8086053: Address inconsistencies regarding ZeroTLAB
authorzmajo
Tue, 12 Jan 2016 09:19:09 +0100
changeset 35548 8d3afe96ffea
parent 35547 0ee84aa8e705
child 35550 633a22d66bd7
8086053: Address inconsistencies regarding ZeroTLAB Summary: Add zero-initialization to C1 for fast TLAB refills; strenghten C2 conditions for skipping zero-initialization. Reviewed-by: kvn, thartmann
hotspot/src/cpu/sparc/vm/c1_MacroAssembler_sparc.cpp
hotspot/src/cpu/sparc/vm/c1_MacroAssembler_sparc.hpp
hotspot/src/cpu/sparc/vm/c1_Runtime1_sparc.cpp
hotspot/src/cpu/sparc/vm/macroAssembler_sparc.cpp
hotspot/src/cpu/sparc/vm/macroAssembler_sparc.hpp
hotspot/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp
hotspot/src/cpu/x86/vm/c1_MacroAssembler_x86.hpp
hotspot/src/cpu/x86/vm/c1_Runtime1_x86.cpp
hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
hotspot/src/share/vm/gc/shared/threadLocalAllocBuffer.cpp
hotspot/src/share/vm/gc/shared/threadLocalAllocBuffer.hpp
hotspot/src/share/vm/opto/library_call.cpp
hotspot/src/share/vm/opto/macro.cpp
hotspot/src/share/vm/opto/macroArrayCopy.cpp
hotspot/src/share/vm/opto/memnode.cpp
hotspot/test/TEST.groups
hotspot/test/compiler/memoryinitialization/ZeroTLABTest.java
--- a/hotspot/src/cpu/sparc/vm/c1_MacroAssembler_sparc.cpp	Mon Jan 11 14:23:35 2016 +0100
+++ b/hotspot/src/cpu/sparc/vm/c1_MacroAssembler_sparc.cpp	Tue Jan 12 09:19:09 2016 +0100
@@ -205,12 +205,7 @@
 
 
 void C1_MacroAssembler::initialize_body(Register base, Register index) {
-  assert_different_registers(base, index);
-  Label loop;
-  bind(loop);
-  subcc(index, HeapWordSize, index);
-  brx(Assembler::greaterEqual, true, Assembler::pt, loop);
-  delayed()->st_ptr(G0, base, index);
+  zero_memory(base, index);
 }
 
 
@@ -237,7 +232,7 @@
   }
   try_allocate(obj, noreg, obj_size * wordSize, t2, t3, slow_case);
 
-  initialize_object(obj, klass, noreg, obj_size * HeapWordSize, t1, t2);
+  initialize_object(obj, klass, noreg, obj_size * HeapWordSize, t1, t2, /* is_tlab_allocated */ UseTLAB);
 }
 
 void C1_MacroAssembler::initialize_object(
@@ -246,7 +241,8 @@
   Register var_size_in_bytes,          // object size in bytes if unknown at compile time; invalid otherwise
   int      con_size_in_bytes,          // object size in bytes if   known at compile time
   Register t1,                         // temp register
-  Register t2                          // temp register
+  Register t2,                         // temp register
+  bool     is_tlab_allocated           // the object was allocated in a TLAB; relevant for the implementation of ZeroTLAB
   ) {
   const int hdr_size_in_bytes = instanceOopDesc::header_size() * HeapWordSize;
 
@@ -269,31 +265,33 @@
 
 #endif
 
-  // initialize body
-  const int threshold = 5 * HeapWordSize;              // approximate break even point for code size
-  if (var_size_in_bytes != noreg) {
-    // use a loop
-    add(obj, hdr_size_in_bytes, t1);               // compute address of first element
-    sub(var_size_in_bytes, hdr_size_in_bytes, t2); // compute size of body
-    initialize_body(t1, t2);
+  if (!(UseTLAB && ZeroTLAB && is_tlab_allocated)) {
+    // initialize body
+    const int threshold = 5 * HeapWordSize;              // approximate break even point for code size
+    if (var_size_in_bytes != noreg) {
+      // use a loop
+      add(obj, hdr_size_in_bytes, t1);               // compute address of first element
+      sub(var_size_in_bytes, hdr_size_in_bytes, t2); // compute size of body
+      initialize_body(t1, t2);
 #ifndef _LP64
-  } else if (con_size_in_bytes < threshold * 2) {
-    // on v9 we can do double word stores to fill twice as much space.
-    assert(hdr_size_in_bytes % 8 == 0, "double word aligned");
-    assert(con_size_in_bytes % 8 == 0, "double word aligned");
-    for (int i = hdr_size_in_bytes; i < con_size_in_bytes; i += 2 * HeapWordSize) stx(G0, obj, i);
+    } else if (con_size_in_bytes < threshold * 2) {
+      // on v9 we can do double word stores to fill twice as much space.
+      assert(hdr_size_in_bytes % 8 == 0, "double word aligned");
+      assert(con_size_in_bytes % 8 == 0, "double word aligned");
+      for (int i = hdr_size_in_bytes; i < con_size_in_bytes; i += 2 * HeapWordSize) stx(G0, obj, i);
 #endif
-  } else if (con_size_in_bytes <= threshold) {
-    // use explicit NULL stores
-    for (int i = hdr_size_in_bytes; i < con_size_in_bytes; i += HeapWordSize)     st_ptr(G0, obj, i);
-  } else if (con_size_in_bytes > hdr_size_in_bytes) {
-    // use a loop
-    const Register base  = t1;
-    const Register index = t2;
-    add(obj, hdr_size_in_bytes, base);               // compute address of first element
-    // compute index = number of words to clear
-    set(con_size_in_bytes - hdr_size_in_bytes, index);
-    initialize_body(base, index);
+    } else if (con_size_in_bytes <= threshold) {
+      // use explicit NULL stores
+      for (int i = hdr_size_in_bytes; i < con_size_in_bytes; i += HeapWordSize)     st_ptr(G0, obj, i);
+    } else if (con_size_in_bytes > hdr_size_in_bytes) {
+      // use a loop
+      const Register base  = t1;
+      const Register index = t2;
+      add(obj, hdr_size_in_bytes, base);               // compute address of first element
+      // compute index = number of words to clear
+      set(con_size_in_bytes - hdr_size_in_bytes, index);
+      initialize_body(base, index);
+    }
   }
 
   if (CURRENT_ENV->dtrace_alloc_probes()) {
--- a/hotspot/src/cpu/sparc/vm/c1_MacroAssembler_sparc.hpp	Mon Jan 11 14:23:35 2016 +0100
+++ b/hotspot/src/cpu/sparc/vm/c1_MacroAssembler_sparc.hpp	Tue Jan 12 09:19:09 2016 +0100
@@ -50,7 +50,8 @@
     Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
     int      con_size_in_bytes,        // object size in bytes if   known at compile time
     Register t1,                       // temp register
-    Register t2                        // temp register
+    Register t2,                       // temp register
+    bool is_tlab_allocated             // the object was allocated in a TLAB; relevant for the implementation of ZeroTLAB
   );
 
   // allocation of fixed-size objects
--- a/hotspot/src/cpu/sparc/vm/c1_Runtime1_sparc.cpp	Mon Jan 11 14:23:35 2016 +0100
+++ b/hotspot/src/cpu/sparc/vm/c1_Runtime1_sparc.cpp	Tue Jan 12 09:19:09 2016 +0100
@@ -435,7 +435,7 @@
 
           __ tlab_allocate(O0_obj, G1_obj_size, 0, G3_t1, slow_path);
 
-          __ initialize_object(O0_obj, G5_klass, G1_obj_size, 0, G3_t1, G4_t2);
+          __ initialize_object(O0_obj, G5_klass, G1_obj_size, 0, G3_t1, G4_t2, /* is_tlab_allocated */ true);
           __ verify_oop(O0_obj);
           __ mov(O0, I0);
           __ ret();
@@ -447,7 +447,7 @@
           __ eden_allocate(O0_obj, G1_obj_size, 0, G3_t1, G4_t2, slow_path);
           __ incr_allocated_bytes(G1_obj_size, G3_t1, G4_t2);
 
-          __ initialize_object(O0_obj, G5_klass, G1_obj_size, 0, G3_t1, G4_t2);
+          __ initialize_object(O0_obj, G5_klass, G1_obj_size, 0, G3_t1, G4_t2, /* is_tlab_allocated */ false);
           __ verify_oop(O0_obj);
           __ mov(O0, I0);
           __ ret();
@@ -542,7 +542,9 @@
           __ ldub(klass_lh, G3_t1, klass_lh_header_size_offset);
           __ sub(G1_arr_size, G3_t1, O1_t2);  // body length
           __ add(O0_obj, G3_t1, G3_t1);       // body start
-          __ initialize_body(G3_t1, O1_t2);
+          if (!ZeroTLAB) {
+            __ initialize_body(G3_t1, O1_t2);
+          }
           __ verify_oop(O0_obj);
           __ retl();
           __ delayed()->nop();
--- a/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.cpp	Mon Jan 11 14:23:35 2016 +0100
+++ b/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.cpp	Tue Jan 12 09:19:09 2016 +0100
@@ -3459,11 +3459,27 @@
   add(top, t1, top); // t1 is tlab_size
   sub(top, ThreadLocalAllocBuffer::alignment_reserve_in_bytes(), top);
   st_ptr(top, G2_thread, in_bytes(JavaThread::tlab_end_offset()));
+
+  if (ZeroTLAB) {
+    // This is a fast TLAB refill, therefore the GC is not notified of it.
+    // So compiled code must fill the new TLAB with zeroes.
+    ld_ptr(G2_thread, in_bytes(JavaThread::tlab_start_offset()), t2);
+    zero_memory(t2, t1);
+  }
   verify_tlab();
   ba(retry);
   delayed()->nop();
 }
 
+void MacroAssembler::zero_memory(Register base, Register index) {
+  assert_different_registers(base, index);
+  Label loop;
+  bind(loop);
+  subcc(index, HeapWordSize, index);
+  brx(Assembler::greaterEqual, true, Assembler::pt, loop);
+  delayed()->st_ptr(G0, base, index);
+}
+
 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes,
                                           Register t1, Register t2) {
   // Bump total bytes allocated by this thread
--- a/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.hpp	Mon Jan 11 14:23:35 2016 +0100
+++ b/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.hpp	Tue Jan 12 09:19:09 2016 +0100
@@ -1278,6 +1278,7 @@
     Label&   slow_case                 // continuation point if fast allocation fails
   );
   void tlab_refill(Label& retry_tlab, Label& try_eden, Label& slow_case);
+  void zero_memory(Register base, Register index);
   void incr_allocated_bytes(RegisterOrConstant size_in_bytes,
                             Register t1, Register t2);
 
--- a/hotspot/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp	Mon Jan 11 14:23:35 2016 +0100
+++ b/hotspot/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp	Tue Jan 12 09:19:09 2016 +0100
@@ -182,54 +182,13 @@
 
 // preserves obj, destroys len_in_bytes
 void C1_MacroAssembler::initialize_body(Register obj, Register len_in_bytes, int hdr_size_in_bytes, Register t1) {
+  assert(hdr_size_in_bytes >= 0, "header size must be positive or 0");
   Label done;
-  assert(obj != len_in_bytes && obj != t1 && t1 != len_in_bytes, "registers must be different");
-  assert((hdr_size_in_bytes & (BytesPerWord - 1)) == 0, "header size is not a multiple of BytesPerWord");
-  Register index = len_in_bytes;
-  // index is positive and ptr sized
-  subptr(index, hdr_size_in_bytes);
+
+  // len_in_bytes is positive and ptr sized
+  subptr(len_in_bytes, hdr_size_in_bytes);
   jcc(Assembler::zero, done);
-  // initialize topmost word, divide index by 2, check if odd and test if zero
-  // note: for the remaining code to work, index must be a multiple of BytesPerWord
-#ifdef ASSERT
-  { Label L;
-    testptr(index, BytesPerWord - 1);
-    jcc(Assembler::zero, L);
-    stop("index is not a multiple of BytesPerWord");
-    bind(L);
-  }
-#endif
-  xorptr(t1, t1);    // use _zero reg to clear memory (shorter code)
-  if (UseIncDec) {
-    shrptr(index, 3);  // divide by 8/16 and set carry flag if bit 2 was set
-  } else {
-    shrptr(index, 2);  // use 2 instructions to avoid partial flag stall
-    shrptr(index, 1);
-  }
-#ifndef _LP64
-  // index could have been not a multiple of 8 (i.e., bit 2 was set)
-  { Label even;
-    // note: if index was a multiple of 8, than it cannot
-    //       be 0 now otherwise it must have been 0 before
-    //       => if it is even, we don't need to check for 0 again
-    jcc(Assembler::carryClear, even);
-    // clear topmost word (no jump needed if conditional assignment would work here)
-    movptr(Address(obj, index, Address::times_8, hdr_size_in_bytes - 0*BytesPerWord), t1);
-    // index could be 0 now, need to check again
-    jcc(Assembler::zero, done);
-    bind(even);
-  }
-#endif // !_LP64
-  // initialize remaining object fields: rdx is a multiple of 2 now
-  { Label loop;
-    bind(loop);
-    movptr(Address(obj, index, Address::times_8, hdr_size_in_bytes - 1*BytesPerWord), t1);
-    NOT_LP64(movptr(Address(obj, index, Address::times_8, hdr_size_in_bytes - 2*BytesPerWord), t1);)
-    decrement(index);
-    jcc(Assembler::notZero, loop);
-  }
-
-  // done
+  zero_memory(obj, len_in_bytes, hdr_size_in_bytes, t1);
   bind(done);
 }
 
@@ -241,47 +200,49 @@
 
   try_allocate(obj, noreg, object_size * BytesPerWord, t1, t2, slow_case);
 
-  initialize_object(obj, klass, noreg, object_size * HeapWordSize, t1, t2);
+  initialize_object(obj, klass, noreg, object_size * HeapWordSize, t1, t2, UseTLAB);
 }
 
-void C1_MacroAssembler::initialize_object(Register obj, Register klass, Register var_size_in_bytes, int con_size_in_bytes, Register t1, Register t2) {
+void C1_MacroAssembler::initialize_object(Register obj, Register klass, Register var_size_in_bytes, int con_size_in_bytes, Register t1, Register t2, bool is_tlab_allocated) {
   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0,
          "con_size_in_bytes is not multiple of alignment");
   const int hdr_size_in_bytes = instanceOopDesc::header_size() * HeapWordSize;
 
   initialize_header(obj, klass, noreg, t1, t2);
 
-  // clear rest of allocated space
-  const Register t1_zero = t1;
-  const Register index = t2;
-  const int threshold = 6 * BytesPerWord;   // approximate break even point for code size (see comments below)
-  if (var_size_in_bytes != noreg) {
-    mov(index, var_size_in_bytes);
-    initialize_body(obj, index, hdr_size_in_bytes, t1_zero);
-  } else if (con_size_in_bytes <= threshold) {
-    // use explicit null stores
-    // code size = 2 + 3*n bytes (n = number of fields to clear)
-    xorptr(t1_zero, t1_zero); // use t1_zero reg to clear memory (shorter code)
-    for (int i = hdr_size_in_bytes; i < con_size_in_bytes; i += BytesPerWord)
-      movptr(Address(obj, i), t1_zero);
-  } else if (con_size_in_bytes > hdr_size_in_bytes) {
-    // use loop to null out the fields
-    // code size = 16 bytes for even n (n = number of fields to clear)
-    // initialize last object field first if odd number of fields
-    xorptr(t1_zero, t1_zero); // use t1_zero reg to clear memory (shorter code)
-    movptr(index, (con_size_in_bytes - hdr_size_in_bytes) >> 3);
-    // initialize last object field if constant size is odd
-    if (((con_size_in_bytes - hdr_size_in_bytes) & 4) != 0)
-      movptr(Address(obj, con_size_in_bytes - (1*BytesPerWord)), t1_zero);
-    // initialize remaining object fields: rdx is a multiple of 2
-    { Label loop;
-      bind(loop);
-      movptr(Address(obj, index, Address::times_8, hdr_size_in_bytes - (1*BytesPerWord)),
-             t1_zero);
-      NOT_LP64(movptr(Address(obj, index, Address::times_8, hdr_size_in_bytes - (2*BytesPerWord)),
-             t1_zero);)
-      decrement(index);
-      jcc(Assembler::notZero, loop);
+  if (!(UseTLAB && ZeroTLAB && is_tlab_allocated)) {
+    // clear rest of allocated space
+    const Register t1_zero = t1;
+    const Register index = t2;
+    const int threshold = 6 * BytesPerWord;   // approximate break even point for code size (see comments below)
+    if (var_size_in_bytes != noreg) {
+      mov(index, var_size_in_bytes);
+      initialize_body(obj, index, hdr_size_in_bytes, t1_zero);
+    } else if (con_size_in_bytes <= threshold) {
+      // use explicit null stores
+      // code size = 2 + 3*n bytes (n = number of fields to clear)
+      xorptr(t1_zero, t1_zero); // use t1_zero reg to clear memory (shorter code)
+      for (int i = hdr_size_in_bytes; i < con_size_in_bytes; i += BytesPerWord)
+        movptr(Address(obj, i), t1_zero);
+    } else if (con_size_in_bytes > hdr_size_in_bytes) {
+      // use loop to null out the fields
+      // code size = 16 bytes for even n (n = number of fields to clear)
+      // initialize last object field first if odd number of fields
+      xorptr(t1_zero, t1_zero); // use t1_zero reg to clear memory (shorter code)
+      movptr(index, (con_size_in_bytes - hdr_size_in_bytes) >> 3);
+      // initialize last object field if constant size is odd
+      if (((con_size_in_bytes - hdr_size_in_bytes) & 4) != 0)
+        movptr(Address(obj, con_size_in_bytes - (1*BytesPerWord)), t1_zero);
+      // initialize remaining object fields: rdx is a multiple of 2
+      { Label loop;
+        bind(loop);
+        movptr(Address(obj, index, Address::times_8, hdr_size_in_bytes - (1*BytesPerWord)),
+               t1_zero);
+        NOT_LP64(movptr(Address(obj, index, Address::times_8, hdr_size_in_bytes - (2*BytesPerWord)),
+               t1_zero);)
+        decrement(index);
+        jcc(Assembler::notZero, loop);
+      }
     }
   }
 
--- a/hotspot/src/cpu/x86/vm/c1_MacroAssembler_x86.hpp	Mon Jan 11 14:23:35 2016 +0100
+++ b/hotspot/src/cpu/x86/vm/c1_MacroAssembler_x86.hpp	Tue Jan 12 09:19:09 2016 +0100
@@ -65,7 +65,8 @@
     Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
     int      con_size_in_bytes,        // object size in bytes if   known at compile time
     Register t1,                       // temp register
-    Register t2                        // temp register
+    Register t2,                       // temp register
+    bool     is_tlab_allocated         // the object was allocated in a TLAB; relevant for the implementation of ZeroTLAB
   );
 
   // allocation of fixed-size objects
--- a/hotspot/src/cpu/x86/vm/c1_Runtime1_x86.cpp	Mon Jan 11 14:23:35 2016 +0100
+++ b/hotspot/src/cpu/x86/vm/c1_Runtime1_x86.cpp	Tue Jan 12 09:19:09 2016 +0100
@@ -1040,7 +1040,7 @@
 
           __ tlab_allocate(obj, obj_size, 0, t1, t2, slow_path);
 
-          __ initialize_object(obj, klass, obj_size, 0, t1, t2);
+          __ initialize_object(obj, klass, obj_size, 0, t1, t2, /* is_tlab_allocated */ true);
           __ verify_oop(obj);
           __ pop(rbx);
           __ pop(rdi);
@@ -1053,7 +1053,7 @@
           __ eden_allocate(obj, obj_size, 0, t1, slow_path);
           __ incr_allocated_bytes(thread, obj_size, 0);
 
-          __ initialize_object(obj, klass, obj_size, 0, t1, t2);
+          __ initialize_object(obj, klass, obj_size, 0, t1, t2, /* is_tlab_allocated */ false);
           __ verify_oop(obj);
           __ pop(rbx);
           __ pop(rdi);
@@ -1169,7 +1169,9 @@
           __ andptr(t1, Klass::_lh_header_size_mask);
           __ subptr(arr_size, t1);  // body length
           __ addptr(t1, obj);       // body start
-          __ initialize_body(t1, arr_size, 0, t2);
+          if (!ZeroTLAB) {
+            __ initialize_body(t1, arr_size, 0, t2);
+          }
           __ verify_oop(obj);
           __ ret(0);
 
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Mon Jan 11 14:23:35 2016 +0100
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Tue Jan 12 09:19:09 2016 +0100
@@ -5426,7 +5426,7 @@
                                      Label& try_eden,
                                      Label& slow_case) {
   Register top = rax;
-  Register t1  = rcx;
+  Register t1  = rcx; // object size
   Register t2  = rsi;
   Register thread_reg = NOT_LP64(rdi) LP64_ONLY(r15_thread);
   assert_different_registers(top, thread_reg, t1, t2, /* preserve: */ rbx, rdx);
@@ -5522,12 +5522,76 @@
   addptr(top, t1);
   subptr(top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
   movptr(Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())), top);
+
+  if (ZeroTLAB) {
+    // This is a fast TLAB refill, therefore the GC is not notified of it.
+    // So compiled code must fill the new TLAB with zeroes.
+    movptr(top, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
+    zero_memory(top, t1, 0, t2);
+  }
+
   verify_tlab();
   jmp(retry);
 
   return thread_reg; // for use by caller
 }
 
+// Preserves the contents of address, destroys the contents length_in_bytes and temp.
+void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
+  assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
+  assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
+  Label done;
+
+  testptr(length_in_bytes, length_in_bytes);
+  jcc(Assembler::zero, done);
+
+  // initialize topmost word, divide index by 2, check if odd and test if zero
+  // note: for the remaining code to work, index must be a multiple of BytesPerWord
+#ifdef ASSERT
+  {
+    Label L;
+    testptr(length_in_bytes, BytesPerWord - 1);
+    jcc(Assembler::zero, L);
+    stop("length must be a multiple of BytesPerWord");
+    bind(L);
+  }
+#endif
+  Register index = length_in_bytes;
+  xorptr(temp, temp);    // use _zero reg to clear memory (shorter code)
+  if (UseIncDec) {
+    shrptr(index, 3);  // divide by 8/16 and set carry flag if bit 2 was set
+  } else {
+    shrptr(index, 2);  // use 2 instructions to avoid partial flag stall
+    shrptr(index, 1);
+  }
+#ifndef _LP64
+  // index could have not been a multiple of 8 (i.e., bit 2 was set)
+  {
+    Label even;
+    // note: if index was a multiple of 8, then it cannot
+    //       be 0 now otherwise it must have been 0 before
+    //       => if it is even, we don't need to check for 0 again
+    jcc(Assembler::carryClear, even);
+    // clear topmost word (no jump would be needed if conditional assignment worked here)
+    movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp);
+    // index could be 0 now, must check again
+    jcc(Assembler::zero, done);
+    bind(even);
+  }
+#endif // !_LP64
+  // initialize remaining object fields: index is a multiple of 2 now
+  {
+    Label loop;
+    bind(loop);
+    movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
+    NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
+    decrement(index);
+    jcc(Assembler::notZero, loop);
+  }
+
+  bind(done);
+}
+
 void MacroAssembler::incr_allocated_bytes(Register thread,
                                           Register var_size_in_bytes,
                                           int con_size_in_bytes,
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp	Mon Jan 11 14:23:35 2016 +0100
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp	Tue Jan 12 09:19:09 2016 +0100
@@ -522,6 +522,8 @@
     Label&   slow_case                 // continuation point if fast allocation fails
   );
   Register tlab_refill(Label& retry_tlab, Label& try_eden, Label& slow_case); // returns TLS address
+  void zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp);
+
   void incr_allocated_bytes(Register thread,
                             Register var_size_in_bytes, int con_size_in_bytes,
                             Register t1 = noreg);
--- a/hotspot/src/share/vm/gc/shared/threadLocalAllocBuffer.cpp	Mon Jan 11 14:23:35 2016 +0100
+++ b/hotspot/src/share/vm/gc/shared/threadLocalAllocBuffer.cpp	Tue Jan 12 09:19:09 2016 +0100
@@ -105,7 +105,7 @@
 // an illusion of a contiguous Eden and optionally retires the tlab.
 // Waste accounting should be done in caller as appropriate; see,
 // for example, clear_before_allocation().
-void ThreadLocalAllocBuffer::make_parsable(bool retire) {
+void ThreadLocalAllocBuffer::make_parsable(bool retire, bool zap) {
   if (end() != NULL) {
     invariants();
 
@@ -113,7 +113,7 @@
       myThread()->incr_allocated_bytes(used_bytes());
     }
 
-    CollectedHeap::fill_with_object(top(), hard_end(), retire);
+    CollectedHeap::fill_with_object(top(), hard_end(), retire && zap);
 
     if (retire || ZeroTLAB) {  // "Reset" the TLAB
       set_start(NULL);
--- a/hotspot/src/share/vm/gc/shared/threadLocalAllocBuffer.hpp	Mon Jan 11 14:23:35 2016 +0100
+++ b/hotspot/src/share/vm/gc/shared/threadLocalAllocBuffer.hpp	Tue Jan 12 09:19:09 2016 +0100
@@ -145,8 +145,8 @@
   // Initialization at startup
   static void startup_initialization();
 
-  // Make an in-use tlab parsable, optionally also retiring it.
-  void make_parsable(bool retire);
+  // Make an in-use tlab parsable, optionally retiring and/or zapping it.
+  void make_parsable(bool retire, bool zap = true);
 
   // Retire in-use tlab before allocation of a new tlab
   void clear_before_allocation();
--- a/hotspot/src/share/vm/opto/library_call.cpp	Mon Jan 11 14:23:35 2016 +0100
+++ b/hotspot/src/share/vm/opto/library_call.cpp	Tue Jan 12 09:19:09 2016 +0100
@@ -3077,7 +3077,7 @@
   set_control( _gvn.transform(new IfTrueNode(iff_arg)));
 #else
   // To return true on Windows you must read the _interrupted field
-  // and check the the event state i.e. take the slow path.
+  // and check the event state i.e. take the slow path.
 #endif // TARGET_OS_FAMILY_windows
 
   // (d) Otherwise, go to the slow path.
--- a/hotspot/src/share/vm/opto/macro.cpp	Mon Jan 11 14:23:35 2016 +0100
+++ b/hotspot/src/share/vm/opto/macro.cpp	Tue Jan 12 09:19:09 2016 +0100
@@ -1813,10 +1813,11 @@
     // there can be two Allocates to one Initialize.  The answer in all these
     // edge cases is safety first.  It is always safe to clear immediately
     // within an Allocate, and then (maybe or maybe not) clear some more later.
-    if (!ZeroTLAB)
+    if (!(UseTLAB && ZeroTLAB)) {
       rawmem = ClearArrayNode::clear_memory(control, rawmem, object,
                                             header_size, size_in_bytes,
                                             &_igvn);
+    }
   } else {
     if (!init->is_complete()) {
       // Try to win by zeroing only what the init does not store.
--- a/hotspot/src/share/vm/opto/macroArrayCopy.cpp	Mon Jan 11 14:23:35 2016 +0100
+++ b/hotspot/src/share/vm/opto/macroArrayCopy.cpp	Tue Jan 12 09:19:09 2016 +0100
@@ -295,7 +295,7 @@
   // out-edges of the dest, we need to avoid making derived pointers
   // from it until we have checked its uses.)
   if (ReduceBulkZeroing
-      && !ZeroTLAB              // pointless if already zeroed
+      && !(UseTLAB && ZeroTLAB) // pointless if already zeroed
       && basic_elem_type != T_CONFLICT // avoid corner case
       && !src->eqv_uncast(dest)
       && alloc != NULL
--- a/hotspot/src/share/vm/opto/memnode.cpp	Mon Jan 11 14:23:35 2016 +0100
+++ b/hotspot/src/share/vm/opto/memnode.cpp	Tue Jan 12 09:19:09 2016 +0100
@@ -3850,7 +3850,7 @@
   bool do_zeroing = true;       // we might give up if inits are very sparse
   int  big_init_gaps = 0;       // how many large gaps have we seen?
 
-  if (ZeroTLAB)  do_zeroing = false;
+  if (UseTLAB && ZeroTLAB)  do_zeroing = false;
   if (!ReduceFieldZeroing && !ReduceBulkZeroing)  do_zeroing = false;
 
   for (uint i = InitializeNode::RawStores, limit = req(); i < limit; i++) {
@@ -3951,7 +3951,7 @@
   remove_extra_zeroes();        // clear out all the zmems left over
   add_req(inits);
 
-  if (!ZeroTLAB) {
+  if (!(UseTLAB && ZeroTLAB)) {
     // If anything remains to be zeroed, zero it all now.
     zeroes_done = align_size_down(zeroes_done, BytesPerInt);
     // if it is the last unused 4 bytes of an instance, forget about it
--- a/hotspot/test/TEST.groups	Mon Jan 11 14:23:35 2016 +0100
+++ b/hotspot/test/TEST.groups	Tue Jan 12 09:19:09 2016 +0100
@@ -288,6 +288,7 @@
   compiler/jsr292/ \
   compiler/loopopts/ \
   compiler/macronodes/ \
+  compiler/memoryinitialization/ \
   compiler/osr/ \
   compiler/regalloc/ \
   compiler/runtime/ \
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/memoryinitialization/ZeroTLABTest.java	Tue Jan 12 09:19:09 2016 +0100
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/*
+ * @test
+ * @bug 8086053
+ * @run main/othervm -Xcomp -XX:+UseTLAB -XX:+ZeroTLAB ZeroTLABTest
+ * @run main/othervm -Xcomp -XX:+UseTLAB -XX:-ZeroTLAB ZeroTLABTest
+ * @run main/othervm -Xcomp -XX:-UseTLAB -XX:+ZeroTLAB ZeroTLABTest
+ * @run main/othervm -Xcomp -XX:-UseTLAB -XX:-ZeroTLAB ZeroTLABTest
+ */
+public class ZeroTLABTest {
+    public static void main(String args[]) {
+        System.out.println("Test PASSED");
+    }
+}