# HG changeset patch # User aph # Date 1482143610 28800 # Node ID c89e1f0a084ebc6b356cd15ba5109039d8f11d17 # Parent 525f24ac5db038149bb00c91364290e6bec3ab05 8169177: AArch64: SIGSEGV when "-XX:+ZeroTLAB" is specified along with GC options Reviewed-by: aph Contributed-by: kavitha.natarajan@linaro.org diff -r 525f24ac5db0 -r c89e1f0a084e hotspot/src/cpu/aarch64/vm/c1_MacroAssembler_aarch64.cpp --- a/hotspot/src/cpu/aarch64/vm/c1_MacroAssembler_aarch64.cpp Mon Dec 19 08:31:01 2016 +0100 +++ b/hotspot/src/cpu/aarch64/vm/c1_MacroAssembler_aarch64.cpp Mon Dec 19 02:33:30 2016 -0800 @@ -195,95 +195,22 @@ } } -// Zero words; len is in bytes -// Destroys all registers except addr -// len must be a nonzero multiple of wordSize -void C1_MacroAssembler::zero_memory(Register addr, Register len, Register t1) { - assert_different_registers(addr, len, t1, rscratch1, rscratch2); - -#ifdef ASSERT - { Label L; - tst(len, BytesPerWord - 1); - br(Assembler::EQ, L); - stop("len is not a multiple of BytesPerWord"); - bind(L); - } -#endif - -#ifndef PRODUCT - block_comment("zero memory"); -#endif - - Label loop; - Label entry; - -// Algorithm: -// -// scratch1 = cnt & 7; -// cnt -= scratch1; -// p += scratch1; -// switch (scratch1) { -// do { -// cnt -= 8; -// p[-8] = 0; -// case 7: -// p[-7] = 0; -// case 6: -// p[-6] = 0; -// // ... -// case 1: -// p[-1] = 0; -// case 0: -// p += 8; -// } while (cnt); -// } - - const int unroll = 8; // Number of str(zr) instructions we'll unroll - - lsr(len, len, LogBytesPerWord); - andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll - sub(len, len, rscratch1); // cnt -= unroll - // t1 always points to the end of the region we're about to zero - add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord); - adr(rscratch2, entry); - sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2); - br(rscratch2); - bind(loop); - sub(len, len, unroll); - for (int i = -unroll; i < 0; i++) - str(zr, Address(t1, i * wordSize)); - bind(entry); - add(t1, t1, unroll * wordSize); - cbnz(len, loop); -} - // preserves obj, destroys len_in_bytes void C1_MacroAssembler::initialize_body(Register obj, Register len_in_bytes, int hdr_size_in_bytes, Register t1) { + assert(hdr_size_in_bytes >= 0, "header size must be positive or 0"); Label done; - assert(obj != len_in_bytes && obj != t1 && t1 != len_in_bytes, "registers must be different"); - assert((hdr_size_in_bytes & (BytesPerWord - 1)) == 0, "header size is not a multiple of BytesPerWord"); - Register index = len_in_bytes; - // index is positive and ptr sized - subs(index, index, hdr_size_in_bytes); + + // len_in_bytes is positive and ptr sized + subs(len_in_bytes, len_in_bytes, hdr_size_in_bytes); br(Assembler::EQ, done); - // note: for the remaining code to work, index must be a multiple of BytesPerWord -#ifdef ASSERT - { Label L; - tst(index, BytesPerWord - 1); - br(Assembler::EQ, L); - stop("index is not a multiple of BytesPerWord"); - bind(L); - } -#endif // Preserve obj if (hdr_size_in_bytes) add(obj, obj, hdr_size_in_bytes); - zero_memory(obj, index, t1); + zero_memory(obj, len_in_bytes, t1); if (hdr_size_in_bytes) sub(obj, obj, hdr_size_in_bytes); - // done bind(done); } @@ -294,57 +221,59 @@ try_allocate(obj, noreg, object_size * BytesPerWord, t1, t2, slow_case); - initialize_object(obj, klass, noreg, object_size * HeapWordSize, t1, t2); + initialize_object(obj, klass, noreg, object_size * HeapWordSize, t1, t2, UseTLAB); } -void C1_MacroAssembler::initialize_object(Register obj, Register klass, Register var_size_in_bytes, int con_size_in_bytes, Register t1, Register t2) { +void C1_MacroAssembler::initialize_object(Register obj, Register klass, Register var_size_in_bytes, int con_size_in_bytes, Register t1, Register t2, bool is_tlab_allocated) { assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "con_size_in_bytes is not multiple of alignment"); const int hdr_size_in_bytes = instanceOopDesc::header_size() * HeapWordSize; initialize_header(obj, klass, noreg, t1, t2); - // clear rest of allocated space - const Register index = t2; - const int threshold = 16 * BytesPerWord; // approximate break even point for code size (see comments below) - if (var_size_in_bytes != noreg) { - mov(index, var_size_in_bytes); - initialize_body(obj, index, hdr_size_in_bytes, t1); - } else if (con_size_in_bytes <= threshold) { - // use explicit null stores - int i = hdr_size_in_bytes; - if (i < con_size_in_bytes && (con_size_in_bytes % (2 * BytesPerWord))) { - str(zr, Address(obj, i)); - i += BytesPerWord; - } - for (; i < con_size_in_bytes; i += 2 * BytesPerWord) - stp(zr, zr, Address(obj, i)); - } else if (con_size_in_bytes > hdr_size_in_bytes) { - block_comment("zero memory"); - // use loop to null out the fields + if (!(UseTLAB && ZeroTLAB && is_tlab_allocated)) { + // clear rest of allocated space + const Register index = t2; + const int threshold = 16 * BytesPerWord; // approximate break even point for code size (see comments below) + if (var_size_in_bytes != noreg) { + mov(index, var_size_in_bytes); + initialize_body(obj, index, hdr_size_in_bytes, t1); + } else if (con_size_in_bytes <= threshold) { + // use explicit null stores + int i = hdr_size_in_bytes; + if (i < con_size_in_bytes && (con_size_in_bytes % (2 * BytesPerWord))) { + str(zr, Address(obj, i)); + i += BytesPerWord; + } + for (; i < con_size_in_bytes; i += 2 * BytesPerWord) + stp(zr, zr, Address(obj, i)); + } else if (con_size_in_bytes > hdr_size_in_bytes) { + block_comment("zero memory"); + // use loop to null out the fields - int words = (con_size_in_bytes - hdr_size_in_bytes) / BytesPerWord; - mov(index, words / 8); + int words = (con_size_in_bytes - hdr_size_in_bytes) / BytesPerWord; + mov(index, words / 8); - const int unroll = 8; // Number of str(zr) instructions we'll unroll - int remainder = words % unroll; - lea(rscratch1, Address(obj, hdr_size_in_bytes + remainder * BytesPerWord)); + const int unroll = 8; // Number of str(zr) instructions we'll unroll + int remainder = words % unroll; + lea(rscratch1, Address(obj, hdr_size_in_bytes + remainder * BytesPerWord)); - Label entry_point, loop; - b(entry_point); + Label entry_point, loop; + b(entry_point); - bind(loop); - sub(index, index, 1); - for (int i = -unroll; i < 0; i++) { - if (-i == remainder) - bind(entry_point); - str(zr, Address(rscratch1, i * wordSize)); - } - if (remainder == 0) - bind(entry_point); - add(rscratch1, rscratch1, unroll * wordSize); - cbnz(index, loop); + bind(loop); + sub(index, index, 1); + for (int i = -unroll; i < 0; i++) { + if (-i == remainder) + bind(entry_point); + str(zr, Address(rscratch1, i * wordSize)); + } + if (remainder == 0) + bind(entry_point); + add(rscratch1, rscratch1, unroll * wordSize); + cbnz(index, loop); + } } membar(StoreStore); diff -r 525f24ac5db0 -r c89e1f0a084e hotspot/src/cpu/aarch64/vm/c1_MacroAssembler_aarch64.hpp --- a/hotspot/src/cpu/aarch64/vm/c1_MacroAssembler_aarch64.hpp Mon Dec 19 08:31:01 2016 +0100 +++ b/hotspot/src/cpu/aarch64/vm/c1_MacroAssembler_aarch64.hpp Mon Dec 19 02:33:30 2016 -0800 @@ -36,7 +36,6 @@ // initialization void pd_init() { _rsp_offset = 0; } -void zero_memory(Register addr, Register len, Register t1); public: void try_allocate( @@ -75,7 +74,8 @@ Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise int con_size_in_bytes, // object size in bytes if known at compile time Register t1, // temp register - Register t2 // temp register + Register t2, // temp register + bool is_tlab_allocated // the object was allocated in a TLAB; relevant for the implementation of ZeroTLAB ); // allocation of fixed-size objects diff -r 525f24ac5db0 -r c89e1f0a084e hotspot/src/cpu/aarch64/vm/c1_Runtime1_aarch64.cpp --- a/hotspot/src/cpu/aarch64/vm/c1_Runtime1_aarch64.cpp Mon Dec 19 08:31:01 2016 +0100 +++ b/hotspot/src/cpu/aarch64/vm/c1_Runtime1_aarch64.cpp Mon Dec 19 02:33:30 2016 -0800 @@ -728,7 +728,7 @@ __ tlab_allocate(obj, obj_size, 0, t1, t2, slow_path); - __ initialize_object(obj, klass, obj_size, 0, t1, t2); + __ initialize_object(obj, klass, obj_size, 0, t1, t2, /* is_tlab_allocated */ true); __ verify_oop(obj); __ ldp(r5, r19, Address(__ post(sp, 2 * wordSize))); __ ret(lr); @@ -740,7 +740,7 @@ __ eden_allocate(obj, obj_size, 0, t1, slow_path); __ incr_allocated_bytes(rthread, obj_size, 0, rscratch1); - __ initialize_object(obj, klass, obj_size, 0, t1, t2); + __ initialize_object(obj, klass, obj_size, 0, t1, t2, /* is_tlab_allocated */ false); __ verify_oop(obj); __ ldp(r5, r19, Address(__ post(sp, 2 * wordSize))); __ ret(lr); @@ -853,7 +853,9 @@ __ andr(t1, t1, Klass::_lh_header_size_mask); __ sub(arr_size, arr_size, t1); // body length __ add(t1, t1, obj); // body start - __ initialize_body(t1, arr_size, 0, t2); + if (!ZeroTLAB) { + __ initialize_body(t1, arr_size, 0, t2); + } __ verify_oop(obj); __ ret(lr); diff -r 525f24ac5db0 -r c89e1f0a084e hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp --- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp Mon Dec 19 08:31:01 2016 +0100 +++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp Mon Dec 19 02:33:30 2016 -0800 @@ -3944,12 +3944,82 @@ add(top, top, t1); sub(top, top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes()); str(top, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); + + if (ZeroTLAB) { + // This is a fast TLAB refill, therefore the GC is not notified of it. + // So compiled code must fill the new TLAB with zeroes. + ldr(top, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); + zero_memory(top,t1,t2); + } + verify_tlab(); b(retry); return rthread; // for use by caller } +// Zero words; len is in bytes +// Destroys all registers except addr +// len must be a nonzero multiple of wordSize +void MacroAssembler::zero_memory(Register addr, Register len, Register t1) { + assert_different_registers(addr, len, t1, rscratch1, rscratch2); + +#ifdef ASSERT + { Label L; + tst(len, BytesPerWord - 1); + br(Assembler::EQ, L); + stop("len is not a multiple of BytesPerWord"); + bind(L); + } +#endif + +#ifndef PRODUCT + block_comment("zero memory"); +#endif + + Label loop; + Label entry; + +// Algorithm: +// +// scratch1 = cnt & 7; +// cnt -= scratch1; +// p += scratch1; +// switch (scratch1) { +// do { +// cnt -= 8; +// p[-8] = 0; +// case 7: +// p[-7] = 0; +// case 6: +// p[-6] = 0; +// // ... +// case 1: +// p[-1] = 0; +// case 0: +// p += 8; +// } while (cnt); +// } + + const int unroll = 8; // Number of str(zr) instructions we'll unroll + + lsr(len, len, LogBytesPerWord); + andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll + sub(len, len, rscratch1); // cnt -= unroll + // t1 always points to the end of the region we're about to zero + add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord); + adr(rscratch2, entry); + sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2); + br(rscratch2); + bind(loop); + sub(len, len, unroll); + for (int i = -unroll; i < 0; i++) + str(zr, Address(t1, i * wordSize)); + bind(entry); + add(t1, t1, unroll * wordSize); + cbnz(len, loop); +} + // Defines obj, preserves var_size_in_bytes void MacroAssembler::eden_allocate(Register obj, Register var_size_in_bytes, diff -r 525f24ac5db0 -r c89e1f0a084e hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp --- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp Mon Dec 19 08:31:01 2016 +0100 +++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp Mon Dec 19 02:33:30 2016 -0800 @@ -857,6 +857,7 @@ Label& slow_case // continuation point if fast allocation fails ); Register tlab_refill(Label& retry_tlab, Label& try_eden, Label& slow_case); // returns TLS address + void zero_memory(Register addr, Register len, Register t1); void verify_tlab(); void incr_allocated_bytes(Register thread,