# HG changeset patch # User avoitylov # Date 1537796664 -10800 # Node ID cc1a4a26779898bf547f020b4a25a683487d6f44 # Parent f5daffd7ec7a421eb46e765ae19d2cc56d0482b3 8210466: Modularize allocations in assembler Reviewed-by: rkennke, dsamersoff diff -r f5daffd7ec7a -r cc1a4a267798 src/hotspot/cpu/arm/c1_MacroAssembler_arm.cpp --- a/src/hotspot/cpu/arm/c1_MacroAssembler_arm.cpp Mon Sep 24 16:39:02 2018 +0300 +++ b/src/hotspot/cpu/arm/c1_MacroAssembler_arm.cpp Mon Sep 24 16:44:24 2018 +0300 @@ -90,7 +90,6 @@ tlab_allocate(obj, obj_end, tmp1, size_expression, slow_case); } else { eden_allocate(obj, obj_end, tmp1, tmp2, size_expression, slow_case); - incr_allocated_bytes(size_expression, tmp1); } } diff -r f5daffd7ec7a -r cc1a4a267798 src/hotspot/cpu/arm/c1_Runtime1_arm.cpp --- a/src/hotspot/cpu/arm/c1_Runtime1_arm.cpp Mon Sep 24 16:39:02 2018 +0300 +++ b/src/hotspot/cpu/arm/c1_Runtime1_arm.cpp Mon Sep 24 16:44:24 2018 +0300 @@ -569,7 +569,6 @@ __ ldr_u32(obj_size, Address(klass, Klass::layout_helper_offset())); __ eden_allocate(result, obj_end, tmp1, tmp2, obj_size, slow_case); // initializes result and obj_end - __ incr_allocated_bytes(obj_size, tmp2); __ initialize_object(result, obj_end, klass, noreg /* len */, tmp1, tmp2, instanceOopDesc::header_size() * HeapWordSize, -1, /* is_tlab_allocated */ false); @@ -658,7 +657,6 @@ // eden_allocate destroys tmp2, so reload header_size after allocation // eden_allocate initializes result and obj_end __ eden_allocate(result, obj_end, tmp1, tmp2, arr_size, slow_case); - __ incr_allocated_bytes(arr_size, tmp2); __ ldrb(tmp2, Address(klass, in_bytes(Klass::layout_helper_offset()) + Klass::_lh_header_size_shift / BitsPerByte)); __ initialize_object(result, obj_end, klass, length, tmp1, tmp2, tmp2, -1, /* is_tlab_allocated */ false); diff -r f5daffd7ec7a -r cc1a4a267798 src/hotspot/cpu/arm/gc/shared/barrierSetAssembler_arm.cpp --- a/src/hotspot/cpu/arm/gc/shared/barrierSetAssembler_arm.cpp Mon Sep 24 16:39:02 2018 +0300 +++ b/src/hotspot/cpu/arm/gc/shared/barrierSetAssembler_arm.cpp Mon Sep 24 16:44:24 2018 +0300 @@ -24,6 +24,8 @@ #include "precompiled.hpp" #include "gc/shared/barrierSetAssembler.hpp" +#include "gc/shared/collectedHeap.hpp" +#include "runtime/thread.hpp" #define __ masm-> @@ -166,3 +168,118 @@ Register obj1, Register obj2) { __ cmp(obj1, obj2); } + +// Puts address of allocated object into register `obj` and end of allocated object into register `obj_end`. +void BarrierSetAssembler::eden_allocate(MacroAssembler* masm, Register obj, Register obj_end, Register tmp1, Register tmp2, + RegisterOrConstant size_expression, Label& slow_case) { + if (!Universe::heap()->supports_inline_contig_alloc()) { + __ b(slow_case); + return; + } + + CollectedHeap* ch = Universe::heap(); + + const Register top_addr = tmp1; + const Register heap_end = tmp2; + + if (size_expression.is_register()) { + assert_different_registers(obj, obj_end, top_addr, heap_end, size_expression.as_register()); + } else { + assert_different_registers(obj, obj_end, top_addr, heap_end); + } + + bool load_const = AARCH64_ONLY(false) NOT_AARCH64(VM_Version::supports_movw() ); // TODO-AARCH64 check performance + if (load_const) { + __ mov_address(top_addr, (address)Universe::heap()->top_addr(), symbolic_Relocation::eden_top_reference); + } else { + __ ldr(top_addr, Address(Rthread, JavaThread::heap_top_addr_offset())); + } + // Calculate new heap_top by adding the size of the object + Label retry; + __ bind(retry); + +#ifdef AARCH64 + __ ldxr(obj, top_addr); +#else + __ ldr(obj, Address(top_addr)); +#endif // AARCH64 + + __ ldr(heap_end, Address(top_addr, (intptr_t)ch->end_addr() - (intptr_t)ch->top_addr())); + __ add_rc(obj_end, obj, size_expression); + // Check if obj_end wrapped around, i.e., obj_end < obj. If yes, jump to the slow case. + __ cmp(obj_end, obj); + __ b(slow_case, lo); + // Update heap_top if allocation succeeded + __ cmp(obj_end, heap_end); + __ b(slow_case, hi); + +#ifdef AARCH64 + __ stxr(heap_end/*scratched*/, obj_end, top_addr); + __ cbnz_w(heap_end, retry); +#else + __ atomic_cas_bool(obj, obj_end, top_addr, 0, heap_end/*scratched*/); + __ b(retry, ne); +#endif // AARCH64 + + incr_allocated_bytes(masm, size_expression, tmp1); +} + +// Puts address of allocated object into register `obj` and end of allocated object into register `obj_end`. +void BarrierSetAssembler::tlab_allocate(MacroAssembler* masm, Register obj, Register obj_end, Register tmp1, + RegisterOrConstant size_expression, Label& slow_case) { + const Register tlab_end = tmp1; + assert_different_registers(obj, obj_end, tlab_end); + + __ ldr(obj, Address(Rthread, JavaThread::tlab_top_offset())); + __ ldr(tlab_end, Address(Rthread, JavaThread::tlab_end_offset())); + __ add_rc(obj_end, obj, size_expression); + __ cmp(obj_end, tlab_end); + __ b(slow_case, hi); + __ str(obj_end, Address(Rthread, JavaThread::tlab_top_offset())); +} + +void BarrierSetAssembler::incr_allocated_bytes(MacroAssembler* masm, RegisterOrConstant size_in_bytes, Register tmp) { +#ifdef AARCH64 + __ ldr(tmp, Address(Rthread, in_bytes(JavaThread::allocated_bytes_offset()))); + __ add_rc(tmp, tmp, size_in_bytes); + __ str(tmp, Address(Rthread, in_bytes(JavaThread::allocated_bytes_offset()))); +#else + // Bump total bytes allocated by this thread + Label done; + + // Borrow the Rthread for alloc counter + Register Ralloc = Rthread; + __ add(Ralloc, Ralloc, in_bytes(JavaThread::allocated_bytes_offset())); + __ ldr(tmp, Address(Ralloc)); + __ adds(tmp, tmp, size_in_bytes); + __ str(tmp, Address(Ralloc), cc); + __ b(done, cc); + + // Increment the high word and store single-copy atomically (that is an unlikely scenario on typical embedded systems as it means >4GB has been allocated) + // To do so ldrd/strd instructions used which require an even-odd pair of registers. Such a request could be difficult to satisfy by + // allocating those registers on a higher level, therefore the routine is ready to allocate a pair itself. + Register low, high; + // Select ether R0/R1 or R2/R3 + + if (size_in_bytes.is_register() && (size_in_bytes.as_register() == R0 || size_in_bytes.as_register() == R1)) { + low = R2; + high = R3; + } else { + low = R0; + high = R1; + } + __ push(RegisterSet(low, high)); + + __ ldrd(low, Address(Ralloc)); + __ adds(low, low, size_in_bytes); + __ adc(high, high, 0); + __ strd(low, Address(Ralloc)); + + __ pop(RegisterSet(low, high)); + + __ bind(done); + + // Unborrow the Rthread + __ sub(Rthread, Ralloc, in_bytes(JavaThread::allocated_bytes_offset())); +#endif // AARCH64 +} diff -r f5daffd7ec7a -r cc1a4a267798 src/hotspot/cpu/arm/gc/shared/barrierSetAssembler_arm.hpp --- a/src/hotspot/cpu/arm/gc/shared/barrierSetAssembler_arm.hpp Mon Sep 24 16:39:02 2018 +0300 +++ b/src/hotspot/cpu/arm/gc/shared/barrierSetAssembler_arm.hpp Mon Sep 24 16:44:24 2018 +0300 @@ -30,6 +30,12 @@ #include "oops/access.hpp" class BarrierSetAssembler: public CHeapObj { +private: + void incr_allocated_bytes(MacroAssembler* masm, + RegisterOrConstant size_in_bytes, + Register tmp +); + public: virtual void arraycopy_prologue(MacroAssembler* masm, DecoratorSet decorators, bool is_oop, Register addr, Register count, int callee_saved_regs) {} @@ -44,6 +50,23 @@ virtual void obj_equals(MacroAssembler* masm, Register obj1, Register obj2); + virtual void eden_allocate(MacroAssembler* masm, + Register obj, // result: pointer to object after successful allocation + Register obj_end, // result: pointer to end of object after successful allocation + Register tmp1, // temp register + Register tmp2, // temp register + RegisterOrConstant size_expression, // size of object + Label& slow_case // continuation point if fast allocation fails + ); + + virtual void tlab_allocate(MacroAssembler* masm, + Register obj, // result: pointer to object after successful allocation + Register obj_end, // result: pointer to end of object after successful allocation + Register tmp1, // temp register + RegisterOrConstant size_expression, // size of object + Label& slow_case // continuation point if fast allocation fails + ); + virtual void barrier_stubs_init() {} }; diff -r f5daffd7ec7a -r cc1a4a267798 src/hotspot/cpu/arm/macroAssembler_arm.cpp --- a/src/hotspot/cpu/arm/macroAssembler_arm.cpp Mon Sep 24 16:39:02 2018 +0300 +++ b/src/hotspot/cpu/arm/macroAssembler_arm.cpp Mon Sep 24 16:44:24 2018 +0300 @@ -1256,68 +1256,15 @@ // Puts address of allocated object into register `obj` and end of allocated object into register `obj_end`. void MacroAssembler::eden_allocate(Register obj, Register obj_end, Register tmp1, Register tmp2, RegisterOrConstant size_expression, Label& slow_case) { - if (!Universe::heap()->supports_inline_contig_alloc()) { - b(slow_case); - return; - } - - CollectedHeap* ch = Universe::heap(); - - const Register top_addr = tmp1; - const Register heap_end = tmp2; - - if (size_expression.is_register()) { - assert_different_registers(obj, obj_end, top_addr, heap_end, size_expression.as_register()); - } else { - assert_different_registers(obj, obj_end, top_addr, heap_end); - } - - bool load_const = AARCH64_ONLY(false) NOT_AARCH64(VM_Version::supports_movw() ); // TODO-AARCH64 check performance - if (load_const) { - mov_address(top_addr, (address)Universe::heap()->top_addr(), symbolic_Relocation::eden_top_reference); - } else { - ldr(top_addr, Address(Rthread, JavaThread::heap_top_addr_offset())); - } - // Calculate new heap_top by adding the size of the object - Label retry; - bind(retry); - -#ifdef AARCH64 - ldxr(obj, top_addr); -#else - ldr(obj, Address(top_addr)); -#endif // AARCH64 - - ldr(heap_end, Address(top_addr, (intptr_t)ch->end_addr() - (intptr_t)ch->top_addr())); - add_rc(obj_end, obj, size_expression); - // Check if obj_end wrapped around, i.e., obj_end < obj. If yes, jump to the slow case. - cmp(obj_end, obj); - b(slow_case, lo); - // Update heap_top if allocation succeeded - cmp(obj_end, heap_end); - b(slow_case, hi); - -#ifdef AARCH64 - stxr(heap_end/*scratched*/, obj_end, top_addr); - cbnz_w(heap_end, retry); -#else - atomic_cas_bool(obj, obj_end, top_addr, 0, heap_end/*scratched*/); - b(retry, ne); -#endif // AARCH64 + BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); + bs->eden_allocate(this, obj, obj_end, tmp1, tmp2, size_expression, slow_case); } // Puts address of allocated object into register `obj` and end of allocated object into register `obj_end`. void MacroAssembler::tlab_allocate(Register obj, Register obj_end, Register tmp1, RegisterOrConstant size_expression, Label& slow_case) { - const Register tlab_end = tmp1; - assert_different_registers(obj, obj_end, tlab_end); - - ldr(obj, Address(Rthread, JavaThread::tlab_top_offset())); - ldr(tlab_end, Address(Rthread, JavaThread::tlab_end_offset())); - add_rc(obj_end, obj, size_expression); - cmp(obj_end, tlab_end); - b(slow_case, hi); - str(obj_end, Address(Rthread, JavaThread::tlab_top_offset())); + BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); + bs->tlab_allocate(this, obj, obj_end, tmp1, size_expression, slow_case); } // Fills memory regions [start..end] with zeroes. Clobbers `start` and `tmp` registers. @@ -1363,52 +1310,6 @@ #endif // AARCH64 } -void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register tmp) { -#ifdef AARCH64 - ldr(tmp, Address(Rthread, in_bytes(JavaThread::allocated_bytes_offset()))); - add_rc(tmp, tmp, size_in_bytes); - str(tmp, Address(Rthread, in_bytes(JavaThread::allocated_bytes_offset()))); -#else - // Bump total bytes allocated by this thread - Label done; - - // Borrow the Rthread for alloc counter - Register Ralloc = Rthread; - add(Ralloc, Ralloc, in_bytes(JavaThread::allocated_bytes_offset())); - ldr(tmp, Address(Ralloc)); - adds(tmp, tmp, size_in_bytes); - str(tmp, Address(Ralloc), cc); - b(done, cc); - - // Increment the high word and store single-copy atomically (that is an unlikely scenario on typical embedded systems as it means >4GB has been allocated) - // To do so ldrd/strd instructions used which require an even-odd pair of registers. Such a request could be difficult to satisfy by - // allocating those registers on a higher level, therefore the routine is ready to allocate a pair itself. - Register low, high; - // Select ether R0/R1 or R2/R3 - - if (size_in_bytes.is_register() && (size_in_bytes.as_register() == R0 || size_in_bytes.as_register() == R1)) { - low = R2; - high = R3; - } else { - low = R0; - high = R1; - } - push(RegisterSet(low, high)); - - ldrd(low, Address(Ralloc)); - adds(low, low, size_in_bytes); - adc(high, high, 0); - strd(low, Address(Ralloc)); - - pop(RegisterSet(low, high)); - - bind(done); - - // Unborrow the Rthread - sub(Rthread, Ralloc, in_bytes(JavaThread::allocated_bytes_offset())); -#endif // AARCH64 -} - void MacroAssembler::arm_stack_overflow_check(int frame_size_in_bytes, Register tmp) { // Version of AbstractAssembler::generate_stack_overflow_check optimized for ARM if (UseStackBanging) { diff -r f5daffd7ec7a -r cc1a4a267798 src/hotspot/cpu/arm/macroAssembler_arm.hpp --- a/src/hotspot/cpu/arm/macroAssembler_arm.hpp Mon Sep 24 16:39:02 2018 +0300 +++ b/src/hotspot/cpu/arm/macroAssembler_arm.hpp Mon Sep 24 16:44:24 2018 +0300 @@ -361,8 +361,6 @@ void zero_memory(Register start, Register end, Register tmp); - void incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register tmp); - static bool needs_explicit_null_check(intptr_t offset); void arm_stack_overflow_check(int frame_size_in_bytes, Register tmp); diff -r f5daffd7ec7a -r cc1a4a267798 src/hotspot/cpu/arm/templateTable_arm.cpp --- a/src/hotspot/cpu/arm/templateTable_arm.cpp Mon Sep 24 16:39:02 2018 +0300 +++ b/src/hotspot/cpu/arm/templateTable_arm.cpp Mon Sep 24 16:44:24 2018 +0300 @@ -4502,12 +4502,7 @@ const Register Rtlab_end = R2_tmp; assert_different_registers(Robj, Rsize, Rklass, Rtlab_top, Rtlab_end); - __ ldr(Robj, Address(Rthread, JavaThread::tlab_top_offset())); - __ ldr(Rtlab_end, Address(Rthread, in_bytes(JavaThread::tlab_end_offset()))); - __ add(Rtlab_top, Robj, Rsize); - __ cmp(Rtlab_top, Rtlab_end); - __ b(slow_case, hi); - __ str(Rtlab_top, Address(Rthread, JavaThread::tlab_top_offset())); + __ tlab_allocate(Robj, Rtlab_top, Rtlab_end, Rsize, slow_case); if (ZeroTLAB) { // the fields have been already cleared __ b(initialize_header); @@ -4523,34 +4518,7 @@ const Register Rheap_end = Rtemp; assert_different_registers(Robj, Rklass, Rsize, Rheap_top_addr, Rheap_top, Rheap_end, LR); - // heap_end now (re)loaded in the loop since also used as a scratch register in the CAS - __ ldr_literal(Rheap_top_addr, Lheap_top_addr); - - Label retry; - __ bind(retry); - -#ifdef AARCH64 - __ ldxr(Robj, Rheap_top_addr); -#else - __ ldr(Robj, Address(Rheap_top_addr)); -#endif // AARCH64 - - __ ldr(Rheap_end, Address(Rheap_top_addr, (intptr_t)Universe::heap()->end_addr()-(intptr_t)Universe::heap()->top_addr())); - __ add(Rheap_top, Robj, Rsize); - __ cmp(Rheap_top, Rheap_end); - __ b(slow_case, hi); - - // Update heap top atomically. - // If someone beats us on the allocation, try again, otherwise continue. -#ifdef AARCH64 - __ stxr(Rtemp2, Rheap_top, Rheap_top_addr); - __ cbnz_w(Rtemp2, retry); -#else - __ atomic_cas_bool(Robj, Rheap_top, Rheap_top_addr, 0, Rheap_end/*scratched*/); - __ b(retry, ne); -#endif // AARCH64 - - __ incr_allocated_bytes(Rsize, Rtemp); + __ eden_allocate(Robj, Rheap_top, Rheap_top_addr, Rheap_end, Rsize, slow_case); } }