8033805: Move Fast_Lock/Fast_Unlock code from .ad files to macroassembler
Summary: Consolidated C2 x86 locking code in one place in macroAssembler_x86.cpp.
Reviewed-by: roland
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Fri Feb 21 08:09:15 2014 -0800
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Mon Feb 24 15:12:26 2014 -0800
@@ -98,217 +98,6 @@
return Address::make_array(adr);
}
-int MacroAssembler::biased_locking_enter(Register lock_reg,
- Register obj_reg,
- Register swap_reg,
- Register tmp_reg,
- bool swap_reg_contains_mark,
- Label& done,
- Label* slow_case,
- BiasedLockingCounters* counters) {
- assert(UseBiasedLocking, "why call this otherwise?");
- assert(swap_reg == rax, "swap_reg must be rax, for cmpxchg");
- assert_different_registers(lock_reg, obj_reg, swap_reg);
-
- if (PrintBiasedLockingStatistics && counters == NULL)
- counters = BiasedLocking::counters();
-
- bool need_tmp_reg = false;
- if (tmp_reg == noreg) {
- need_tmp_reg = true;
- tmp_reg = lock_reg;
- } else {
- assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
- }
- assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
- Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes());
- Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes());
- Address saved_mark_addr(lock_reg, 0);
-
- // Biased locking
- // See whether the lock is currently biased toward our thread and
- // whether the epoch is still valid
- // Note that the runtime guarantees sufficient alignment of JavaThread
- // pointers to allow age to be placed into low bits
- // First check to see whether biasing is even enabled for this object
- Label cas_label;
- int null_check_offset = -1;
- if (!swap_reg_contains_mark) {
- null_check_offset = offset();
- movl(swap_reg, mark_addr);
- }
- if (need_tmp_reg) {
- push(tmp_reg);
- }
- movl(tmp_reg, swap_reg);
- andl(tmp_reg, markOopDesc::biased_lock_mask_in_place);
- cmpl(tmp_reg, markOopDesc::biased_lock_pattern);
- if (need_tmp_reg) {
- pop(tmp_reg);
- }
- jcc(Assembler::notEqual, cas_label);
- // The bias pattern is present in the object's header. Need to check
- // whether the bias owner and the epoch are both still current.
- // Note that because there is no current thread register on x86 we
- // need to store off the mark word we read out of the object to
- // avoid reloading it and needing to recheck invariants below. This
- // store is unfortunate but it makes the overall code shorter and
- // simpler.
- movl(saved_mark_addr, swap_reg);
- if (need_tmp_reg) {
- push(tmp_reg);
- }
- get_thread(tmp_reg);
- xorl(swap_reg, tmp_reg);
- if (swap_reg_contains_mark) {
- null_check_offset = offset();
- }
- movl(tmp_reg, klass_addr);
- xorl(swap_reg, Address(tmp_reg, Klass::prototype_header_offset()));
- andl(swap_reg, ~((int) markOopDesc::age_mask_in_place));
- if (need_tmp_reg) {
- pop(tmp_reg);
- }
- if (counters != NULL) {
- cond_inc32(Assembler::zero,
- ExternalAddress((address)counters->biased_lock_entry_count_addr()));
- }
- jcc(Assembler::equal, done);
-
- Label try_revoke_bias;
- Label try_rebias;
-
- // At this point we know that the header has the bias pattern and
- // that we are not the bias owner in the current epoch. We need to
- // figure out more details about the state of the header in order to
- // know what operations can be legally performed on the object's
- // header.
-
- // If the low three bits in the xor result aren't clear, that means
- // the prototype header is no longer biased and we have to revoke
- // the bias on this object.
- testl(swap_reg, markOopDesc::biased_lock_mask_in_place);
- jcc(Assembler::notZero, try_revoke_bias);
-
- // Biasing is still enabled for this data type. See whether the
- // epoch of the current bias is still valid, meaning that the epoch
- // bits of the mark word are equal to the epoch bits of the
- // prototype header. (Note that the prototype header's epoch bits
- // only change at a safepoint.) If not, attempt to rebias the object
- // toward the current thread. Note that we must be absolutely sure
- // that the current epoch is invalid in order to do this because
- // otherwise the manipulations it performs on the mark word are
- // illegal.
- testl(swap_reg, markOopDesc::epoch_mask_in_place);
- jcc(Assembler::notZero, try_rebias);
-
- // The epoch of the current bias is still valid but we know nothing
- // about the owner; it might be set or it might be clear. Try to
- // acquire the bias of the object using an atomic operation. If this
- // fails we will go in to the runtime to revoke the object's bias.
- // Note that we first construct the presumed unbiased header so we
- // don't accidentally blow away another thread's valid bias.
- movl(swap_reg, saved_mark_addr);
- andl(swap_reg,
- markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
- if (need_tmp_reg) {
- push(tmp_reg);
- }
- get_thread(tmp_reg);
- orl(tmp_reg, swap_reg);
- if (os::is_MP()) {
- lock();
- }
- cmpxchgptr(tmp_reg, Address(obj_reg, 0));
- if (need_tmp_reg) {
- pop(tmp_reg);
- }
- // If the biasing toward our thread failed, this means that
- // another thread succeeded in biasing it toward itself and we
- // need to revoke that bias. The revocation will occur in the
- // interpreter runtime in the slow case.
- if (counters != NULL) {
- cond_inc32(Assembler::zero,
- ExternalAddress((address)counters->anonymously_biased_lock_entry_count_addr()));
- }
- if (slow_case != NULL) {
- jcc(Assembler::notZero, *slow_case);
- }
- jmp(done);
-
- bind(try_rebias);
- // At this point we know the epoch has expired, meaning that the
- // current "bias owner", if any, is actually invalid. Under these
- // circumstances _only_, we are allowed to use the current header's
- // value as the comparison value when doing the cas to acquire the
- // bias in the current epoch. In other words, we allow transfer of
- // the bias from one thread to another directly in this situation.
- //
- // FIXME: due to a lack of registers we currently blow away the age
- // bits in this situation. Should attempt to preserve them.
- if (need_tmp_reg) {
- push(tmp_reg);
- }
- get_thread(tmp_reg);
- movl(swap_reg, klass_addr);
- orl(tmp_reg, Address(swap_reg, Klass::prototype_header_offset()));
- movl(swap_reg, saved_mark_addr);
- if (os::is_MP()) {
- lock();
- }
- cmpxchgptr(tmp_reg, Address(obj_reg, 0));
- if (need_tmp_reg) {
- pop(tmp_reg);
- }
- // If the biasing toward our thread failed, then another thread
- // succeeded in biasing it toward itself and we need to revoke that
- // bias. The revocation will occur in the runtime in the slow case.
- if (counters != NULL) {
- cond_inc32(Assembler::zero,
- ExternalAddress((address)counters->rebiased_lock_entry_count_addr()));
- }
- if (slow_case != NULL) {
- jcc(Assembler::notZero, *slow_case);
- }
- jmp(done);
-
- bind(try_revoke_bias);
- // The prototype mark in the klass doesn't have the bias bit set any
- // more, indicating that objects of this data type are not supposed
- // to be biased any more. We are going to try to reset the mark of
- // this object to the prototype value and fall through to the
- // CAS-based locking scheme. Note that if our CAS fails, it means
- // that another thread raced us for the privilege of revoking the
- // bias of this particular object, so it's okay to continue in the
- // normal locking code.
- //
- // FIXME: due to a lack of registers we currently blow away the age
- // bits in this situation. Should attempt to preserve them.
- movl(swap_reg, saved_mark_addr);
- if (need_tmp_reg) {
- push(tmp_reg);
- }
- movl(tmp_reg, klass_addr);
- movl(tmp_reg, Address(tmp_reg, Klass::prototype_header_offset()));
- if (os::is_MP()) {
- lock();
- }
- cmpxchgptr(tmp_reg, Address(obj_reg, 0));
- if (need_tmp_reg) {
- pop(tmp_reg);
- }
- // Fall through to the normal CAS-based lock, because no matter what
- // the result of the above CAS, some thread must have succeeded in
- // removing the bias bit from the object's header.
- if (counters != NULL) {
- cond_inc32(Assembler::zero,
- ExternalAddress((address)counters->revoked_lock_entry_count_addr()));
- }
-
- bind(cas_label);
-
- return null_check_offset;
-}
void MacroAssembler::call_VM_leaf_base(address entry_point,
int number_of_arguments) {
call(RuntimeAddress(entry_point));
@@ -726,165 +515,6 @@
return array;
}
-int MacroAssembler::biased_locking_enter(Register lock_reg,
- Register obj_reg,
- Register swap_reg,
- Register tmp_reg,
- bool swap_reg_contains_mark,
- Label& done,
- Label* slow_case,
- BiasedLockingCounters* counters) {
- assert(UseBiasedLocking, "why call this otherwise?");
- assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
- assert(tmp_reg != noreg, "tmp_reg must be supplied");
- assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
- assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
- Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes());
- Address saved_mark_addr(lock_reg, 0);
-
- if (PrintBiasedLockingStatistics && counters == NULL)
- counters = BiasedLocking::counters();
-
- // Biased locking
- // See whether the lock is currently biased toward our thread and
- // whether the epoch is still valid
- // Note that the runtime guarantees sufficient alignment of JavaThread
- // pointers to allow age to be placed into low bits
- // First check to see whether biasing is even enabled for this object
- Label cas_label;
- int null_check_offset = -1;
- if (!swap_reg_contains_mark) {
- null_check_offset = offset();
- movq(swap_reg, mark_addr);
- }
- movq(tmp_reg, swap_reg);
- andq(tmp_reg, markOopDesc::biased_lock_mask_in_place);
- cmpq(tmp_reg, markOopDesc::biased_lock_pattern);
- jcc(Assembler::notEqual, cas_label);
- // The bias pattern is present in the object's header. Need to check
- // whether the bias owner and the epoch are both still current.
- load_prototype_header(tmp_reg, obj_reg);
- orq(tmp_reg, r15_thread);
- xorq(tmp_reg, swap_reg);
- andq(tmp_reg, ~((int) markOopDesc::age_mask_in_place));
- if (counters != NULL) {
- cond_inc32(Assembler::zero,
- ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
- }
- jcc(Assembler::equal, done);
-
- Label try_revoke_bias;
- Label try_rebias;
-
- // At this point we know that the header has the bias pattern and
- // that we are not the bias owner in the current epoch. We need to
- // figure out more details about the state of the header in order to
- // know what operations can be legally performed on the object's
- // header.
-
- // If the low three bits in the xor result aren't clear, that means
- // the prototype header is no longer biased and we have to revoke
- // the bias on this object.
- testq(tmp_reg, markOopDesc::biased_lock_mask_in_place);
- jcc(Assembler::notZero, try_revoke_bias);
-
- // Biasing is still enabled for this data type. See whether the
- // epoch of the current bias is still valid, meaning that the epoch
- // bits of the mark word are equal to the epoch bits of the
- // prototype header. (Note that the prototype header's epoch bits
- // only change at a safepoint.) If not, attempt to rebias the object
- // toward the current thread. Note that we must be absolutely sure
- // that the current epoch is invalid in order to do this because
- // otherwise the manipulations it performs on the mark word are
- // illegal.
- testq(tmp_reg, markOopDesc::epoch_mask_in_place);
- jcc(Assembler::notZero, try_rebias);
-
- // The epoch of the current bias is still valid but we know nothing
- // about the owner; it might be set or it might be clear. Try to
- // acquire the bias of the object using an atomic operation. If this
- // fails we will go in to the runtime to revoke the object's bias.
- // Note that we first construct the presumed unbiased header so we
- // don't accidentally blow away another thread's valid bias.
- andq(swap_reg,
- markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
- movq(tmp_reg, swap_reg);
- orq(tmp_reg, r15_thread);
- if (os::is_MP()) {
- lock();
- }
- cmpxchgq(tmp_reg, Address(obj_reg, 0));
- // If the biasing toward our thread failed, this means that
- // another thread succeeded in biasing it toward itself and we
- // need to revoke that bias. The revocation will occur in the
- // interpreter runtime in the slow case.
- if (counters != NULL) {
- cond_inc32(Assembler::zero,
- ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
- }
- if (slow_case != NULL) {
- jcc(Assembler::notZero, *slow_case);
- }
- jmp(done);
-
- bind(try_rebias);
- // At this point we know the epoch has expired, meaning that the
- // current "bias owner", if any, is actually invalid. Under these
- // circumstances _only_, we are allowed to use the current header's
- // value as the comparison value when doing the cas to acquire the
- // bias in the current epoch. In other words, we allow transfer of
- // the bias from one thread to another directly in this situation.
- //
- // FIXME: due to a lack of registers we currently blow away the age
- // bits in this situation. Should attempt to preserve them.
- load_prototype_header(tmp_reg, obj_reg);
- orq(tmp_reg, r15_thread);
- if (os::is_MP()) {
- lock();
- }
- cmpxchgq(tmp_reg, Address(obj_reg, 0));
- // If the biasing toward our thread failed, then another thread
- // succeeded in biasing it toward itself and we need to revoke that
- // bias. The revocation will occur in the runtime in the slow case.
- if (counters != NULL) {
- cond_inc32(Assembler::zero,
- ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
- }
- if (slow_case != NULL) {
- jcc(Assembler::notZero, *slow_case);
- }
- jmp(done);
-
- bind(try_revoke_bias);
- // The prototype mark in the klass doesn't have the bias bit set any
- // more, indicating that objects of this data type are not supposed
- // to be biased any more. We are going to try to reset the mark of
- // this object to the prototype value and fall through to the
- // CAS-based locking scheme. Note that if our CAS fails, it means
- // that another thread raced us for the privilege of revoking the
- // bias of this particular object, so it's okay to continue in the
- // normal locking code.
- //
- // FIXME: due to a lack of registers we currently blow away the age
- // bits in this situation. Should attempt to preserve them.
- load_prototype_header(tmp_reg, obj_reg);
- if (os::is_MP()) {
- lock();
- }
- cmpxchgq(tmp_reg, Address(obj_reg, 0));
- // Fall through to the normal CAS-based lock, because no matter what
- // the result of the above CAS, some thread must have succeeded in
- // removing the bias bit from the object's header.
- if (counters != NULL) {
- cond_inc32(Assembler::zero,
- ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
- }
-
- bind(cas_label);
-
- return null_check_offset;
-}
-
void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
Label L, E;
@@ -1360,9 +990,16 @@
void MacroAssembler::atomic_incl(AddressLiteral counter_addr) {
pushf();
- if (os::is_MP())
- lock();
- incrementl(counter_addr);
+ if (reachable(counter_addr)) {
+ if (os::is_MP())
+ lock();
+ incrementl(as_Address(counter_addr));
+ } else {
+ lea(rscratch1, counter_addr);
+ if (os::is_MP())
+ lock();
+ incrementl(Address(rscratch1, 0));
+ }
popf();
}
@@ -1393,6 +1030,234 @@
}
}
+int MacroAssembler::biased_locking_enter(Register lock_reg,
+ Register obj_reg,
+ Register swap_reg,
+ Register tmp_reg,
+ bool swap_reg_contains_mark,
+ Label& done,
+ Label* slow_case,
+ BiasedLockingCounters* counters) {
+ assert(UseBiasedLocking, "why call this otherwise?");
+ assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
+ LP64_ONLY( assert(tmp_reg != noreg, "tmp_reg must be supplied"); )
+ bool need_tmp_reg = false;
+ if (tmp_reg == noreg) {
+ need_tmp_reg = true;
+ tmp_reg = lock_reg;
+ assert_different_registers(lock_reg, obj_reg, swap_reg);
+ } else {
+ assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
+ }
+ assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
+ Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes());
+ Address saved_mark_addr(lock_reg, 0);
+
+ if (PrintBiasedLockingStatistics && counters == NULL) {
+ counters = BiasedLocking::counters();
+ }
+ // Biased locking
+ // See whether the lock is currently biased toward our thread and
+ // whether the epoch is still valid
+ // Note that the runtime guarantees sufficient alignment of JavaThread
+ // pointers to allow age to be placed into low bits
+ // First check to see whether biasing is even enabled for this object
+ Label cas_label;
+ int null_check_offset = -1;
+ if (!swap_reg_contains_mark) {
+ null_check_offset = offset();
+ movptr(swap_reg, mark_addr);
+ }
+ if (need_tmp_reg) {
+ push(tmp_reg);
+ }
+ movptr(tmp_reg, swap_reg);
+ andptr(tmp_reg, markOopDesc::biased_lock_mask_in_place);
+ cmpptr(tmp_reg, markOopDesc::biased_lock_pattern);
+ if (need_tmp_reg) {
+ pop(tmp_reg);
+ }
+ jcc(Assembler::notEqual, cas_label);
+ // The bias pattern is present in the object's header. Need to check
+ // whether the bias owner and the epoch are both still current.
+#ifndef _LP64
+ // Note that because there is no current thread register on x86_32 we
+ // need to store off the mark word we read out of the object to
+ // avoid reloading it and needing to recheck invariants below. This
+ // store is unfortunate but it makes the overall code shorter and
+ // simpler.
+ movptr(saved_mark_addr, swap_reg);
+#endif
+ if (need_tmp_reg) {
+ push(tmp_reg);
+ }
+ if (swap_reg_contains_mark) {
+ null_check_offset = offset();
+ }
+ load_prototype_header(tmp_reg, obj_reg);
+#ifdef _LP64
+ orptr(tmp_reg, r15_thread);
+ xorptr(tmp_reg, swap_reg);
+ Register header_reg = tmp_reg;
+#else
+ xorptr(tmp_reg, swap_reg);
+ get_thread(swap_reg);
+ xorptr(swap_reg, tmp_reg);
+ Register header_reg = swap_reg;
+#endif
+ andptr(header_reg, ~((int) markOopDesc::age_mask_in_place));
+ if (need_tmp_reg) {
+ pop(tmp_reg);
+ }
+ if (counters != NULL) {
+ cond_inc32(Assembler::zero,
+ ExternalAddress((address) counters->biased_lock_entry_count_addr()));
+ }
+ jcc(Assembler::equal, done);
+
+ Label try_revoke_bias;
+ Label try_rebias;
+
+ // At this point we know that the header has the bias pattern and
+ // that we are not the bias owner in the current epoch. We need to
+ // figure out more details about the state of the header in order to
+ // know what operations can be legally performed on the object's
+ // header.
+
+ // If the low three bits in the xor result aren't clear, that means
+ // the prototype header is no longer biased and we have to revoke
+ // the bias on this object.
+ testptr(header_reg, markOopDesc::biased_lock_mask_in_place);
+ jccb(Assembler::notZero, try_revoke_bias);
+
+ // Biasing is still enabled for this data type. See whether the
+ // epoch of the current bias is still valid, meaning that the epoch
+ // bits of the mark word are equal to the epoch bits of the
+ // prototype header. (Note that the prototype header's epoch bits
+ // only change at a safepoint.) If not, attempt to rebias the object
+ // toward the current thread. Note that we must be absolutely sure
+ // that the current epoch is invalid in order to do this because
+ // otherwise the manipulations it performs on the mark word are
+ // illegal.
+ testptr(header_reg, markOopDesc::epoch_mask_in_place);
+ jccb(Assembler::notZero, try_rebias);
+
+ // The epoch of the current bias is still valid but we know nothing
+ // about the owner; it might be set or it might be clear. Try to
+ // acquire the bias of the object using an atomic operation. If this
+ // fails we will go in to the runtime to revoke the object's bias.
+ // Note that we first construct the presumed unbiased header so we
+ // don't accidentally blow away another thread's valid bias.
+ NOT_LP64( movptr(swap_reg, saved_mark_addr); )
+ andptr(swap_reg,
+ markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
+ if (need_tmp_reg) {
+ push(tmp_reg);
+ }
+#ifdef _LP64
+ movptr(tmp_reg, swap_reg);
+ orptr(tmp_reg, r15_thread);
+#else
+ get_thread(tmp_reg);
+ orptr(tmp_reg, swap_reg);
+#endif
+ if (os::is_MP()) {
+ lock();
+ }
+ cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
+ if (need_tmp_reg) {
+ pop(tmp_reg);
+ }
+ // If the biasing toward our thread failed, this means that
+ // another thread succeeded in biasing it toward itself and we
+ // need to revoke that bias. The revocation will occur in the
+ // interpreter runtime in the slow case.
+ if (counters != NULL) {
+ cond_inc32(Assembler::zero,
+ ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
+ }
+ if (slow_case != NULL) {
+ jcc(Assembler::notZero, *slow_case);
+ }
+ jmp(done);
+
+ bind(try_rebias);
+ // At this point we know the epoch has expired, meaning that the
+ // current "bias owner", if any, is actually invalid. Under these
+ // circumstances _only_, we are allowed to use the current header's
+ // value as the comparison value when doing the cas to acquire the
+ // bias in the current epoch. In other words, we allow transfer of
+ // the bias from one thread to another directly in this situation.
+ //
+ // FIXME: due to a lack of registers we currently blow away the age
+ // bits in this situation. Should attempt to preserve them.
+ if (need_tmp_reg) {
+ push(tmp_reg);
+ }
+ load_prototype_header(tmp_reg, obj_reg);
+#ifdef _LP64
+ orptr(tmp_reg, r15_thread);
+#else
+ get_thread(swap_reg);
+ orptr(tmp_reg, swap_reg);
+ movptr(swap_reg, saved_mark_addr);
+#endif
+ if (os::is_MP()) {
+ lock();
+ }
+ cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
+ if (need_tmp_reg) {
+ pop(tmp_reg);
+ }
+ // If the biasing toward our thread failed, then another thread
+ // succeeded in biasing it toward itself and we need to revoke that
+ // bias. The revocation will occur in the runtime in the slow case.
+ if (counters != NULL) {
+ cond_inc32(Assembler::zero,
+ ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
+ }
+ if (slow_case != NULL) {
+ jcc(Assembler::notZero, *slow_case);
+ }
+ jmp(done);
+
+ bind(try_revoke_bias);
+ // The prototype mark in the klass doesn't have the bias bit set any
+ // more, indicating that objects of this data type are not supposed
+ // to be biased any more. We are going to try to reset the mark of
+ // this object to the prototype value and fall through to the
+ // CAS-based locking scheme. Note that if our CAS fails, it means
+ // that another thread raced us for the privilege of revoking the
+ // bias of this particular object, so it's okay to continue in the
+ // normal locking code.
+ //
+ // FIXME: due to a lack of registers we currently blow away the age
+ // bits in this situation. Should attempt to preserve them.
+ NOT_LP64( movptr(swap_reg, saved_mark_addr); )
+ if (need_tmp_reg) {
+ push(tmp_reg);
+ }
+ load_prototype_header(tmp_reg, obj_reg);
+ if (os::is_MP()) {
+ lock();
+ }
+ cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
+ if (need_tmp_reg) {
+ pop(tmp_reg);
+ }
+ // Fall through to the normal CAS-based lock, because no matter what
+ // the result of the above CAS, some thread must have succeeded in
+ // removing the bias bit from the object's header.
+ if (counters != NULL) {
+ cond_inc32(Assembler::zero,
+ ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
+ }
+
+ bind(cas_label);
+
+ return null_check_offset;
+}
+
void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
assert(UseBiasedLocking, "why call this otherwise?");
@@ -1408,6 +1273,620 @@
jcc(Assembler::equal, done);
}
+#ifdef COMPILER2
+// Fast_Lock and Fast_Unlock used by C2
+
+// Because the transitions from emitted code to the runtime
+// monitorenter/exit helper stubs are so slow it's critical that
+// we inline both the stack-locking fast-path and the inflated fast path.
+//
+// See also: cmpFastLock and cmpFastUnlock.
+//
+// What follows is a specialized inline transliteration of the code
+// in slow_enter() and slow_exit(). If we're concerned about I$ bloat
+// another option would be to emit TrySlowEnter and TrySlowExit methods
+// at startup-time. These methods would accept arguments as
+// (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
+// indications in the icc.ZFlag. Fast_Lock and Fast_Unlock would simply
+// marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
+// In practice, however, the # of lock sites is bounded and is usually small.
+// Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
+// if the processor uses simple bimodal branch predictors keyed by EIP
+// Since the helper routines would be called from multiple synchronization
+// sites.
+//
+// An even better approach would be write "MonitorEnter()" and "MonitorExit()"
+// in java - using j.u.c and unsafe - and just bind the lock and unlock sites
+// to those specialized methods. That'd give us a mostly platform-independent
+// implementation that the JITs could optimize and inline at their pleasure.
+// Done correctly, the only time we'd need to cross to native could would be
+// to park() or unpark() threads. We'd also need a few more unsafe operators
+// to (a) prevent compiler-JIT reordering of non-volatile accesses, and
+// (b) explicit barriers or fence operations.
+//
+// TODO:
+//
+// * Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
+// This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
+// Given TLAB allocation, Self is usually manifested in a register, so passing it into
+// the lock operators would typically be faster than reifying Self.
+//
+// * Ideally I'd define the primitives as:
+// fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
+// fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
+// Unfortunately ADLC bugs prevent us from expressing the ideal form.
+// Instead, we're stuck with a rather awkward and brittle register assignments below.
+// Furthermore the register assignments are overconstrained, possibly resulting in
+// sub-optimal code near the synchronization site.
+//
+// * Eliminate the sp-proximity tests and just use "== Self" tests instead.
+// Alternately, use a better sp-proximity test.
+//
+// * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
+// Either one is sufficient to uniquely identify a thread.
+// TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
+//
+// * Intrinsify notify() and notifyAll() for the common cases where the
+// object is locked by the calling thread but the waitlist is empty.
+// avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
+//
+// * use jccb and jmpb instead of jcc and jmp to improve code density.
+// But beware of excessive branch density on AMD Opterons.
+//
+// * Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
+// or failure of the fast-path. If the fast-path fails then we pass
+// control to the slow-path, typically in C. In Fast_Lock and
+// Fast_Unlock we often branch to DONE_LABEL, just to find that C2
+// will emit a conditional branch immediately after the node.
+// So we have branches to branches and lots of ICC.ZF games.
+// Instead, it might be better to have C2 pass a "FailureLabel"
+// into Fast_Lock and Fast_Unlock. In the case of success, control
+// will drop through the node. ICC.ZF is undefined at exit.
+// In the case of failure, the node will branch directly to the
+// FailureLabel
+
+
+// obj: object to lock
+// box: on-stack box address (displaced header location) - KILLED
+// rax,: tmp -- KILLED
+// scr: tmp -- KILLED
+void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, Register scrReg, BiasedLockingCounters* counters) {
+ // Ensure the register assignents are disjoint
+ guarantee (objReg != boxReg, "");
+ guarantee (objReg != tmpReg, "");
+ guarantee (objReg != scrReg, "");
+ guarantee (boxReg != tmpReg, "");
+ guarantee (boxReg != scrReg, "");
+ guarantee (tmpReg == rax, "");
+
+ if (counters != NULL) {
+ atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()));
+ }
+ if (EmitSync & 1) {
+ // set box->dhw = unused_mark (3)
+ // Force all sync thru slow-path: slow_enter() and slow_exit()
+ movptr (Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
+ cmpptr (rsp, (int32_t)NULL_WORD);
+ } else
+ if (EmitSync & 2) {
+ Label DONE_LABEL ;
+ if (UseBiasedLocking) {
+ // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument.
+ biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
+ }
+
+ movptr(tmpReg, Address(objReg, 0)); // fetch markword
+ orptr (tmpReg, 0x1);
+ movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS
+ if (os::is_MP()) {
+ lock();
+ }
+ cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg
+ jccb(Assembler::equal, DONE_LABEL);
+ // Recursive locking
+ subptr(tmpReg, rsp);
+ andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
+ movptr(Address(boxReg, 0), tmpReg);
+ bind(DONE_LABEL);
+ } else {
+ // Possible cases that we'll encounter in fast_lock
+ // ------------------------------------------------
+ // * Inflated
+ // -- unlocked
+ // -- Locked
+ // = by self
+ // = by other
+ // * biased
+ // -- by Self
+ // -- by other
+ // * neutral
+ // * stack-locked
+ // -- by self
+ // = sp-proximity test hits
+ // = sp-proximity test generates false-negative
+ // -- by other
+ //
+
+ Label IsInflated, DONE_LABEL;
+
+ // it's stack-locked, biased or neutral
+ // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
+ // order to reduce the number of conditional branches in the most common cases.
+ // Beware -- there's a subtle invariant that fetch of the markword
+ // at [FETCH], below, will never observe a biased encoding (*101b).
+ // If this invariant is not held we risk exclusion (safety) failure.
+ if (UseBiasedLocking && !UseOptoBiasInlining) {
+ biased_locking_enter(boxReg, objReg, tmpReg, scrReg, true, DONE_LABEL, NULL, counters);
+ }
+
+ movptr(tmpReg, Address(objReg, 0)); // [FETCH]
+ testl (tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased
+ jccb (Assembler::notZero, IsInflated);
+
+ // Attempt stack-locking ...
+ orptr (tmpReg, 0x1);
+ movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS
+ if (os::is_MP()) {
+ lock();
+ }
+ cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg
+ if (counters != NULL) {
+ cond_inc32(Assembler::equal,
+ ExternalAddress((address)counters->fast_path_entry_count_addr()));
+ }
+ jccb(Assembler::equal, DONE_LABEL);
+
+ // Recursive locking
+ subptr(tmpReg, rsp);
+ andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
+ movptr(Address(boxReg, 0), tmpReg);
+ if (counters != NULL) {
+ cond_inc32(Assembler::equal,
+ ExternalAddress((address)counters->fast_path_entry_count_addr()));
+ }
+ jmpb(DONE_LABEL);
+
+ bind(IsInflated);
+#ifndef _LP64
+ // The object is inflated.
+ //
+ // TODO-FIXME: eliminate the ugly use of manifest constants:
+ // Use markOopDesc::monitor_value instead of "2".
+ // use markOop::unused_mark() instead of "3".
+ // The tmpReg value is an objectMonitor reference ORed with
+ // markOopDesc::monitor_value (2). We can either convert tmpReg to an
+ // objectmonitor pointer by masking off the "2" bit or we can just
+ // use tmpReg as an objectmonitor pointer but bias the objectmonitor
+ // field offsets with "-2" to compensate for and annul the low-order tag bit.
+ //
+ // I use the latter as it avoids AGI stalls.
+ // As such, we write "mov r, [tmpReg+OFFSETOF(Owner)-2]"
+ // instead of "mov r, [tmpReg+OFFSETOF(Owner)]".
+ //
+ #define OFFSET_SKEWED(f) ((ObjectMonitor::f ## _offset_in_bytes())-2)
+
+ // boxReg refers to the on-stack BasicLock in the current frame.
+ // We'd like to write:
+ // set box->_displaced_header = markOop::unused_mark(). Any non-0 value suffices.
+ // This is convenient but results a ST-before-CAS penalty. The following CAS suffers
+ // additional latency as we have another ST in the store buffer that must drain.
+
+ if (EmitSync & 8192) {
+ movptr(Address(boxReg, 0), 3); // results in ST-before-CAS penalty
+ get_thread (scrReg);
+ movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
+ movptr(tmpReg, NULL_WORD); // consider: xor vs mov
+ if (os::is_MP()) {
+ lock();
+ }
+ cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
+ } else
+ if ((EmitSync & 128) == 0) { // avoid ST-before-CAS
+ movptr(scrReg, boxReg);
+ movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
+
+ // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
+ if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
+ // prefetchw [eax + Offset(_owner)-2]
+ prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
+ }
+
+ if ((EmitSync & 64) == 0) {
+ // Optimistic form: consider XORL tmpReg,tmpReg
+ movptr(tmpReg, NULL_WORD);
+ } else {
+ // Can suffer RTS->RTO upgrades on shared or cold $ lines
+ // Test-And-CAS instead of CAS
+ movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); // rax, = m->_owner
+ testptr(tmpReg, tmpReg); // Locked ?
+ jccb (Assembler::notZero, DONE_LABEL);
+ }
+
+ // Appears unlocked - try to swing _owner from null to non-null.
+ // Ideally, I'd manifest "Self" with get_thread and then attempt
+ // to CAS the register containing Self into m->Owner.
+ // But we don't have enough registers, so instead we can either try to CAS
+ // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds
+ // we later store "Self" into m->Owner. Transiently storing a stack address
+ // (rsp or the address of the box) into m->owner is harmless.
+ // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
+ if (os::is_MP()) {
+ lock();
+ }
+ cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
+ movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3
+ jccb (Assembler::notZero, DONE_LABEL);
+ get_thread (scrReg); // beware: clobbers ICCs
+ movptr(Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2), scrReg);
+ xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success
+
+ // If the CAS fails we can either retry or pass control to the slow-path.
+ // We use the latter tactic.
+ // Pass the CAS result in the icc.ZFlag into DONE_LABEL
+ // If the CAS was successful ...
+ // Self has acquired the lock
+ // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
+ // Intentional fall-through into DONE_LABEL ...
+ } else {
+ movptr(Address(boxReg, 0), intptr_t(markOopDesc::unused_mark())); // results in ST-before-CAS penalty
+ movptr(boxReg, tmpReg);
+
+ // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
+ if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
+ // prefetchw [eax + Offset(_owner)-2]
+ prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
+ }
+
+ if ((EmitSync & 64) == 0) {
+ // Optimistic form
+ xorptr (tmpReg, tmpReg);
+ } else {
+ // Can suffer RTS->RTO upgrades on shared or cold $ lines
+ movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); // rax, = m->_owner
+ testptr(tmpReg, tmpReg); // Locked ?
+ jccb (Assembler::notZero, DONE_LABEL);
+ }
+
+ // Appears unlocked - try to swing _owner from null to non-null.
+ // Use either "Self" (in scr) or rsp as thread identity in _owner.
+ // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
+ get_thread (scrReg);
+ if (os::is_MP()) {
+ lock();
+ }
+ cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
+
+ // If the CAS fails we can either retry or pass control to the slow-path.
+ // We use the latter tactic.
+ // Pass the CAS result in the icc.ZFlag into DONE_LABEL
+ // If the CAS was successful ...
+ // Self has acquired the lock
+ // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
+ // Intentional fall-through into DONE_LABEL ...
+ }
+#else // _LP64
+ // It's inflated
+
+ // TODO: someday avoid the ST-before-CAS penalty by
+ // relocating (deferring) the following ST.
+ // We should also think about trying a CAS without having
+ // fetched _owner. If the CAS is successful we may
+ // avoid an RTO->RTS upgrade on the $line.
+
+ // Without cast to int32_t a movptr will destroy r10 which is typically obj
+ movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
+
+ mov (boxReg, tmpReg);
+ movptr (tmpReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
+ testptr(tmpReg, tmpReg);
+ jccb (Assembler::notZero, DONE_LABEL);
+
+ // It's inflated and appears unlocked
+ if (os::is_MP()) {
+ lock();
+ }
+ cmpxchgptr(r15_thread, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
+ // Intentional fall-through into DONE_LABEL ...
+
+#endif
+
+ // DONE_LABEL is a hot target - we'd really like to place it at the
+ // start of cache line by padding with NOPs.
+ // See the AMD and Intel software optimization manuals for the
+ // most efficient "long" NOP encodings.
+ // Unfortunately none of our alignment mechanisms suffice.
+ bind(DONE_LABEL);
+
+ // At DONE_LABEL the icc ZFlag is set as follows ...
+ // Fast_Unlock uses the same protocol.
+ // ZFlag == 1 -> Success
+ // ZFlag == 0 -> Failure - force control through the slow-path
+ }
+}
+
+// obj: object to unlock
+// box: box address (displaced header location), killed. Must be EAX.
+// tmp: killed, cannot be obj nor box.
+//
+// Some commentary on balanced locking:
+//
+// Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
+// Methods that don't have provably balanced locking are forced to run in the
+// interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
+// The interpreter provides two properties:
+// I1: At return-time the interpreter automatically and quietly unlocks any
+// objects acquired the current activation (frame). Recall that the
+// interpreter maintains an on-stack list of locks currently held by
+// a frame.
+// I2: If a method attempts to unlock an object that is not held by the
+// the frame the interpreter throws IMSX.
+//
+// Lets say A(), which has provably balanced locking, acquires O and then calls B().
+// B() doesn't have provably balanced locking so it runs in the interpreter.
+// Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
+// is still locked by A().
+//
+// The only other source of unbalanced locking would be JNI. The "Java Native Interface:
+// Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
+// should not be unlocked by "normal" java-level locking and vice-versa. The specification
+// doesn't specify what will occur if a program engages in such mixed-mode locking, however.
+
+void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) {
+ guarantee (objReg != boxReg, "");
+ guarantee (objReg != tmpReg, "");
+ guarantee (boxReg != tmpReg, "");
+ guarantee (boxReg == rax, "");
+
+ if (EmitSync & 4) {
+ // Disable - inhibit all inlining. Force control through the slow-path
+ cmpptr (rsp, 0);
+ } else
+ if (EmitSync & 8) {
+ Label DONE_LABEL;
+ if (UseBiasedLocking) {
+ biased_locking_exit(objReg, tmpReg, DONE_LABEL);
+ }
+ // Classic stack-locking code ...
+ // Check whether the displaced header is 0
+ //(=> recursive unlock)
+ movptr(tmpReg, Address(boxReg, 0));
+ testptr(tmpReg, tmpReg);
+ jccb(Assembler::zero, DONE_LABEL);
+ // If not recursive lock, reset the header to displaced header
+ if (os::is_MP()) {
+ lock();
+ }
+ cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
+ bind(DONE_LABEL);
+ } else {
+ Label DONE_LABEL, Stacked, CheckSucc;
+
+ // Critically, the biased locking test must have precedence over
+ // and appear before the (box->dhw == 0) recursive stack-lock test.
+ if (UseBiasedLocking && !UseOptoBiasInlining) {
+ biased_locking_exit(objReg, tmpReg, DONE_LABEL);
+ }
+
+ cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
+ movptr(tmpReg, Address(objReg, 0)); // Examine the object's markword
+ jccb (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock
+
+ testptr(tmpReg, 0x02); // Inflated?
+ jccb (Assembler::zero, Stacked);
+
+ // It's inflated.
+ // Despite our balanced locking property we still check that m->_owner == Self
+ // as java routines or native JNI code called by this thread might
+ // have released the lock.
+ // Refer to the comments in synchronizer.cpp for how we might encode extra
+ // state in _succ so we can avoid fetching EntryList|cxq.
+ //
+ // I'd like to add more cases in fast_lock() and fast_unlock() --
+ // such as recursive enter and exit -- but we have to be wary of
+ // I$ bloat, T$ effects and BP$ effects.
+ //
+ // If there's no contention try a 1-0 exit. That is, exit without
+ // a costly MEMBAR or CAS. See synchronizer.cpp for details on how
+ // we detect and recover from the race that the 1-0 exit admits.
+ //
+ // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
+ // before it STs null into _owner, releasing the lock. Updates
+ // to data protected by the critical section must be visible before
+ // we drop the lock (and thus before any other thread could acquire
+ // the lock and observe the fields protected by the lock).
+ // IA32's memory-model is SPO, so STs are ordered with respect to
+ // each other and there's no need for an explicit barrier (fence).
+ // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
+#ifndef _LP64
+ get_thread (boxReg);
+ if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
+ // prefetchw [ebx + Offset(_owner)-2]
+ prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
+ }
+
+ // Note that we could employ various encoding schemes to reduce
+ // the number of loads below (currently 4) to just 2 or 3.
+ // Refer to the comments in synchronizer.cpp.
+ // In practice the chain of fetches doesn't seem to impact performance, however.
+ if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
+ // Attempt to reduce branch density - AMD's branch predictor.
+ xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
+ orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2));
+ orptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2));
+ orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2));
+ jccb (Assembler::notZero, DONE_LABEL);
+ movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD);
+ jmpb (DONE_LABEL);
+ } else {
+ xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
+ orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2));
+ jccb (Assembler::notZero, DONE_LABEL);
+ movptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2));
+ orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2));
+ jccb (Assembler::notZero, CheckSucc);
+ movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD);
+ jmpb (DONE_LABEL);
+ }
+
+ // The Following code fragment (EmitSync & 65536) improves the performance of
+ // contended applications and contended synchronization microbenchmarks.
+ // Unfortunately the emission of the code - even though not executed - causes regressions
+ // in scimark and jetstream, evidently because of $ effects. Replacing the code
+ // with an equal number of never-executed NOPs results in the same regression.
+ // We leave it off by default.
+
+ if ((EmitSync & 65536) != 0) {
+ Label LSuccess, LGoSlowPath ;
+
+ bind (CheckSucc);
+
+ // Optional pre-test ... it's safe to elide this
+ if ((EmitSync & 16) == 0) {
+ cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD);
+ jccb (Assembler::zero, LGoSlowPath);
+ }
+
+ // We have a classic Dekker-style idiom:
+ // ST m->_owner = 0 ; MEMBAR; LD m->_succ
+ // There are a number of ways to implement the barrier:
+ // (1) lock:andl &m->_owner, 0
+ // is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
+ // LOCK: ANDL [ebx+Offset(_Owner)-2], 0
+ // Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
+ // (2) If supported, an explicit MFENCE is appealing.
+ // In older IA32 processors MFENCE is slower than lock:add or xchg
+ // particularly if the write-buffer is full as might be the case if
+ // if stores closely precede the fence or fence-equivalent instruction.
+ // In more modern implementations MFENCE appears faster, however.
+ // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
+ // The $lines underlying the top-of-stack should be in M-state.
+ // The locked add instruction is serializing, of course.
+ // (4) Use xchg, which is serializing
+ // mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
+ // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
+ // The integer condition codes will tell us if succ was 0.
+ // Since _succ and _owner should reside in the same $line and
+ // we just stored into _owner, it's likely that the $line
+ // remains in M-state for the lock:orl.
+ //
+ // We currently use (3), although it's likely that switching to (2)
+ // is correct for the future.
+
+ movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD);
+ if (os::is_MP()) {
+ if (VM_Version::supports_sse2() && 1 == FenceInstruction) {
+ mfence();
+ } else {
+ lock (); addptr(Address(rsp, 0), 0);
+ }
+ }
+ // Ratify _succ remains non-null
+ cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0);
+ jccb (Assembler::notZero, LSuccess);
+
+ xorptr(boxReg, boxReg); // box is really EAX
+ if (os::is_MP()) { lock(); }
+ cmpxchgptr(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
+ jccb (Assembler::notEqual, LSuccess);
+ // Since we're low on registers we installed rsp as a placeholding in _owner.
+ // Now install Self over rsp. This is safe as we're transitioning from
+ // non-null to non=null
+ get_thread (boxReg);
+ movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg);
+ // Intentional fall-through into LGoSlowPath ...
+
+ bind (LGoSlowPath);
+ orptr(boxReg, 1); // set ICC.ZF=0 to indicate failure
+ jmpb (DONE_LABEL);
+
+ bind (LSuccess);
+ xorptr(boxReg, boxReg); // set ICC.ZF=1 to indicate success
+ jmpb (DONE_LABEL);
+ }
+
+ bind (Stacked);
+ // It's not inflated and it's not recursively stack-locked and it's not biased.
+ // It must be stack-locked.
+ // Try to reset the header to displaced header.
+ // The "box" value on the stack is stable, so we can reload
+ // and be assured we observe the same value as above.
+ movptr(tmpReg, Address(boxReg, 0));
+ if (os::is_MP()) {
+ lock();
+ }
+ cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
+ // Intention fall-thru into DONE_LABEL
+
+ // DONE_LABEL is a hot target - we'd really like to place it at the
+ // start of cache line by padding with NOPs.
+ // See the AMD and Intel software optimization manuals for the
+ // most efficient "long" NOP encodings.
+ // Unfortunately none of our alignment mechanisms suffice.
+ if ((EmitSync & 65536) == 0) {
+ bind (CheckSucc);
+ }
+#else // _LP64
+ // It's inflated
+ movptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
+ xorptr(boxReg, r15_thread);
+ orptr (boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2));
+ jccb (Assembler::notZero, DONE_LABEL);
+ movptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2));
+ orptr (boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2));
+ jccb (Assembler::notZero, CheckSucc);
+ movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), (int32_t)NULL_WORD);
+ jmpb (DONE_LABEL);
+
+ if ((EmitSync & 65536) == 0) {
+ Label LSuccess, LGoSlowPath ;
+ bind (CheckSucc);
+ cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD);
+ jccb (Assembler::zero, LGoSlowPath);
+
+ // I'd much rather use lock:andl m->_owner, 0 as it's faster than the
+ // the explicit ST;MEMBAR combination, but masm doesn't currently support
+ // "ANDQ M,IMM". Don't use MFENCE here. lock:add to TOS, xchg, etc
+ // are all faster when the write buffer is populated.
+ movptr (Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), (int32_t)NULL_WORD);
+ if (os::is_MP()) {
+ lock (); addl (Address(rsp, 0), 0);
+ }
+ cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD);
+ jccb (Assembler::notZero, LSuccess);
+
+ movptr (boxReg, (int32_t)NULL_WORD); // box is really EAX
+ if (os::is_MP()) { lock(); }
+ cmpxchgptr(r15_thread, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
+ jccb (Assembler::notEqual, LSuccess);
+ // Intentional fall-through into slow-path
+
+ bind (LGoSlowPath);
+ orl (boxReg, 1); // set ICC.ZF=0 to indicate failure
+ jmpb (DONE_LABEL);
+
+ bind (LSuccess);
+ testl (boxReg, 0); // set ICC.ZF=1 to indicate success
+ jmpb (DONE_LABEL);
+ }
+
+ bind (Stacked);
+ movptr(tmpReg, Address (boxReg, 0)); // re-fetch
+ if (os::is_MP()) { lock(); }
+ cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
+
+ if (EmitSync & 65536) {
+ bind (CheckSucc);
+ }
+#endif
+ bind(DONE_LABEL);
+ // Avoid branch to branch on AMD processors
+ if (EmitSync & 32768) {
+ nop();
+ }
+ }
+}
+#endif // COMPILER2
+
void MacroAssembler::c2bool(Register x) {
// implements x == 0 ? 0 : 1
// note: must only look at least-significant byte of x
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp Fri Feb 21 08:09:15 2014 -0800
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp Mon Feb 24 15:12:26 2014 -0800
@@ -651,7 +651,12 @@
Label& done, Label* slow_case = NULL,
BiasedLockingCounters* counters = NULL);
void biased_locking_exit (Register obj_reg, Register temp_reg, Label& done);
-
+#ifdef COMPILER2
+ // Code used by cmpFastLock and cmpFastUnlock mach instructions in .ad file.
+ // See full desription in macroAssembler_x86.cpp.
+ void fast_lock(Register obj, Register box, Register tmp, Register scr, BiasedLockingCounters* counters);
+ void fast_unlock(Register obj, Register box, Register tmp);
+#endif
Condition negate_condition(Condition cond);
--- a/hotspot/src/cpu/x86/vm/x86_32.ad Fri Feb 21 08:09:15 2014 -0800
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad Mon Feb 24 15:12:26 2014 -0800
@@ -2918,542 +2918,6 @@
emit_d8 (cbuf,0 );
%}
-
- // Because the transitions from emitted code to the runtime
- // monitorenter/exit helper stubs are so slow it's critical that
- // we inline both the stack-locking fast-path and the inflated fast path.
- //
- // See also: cmpFastLock and cmpFastUnlock.
- //
- // What follows is a specialized inline transliteration of the code
- // in slow_enter() and slow_exit(). If we're concerned about I$ bloat
- // another option would be to emit TrySlowEnter and TrySlowExit methods
- // at startup-time. These methods would accept arguments as
- // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
- // indications in the icc.ZFlag. Fast_Lock and Fast_Unlock would simply
- // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
- // In practice, however, the # of lock sites is bounded and is usually small.
- // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
- // if the processor uses simple bimodal branch predictors keyed by EIP
- // Since the helper routines would be called from multiple synchronization
- // sites.
- //
- // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
- // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
- // to those specialized methods. That'd give us a mostly platform-independent
- // implementation that the JITs could optimize and inline at their pleasure.
- // Done correctly, the only time we'd need to cross to native could would be
- // to park() or unpark() threads. We'd also need a few more unsafe operators
- // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
- // (b) explicit barriers or fence operations.
- //
- // TODO:
- //
- // * Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
- // This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
- // Given TLAB allocation, Self is usually manifested in a register, so passing it into
- // the lock operators would typically be faster than reifying Self.
- //
- // * Ideally I'd define the primitives as:
- // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
- // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
- // Unfortunately ADLC bugs prevent us from expressing the ideal form.
- // Instead, we're stuck with a rather awkward and brittle register assignments below.
- // Furthermore the register assignments are overconstrained, possibly resulting in
- // sub-optimal code near the synchronization site.
- //
- // * Eliminate the sp-proximity tests and just use "== Self" tests instead.
- // Alternately, use a better sp-proximity test.
- //
- // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
- // Either one is sufficient to uniquely identify a thread.
- // TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
- //
- // * Intrinsify notify() and notifyAll() for the common cases where the
- // object is locked by the calling thread but the waitlist is empty.
- // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
- //
- // * use jccb and jmpb instead of jcc and jmp to improve code density.
- // But beware of excessive branch density on AMD Opterons.
- //
- // * Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
- // or failure of the fast-path. If the fast-path fails then we pass
- // control to the slow-path, typically in C. In Fast_Lock and
- // Fast_Unlock we often branch to DONE_LABEL, just to find that C2
- // will emit a conditional branch immediately after the node.
- // So we have branches to branches and lots of ICC.ZF games.
- // Instead, it might be better to have C2 pass a "FailureLabel"
- // into Fast_Lock and Fast_Unlock. In the case of success, control
- // will drop through the node. ICC.ZF is undefined at exit.
- // In the case of failure, the node will branch directly to the
- // FailureLabel
-
-
- // obj: object to lock
- // box: on-stack box address (displaced header location) - KILLED
- // rax,: tmp -- KILLED
- // scr: tmp -- KILLED
- enc_class Fast_Lock( eRegP obj, eRegP box, eAXRegI tmp, eRegP scr ) %{
-
- Register objReg = as_Register($obj$$reg);
- Register boxReg = as_Register($box$$reg);
- Register tmpReg = as_Register($tmp$$reg);
- Register scrReg = as_Register($scr$$reg);
-
- // Ensure the register assignents are disjoint
- guarantee (objReg != boxReg, "") ;
- guarantee (objReg != tmpReg, "") ;
- guarantee (objReg != scrReg, "") ;
- guarantee (boxReg != tmpReg, "") ;
- guarantee (boxReg != scrReg, "") ;
- guarantee (tmpReg == as_Register(EAX_enc), "") ;
-
- MacroAssembler masm(&cbuf);
-
- if (_counters != NULL) {
- masm.atomic_incl(ExternalAddress((address) _counters->total_entry_count_addr()));
- }
- if (EmitSync & 1) {
- // set box->dhw = unused_mark (3)
- // Force all sync thru slow-path: slow_enter() and slow_exit()
- masm.movptr (Address(boxReg, 0), int32_t(markOopDesc::unused_mark())) ;
- masm.cmpptr (rsp, (int32_t)0) ;
- } else
- if (EmitSync & 2) {
- Label DONE_LABEL ;
- if (UseBiasedLocking) {
- // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument.
- masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters);
- }
-
- masm.movptr(tmpReg, Address(objReg, 0)) ; // fetch markword
- masm.orptr (tmpReg, 0x1);
- masm.movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS
- if (os::is_MP()) { masm.lock(); }
- masm.cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg
- masm.jcc(Assembler::equal, DONE_LABEL);
- // Recursive locking
- masm.subptr(tmpReg, rsp);
- masm.andptr(tmpReg, (int32_t) 0xFFFFF003 );
- masm.movptr(Address(boxReg, 0), tmpReg);
- masm.bind(DONE_LABEL) ;
- } else {
- // Possible cases that we'll encounter in fast_lock
- // ------------------------------------------------
- // * Inflated
- // -- unlocked
- // -- Locked
- // = by self
- // = by other
- // * biased
- // -- by Self
- // -- by other
- // * neutral
- // * stack-locked
- // -- by self
- // = sp-proximity test hits
- // = sp-proximity test generates false-negative
- // -- by other
- //
-
- Label IsInflated, DONE_LABEL, PopDone ;
-
- // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
- // order to reduce the number of conditional branches in the most common cases.
- // Beware -- there's a subtle invariant that fetch of the markword
- // at [FETCH], below, will never observe a biased encoding (*101b).
- // If this invariant is not held we risk exclusion (safety) failure.
- if (UseBiasedLocking && !UseOptoBiasInlining) {
- masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters);
- }
-
- masm.movptr(tmpReg, Address(objReg, 0)) ; // [FETCH]
- masm.testptr(tmpReg, 0x02) ; // Inflated v (Stack-locked or neutral)
- masm.jccb (Assembler::notZero, IsInflated) ;
-
- // Attempt stack-locking ...
- masm.orptr (tmpReg, 0x1);
- masm.movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS
- if (os::is_MP()) { masm.lock(); }
- masm.cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg
- if (_counters != NULL) {
- masm.cond_inc32(Assembler::equal,
- ExternalAddress((address)_counters->fast_path_entry_count_addr()));
- }
- masm.jccb (Assembler::equal, DONE_LABEL);
-
- // Recursive locking
- masm.subptr(tmpReg, rsp);
- masm.andptr(tmpReg, 0xFFFFF003 );
- masm.movptr(Address(boxReg, 0), tmpReg);
- if (_counters != NULL) {
- masm.cond_inc32(Assembler::equal,
- ExternalAddress((address)_counters->fast_path_entry_count_addr()));
- }
- masm.jmp (DONE_LABEL) ;
-
- masm.bind (IsInflated) ;
-
- // The object is inflated.
- //
- // TODO-FIXME: eliminate the ugly use of manifest constants:
- // Use markOopDesc::monitor_value instead of "2".
- // use markOop::unused_mark() instead of "3".
- // The tmpReg value is an objectMonitor reference ORed with
- // markOopDesc::monitor_value (2). We can either convert tmpReg to an
- // objectmonitor pointer by masking off the "2" bit or we can just
- // use tmpReg as an objectmonitor pointer but bias the objectmonitor
- // field offsets with "-2" to compensate for and annul the low-order tag bit.
- //
- // I use the latter as it avoids AGI stalls.
- // As such, we write "mov r, [tmpReg+OFFSETOF(Owner)-2]"
- // instead of "mov r, [tmpReg+OFFSETOF(Owner)]".
- //
- #define OFFSET_SKEWED(f) ((ObjectMonitor::f ## _offset_in_bytes())-2)
-
- // boxReg refers to the on-stack BasicLock in the current frame.
- // We'd like to write:
- // set box->_displaced_header = markOop::unused_mark(). Any non-0 value suffices.
- // This is convenient but results a ST-before-CAS penalty. The following CAS suffers
- // additional latency as we have another ST in the store buffer that must drain.
-
- if (EmitSync & 8192) {
- masm.movptr(Address(boxReg, 0), 3) ; // results in ST-before-CAS penalty
- masm.get_thread (scrReg) ;
- masm.movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
- masm.movptr(tmpReg, NULL_WORD); // consider: xor vs mov
- if (os::is_MP()) { masm.lock(); }
- masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
- } else
- if ((EmitSync & 128) == 0) { // avoid ST-before-CAS
- masm.movptr(scrReg, boxReg) ;
- masm.movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
-
- // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
- if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
- // prefetchw [eax + Offset(_owner)-2]
- masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2));
- }
-
- if ((EmitSync & 64) == 0) {
- // Optimistic form: consider XORL tmpReg,tmpReg
- masm.movptr(tmpReg, NULL_WORD) ;
- } else {
- // Can suffer RTS->RTO upgrades on shared or cold $ lines
- // Test-And-CAS instead of CAS
- masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; // rax, = m->_owner
- masm.testptr(tmpReg, tmpReg) ; // Locked ?
- masm.jccb (Assembler::notZero, DONE_LABEL) ;
- }
-
- // Appears unlocked - try to swing _owner from null to non-null.
- // Ideally, I'd manifest "Self" with get_thread and then attempt
- // to CAS the register containing Self into m->Owner.
- // But we don't have enough registers, so instead we can either try to CAS
- // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds
- // we later store "Self" into m->Owner. Transiently storing a stack address
- // (rsp or the address of the box) into m->owner is harmless.
- // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
- if (os::is_MP()) { masm.lock(); }
- masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
- masm.movptr(Address(scrReg, 0), 3) ; // box->_displaced_header = 3
- masm.jccb (Assembler::notZero, DONE_LABEL) ;
- masm.get_thread (scrReg) ; // beware: clobbers ICCs
- masm.movptr(Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2), scrReg) ;
- masm.xorptr(boxReg, boxReg) ; // set icc.ZFlag = 1 to indicate success
-
- // If the CAS fails we can either retry or pass control to the slow-path.
- // We use the latter tactic.
- // Pass the CAS result in the icc.ZFlag into DONE_LABEL
- // If the CAS was successful ...
- // Self has acquired the lock
- // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
- // Intentional fall-through into DONE_LABEL ...
- } else {
- masm.movptr(Address(boxReg, 0), 3) ; // results in ST-before-CAS penalty
- masm.movptr(boxReg, tmpReg) ;
-
- // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
- if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
- // prefetchw [eax + Offset(_owner)-2]
- masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2));
- }
-
- if ((EmitSync & 64) == 0) {
- // Optimistic form
- masm.xorptr (tmpReg, tmpReg) ;
- } else {
- // Can suffer RTS->RTO upgrades on shared or cold $ lines
- masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; // rax, = m->_owner
- masm.testptr(tmpReg, tmpReg) ; // Locked ?
- masm.jccb (Assembler::notZero, DONE_LABEL) ;
- }
-
- // Appears unlocked - try to swing _owner from null to non-null.
- // Use either "Self" (in scr) or rsp as thread identity in _owner.
- // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
- masm.get_thread (scrReg) ;
- if (os::is_MP()) { masm.lock(); }
- masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
-
- // If the CAS fails we can either retry or pass control to the slow-path.
- // We use the latter tactic.
- // Pass the CAS result in the icc.ZFlag into DONE_LABEL
- // If the CAS was successful ...
- // Self has acquired the lock
- // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
- // Intentional fall-through into DONE_LABEL ...
- }
-
- // DONE_LABEL is a hot target - we'd really like to place it at the
- // start of cache line by padding with NOPs.
- // See the AMD and Intel software optimization manuals for the
- // most efficient "long" NOP encodings.
- // Unfortunately none of our alignment mechanisms suffice.
- masm.bind(DONE_LABEL);
-
- // Avoid branch-to-branch on AMD processors
- // This appears to be superstition.
- if (EmitSync & 32) masm.nop() ;
-
-
- // At DONE_LABEL the icc ZFlag is set as follows ...
- // Fast_Unlock uses the same protocol.
- // ZFlag == 1 -> Success
- // ZFlag == 0 -> Failure - force control through the slow-path
- }
- %}
-
- // obj: object to unlock
- // box: box address (displaced header location), killed. Must be EAX.
- // rbx,: killed tmp; cannot be obj nor box.
- //
- // Some commentary on balanced locking:
- //
- // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
- // Methods that don't have provably balanced locking are forced to run in the
- // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
- // The interpreter provides two properties:
- // I1: At return-time the interpreter automatically and quietly unlocks any
- // objects acquired the current activation (frame). Recall that the
- // interpreter maintains an on-stack list of locks currently held by
- // a frame.
- // I2: If a method attempts to unlock an object that is not held by the
- // the frame the interpreter throws IMSX.
- //
- // Lets say A(), which has provably balanced locking, acquires O and then calls B().
- // B() doesn't have provably balanced locking so it runs in the interpreter.
- // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
- // is still locked by A().
- //
- // The only other source of unbalanced locking would be JNI. The "Java Native Interface:
- // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
- // should not be unlocked by "normal" java-level locking and vice-versa. The specification
- // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
-
- enc_class Fast_Unlock( nabxRegP obj, eAXRegP box, eRegP tmp) %{
-
- Register objReg = as_Register($obj$$reg);
- Register boxReg = as_Register($box$$reg);
- Register tmpReg = as_Register($tmp$$reg);
-
- guarantee (objReg != boxReg, "") ;
- guarantee (objReg != tmpReg, "") ;
- guarantee (boxReg != tmpReg, "") ;
- guarantee (boxReg == as_Register(EAX_enc), "") ;
- MacroAssembler masm(&cbuf);
-
- if (EmitSync & 4) {
- // Disable - inhibit all inlining. Force control through the slow-path
- masm.cmpptr (rsp, 0) ;
- } else
- if (EmitSync & 8) {
- Label DONE_LABEL ;
- if (UseBiasedLocking) {
- masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL);
- }
- // classic stack-locking code ...
- masm.movptr(tmpReg, Address(boxReg, 0)) ;
- masm.testptr(tmpReg, tmpReg) ;
- masm.jcc (Assembler::zero, DONE_LABEL) ;
- if (os::is_MP()) { masm.lock(); }
- masm.cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses EAX which is box
- masm.bind(DONE_LABEL);
- } else {
- Label DONE_LABEL, Stacked, CheckSucc, Inflated ;
-
- // Critically, the biased locking test must have precedence over
- // and appear before the (box->dhw == 0) recursive stack-lock test.
- if (UseBiasedLocking && !UseOptoBiasInlining) {
- masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL);
- }
-
- masm.cmpptr(Address(boxReg, 0), 0) ; // Examine the displaced header
- masm.movptr(tmpReg, Address(objReg, 0)) ; // Examine the object's markword
- masm.jccb (Assembler::zero, DONE_LABEL) ; // 0 indicates recursive stack-lock
-
- masm.testptr(tmpReg, 0x02) ; // Inflated?
- masm.jccb (Assembler::zero, Stacked) ;
-
- masm.bind (Inflated) ;
- // It's inflated.
- // Despite our balanced locking property we still check that m->_owner == Self
- // as java routines or native JNI code called by this thread might
- // have released the lock.
- // Refer to the comments in synchronizer.cpp for how we might encode extra
- // state in _succ so we can avoid fetching EntryList|cxq.
- //
- // I'd like to add more cases in fast_lock() and fast_unlock() --
- // such as recursive enter and exit -- but we have to be wary of
- // I$ bloat, T$ effects and BP$ effects.
- //
- // If there's no contention try a 1-0 exit. That is, exit without
- // a costly MEMBAR or CAS. See synchronizer.cpp for details on how
- // we detect and recover from the race that the 1-0 exit admits.
- //
- // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
- // before it STs null into _owner, releasing the lock. Updates
- // to data protected by the critical section must be visible before
- // we drop the lock (and thus before any other thread could acquire
- // the lock and observe the fields protected by the lock).
- // IA32's memory-model is SPO, so STs are ordered with respect to
- // each other and there's no need for an explicit barrier (fence).
- // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
-
- masm.get_thread (boxReg) ;
- if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
- // prefetchw [ebx + Offset(_owner)-2]
- masm.prefetchw(Address(rbx, ObjectMonitor::owner_offset_in_bytes()-2));
- }
-
- // Note that we could employ various encoding schemes to reduce
- // the number of loads below (currently 4) to just 2 or 3.
- // Refer to the comments in synchronizer.cpp.
- // In practice the chain of fetches doesn't seem to impact performance, however.
- if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
- // Attempt to reduce branch density - AMD's branch predictor.
- masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
- masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
- masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ;
- masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ;
- masm.jccb (Assembler::notZero, DONE_LABEL) ;
- masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ;
- masm.jmpb (DONE_LABEL) ;
- } else {
- masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
- masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
- masm.jccb (Assembler::notZero, DONE_LABEL) ;
- masm.movptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ;
- masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ;
- masm.jccb (Assembler::notZero, CheckSucc) ;
- masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ;
- masm.jmpb (DONE_LABEL) ;
- }
-
- // The Following code fragment (EmitSync & 65536) improves the performance of
- // contended applications and contended synchronization microbenchmarks.
- // Unfortunately the emission of the code - even though not executed - causes regressions
- // in scimark and jetstream, evidently because of $ effects. Replacing the code
- // with an equal number of never-executed NOPs results in the same regression.
- // We leave it off by default.
-
- if ((EmitSync & 65536) != 0) {
- Label LSuccess, LGoSlowPath ;
-
- masm.bind (CheckSucc) ;
-
- // Optional pre-test ... it's safe to elide this
- if ((EmitSync & 16) == 0) {
- masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ;
- masm.jccb (Assembler::zero, LGoSlowPath) ;
- }
-
- // We have a classic Dekker-style idiom:
- // ST m->_owner = 0 ; MEMBAR; LD m->_succ
- // There are a number of ways to implement the barrier:
- // (1) lock:andl &m->_owner, 0
- // is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
- // LOCK: ANDL [ebx+Offset(_Owner)-2], 0
- // Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
- // (2) If supported, an explicit MFENCE is appealing.
- // In older IA32 processors MFENCE is slower than lock:add or xchg
- // particularly if the write-buffer is full as might be the case if
- // if stores closely precede the fence or fence-equivalent instruction.
- // In more modern implementations MFENCE appears faster, however.
- // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
- // The $lines underlying the top-of-stack should be in M-state.
- // The locked add instruction is serializing, of course.
- // (4) Use xchg, which is serializing
- // mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
- // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
- // The integer condition codes will tell us if succ was 0.
- // Since _succ and _owner should reside in the same $line and
- // we just stored into _owner, it's likely that the $line
- // remains in M-state for the lock:orl.
- //
- // We currently use (3), although it's likely that switching to (2)
- // is correct for the future.
-
- masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ;
- if (os::is_MP()) {
- if (VM_Version::supports_sse2() && 1 == FenceInstruction) {
- masm.mfence();
- } else {
- masm.lock () ; masm.addptr(Address(rsp, 0), 0) ;
- }
- }
- // Ratify _succ remains non-null
- masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ;
- masm.jccb (Assembler::notZero, LSuccess) ;
-
- masm.xorptr(boxReg, boxReg) ; // box is really EAX
- if (os::is_MP()) { masm.lock(); }
- masm.cmpxchgptr(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
- masm.jccb (Assembler::notEqual, LSuccess) ;
- // Since we're low on registers we installed rsp as a placeholding in _owner.
- // Now install Self over rsp. This is safe as we're transitioning from
- // non-null to non=null
- masm.get_thread (boxReg) ;
- masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg) ;
- // Intentional fall-through into LGoSlowPath ...
-
- masm.bind (LGoSlowPath) ;
- masm.orptr(boxReg, 1) ; // set ICC.ZF=0 to indicate failure
- masm.jmpb (DONE_LABEL) ;
-
- masm.bind (LSuccess) ;
- masm.xorptr(boxReg, boxReg) ; // set ICC.ZF=1 to indicate success
- masm.jmpb (DONE_LABEL) ;
- }
-
- masm.bind (Stacked) ;
- // It's not inflated and it's not recursively stack-locked and it's not biased.
- // It must be stack-locked.
- // Try to reset the header to displaced header.
- // The "box" value on the stack is stable, so we can reload
- // and be assured we observe the same value as above.
- masm.movptr(tmpReg, Address(boxReg, 0)) ;
- if (os::is_MP()) { masm.lock(); }
- masm.cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses EAX which is box
- // Intention fall-thru into DONE_LABEL
-
-
- // DONE_LABEL is a hot target - we'd really like to place it at the
- // start of cache line by padding with NOPs.
- // See the AMD and Intel software optimization manuals for the
- // most efficient "long" NOP encodings.
- // Unfortunately none of our alignment mechanisms suffice.
- if ((EmitSync & 65536) == 0) {
- masm.bind (CheckSucc) ;
- }
- masm.bind(DONE_LABEL);
-
- // Avoid branch to branch on AMD processors
- if (EmitSync & 32768) { masm.nop() ; }
- }
- %}
-
-
enc_class enc_pop_rdx() %{
emit_opcode(cbuf,0x5A);
%}
@@ -13157,23 +12621,26 @@
// inlined locking and unlocking
-
-instruct cmpFastLock( eFlagsReg cr, eRegP object, eBXRegP box, eAXRegI tmp, eRegP scr) %{
- match( Set cr (FastLock object box) );
- effect( TEMP tmp, TEMP scr, USE_KILL box );
+instruct cmpFastLock(eFlagsReg cr, eRegP object, eBXRegP box, eAXRegI tmp, eRegP scr) %{
+ match(Set cr (FastLock object box));
+ effect(TEMP tmp, TEMP scr, USE_KILL box);
ins_cost(300);
format %{ "FASTLOCK $object,$box\t! kills $box,$tmp,$scr" %}
- ins_encode( Fast_Lock(object,box,tmp,scr) );
- ins_pipe( pipe_slow );
-%}
-
-instruct cmpFastUnlock( eFlagsReg cr, eRegP object, eAXRegP box, eRegP tmp ) %{
- match( Set cr (FastUnlock object box) );
- effect( TEMP tmp, USE_KILL box );
+ ins_encode %{
+ __ fast_lock($object$$Register, $box$$Register, $tmp$$Register, $scr$$Register, _counters);
+ %}
+ ins_pipe(pipe_slow);
+%}
+
+instruct cmpFastUnlock(eFlagsReg cr, eRegP object, eAXRegP box, eRegP tmp ) %{
+ match(Set cr (FastUnlock object box));
+ effect(TEMP tmp, USE_KILL box);
ins_cost(300);
format %{ "FASTUNLOCK $object,$box\t! kills $box,$tmp" %}
- ins_encode( Fast_Unlock(object,box,tmp) );
- ins_pipe( pipe_slow );
+ ins_encode %{
+ __ fast_unlock($object$$Register, $box$$Register, $tmp$$Register);
+ %}
+ ins_pipe(pipe_slow);
%}
--- a/hotspot/src/cpu/x86/vm/x86_64.ad Fri Feb 21 08:09:15 2014 -0800
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad Mon Feb 24 15:12:26 2014 -0800
@@ -2599,231 +2599,6 @@
%}
- // obj: object to lock
- // box: box address (header location) -- killed
- // tmp: rax -- killed
- // scr: rbx -- killed
- //
- // What follows is a direct transliteration of fast_lock() and fast_unlock()
- // from i486.ad. See that file for comments.
- // TODO: where possible switch from movq (r, 0) to movl(r,0) and
- // use the shorter encoding. (Movl clears the high-order 32-bits).
-
-
- enc_class Fast_Lock(rRegP obj, rRegP box, rax_RegI tmp, rRegP scr)
- %{
- Register objReg = as_Register((int)$obj$$reg);
- Register boxReg = as_Register((int)$box$$reg);
- Register tmpReg = as_Register($tmp$$reg);
- Register scrReg = as_Register($scr$$reg);
- MacroAssembler masm(&cbuf);
-
- // Verify uniqueness of register assignments -- necessary but not sufficient
- assert (objReg != boxReg && objReg != tmpReg &&
- objReg != scrReg && tmpReg != scrReg, "invariant") ;
-
- if (_counters != NULL) {
- masm.atomic_incl(ExternalAddress((address) _counters->total_entry_count_addr()));
- }
- if (EmitSync & 1) {
- // Without cast to int32_t a movptr will destroy r10 which is typically obj
- masm.movptr (Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark())) ;
- masm.cmpptr(rsp, (int32_t)NULL_WORD) ;
- } else
- if (EmitSync & 2) {
- Label DONE_LABEL;
- if (UseBiasedLocking) {
- // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument.
- masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters);
- }
- // QQQ was movl...
- masm.movptr(tmpReg, 0x1);
- masm.orptr(tmpReg, Address(objReg, 0));
- masm.movptr(Address(boxReg, 0), tmpReg);
- if (os::is_MP()) {
- masm.lock();
- }
- masm.cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg
- masm.jcc(Assembler::equal, DONE_LABEL);
-
- // Recursive locking
- masm.subptr(tmpReg, rsp);
- masm.andptr(tmpReg, 7 - os::vm_page_size());
- masm.movptr(Address(boxReg, 0), tmpReg);
-
- masm.bind(DONE_LABEL);
- masm.nop(); // avoid branch to branch
- } else {
- Label DONE_LABEL, IsInflated, Egress;
-
- masm.movptr(tmpReg, Address(objReg, 0)) ;
- masm.testl (tmpReg, 0x02) ; // inflated vs stack-locked|neutral|biased
- masm.jcc (Assembler::notZero, IsInflated) ;
-
- // it's stack-locked, biased or neutral
- // TODO: optimize markword triage order to reduce the number of
- // conditional branches in the most common cases.
- // Beware -- there's a subtle invariant that fetch of the markword
- // at [FETCH], below, will never observe a biased encoding (*101b).
- // If this invariant is not held we'll suffer exclusion (safety) failure.
-
- if (UseBiasedLocking && !UseOptoBiasInlining) {
- masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, true, DONE_LABEL, NULL, _counters);
- masm.movptr(tmpReg, Address(objReg, 0)) ; // [FETCH]
- }
-
- // was q will it destroy high?
- masm.orl (tmpReg, 1) ;
- masm.movptr(Address(boxReg, 0), tmpReg) ;
- if (os::is_MP()) { masm.lock(); }
- masm.cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg
- if (_counters != NULL) {
- masm.cond_inc32(Assembler::equal,
- ExternalAddress((address) _counters->fast_path_entry_count_addr()));
- }
- masm.jcc (Assembler::equal, DONE_LABEL);
-
- // Recursive locking
- masm.subptr(tmpReg, rsp);
- masm.andptr(tmpReg, 7 - os::vm_page_size());
- masm.movptr(Address(boxReg, 0), tmpReg);
- if (_counters != NULL) {
- masm.cond_inc32(Assembler::equal,
- ExternalAddress((address) _counters->fast_path_entry_count_addr()));
- }
- masm.jmp (DONE_LABEL) ;
-
- masm.bind (IsInflated) ;
- // It's inflated
-
- // TODO: someday avoid the ST-before-CAS penalty by
- // relocating (deferring) the following ST.
- // We should also think about trying a CAS without having
- // fetched _owner. If the CAS is successful we may
- // avoid an RTO->RTS upgrade on the $line.
- // Without cast to int32_t a movptr will destroy r10 which is typically obj
- masm.movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark())) ;
-
- masm.mov (boxReg, tmpReg) ;
- masm.movptr (tmpReg, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
- masm.testptr(tmpReg, tmpReg) ;
- masm.jcc (Assembler::notZero, DONE_LABEL) ;
-
- // It's inflated and appears unlocked
- if (os::is_MP()) { masm.lock(); }
- masm.cmpxchgptr(r15_thread, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
- // Intentional fall-through into DONE_LABEL ...
-
- masm.bind (DONE_LABEL) ;
- masm.nop () ; // avoid jmp to jmp
- }
- %}
-
- // obj: object to unlock
- // box: box address (displaced header location), killed
- // RBX: killed tmp; cannot be obj nor box
- enc_class Fast_Unlock(rRegP obj, rax_RegP box, rRegP tmp)
- %{
-
- Register objReg = as_Register($obj$$reg);
- Register boxReg = as_Register($box$$reg);
- Register tmpReg = as_Register($tmp$$reg);
- MacroAssembler masm(&cbuf);
-
- if (EmitSync & 4) {
- masm.cmpptr(rsp, 0) ;
- } else
- if (EmitSync & 8) {
- Label DONE_LABEL;
- if (UseBiasedLocking) {
- masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL);
- }
-
- // Check whether the displaced header is 0
- //(=> recursive unlock)
- masm.movptr(tmpReg, Address(boxReg, 0));
- masm.testptr(tmpReg, tmpReg);
- masm.jcc(Assembler::zero, DONE_LABEL);
-
- // If not recursive lock, reset the header to displaced header
- if (os::is_MP()) {
- masm.lock();
- }
- masm.cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
- masm.bind(DONE_LABEL);
- masm.nop(); // avoid branch to branch
- } else {
- Label DONE_LABEL, Stacked, CheckSucc ;
-
- if (UseBiasedLocking && !UseOptoBiasInlining) {
- masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL);
- }
-
- masm.movptr(tmpReg, Address(objReg, 0)) ;
- masm.cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD) ;
- masm.jcc (Assembler::zero, DONE_LABEL) ;
- masm.testl (tmpReg, 0x02) ;
- masm.jcc (Assembler::zero, Stacked) ;
-
- // It's inflated
- masm.movptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
- masm.xorptr(boxReg, r15_thread) ;
- masm.orptr (boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
- masm.jcc (Assembler::notZero, DONE_LABEL) ;
- masm.movptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ;
- masm.orptr (boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ;
- masm.jcc (Assembler::notZero, CheckSucc) ;
- masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), (int32_t)NULL_WORD) ;
- masm.jmp (DONE_LABEL) ;
-
- if ((EmitSync & 65536) == 0) {
- Label LSuccess, LGoSlowPath ;
- masm.bind (CheckSucc) ;
- masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD) ;
- masm.jcc (Assembler::zero, LGoSlowPath) ;
-
- // I'd much rather use lock:andl m->_owner, 0 as it's faster than the
- // the explicit ST;MEMBAR combination, but masm doesn't currently support
- // "ANDQ M,IMM". Don't use MFENCE here. lock:add to TOS, xchg, etc
- // are all faster when the write buffer is populated.
- masm.movptr (Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), (int32_t)NULL_WORD) ;
- if (os::is_MP()) {
- masm.lock () ; masm.addl (Address(rsp, 0), 0) ;
- }
- masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD) ;
- masm.jcc (Assembler::notZero, LSuccess) ;
-
- masm.movptr (boxReg, (int32_t)NULL_WORD) ; // box is really EAX
- if (os::is_MP()) { masm.lock(); }
- masm.cmpxchgptr(r15_thread, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
- masm.jcc (Assembler::notEqual, LSuccess) ;
- // Intentional fall-through into slow-path
-
- masm.bind (LGoSlowPath) ;
- masm.orl (boxReg, 1) ; // set ICC.ZF=0 to indicate failure
- masm.jmp (DONE_LABEL) ;
-
- masm.bind (LSuccess) ;
- masm.testl (boxReg, 0) ; // set ICC.ZF=1 to indicate success
- masm.jmp (DONE_LABEL) ;
- }
-
- masm.bind (Stacked) ;
- masm.movptr(tmpReg, Address (boxReg, 0)) ; // re-fetch
- if (os::is_MP()) { masm.lock(); }
- masm.cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
-
- if (EmitSync & 65536) {
- masm.bind (CheckSucc) ;
- }
- masm.bind(DONE_LABEL);
- if (EmitSync & 32768) {
- masm.nop(); // avoid branch to branch
- }
- }
- %}
-
-
enc_class enc_rethrow()
%{
cbuf.set_insts_mark();
@@ -11453,27 +11228,25 @@
// ============================================================================
// inlined locking and unlocking
-instruct cmpFastLock(rFlagsReg cr,
- rRegP object, rbx_RegP box, rax_RegI tmp, rRegP scr)
-%{
+instruct cmpFastLock(rFlagsReg cr, rRegP object, rbx_RegP box, rax_RegI tmp, rRegP scr) %{
match(Set cr (FastLock object box));
effect(TEMP tmp, TEMP scr, USE_KILL box);
-
ins_cost(300);
format %{ "fastlock $object,$box\t! kills $box,$tmp,$scr" %}
- ins_encode(Fast_Lock(object, box, tmp, scr));
+ ins_encode %{
+ __ fast_lock($object$$Register, $box$$Register, $tmp$$Register, $scr$$Register, _counters);
+ %}
ins_pipe(pipe_slow);
%}
-instruct cmpFastUnlock(rFlagsReg cr,
- rRegP object, rax_RegP box, rRegP tmp)
-%{
+instruct cmpFastUnlock(rFlagsReg cr, rRegP object, rax_RegP box, rRegP tmp) %{
match(Set cr (FastUnlock object box));
effect(TEMP tmp, USE_KILL box);
-
ins_cost(300);
format %{ "fastunlock $object,$box\t! kills $box,$tmp" %}
- ins_encode(Fast_Unlock(object, box, tmp));
+ ins_encode %{
+ __ fast_unlock($object$$Register, $box$$Register, $tmp$$Register);
+ %}
ins_pipe(pipe_slow);
%}