8073165: Contended Locking fast exit bucket
authordcubed
Thu, 16 Apr 2015 08:23:26 -0700
changeset 30244 d4e471395ff5
parent 30240 a7ba42fa1df6
child 30245 1c5b90ba1d47
8073165: Contended Locking fast exit bucket Summary: JEP-143/JDK-8073165 Contended Locking fast exit bucket Reviewed-by: dholmes, acorn, dice, dcubed Contributed-by: dave.dice@oracle.com, karen.kinnear@oracle.com, daniel.daugherty@oracle.com
hotspot/src/cpu/sparc/vm/macroAssembler_sparc.cpp
hotspot/src/cpu/sparc/vm/sharedRuntime_sparc.cpp
hotspot/src/cpu/x86/vm/globals_x86.hpp
hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp
hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp
hotspot/src/share/vm/opto/macro.cpp
hotspot/src/share/vm/opto/macro.hpp
hotspot/src/share/vm/opto/runtime.cpp
hotspot/src/share/vm/opto/runtime.hpp
hotspot/src/share/vm/runtime/sharedRuntime.cpp
hotspot/src/share/vm/runtime/sharedRuntime.hpp
hotspot/src/share/vm/runtime/synchronizer.cpp
hotspot/src/share/vm/runtime/synchronizer.hpp
--- a/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.cpp	Wed Apr 15 17:34:28 2015 -0700
+++ b/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.cpp	Thu Apr 16 08:23:26 2015 -0700
@@ -3019,44 +3019,107 @@
    // past the store that releases the lock.  But TSO is a strong memory model
    // and that particular flavor of barrier is a noop, so we can safely elide it.
    // Note that we use 1-0 locking by default for the inflated case.  We
-   // close the resultant (and rare) race by having contented threads in
+   // close the resultant (and rare) race by having contended threads in
    // monitorenter periodically poll _owner.
-   ld_ptr(Rmark, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner), Rscratch);
-   ld_ptr(Rmark, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions), Rbox);
-   xor3(Rscratch, G2_thread, Rscratch);
-   orcc(Rbox, Rscratch, Rbox);
-   brx(Assembler::notZero, false, Assembler::pn, done);
-   delayed()->
-   ld_ptr(Rmark, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList), Rscratch);
-   ld_ptr(Rmark, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq), Rbox);
-   orcc(Rbox, Rscratch, G0);
-   if (EmitSync & 65536) {
-      Label LSucc ;
-      brx(Assembler::notZero, false, Assembler::pn, LSucc);
-      delayed()->nop();
-      ba(done);
-      delayed()->st_ptr(G0, Rmark, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner));
-
-      bind(LSucc);
-      st_ptr(G0, Rmark, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner));
-      if (os::is_MP()) { membar (StoreLoad); }
-      ld_ptr(Rmark, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ), Rscratch);
-      andcc(Rscratch, Rscratch, G0);
-      brx(Assembler::notZero, false, Assembler::pt, done);
-      delayed()->andcc(G0, G0, G0);
-      add(Rmark, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner), Rmark);
-      mov(G2_thread, Rscratch);
-      cas_ptr(Rmark, G0, Rscratch);
-      // invert icc.zf and goto done
-      br_notnull(Rscratch, false, Assembler::pt, done);
-      delayed()->cmp(G0, G0);
-      ba(done);
-      delayed()->cmp(G0, 1);
+
+   if (EmitSync & 1024) {
+     // Emit code to check that _owner == Self
+     // We could fold the _owner test into subsequent code more efficiently
+     // than using a stand-alone check, but since _owner checking is off by
+     // default we don't bother. We also might consider predicating the
+     // _owner==Self check on Xcheck:jni or running on a debug build.
+     ld_ptr(Address(Rmark, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), Rscratch);
+     orcc(Rscratch, G0, G0);
+     brx(Assembler::notZero, false, Assembler::pn, done);
+     delayed()->nop();
+   }
+
+   if (EmitSync & 512) {
+     // classic lock release code absent 1-0 locking
+     //   m->Owner = null;
+     //   membar #storeload
+     //   if (m->cxq|m->EntryList) == null goto Success
+     //   if (m->succ != null) goto Success
+     //   if CAS (&m->Owner,0,Self) != 0 goto Success
+     //   goto SlowPath
+     ld_ptr(Address(Rmark, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), Rbox);
+     orcc(Rbox, G0, G0);
+     brx(Assembler::notZero, false, Assembler::pn, done);
+     delayed()->nop();
+     st_ptr(G0, Address(Rmark, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
+     if (os::is_MP()) { membar(StoreLoad); }
+     ld_ptr(Address(Rmark, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)), Rscratch);
+     ld_ptr(Address(Rmark, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)), Rbox);
+     orcc(Rbox, Rscratch, G0);
+     brx(Assembler::zero, false, Assembler::pt, done);
+     delayed()->
+     ld_ptr(Address(Rmark, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), Rscratch);
+     andcc(Rscratch, Rscratch, G0);
+     brx(Assembler::notZero, false, Assembler::pt, done);
+     delayed()->andcc(G0, G0, G0);
+     add(Rmark, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner), Rmark);
+     mov(G2_thread, Rscratch);
+     cas_ptr(Rmark, G0, Rscratch);
+     cmp(Rscratch, G0);
+     // invert icc.zf and goto done
+     brx(Assembler::notZero, false, Assembler::pt, done);
+     delayed()->cmp(G0, G0);
+     br(Assembler::always, false, Assembler::pt, done);
+     delayed()->cmp(G0, 1);
    } else {
-      brx(Assembler::notZero, false, Assembler::pn, done);
-      delayed()->nop();
-      ba(done);
-      delayed()->st_ptr(G0, Rmark, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner));
+     // 1-0 form : avoids CAS and MEMBAR in the common case
+     // Do not bother to ratify that m->Owner == Self.
+     ld_ptr(Address(Rmark, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), Rbox);
+     orcc(Rbox, G0, G0);
+     brx(Assembler::notZero, false, Assembler::pn, done);
+     delayed()->
+     ld_ptr(Address(Rmark, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)), Rscratch);
+     ld_ptr(Address(Rmark, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)), Rbox);
+     orcc(Rbox, Rscratch, G0);
+     if (EmitSync & 16384) {
+       // As an optional optimization, if (EntryList|cxq) != null and _succ is null then
+       // we should transfer control directly to the slow-path.
+       // This test makes the reacquire operation below very infrequent.
+       // The logic is equivalent to :
+       //   if (cxq|EntryList) == null : Owner=null; goto Success
+       //   if succ == null : goto SlowPath
+       //   Owner=null; membar #storeload
+       //   if succ != null : goto Success
+       //   if CAS(&Owner,null,Self) != null goto Success
+       //   goto SlowPath
+       brx(Assembler::zero, true, Assembler::pt, done);
+       delayed()->
+       st_ptr(G0, Address(Rmark, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
+       ld_ptr(Address(Rmark, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), Rscratch);
+       andcc(Rscratch, Rscratch, G0) ;
+       brx(Assembler::zero, false, Assembler::pt, done);
+       delayed()->orcc(G0, 1, G0);
+       st_ptr(G0, Address(Rmark, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
+     } else {
+       brx(Assembler::zero, false, Assembler::pt, done);
+       delayed()->
+       st_ptr(G0, Address(Rmark, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
+     }
+     if (os::is_MP()) { membar(StoreLoad); }
+     // Check that _succ is (or remains) non-zero
+     ld_ptr(Address(Rmark, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), Rscratch);
+     andcc(Rscratch, Rscratch, G0);
+     brx(Assembler::notZero, false, Assembler::pt, done);
+     delayed()->andcc(G0, G0, G0);
+     add(Rmark, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner), Rmark);
+     mov(G2_thread, Rscratch);
+     cas_ptr(Rmark, G0, Rscratch);
+     cmp(Rscratch, G0);
+     // invert icc.zf and goto done
+     // A slightly better v8+/v9 idiom would be the following:
+     //   movrnz Rscratch,1,Rscratch
+     //   ba done
+     //   xorcc Rscratch,1,G0
+     // In v8+ mode the idiom would be valid IFF Rscratch was a G or O register
+     brx(Assembler::notZero, false, Assembler::pt, done);
+     delayed()->cmp(G0, G0);
+     br(Assembler::always, false, Assembler::pt, done);
+     delayed()->cmp(G0, 1);
    }
 
    bind   (LStacked);
--- a/hotspot/src/cpu/sparc/vm/sharedRuntime_sparc.cpp	Wed Apr 15 17:34:28 2015 -0700
+++ b/hotspot/src/cpu/sparc/vm/sharedRuntime_sparc.cpp	Thu Apr 16 08:23:26 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -2664,6 +2664,9 @@
     // disallows any pending_exception.
     __ mov(L3_box, O1);
 
+    // Pass in current thread pointer
+    __ mov(G2_thread, O2);
+
     __ call(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C), relocInfo::runtime_call_type);
     __ delayed()->mov(L4, O0);              // Need oop in O0
 
--- a/hotspot/src/cpu/x86/vm/globals_x86.hpp	Wed Apr 15 17:34:28 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/globals_x86.hpp	Thu Apr 16 08:23:26 2015 -0700
@@ -87,9 +87,6 @@
   develop(bool, IEEEPrecision, true,                                        \
           "Enables IEEE precision (for INTEL only)")                        \
                                                                             \
-  product(intx, FenceInstruction, 0,                                        \
-          "(Unsafe,Unstable) Experimental")                                 \
-                                                                            \
   product(bool, UseStoreImmI16, true,                                       \
           "Use store immediate 16-bits value instruction on x86")           \
                                                                             \
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Wed Apr 15 17:34:28 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Thu Apr 16 08:23:26 2015 -0700
@@ -1958,6 +1958,11 @@
 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
+// Arguably given that the spec legislates the JNI case as undefined our implementation
+// could reasonably *avoid* checking owner in Fast_Unlock().
+// In the interest of performance we elide m->Owner==Self check in unlock.
+// A perfectly viable alternative is to elide the owner check except when
+// Xcheck:jni is enabled.
 
 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
   assert(boxReg == rax, "");
@@ -1966,24 +1971,6 @@
   if (EmitSync & 4) {
     // Disable - inhibit all inlining.  Force control through the slow-path
     cmpptr (rsp, 0);
-  } else
-  if (EmitSync & 8) {
-    Label DONE_LABEL;
-    if (UseBiasedLocking) {
-       biased_locking_exit(objReg, tmpReg, DONE_LABEL);
-    }
-    // Classic stack-locking code ...
-    // Check whether the displaced header is 0
-    //(=> recursive unlock)
-    movptr(tmpReg, Address(boxReg, 0));
-    testptr(tmpReg, tmpReg);
-    jccb(Assembler::zero, DONE_LABEL);
-    // If not recursive lock, reset the header to displaced header
-    if (os::is_MP()) {
-      lock();
-    }
-    cmpxchgptr(tmpReg, Address(objReg, 0));   // Uses RAX which is box
-    bind(DONE_LABEL);
   } else {
     Label DONE_LABEL, Stacked, CheckSucc;
 
@@ -2060,9 +2047,9 @@
     // the number of loads below (currently 4) to just 2 or 3.
     // Refer to the comments in synchronizer.cpp.
     // In practice the chain of fetches doesn't seem to impact performance, however.
+    xorptr(boxReg, boxReg);
     if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
        // Attempt to reduce branch density - AMD's branch predictor.
-       xorptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
@@ -2070,7 +2057,6 @@
        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
        jmpb  (DONE_LABEL);
     } else {
-       xorptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
        jccb  (Assembler::notZero, DONE_LABEL);
        movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
@@ -2093,10 +2079,8 @@
        bind  (CheckSucc);
 
        // Optional pre-test ... it's safe to elide this
-       if ((EmitSync & 16) == 0) {
-          cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
-          jccb  (Assembler::zero, LGoSlowPath);
-       }
+       cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
+       jccb(Assembler::zero, LGoSlowPath);
 
        // We have a classic Dekker-style idiom:
        //    ST m->_owner = 0 ; MEMBAR; LD m->_succ
@@ -2109,7 +2093,8 @@
        //     In older IA32 processors MFENCE is slower than lock:add or xchg
        //     particularly if the write-buffer is full as might be the case if
        //     if stores closely precede the fence or fence-equivalent instruction.
-       //     In more modern implementations MFENCE appears faster, however.
+       //     See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
+       //     as the situation has changed with Nehalem and Shanghai.
        // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
        //     The $lines underlying the top-of-stack should be in M-state.
        //     The locked add instruction is serializing, of course.
@@ -2126,11 +2111,7 @@
 
        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
        if (os::is_MP()) {
-          if (VM_Version::supports_sse2() && 1 == FenceInstruction) {
-            mfence();
-          } else {
-            lock (); addptr(Address(rsp, 0), 0);
-          }
+         lock(); addptr(Address(rsp, 0), 0);
        }
        // Ratify _succ remains non-null
        cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), 0);
@@ -2179,8 +2160,17 @@
     }
 #else // _LP64
     // It's inflated
-    movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
-    xorptr(boxReg, r15_thread);
+    if (EmitSync & 1024) {
+      // Emit code to check that _owner == Self
+      // We could fold the _owner test into subsequent code more efficiently
+      // than using a stand-alone check, but since _owner checking is off by
+      // default we don't bother. We also might consider predicating the
+      // _owner==Self check on Xcheck:jni or running on a debug build.
+      movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
+      xorptr(boxReg, r15_thread);
+    } else {
+      xorptr(boxReg, boxReg);
+    }
     orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
     jccb  (Assembler::notZero, DONE_LABEL);
     movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
@@ -2190,23 +2180,51 @@
     jmpb  (DONE_LABEL);
 
     if ((EmitSync & 65536) == 0) {
+      // Try to avoid passing control into the slow_path ...
       Label LSuccess, LGoSlowPath ;
       bind  (CheckSucc);
+
+      // The following optional optimization can be elided if necessary
+      // Effectively: if (succ == null) goto SlowPath
+      // The code reduces the window for a race, however,
+      // and thus benefits performance.
       cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
       jccb  (Assembler::zero, LGoSlowPath);
 
-      // I'd much rather use lock:andl m->_owner, 0 as it's faster than the
-      // the explicit ST;MEMBAR combination, but masm doesn't currently support
-      // "ANDQ M,IMM".  Don't use MFENCE here.  lock:add to TOS, xchg, etc
-      // are all faster when the write buffer is populated.
-      movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
-      if (os::is_MP()) {
-         lock (); addl (Address(rsp, 0), 0);
+      if ((EmitSync & 16) && os::is_MP()) {
+        orptr(boxReg, boxReg);
+        xchgptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
+      } else {
+        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
+        if (os::is_MP()) {
+          // Memory barrier/fence
+          // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
+          // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
+          // This is faster on Nehalem and AMD Shanghai/Barcelona.
+          // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
+          // We might also restructure (ST Owner=0;barrier;LD _Succ) to
+          // (mov box,0; xchgq box, &m->Owner; LD _succ) .
+          lock(); addl(Address(rsp, 0), 0);
+        }
       }
       cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
       jccb  (Assembler::notZero, LSuccess);
 
-      movptr (boxReg, (int32_t)NULL_WORD);                   // box is really EAX
+      // Rare inopportune interleaving - race.
+      // The successor vanished in the small window above.
+      // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
+      // We need to ensure progress and succession.
+      // Try to reacquire the lock.
+      // If that fails then the new owner is responsible for succession and this
+      // thread needs to take no further action and can exit via the fast path (success).
+      // If the re-acquire succeeds then pass control into the slow path.
+      // As implemented, this latter mode is horrible because we generated more
+      // coherence traffic on the lock *and* artifically extended the critical section
+      // length while by virtue of passing control into the slow path.
+
+      // box is really RAX -- the following CMPXCHG depends on that binding
+      // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
+      movptr(boxReg, (int32_t)NULL_WORD);
       if (os::is_MP()) { lock(); }
       cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
       jccb  (Assembler::notEqual, LSuccess);
@@ -2231,10 +2249,6 @@
     }
 #endif
     bind(DONE_LABEL);
-    // Avoid branch to branch on AMD processors
-    if (EmitSync & 32768) {
-       nop();
-    }
   }
 }
 #endif // COMPILER2
--- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Wed Apr 15 17:34:28 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Thu Apr 16 08:23:26 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -2343,12 +2343,14 @@
 
     // should be a peal
     // +wordSize because of the push above
+    // args are (oop obj, BasicLock* lock, JavaThread* thread)
+    __ push(thread);
     __ lea(rax, Address(rbp, lock_slot_rbp_offset));
     __ push(rax);
 
     __ push(obj_reg);
     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
-    __ addptr(rsp, 2*wordSize);
+    __ addptr(rsp, 3*wordSize);
 #ifdef ASSERT
     {
       Label L;
--- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Wed Apr 15 17:34:28 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Thu Apr 16 08:23:26 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -2581,6 +2581,7 @@
     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
 
     __ mov(c_rarg0, obj_reg);
+    __ mov(c_rarg2, r15_thread);
     __ mov(r12, rsp); // remember sp
     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
     __ andptr(rsp, -16); // align stack as required by ABI
@@ -2590,6 +2591,7 @@
     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
 
+    // args are (oop obj, BasicLock* lock, JavaThread* thread)
     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
     __ mov(rsp, r12); // restore sp
     __ reinit_heapbase();
--- a/hotspot/src/share/vm/opto/macro.cpp	Wed Apr 15 17:34:28 2015 -0700
+++ b/hotspot/src/share/vm/opto/macro.cpp	Thu Apr 16 08:23:26 2015 -0700
@@ -144,7 +144,9 @@
 }
 
 //------------------------------make_slow_call---------------------------------
-CallNode* PhaseMacroExpand::make_slow_call(CallNode *oldcall, const TypeFunc* slow_call_type, address slow_call, const char* leaf_name, Node* slow_path, Node* parm0, Node* parm1) {
+CallNode* PhaseMacroExpand::make_slow_call(CallNode *oldcall, const TypeFunc* slow_call_type,
+                                           address slow_call, const char* leaf_name, Node* slow_path,
+                                           Node* parm0, Node* parm1, Node* parm2) {
 
   // Slow-path call
  CallNode *call = leaf_name
@@ -155,6 +157,7 @@
   copy_predefined_input_for_runtime_call(slow_path, oldcall, call );
   if (parm0 != NULL)  call->init_req(TypeFunc::Parms+0, parm0);
   if (parm1 != NULL)  call->init_req(TypeFunc::Parms+1, parm1);
+  if (parm2 != NULL)  call->init_req(TypeFunc::Parms+2, parm2);
   copy_call_debug_info(oldcall, call);
   call->set_cnt(PROB_UNLIKELY_MAG(4));  // Same effect as RC_UNCOMMON.
   _igvn.replace_node(oldcall, call);
@@ -2328,7 +2331,9 @@
   }
 
   // Make slow path call
-  CallNode *call = make_slow_call( (CallNode *) lock, OptoRuntime::complete_monitor_enter_Type(), OptoRuntime::complete_monitor_locking_Java(), NULL, slow_path, obj, box );
+  CallNode *call = make_slow_call((CallNode *) lock, OptoRuntime::complete_monitor_enter_Type(),
+                                  OptoRuntime::complete_monitor_locking_Java(), NULL, slow_path,
+                                  obj, box, NULL);
 
   extract_call_projections(call);
 
@@ -2395,8 +2400,11 @@
   funlock = transform_later( funlock )->as_FastUnlock();
   // Optimize test; set region slot 2
   Node *slow_path = opt_bits_test(ctrl, region, 2, funlock, 0, 0);
+  Node *thread = transform_later(new ThreadLocalNode());
 
-  CallNode *call = make_slow_call( (CallNode *) unlock, OptoRuntime::complete_monitor_exit_Type(), CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C), "complete_monitor_unlocking_C", slow_path, obj, box );
+  CallNode *call = make_slow_call((CallNode *) unlock, OptoRuntime::complete_monitor_exit_Type(),
+                                  CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C),
+                                  "complete_monitor_unlocking_C", slow_path, obj, box, thread);
 
   extract_call_projections(call);
 
--- a/hotspot/src/share/vm/opto/macro.hpp	Wed Apr 15 17:34:28 2015 -0700
+++ b/hotspot/src/share/vm/opto/macro.hpp	Thu Apr 16 08:23:26 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -186,7 +186,8 @@
   Node* opt_bits_test(Node* ctrl, Node* region, int edge, Node* word, int mask, int bits, bool return_fast_path = false);
   void copy_predefined_input_for_runtime_call(Node * ctrl, CallNode* oldcall, CallNode* call);
   CallNode* make_slow_call(CallNode *oldcall, const TypeFunc* slow_call_type, address slow_call,
-                       const char* leaf_name, Node* slow_path, Node* parm0, Node* parm1);
+                           const char* leaf_name, Node* slow_path, Node* parm0, Node* parm1,
+                           Node* parm2);
   void extract_call_projections(CallNode *call);
 
   Node* initialize_object(AllocateNode* alloc,
--- a/hotspot/src/share/vm/opto/runtime.cpp	Wed Apr 15 17:34:28 2015 -0700
+++ b/hotspot/src/share/vm/opto/runtime.cpp	Thu Apr 16 08:23:26 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -600,10 +600,11 @@
 //-----------------------------------------------------------------------------
 const TypeFunc *OptoRuntime::complete_monitor_exit_Type() {
   // create input type (domain)
-  const Type **fields = TypeTuple::fields(2);
+  const Type **fields = TypeTuple::fields(3);
   fields[TypeFunc::Parms+0] = TypeInstPtr::NOTNULL;  // Object to be Locked
-  fields[TypeFunc::Parms+1] = TypeRawPtr::BOTTOM;   // Address of stack location for lock
-  const TypeTuple *domain = TypeTuple::make(TypeFunc::Parms+2,fields);
+  fields[TypeFunc::Parms+1] = TypeRawPtr::BOTTOM;    // Address of stack location for lock - BasicLock
+  fields[TypeFunc::Parms+2] = TypeRawPtr::BOTTOM;    // Thread pointer (Self)
+  const TypeTuple *domain = TypeTuple::make(TypeFunc::Parms+3,fields);
 
   // create result type (range)
   fields = TypeTuple::fields(0);
--- a/hotspot/src/share/vm/opto/runtime.hpp	Wed Apr 15 17:34:28 2015 -0700
+++ b/hotspot/src/share/vm/opto/runtime.hpp	Thu Apr 16 08:23:26 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -184,7 +184,7 @@
 public:
   // Slow-path Locking and Unlocking
   static void complete_monitor_locking_C(oopDesc* obj, BasicLock* lock, JavaThread* thread);
-  static void complete_monitor_unlocking_C(oopDesc* obj, BasicLock* lock);
+  static void complete_monitor_unlocking_C(oopDesc* obj, BasicLock* lock, JavaThread* thread);
 
 private:
 
--- a/hotspot/src/share/vm/runtime/sharedRuntime.cpp	Wed Apr 15 17:34:28 2015 -0700
+++ b/hotspot/src/share/vm/runtime/sharedRuntime.cpp	Thu Apr 16 08:23:26 2015 -0700
@@ -1819,9 +1819,9 @@
 JRT_END
 
 // Handles the uncommon cases of monitor unlocking in compiled code
-JRT_LEAF(void, SharedRuntime::complete_monitor_unlocking_C(oopDesc* _obj, BasicLock* lock))
+JRT_LEAF(void, SharedRuntime::complete_monitor_unlocking_C(oopDesc* _obj, BasicLock* lock, JavaThread * THREAD))
    oop obj(_obj);
-  Thread* THREAD = JavaThread::current();
+  assert(JavaThread::current() == THREAD, "invariant");
   // I'm not convinced we need the code contained by MIGHT_HAVE_PENDING anymore
   // testing was unable to ever fire the assert that guarded it so I have removed it.
   assert(!HAS_PENDING_EXCEPTION, "Do we need code below anymore?");
--- a/hotspot/src/share/vm/runtime/sharedRuntime.hpp	Wed Apr 15 17:34:28 2015 -0700
+++ b/hotspot/src/share/vm/runtime/sharedRuntime.hpp	Thu Apr 16 08:23:26 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -472,7 +472,7 @@
 
   // Slow-path Locking and Unlocking
   static void complete_monitor_locking_C(oopDesc* obj, BasicLock* lock, JavaThread* thread);
-  static void complete_monitor_unlocking_C(oopDesc* obj, BasicLock* lock);
+  static void complete_monitor_unlocking_C(oopDesc* obj, BasicLock* lock, JavaThread* thread);
 
   // Resolving of calls
   static address resolve_static_call_C     (JavaThread *thread);
--- a/hotspot/src/share/vm/runtime/synchronizer.cpp	Wed Apr 15 17:34:28 2015 -0700
+++ b/hotspot/src/share/vm/runtime/synchronizer.cpp	Thu Apr 16 08:23:26 2015 -0700
@@ -109,17 +109,24 @@
 }
 
 #define NINFLATIONLOCKS 256
-static volatile intptr_t InflationLocks[NINFLATIONLOCKS];
+static volatile intptr_t gInflationLocks[NINFLATIONLOCKS];
 
+// global list of blocks of monitors
 // gBlockList is really PaddedEnd<ObjectMonitor> *, but we don't
 // want to expose the PaddedEnd template more than necessary.
 ObjectMonitor * ObjectSynchronizer::gBlockList = NULL;
+// global monitor free list
 ObjectMonitor * volatile ObjectSynchronizer::gFreeList  = NULL;
+// global monitor in-use list, for moribund threads,
+// monitors they inflated need to be scanned for deflation
 ObjectMonitor * volatile ObjectSynchronizer::gOmInUseList  = NULL;
+// count of entries in gOmInUseList
 int ObjectSynchronizer::gOmInUseCount = 0;
-static volatile intptr_t ListLock = 0;      // protects global monitor free-list cache
-static volatile int MonitorFreeCount  = 0;  // # on gFreeList
-static volatile int MonitorPopulation = 0;  // # Extant -- in circulation
+
+static volatile intptr_t gListLock = 0;      // protects global monitor lists
+static volatile int gMonitorFreeCount  = 0;  // # on gFreeList
+static volatile int gMonitorPopulation = 0;  // # Extant -- in circulation
+
 #define CHAINMARKER (cast_to_oop<intptr_t>(-1))
 
 
@@ -528,7 +535,7 @@
         int YieldThenBlock = 0;
         assert(ix >= 0 && ix < NINFLATIONLOCKS, "invariant");
         assert((NINFLATIONLOCKS & (NINFLATIONLOCKS-1)) == 0, "invariant");
-        Thread::muxAcquire(InflationLocks + ix, "InflationLock");
+        Thread::muxAcquire(gInflationLocks + ix, "gInflationLock");
         while (obj->mark() == markOopDesc::INFLATING()) {
           // Beware: NakedYield() is advisory and has almost no effect on some platforms
           // so we periodically call Self->_ParkEvent->park(1).
@@ -539,7 +546,7 @@
             os::naked_yield();
           }
         }
-        Thread::muxRelease(InflationLocks + ix);
+        Thread::muxRelease(gInflationLocks + ix);
         TEVENT(Inflate: INFLATING - yield/park);
       }
     } else {
@@ -882,7 +889,7 @@
 // STW-time -- disassociates idle monitors from objects.  Such
 // scavenged monitors are returned to the gFreeList.
 //
-// The global list is protected by ListLock.  All the critical sections
+// The global list is protected by gListLock.  All the critical sections
 // are short and operate in constant-time.
 //
 // ObjectMonitors reside in type-stable memory (TSM) and are immortal.
@@ -937,17 +944,17 @@
 
 void ObjectSynchronizer::verifyInUse(Thread *Self) {
   ObjectMonitor* mid;
-  int inusetally = 0;
+  int in_use_tally = 0;
   for (mid = Self->omInUseList; mid != NULL; mid = mid->FreeNext) {
-    inusetally++;
+    in_use_tally++;
   }
-  assert(inusetally == Self->omInUseCount, "inuse count off");
+  assert(in_use_tally == Self->omInUseCount, "in-use count off");
 
-  int freetally = 0;
+  int free_tally = 0;
   for (mid = Self->omFreeList; mid != NULL; mid = mid->FreeNext) {
-    freetally++;
+    free_tally++;
   }
-  assert(freetally == Self->omFreeCount, "free count off");
+  assert(free_tally == Self->omFreeCount, "free count off");
 }
 
 ObjectMonitor * NOINLINE ObjectSynchronizer::omAlloc(Thread * Self) {
@@ -964,7 +971,7 @@
     // Threads will attempt to allocate first from their local list, then
     // from the global list, and only after those attempts fail will the thread
     // attempt to instantiate new monitors.   Thread-local free lists take
-    // heat off the ListLock and improve allocation latency, as well as reducing
+    // heat off the gListLock and improve allocation latency, as well as reducing
     // coherency traffic on the shared global list.
     m = Self->omFreeList;
     if (m != NULL) {
@@ -994,9 +1001,9 @@
       // Reprovision the thread's omFreeList.
       // Use bulk transfers to reduce the allocation rate and heat
       // on various locks.
-      Thread::muxAcquire(&ListLock, "omAlloc");
+      Thread::muxAcquire(&gListLock, "omAlloc");
       for (int i = Self->omFreeProvision; --i >= 0 && gFreeList != NULL;) {
-        MonitorFreeCount--;
+        gMonitorFreeCount--;
         ObjectMonitor * take = gFreeList;
         gFreeList = take->FreeNext;
         guarantee(take->object() == NULL, "invariant");
@@ -1004,13 +1011,13 @@
         take->Recycle();
         omRelease(Self, take, false);
       }
-      Thread::muxRelease(&ListLock);
+      Thread::muxRelease(&gListLock);
       Self->omFreeProvision += 1 + (Self->omFreeProvision/2);
       if (Self->omFreeProvision > MAXPRIVATE) Self->omFreeProvision = MAXPRIVATE;
       TEVENT(omFirst - reprovision);
 
       const int mx = MonitorBound;
-      if (mx > 0 && (MonitorPopulation-MonitorFreeCount) > mx) {
+      if (mx > 0 && (gMonitorPopulation-gMonitorFreeCount) > mx) {
         // We can't safely induce a STW safepoint from omAlloc() as our thread
         // state may not be appropriate for such activities and callers may hold
         // naked oops, so instead we defer the action.
@@ -1068,11 +1075,11 @@
     // block in hand.  This avoids some lock traffic and redundant
     // list activity.
 
-    // Acquire the ListLock to manipulate BlockList and FreeList.
+    // Acquire the gListLock to manipulate gBlockList and gFreeList.
     // An Oyama-Taura-Yonezawa scheme might be more efficient.
-    Thread::muxAcquire(&ListLock, "omAlloc [2]");
-    MonitorPopulation += _BLOCKSIZE-1;
-    MonitorFreeCount += _BLOCKSIZE-1;
+    Thread::muxAcquire(&gListLock, "omAlloc [2]");
+    gMonitorPopulation += _BLOCKSIZE-1;
+    gMonitorFreeCount += _BLOCKSIZE-1;
 
     // Add the new block to the list of extant blocks (gBlockList).
     // The very first objectMonitor in a block is reserved and dedicated.
@@ -1083,7 +1090,7 @@
     // Add the new string of objectMonitors to the global free list
     temp[_BLOCKSIZE - 1].FreeNext = gFreeList;
     gFreeList = temp + 1;
-    Thread::muxRelease(&ListLock);
+    Thread::muxRelease(&gListLock);
     TEVENT(Allocate block of monitors);
   }
 }
@@ -1094,32 +1101,36 @@
 // omRelease is to return a monitor to the free list after a CAS
 // attempt failed.  This doesn't allow unbounded #s of monitors to
 // accumulate on a thread's free list.
+//
+// Key constraint: all ObjectMonitors on a thread's free list and the global
+// free list must have their object field set to null. This prevents the
+// scavenger -- deflate_idle_monitors -- from reclaiming them.
 
 void ObjectSynchronizer::omRelease(Thread * Self, ObjectMonitor * m,
                                    bool fromPerThreadAlloc) {
   guarantee(m->object() == NULL, "invariant");
-
+  guarantee(((m->is_busy()|m->_recursions) == 0), "freeing in-use monitor");
   // Remove from omInUseList
   if (MonitorInUseLists && fromPerThreadAlloc) {
-    ObjectMonitor* curmidinuse = NULL;
-    for (ObjectMonitor* mid = Self->omInUseList; mid != NULL;) {
+    ObjectMonitor* cur_mid_in_use = NULL;
+    bool extracted = false;
+    for (ObjectMonitor* mid = Self->omInUseList; mid != NULL; cur_mid_in_use = mid, mid = mid->FreeNext) {
       if (m == mid) {
-        // extract from per-thread in-use-list
+        // extract from per-thread in-use list
         if (mid == Self->omInUseList) {
           Self->omInUseList = mid->FreeNext;
-        } else if (curmidinuse != NULL) {
-          curmidinuse->FreeNext = mid->FreeNext; // maintain the current thread inuselist
+        } else if (cur_mid_in_use != NULL) {
+          cur_mid_in_use->FreeNext = mid->FreeNext; // maintain the current thread in-use list
         }
+        extracted = true;
         Self->omInUseCount--;
         if (ObjectMonitor::Knob_VerifyInUse) {
           verifyInUse(Self);
         }
         break;
-      } else {
-        curmidinuse = mid;
-        mid = mid->FreeNext;
       }
     }
+    assert(extracted, "Should have extracted from in-use list");
   }
 
   // FreeNext is used for both omInUseList and omFreeList, so clear old before setting new
@@ -1149,52 +1160,60 @@
 // operator.
 
 void ObjectSynchronizer::omFlush(Thread * Self) {
-  ObjectMonitor * List = Self->omFreeList;  // Null-terminated SLL
+  ObjectMonitor * list = Self->omFreeList;  // Null-terminated SLL
   Self->omFreeList = NULL;
-  ObjectMonitor * Tail = NULL;
-  int Tally = 0;
-  if (List != NULL) {
+  ObjectMonitor * tail = NULL;
+  int tally = 0;
+  if (list != NULL) {
     ObjectMonitor * s;
-    for (s = List; s != NULL; s = s->FreeNext) {
-      Tally++;
-      Tail = s;
+    // The thread is going away, the per-thread free monitors
+    // are freed via set_owner(NULL)
+    // Link them to tail, which will be linked into the global free list
+    // gFreeList below, under the gListLock
+    for (s = list; s != NULL; s = s->FreeNext) {
+      tally++;
+      tail = s;
       guarantee(s->object() == NULL, "invariant");
       guarantee(!s->is_busy(), "invariant");
       s->set_owner(NULL);   // redundant but good hygiene
       TEVENT(omFlush - Move one);
     }
-    guarantee(Tail != NULL && List != NULL, "invariant");
+    guarantee(tail != NULL && list != NULL, "invariant");
   }
 
-  ObjectMonitor * InUseList = Self->omInUseList;
-  ObjectMonitor * InUseTail = NULL;
-  int InUseTally = 0;
-  if (InUseList != NULL) {
+  ObjectMonitor * inUseList = Self->omInUseList;
+  ObjectMonitor * inUseTail = NULL;
+  int inUseTally = 0;
+  if (inUseList != NULL) {
     Self->omInUseList = NULL;
-    ObjectMonitor *curom;
-    for (curom = InUseList; curom != NULL; curom = curom->FreeNext) {
-      InUseTail = curom;
-      InUseTally++;
+    ObjectMonitor *cur_om;
+    // The thread is going away, however the omInUseList inflated
+    // monitors may still be in-use by other threads.
+    // Link them to inUseTail, which will be linked into the global in-use list
+    // gOmInUseList below, under the gListLock
+    for (cur_om = inUseList; cur_om != NULL; cur_om = cur_om->FreeNext) {
+      inUseTail = cur_om;
+      inUseTally++;
     }
-    assert(Self->omInUseCount == InUseTally, "inuse count off");
+    assert(Self->omInUseCount == inUseTally, "in-use count off");
     Self->omInUseCount = 0;
-    guarantee(InUseTail != NULL && InUseList != NULL, "invariant");
+    guarantee(inUseTail != NULL && inUseList != NULL, "invariant");
   }
 
-  Thread::muxAcquire(&ListLock, "omFlush");
-  if (Tail != NULL) {
-    Tail->FreeNext = gFreeList;
-    gFreeList = List;
-    MonitorFreeCount += Tally;
+  Thread::muxAcquire(&gListLock, "omFlush");
+  if (tail != NULL) {
+    tail->FreeNext = gFreeList;
+    gFreeList = list;
+    gMonitorFreeCount += tally;
   }
 
-  if (InUseTail != NULL) {
-    InUseTail->FreeNext = gOmInUseList;
-    gOmInUseList = InUseList;
-    gOmInUseCount += InUseTally;
+  if (inUseTail != NULL) {
+    inUseTail->FreeNext = gOmInUseList;
+    gOmInUseList = inUseList;
+    gOmInUseCount += inUseTally;
   }
 
-  Thread::muxRelease(&ListLock);
+  Thread::muxRelease(&gListLock);
   TEVENT(omFlush);
 }
 
@@ -1411,14 +1430,14 @@
 //
 // We have added a flag, MonitorInUseLists, which creates a list
 // of active monitors for each thread. deflate_idle_monitors()
-// only scans the per-thread inuse lists. omAlloc() puts all
+// only scans the per-thread in-use lists. omAlloc() puts all
 // assigned monitors on the per-thread list. deflate_idle_monitors()
 // returns the non-busy monitors to the global free list.
 // When a thread dies, omFlush() adds the list of active monitors for
 // that thread to a global gOmInUseList acquiring the
 // global list lock. deflate_idle_monitors() acquires the global
 // list lock to scan for non-busy monitors to the global free list.
-// An alternative could have used a single global inuse list. The
+// An alternative could have used a single global in-use list. The
 // downside would have been the additional cost of acquiring the global list lock
 // for every omAlloc().
 //
@@ -1432,8 +1451,8 @@
   MaximumRecheckInterval  = 1000
 };
 
-// Deflate a single monitor if not in use
-// Return true if deflated, false if in use
+// Deflate a single monitor if not in-use
+// Return true if deflated, false if in-use
 bool ObjectSynchronizer::deflate_monitor(ObjectMonitor* mid, oop obj,
                                          ObjectMonitor** freeHeadp,
                                          ObjectMonitor** freeTailp) {
@@ -1465,11 +1484,11 @@
 
     assert(mid->object() == NULL, "invariant");
 
-    // Move the object to the working free list defined by FreeHead,FreeTail.
+    // Move the object to the working free list defined by freeHeadp, freeTailp
     if (*freeHeadp == NULL) *freeHeadp = mid;
     if (*freeTailp != NULL) {
       ObjectMonitor * prevtail = *freeTailp;
-      assert(prevtail->FreeNext == NULL, "cleaned up deflated?"); // TODO KK
+      assert(prevtail->FreeNext == NULL, "cleaned up deflated?");
       prevtail->FreeNext = mid;
     }
     *freeTailp = mid;
@@ -1478,38 +1497,37 @@
   return deflated;
 }
 
-// Caller acquires ListLock
-int ObjectSynchronizer::walk_monitor_list(ObjectMonitor** listheadp,
-                                          ObjectMonitor** freeHeadp,
-                                          ObjectMonitor** freeTailp) {
+// Walk a given monitor list, and deflate idle monitors
+// The given list could be a per-thread list or a global list
+// Caller acquires gListLock
+int ObjectSynchronizer::deflate_monitor_list(ObjectMonitor** listHeadp,
+                                             ObjectMonitor** freeHeadp,
+                                             ObjectMonitor** freeTailp) {
   ObjectMonitor* mid;
   ObjectMonitor* next;
-  ObjectMonitor* curmidinuse = NULL;
-  int deflatedcount = 0;
+  ObjectMonitor* cur_mid_in_use = NULL;
+  int deflated_count = 0;
 
-  for (mid = *listheadp; mid != NULL;) {
+  for (mid = *listHeadp; mid != NULL;) {
     oop obj = (oop) mid->object();
-    bool deflated = false;
-    if (obj != NULL) {
-      deflated = deflate_monitor(mid, obj, freeHeadp, freeTailp);
-    }
-    if (deflated) {
-      // extract from per-thread in-use-list
-      if (mid == *listheadp) {
-        *listheadp = mid->FreeNext;
-      } else if (curmidinuse != NULL) {
-        curmidinuse->FreeNext = mid->FreeNext; // maintain the current thread inuselist
+    if (obj != NULL && deflate_monitor(mid, obj, freeHeadp, freeTailp)) {
+      // if deflate_monitor succeeded,
+      // extract from per-thread in-use list
+      if (mid == *listHeadp) {
+        *listHeadp = mid->FreeNext;
+      } else if (cur_mid_in_use != NULL) {
+        cur_mid_in_use->FreeNext = mid->FreeNext; // maintain the current thread in-use list
       }
       next = mid->FreeNext;
-      mid->FreeNext = NULL;  // This mid is current tail in the FreeHead list
+      mid->FreeNext = NULL;  // This mid is current tail in the freeHeadp list
       mid = next;
-      deflatedcount++;
+      deflated_count++;
     } else {
-      curmidinuse = mid;
+      cur_mid_in_use = mid;
       mid = mid->FreeNext;
     }
   }
-  return deflatedcount;
+  return deflated_count;
 }
 
 void ObjectSynchronizer::deflate_idle_monitors() {
@@ -1519,34 +1537,34 @@
   int nScavenged = 0;          // reclaimed
   bool deflated = false;
 
-  ObjectMonitor * FreeHead = NULL;  // Local SLL of scavenged monitors
-  ObjectMonitor * FreeTail = NULL;
+  ObjectMonitor * freeHeadp = NULL;  // Local SLL of scavenged monitors
+  ObjectMonitor * freeTailp = NULL;
 
   TEVENT(deflate_idle_monitors);
   // Prevent omFlush from changing mids in Thread dtor's during deflation
   // And in case the vm thread is acquiring a lock during a safepoint
   // See e.g. 6320749
-  Thread::muxAcquire(&ListLock, "scavenge - return");
+  Thread::muxAcquire(&gListLock, "scavenge - return");
 
   if (MonitorInUseLists) {
     int inUse = 0;
     for (JavaThread* cur = Threads::first(); cur != NULL; cur = cur->next()) {
       nInCirculation+= cur->omInUseCount;
-      int deflatedcount = walk_monitor_list(cur->omInUseList_addr(), &FreeHead, &FreeTail);
-      cur->omInUseCount-= deflatedcount;
+      int deflated_count = deflate_monitor_list(cur->omInUseList_addr(), &freeHeadp, &freeTailp);
+      cur->omInUseCount-= deflated_count;
       if (ObjectMonitor::Knob_VerifyInUse) {
         verifyInUse(cur);
       }
-      nScavenged += deflatedcount;
+      nScavenged += deflated_count;
       nInuse += cur->omInUseCount;
     }
 
     // For moribund threads, scan gOmInUseList
     if (gOmInUseList) {
       nInCirculation += gOmInUseCount;
-      int deflatedcount = walk_monitor_list((ObjectMonitor **)&gOmInUseList, &FreeHead, &FreeTail);
-      gOmInUseCount-= deflatedcount;
-      nScavenged += deflatedcount;
+      int deflated_count = deflate_monitor_list((ObjectMonitor **)&gOmInUseList, &freeHeadp, &freeTailp);
+      gOmInUseCount-= deflated_count;
+      nScavenged += deflated_count;
       nInuse += gOmInUseCount;
     }
 
@@ -1568,7 +1586,7 @@
         guarantee(!mid->is_busy(), "invariant");
         continue;
       }
-      deflated = deflate_monitor(mid, obj, &FreeHead, &FreeTail);
+      deflated = deflate_monitor(mid, obj, &freeHeadp, &freeTailp);
 
       if (deflated) {
         mid->FreeNext = NULL;
@@ -1579,28 +1597,28 @@
     }
   }
 
-  MonitorFreeCount += nScavenged;
+  gMonitorFreeCount += nScavenged;
 
-  // Consider: audit gFreeList to ensure that MonitorFreeCount and list agree.
+  // Consider: audit gFreeList to ensure that gMonitorFreeCount and list agree.
 
   if (ObjectMonitor::Knob_Verbose) {
     ::printf("Deflate: InCirc=%d InUse=%d Scavenged=%d ForceMonitorScavenge=%d : pop=%d free=%d\n",
              nInCirculation, nInuse, nScavenged, ForceMonitorScavenge,
-             MonitorPopulation, MonitorFreeCount);
+             gMonitorPopulation, gMonitorFreeCount);
     ::fflush(stdout);
   }
 
   ForceMonitorScavenge = 0;    // Reset
 
   // Move the scavenged monitors back to the global free list.
-  if (FreeHead != NULL) {
-    guarantee(FreeTail != NULL && nScavenged > 0, "invariant");
-    assert(FreeTail->FreeNext == NULL, "invariant");
+  if (freeHeadp != NULL) {
+    guarantee(freeTailp != NULL && nScavenged > 0, "invariant");
+    assert(freeTailp->FreeNext == NULL, "invariant");
     // constant-time list splice - prepend scavenged segment to gFreeList
-    FreeTail->FreeNext = gFreeList;
-    gFreeList = FreeHead;
+    freeTailp->FreeNext = gFreeList;
+    gFreeList = freeHeadp;
   }
-  Thread::muxRelease(&ListLock);
+  Thread::muxRelease(&gListLock);
 
   if (ObjectMonitor::_sync_Deflations != NULL) ObjectMonitor::_sync_Deflations->inc(nScavenged);
   if (ObjectMonitor::_sync_MonExtant  != NULL) ObjectMonitor::_sync_MonExtant ->set_value(nInCirculation);
@@ -1648,9 +1666,9 @@
   assert(THREAD == JavaThread::current(), "must be current Java thread");
   No_Safepoint_Verifier nsv;
   ReleaseJavaMonitorsClosure rjmc(THREAD);
-  Thread::muxAcquire(&ListLock, "release_monitors_owned_by_thread");
+  Thread::muxAcquire(&gListLock, "release_monitors_owned_by_thread");
   ObjectSynchronizer::monitors_iterate(&rjmc);
-  Thread::muxRelease(&ListLock);
+  Thread::muxRelease(&gListLock);
   THREAD->clear_pending_exception();
 }
 
--- a/hotspot/src/share/vm/runtime/synchronizer.hpp	Wed Apr 15 17:34:28 2015 -0700
+++ b/hotspot/src/share/vm/runtime/synchronizer.hpp	Thu Apr 16 08:23:26 2015 -0700
@@ -116,9 +116,10 @@
   // Basically we deflate all monitors that are not busy.
   // An adaptive profile-based deflation policy could be used if needed
   static void deflate_idle_monitors();
-  static int walk_monitor_list(ObjectMonitor** listheadp,
-                               ObjectMonitor** freeHeadp,
-                               ObjectMonitor** freeTailp);
+  // For a given monitor list: global or per-thread, deflate idle monitors
+  static int deflate_monitor_list(ObjectMonitor** listheadp,
+                                  ObjectMonitor** freeHeadp,
+                                  ObjectMonitor** freeTailp);
   static bool deflate_monitor(ObjectMonitor* mid, oop obj,
                               ObjectMonitor** freeHeadp,
                               ObjectMonitor** freeTailp);
@@ -135,16 +136,17 @@
 
  private:
   enum { _BLOCKSIZE = 128 };
+  // global list of blocks of monitors
   // gBlockList is really PaddedEnd<ObjectMonitor> *, but we don't
   // want to expose the PaddedEnd template more than necessary.
-  static ObjectMonitor* gBlockList;
+  static ObjectMonitor * gBlockList;
+  // global monitor free list
   static ObjectMonitor * volatile gFreeList;
-  // global monitor in use list, for moribund threads,
+  // global monitor in-use list, for moribund threads,
   // monitors they inflated need to be scanned for deflation
   static ObjectMonitor * volatile gOmInUseList;
   // count of entries in gOmInUseList
   static int gOmInUseCount;
-
 };
 
 // ObjectLocker enforced balanced locking and can never thrown an