diff -r 11c11e616b91 -r dc9b63952682 hotspot/src/share/vm/runtime/thread.cpp
--- a/hotspot/src/share/vm/runtime/thread.cpp	Mon Oct 18 09:33:24 2010 -0700
+++ b/hotspot/src/share/vm/runtime/thread.cpp	Fri Oct 22 15:59:34 2010 -0400
@@ -2995,8 +2995,8 @@
   // crash Linux VM, see notes in os_linux.cpp.
   main_thread->create_stack_guard_pages();
 
-  // Initialize Java-Leve synchronization subsystem
-  ObjectSynchronizer::Initialize() ;
+  // Initialize Java-Level synchronization subsystem
+  ObjectMonitor::Initialize() ;
 
   // Initialize global modules
   jint status = init_globals();
@@ -3965,215 +3965,272 @@
   }
 }
 
-
-// Lifecycle management for TSM ParkEvents.
-// ParkEvents are type-stable (TSM).
-// In our particular implementation they happen to be immortal.
+// Internal SpinLock and Mutex
+// Based on ParkEvent
+
+// Ad-hoc mutual exclusion primitives: SpinLock and Mux
 //
-// We manage concurrency on the FreeList with a CAS-based
-// detach-modify-reattach idiom that avoids the ABA problems
-// that would otherwise be present in a simple CAS-based
-// push-pop implementation.   (push-one and pop-all)
+// We employ SpinLocks _only for low-contention, fixed-length
+// short-duration critical sections where we're concerned
+// about native mutex_t or HotSpot Mutex:: latency.
+// The mux construct provides a spin-then-block mutual exclusion
+// mechanism.
+//
+// Testing has shown that contention on the ListLock guarding gFreeList
+// is common.  If we implement ListLock as a simple SpinLock it's common
+// for the JVM to devolve to yielding with little progress.  This is true
+// despite the fact that the critical sections protected by ListLock are
+// extremely short.
 //
-// Caveat: Allocate() and Release() may be called from threads
-// other than the thread associated with the Event!
-// If we need to call Allocate() when running as the thread in
-// question then look for the PD calls to initialize native TLS.
-// Native TLS (Win32/Linux/Solaris) can only be initialized or
-// accessed by the associated thread.
-// See also pd_initialize().
-//
-// Note that we could defer associating a ParkEvent with a thread
-// until the 1st time the thread calls park().  unpark() calls to
-// an unprovisioned thread would be ignored.  The first park() call
-// for a thread would allocate and associate a ParkEvent and return
-// immediately.
-
-volatile int ParkEvent::ListLock = 0 ;
-ParkEvent * volatile ParkEvent::FreeList = NULL ;
-
-ParkEvent * ParkEvent::Allocate (Thread * t) {
-  // In rare cases -- JVM_RawMonitor* operations -- we can find t == null.
-  ParkEvent * ev ;
-
-  // Start by trying to recycle an existing but unassociated
-  // ParkEvent from the global free list.
+// TODO-FIXME: ListLock should be of type SpinLock.
+// We should make this a 1st-class type, integrated into the lock
+// hierarchy as leaf-locks.  Critically, the SpinLock structure
+// should have sufficient padding to avoid false-sharing and excessive
+// cache-coherency traffic.
+
+
+typedef volatile int SpinLockT ;
+
+void Thread::SpinAcquire (volatile int * adr, const char * LockName) {
+  if (Atomic::cmpxchg (1, adr, 0) == 0) {
+     return ;   // normal fast-path return
+  }
+
+  // Slow-path : We've encountered contention -- Spin/Yield/Block strategy.
+  TEVENT (SpinAcquire - ctx) ;
+  int ctr = 0 ;
+  int Yields = 0 ;
   for (;;) {
-    ev = FreeList ;
-    if (ev == NULL) break ;
-    // 1: Detach - sequester or privatize the list
-    // Tantamount to ev = Swap (&FreeList, NULL)
-    if (Atomic::cmpxchg_ptr (NULL, &FreeList, ev) != ev) {
-       continue ;
-    }
-
-    // We've detached the list.  The list in-hand is now
-    // local to this thread.   This thread can operate on the
-    // list without risk of interference from other threads.
-    // 2: Extract -- pop the 1st element from the list.
-    ParkEvent * List = ev->FreeNext ;
-    if (List == NULL) break ;
-    for (;;) {
-        // 3: Try to reattach the residual list
-        guarantee (List != NULL, "invariant") ;
-        ParkEvent * Arv =  (ParkEvent *) Atomic::cmpxchg_ptr (List, &FreeList, NULL) ;
-        if (Arv == NULL) break ;
-
-        // New nodes arrived.  Try to detach the recent arrivals.
-        if (Atomic::cmpxchg_ptr (NULL, &FreeList, Arv) != Arv) {
-            continue ;
+     while (*adr != 0) {
+        ++ctr ;
+        if ((ctr & 0xFFF) == 0 || !os::is_MP()) {
+           if (Yields > 5) {
+             // Consider using a simple NakedSleep() instead.
+             // Then SpinAcquire could be called by non-JVM threads
+             Thread::current()->_ParkEvent->park(1) ;
+           } else {
+             os::NakedYield() ;
+             ++Yields ;
+           }
+        } else {
+           SpinPause() ;
         }
-        guarantee (Arv != NULL, "invariant") ;
-        // 4: Merge Arv into List
-        ParkEvent * Tail = List ;
-        while (Tail->FreeNext != NULL) Tail = Tail->FreeNext ;
-        Tail->FreeNext = Arv ;
-    }
-    break ;
-  }
-
-  if (ev != NULL) {
-    guarantee (ev->AssociatedWith == NULL, "invariant") ;
-  } else {
-    // Do this the hard way -- materialize a new ParkEvent.
-    // In rare cases an allocating thread might detach a long list --
-    // installing null into FreeList -- and then stall or be obstructed.
-    // A 2nd thread calling Allocate() would see FreeList == null.
-    // The list held privately by the 1st thread is unavailable to the 2nd thread.
-    // In that case the 2nd thread would have to materialize a new ParkEvent,
-    // even though free ParkEvents existed in the system.  In this case we end up
-    // with more ParkEvents in circulation than we need, but the race is
-    // rare and the outcome is benign.  Ideally, the # of extant ParkEvents
-    // is equal to the maximum # of threads that existed at any one time.
-    // Because of the race mentioned above, segments of the freelist
-    // can be transiently inaccessible.  At worst we may end up with the
-    // # of ParkEvents in circulation slightly above the ideal.
-    // Note that if we didn't have the TSM/immortal constraint, then
-    // when reattaching, above, we could trim the list.
-    ev = new ParkEvent () ;
-    guarantee ((intptr_t(ev) & 0xFF) == 0, "invariant") ;
-  }
-  ev->reset() ;                     // courtesy to caller
-  ev->AssociatedWith = t ;          // Associate ev with t
-  ev->FreeNext       = NULL ;
-  return ev ;
-}
-
-void ParkEvent::Release (ParkEvent * ev) {
-  if (ev == NULL) return ;
-  guarantee (ev->FreeNext == NULL      , "invariant") ;
-  ev->AssociatedWith = NULL ;
-  for (;;) {
-    // Push ev onto FreeList
-    // The mechanism is "half" lock-free.
-    ParkEvent * List = FreeList ;
-    ev->FreeNext = List ;
-    if (Atomic::cmpxchg_ptr (ev, &FreeList, List) == List) break ;
+     }
+     if (Atomic::cmpxchg (1, adr, 0) == 0) return ;
   }
 }
 
-// Override operator new and delete so we can ensure that the
-// least significant byte of ParkEvent addresses is 0.
-// Beware that excessive address alignment is undesirable
-// as it can result in D$ index usage imbalance as
-// well as bank access imbalance on Niagara-like platforms,
-// although Niagara's hash function should help.
-
-void * ParkEvent::operator new (size_t sz) {
-  return (void *) ((intptr_t (CHeapObj::operator new (sz + 256)) + 256) & -256) ;
-}
-
-void ParkEvent::operator delete (void * a) {
-  // ParkEvents are type-stable and immortal ...
-  ShouldNotReachHere();
+void Thread::SpinRelease (volatile int * adr) {
+  assert (*adr != 0, "invariant") ;
+  OrderAccess::fence() ;      // guarantee at least release consistency.
+  // Roach-motel semantics.
+  // It's safe if subsequent LDs and STs float "up" into the critical section,
+  // but prior LDs and STs within the critical section can't be allowed
+  // to reorder or float past the ST that releases the lock.
+  *adr = 0 ;
 }
 
-
-// 6399321 As a temporary measure we copied & modified the ParkEvent::
-// allocate() and release() code for use by Parkers.  The Parker:: forms
-// will eventually be removed as we consolide and shift over to ParkEvents
-// for both builtin synchronization and JSR166 operations.
-
-volatile int Parker::ListLock = 0 ;
-Parker * volatile Parker::FreeList = NULL ;
-
-Parker * Parker::Allocate (JavaThread * t) {
-  guarantee (t != NULL, "invariant") ;
-  Parker * p ;
-
-  // Start by trying to recycle an existing but unassociated
-  // Parker from the global free list.
+// muxAcquire and muxRelease:
+//
+// *  muxAcquire and muxRelease support a single-word lock-word construct.
+//    The LSB of the word is set IFF the lock is held.
+//    The remainder of the word points to the head of a singly-linked list
+//    of threads blocked on the lock.
+//
+// *  The current implementation of muxAcquire-muxRelease uses its own
+//    dedicated Thread._MuxEvent instance.  If we're interested in
+//    minimizing the peak number of extant ParkEvent instances then
+//    we could eliminate _MuxEvent and "borrow" _ParkEvent as long
+//    as certain invariants were satisfied.  Specifically, care would need
+//    to be taken with regards to consuming unpark() "permits".
+//    A safe rule of thumb is that a thread would never call muxAcquire()
+//    if it's enqueued (cxq, EntryList, WaitList, etc) and will subsequently
+//    park().  Otherwise the _ParkEvent park() operation in muxAcquire() could
+//    consume an unpark() permit intended for monitorenter, for instance.
+//    One way around this would be to widen the restricted-range semaphore
+//    implemented in park().  Another alternative would be to provide
+//    multiple instances of the PlatformEvent() for each thread.  One
+//    instance would be dedicated to muxAcquire-muxRelease, for instance.
+//
+// *  Usage:
+//    -- Only as leaf locks
+//    -- for short-term locking only as muxAcquire does not perform
+//       thread state transitions.
+//
+// Alternatives:
+// *  We could implement muxAcquire and muxRelease with MCS or CLH locks
+//    but with parking or spin-then-park instead of pure spinning.
+// *  Use Taura-Oyama-Yonenzawa locks.
+// *  It's possible to construct a 1-0 lock if we encode the lockword as
+//    (List,LockByte).  Acquire will CAS the full lockword while Release
+//    will STB 0 into the LockByte.  The 1-0 scheme admits stranding, so
+//    acquiring threads use timers (ParkTimed) to detect and recover from
+//    the stranding window.  Thread/Node structures must be aligned on 256-byte
+//    boundaries by using placement-new.
+// *  Augment MCS with advisory back-link fields maintained with CAS().
+//    Pictorially:  LockWord -> T1 <-> T2 <-> T3 <-> ... <-> Tn <-> Owner.
+//    The validity of the backlinks must be ratified before we trust the value.
+//    If the backlinks are invalid the exiting thread must back-track through the
+//    the forward links, which are always trustworthy.
+// *  Add a successor indication.  The LockWord is currently encoded as
+//    (List, LOCKBIT:1).  We could also add a SUCCBIT or an explicit _succ variable
+//    to provide the usual futile-wakeup optimization.
+//    See RTStt for details.
+// *  Consider schedctl.sc_nopreempt to cover the critical section.
+//
+
+
+typedef volatile intptr_t MutexT ;      // Mux Lock-word
+enum MuxBits { LOCKBIT = 1 } ;
+
+void Thread::muxAcquire (volatile intptr_t * Lock, const char * LockName) {
+  intptr_t w = Atomic::cmpxchg_ptr (LOCKBIT, Lock, 0) ;
+  if (w == 0) return ;
+  if ((w & LOCKBIT) == 0 && Atomic::cmpxchg_ptr (w|LOCKBIT, Lock, w) == w) {
+     return ;
+  }
+
+  TEVENT (muxAcquire - Contention) ;
+  ParkEvent * const Self = Thread::current()->_MuxEvent ;
+  assert ((intptr_t(Self) & LOCKBIT) == 0, "invariant") ;
   for (;;) {
-    p = FreeList ;
-    if (p  == NULL) break ;
-    // 1: Detach
-    // Tantamount to p = Swap (&FreeList, NULL)
-    if (Atomic::cmpxchg_ptr (NULL, &FreeList, p) != p) {
-       continue ;
-    }
-
-    // We've detached the list.  The list in-hand is now
-    // local to this thread.   This thread can operate on the
-    // list without risk of interference from other threads.
-    // 2: Extract -- pop the 1st element from the list.
-    Parker * List = p->FreeNext ;
-    if (List == NULL) break ;
-    for (;;) {
-        // 3: Try to reattach the residual list
-        guarantee (List != NULL, "invariant") ;
-        Parker * Arv =  (Parker *) Atomic::cmpxchg_ptr (List, &FreeList, NULL) ;
-        if (Arv == NULL) break ;
-
-        // New nodes arrived.  Try to detach the recent arrivals.
-        if (Atomic::cmpxchg_ptr (NULL, &FreeList, Arv) != Arv) {
-            continue ;
+     int its = (os::is_MP() ? 100 : 0) + 1 ;
+
+     // Optional spin phase: spin-then-park strategy
+     while (--its >= 0) {
+       w = *Lock ;
+       if ((w & LOCKBIT) == 0 && Atomic::cmpxchg_ptr (w|LOCKBIT, Lock, w) == w) {
+          return ;
+       }
+     }
+
+     Self->reset() ;
+     Self->OnList = intptr_t(Lock) ;
+     // The following fence() isn't _strictly necessary as the subsequent
+     // CAS() both serializes execution and ratifies the fetched *Lock value.
+     OrderAccess::fence();
+     for (;;) {
+        w = *Lock ;
+        if ((w & LOCKBIT) == 0) {
+            if (Atomic::cmpxchg_ptr (w|LOCKBIT, Lock, w) == w) {
+                Self->OnList = 0 ;   // hygiene - allows stronger asserts
+                return ;
+            }
+            continue ;      // Interference -- *Lock changed -- Just retry
         }
-        guarantee (Arv != NULL, "invariant") ;
-        // 4: Merge Arv into List
-        Parker * Tail = List ;
-        while (Tail->FreeNext != NULL) Tail = Tail->FreeNext ;
-        Tail->FreeNext = Arv ;
-    }
-    break ;
-  }
-
-  if (p != NULL) {
-    guarantee (p->AssociatedWith == NULL, "invariant") ;
-  } else {
-    // Do this the hard way -- materialize a new Parker..
-    // In rare cases an allocating thread might detach
-    // a long list -- installing null into FreeList --and
-    // then stall.  Another thread calling Allocate() would see
-    // FreeList == null and then invoke the ctor.  In this case we
-    // end up with more Parkers in circulation than we need, but
-    // the race is rare and the outcome is benign.
-    // Ideally, the # of extant Parkers is equal to the
-    // maximum # of threads that existed at any one time.
-    // Because of the race mentioned above, segments of the
-    // freelist can be transiently inaccessible.  At worst
-    // we may end up with the # of Parkers in circulation
-    // slightly above the ideal.
-    p = new Parker() ;
-  }
-  p->AssociatedWith = t ;          // Associate p with t
-  p->FreeNext       = NULL ;
-  return p ;
-}
-
-
-void Parker::Release (Parker * p) {
-  if (p == NULL) return ;
-  guarantee (p->AssociatedWith != NULL, "invariant") ;
-  guarantee (p->FreeNext == NULL      , "invariant") ;
-  p->AssociatedWith = NULL ;
-  for (;;) {
-    // Push p onto FreeList
-    Parker * List = FreeList ;
-    p->FreeNext = List ;
-    if (Atomic::cmpxchg_ptr (p, &FreeList, List) == List) break ;
+        assert (w & LOCKBIT, "invariant") ;
+        Self->ListNext = (ParkEvent *) (w & ~LOCKBIT );
+        if (Atomic::cmpxchg_ptr (intptr_t(Self)|LOCKBIT, Lock, w) == w) break ;
+     }
+
+     while (Self->OnList != 0) {
+        Self->park() ;
+     }
   }
 }
 
+void Thread::muxAcquireW (volatile intptr_t * Lock, ParkEvent * ev) {
+  intptr_t w = Atomic::cmpxchg_ptr (LOCKBIT, Lock, 0) ;
+  if (w == 0) return ;
+  if ((w & LOCKBIT) == 0 && Atomic::cmpxchg_ptr (w|LOCKBIT, Lock, w) == w) {
+    return ;
+  }
+
+  TEVENT (muxAcquire - Contention) ;
+  ParkEvent * ReleaseAfter = NULL ;
+  if (ev == NULL) {
+    ev = ReleaseAfter = ParkEvent::Allocate (NULL) ;
+  }
+  assert ((intptr_t(ev) & LOCKBIT) == 0, "invariant") ;
+  for (;;) {
+    guarantee (ev->OnList == 0, "invariant") ;
+    int its = (os::is_MP() ? 100 : 0) + 1 ;
+
+    // Optional spin phase: spin-then-park strategy
+    while (--its >= 0) {
+      w = *Lock ;
+      if ((w & LOCKBIT) == 0 && Atomic::cmpxchg_ptr (w|LOCKBIT, Lock, w) == w) {
+        if (ReleaseAfter != NULL) {
+          ParkEvent::Release (ReleaseAfter) ;
+        }
+        return ;
+      }
+    }
+
+    ev->reset() ;
+    ev->OnList = intptr_t(Lock) ;
+    // The following fence() isn't _strictly necessary as the subsequent
+    // CAS() both serializes execution and ratifies the fetched *Lock value.
+    OrderAccess::fence();
+    for (;;) {
+      w = *Lock ;
+      if ((w & LOCKBIT) == 0) {
+        if (Atomic::cmpxchg_ptr (w|LOCKBIT, Lock, w) == w) {
+          ev->OnList = 0 ;
+          // We call ::Release while holding the outer lock, thus
+          // artificially lengthening the critical section.
+          // Consider deferring the ::Release() until the subsequent unlock(),
+          // after we've dropped the outer lock.
+          if (ReleaseAfter != NULL) {
+            ParkEvent::Release (ReleaseAfter) ;
+          }
+          return ;
+        }
+        continue ;      // Interference -- *Lock changed -- Just retry
+      }
+      assert (w & LOCKBIT, "invariant") ;
+      ev->ListNext = (ParkEvent *) (w & ~LOCKBIT );
+      if (Atomic::cmpxchg_ptr (intptr_t(ev)|LOCKBIT, Lock, w) == w) break ;
+    }
+
+    while (ev->OnList != 0) {
+      ev->park() ;
+    }
+  }
+}
+
+// Release() must extract a successor from the list and then wake that thread.
+// It can "pop" the front of the list or use a detach-modify-reattach (DMR) scheme
+// similar to that used by ParkEvent::Allocate() and ::Release().  DMR-based
+// Release() would :
+// (A) CAS() or swap() null to *Lock, releasing the lock and detaching the list.
+// (B) Extract a successor from the private list "in-hand"
+// (C) attempt to CAS() the residual back into *Lock over null.
+//     If there were any newly arrived threads and the CAS() would fail.
+//     In that case Release() would detach the RATs, re-merge the list in-hand
+//     with the RATs and repeat as needed.  Alternately, Release() might
+//     detach and extract a successor, but then pass the residual list to the wakee.
+//     The wakee would be responsible for reattaching and remerging before it
+//     competed for the lock.
+//
+// Both "pop" and DMR are immune from ABA corruption -- there can be
+// multiple concurrent pushers, but only one popper or detacher.
+// This implementation pops from the head of the list.  This is unfair,
+// but tends to provide excellent throughput as hot threads remain hot.
+// (We wake recently run threads first).
+
+void Thread::muxRelease (volatile intptr_t * Lock)  {
+  for (;;) {
+    const intptr_t w = Atomic::cmpxchg_ptr (0, Lock, LOCKBIT) ;
+    assert (w & LOCKBIT, "invariant") ;
+    if (w == LOCKBIT) return ;
+    ParkEvent * List = (ParkEvent *) (w & ~LOCKBIT) ;
+    assert (List != NULL, "invariant") ;
+    assert (List->OnList == intptr_t(Lock), "invariant") ;
+    ParkEvent * nxt = List->ListNext ;
+
+    // The following CAS() releases the lock and pops the head element.
+    if (Atomic::cmpxchg_ptr (intptr_t(nxt), Lock, w) != w) {
+      continue ;
+    }
+    List->OnList = 0 ;
+    OrderAccess::fence() ;
+    List->unpark () ;
+    return ;
+  }
+}
+
+
 void Threads::verify() {
   ALL_JAVA_THREADS(p) {
     p->verify();