--- a/hotspot/src/share/vm/runtime/thread.cpp Mon Oct 18 09:33:24 2010 -0700
+++ b/hotspot/src/share/vm/runtime/thread.cpp Fri Oct 22 15:59:34 2010 -0400
@@ -2995,8 +2995,8 @@
// crash Linux VM, see notes in os_linux.cpp.
main_thread->create_stack_guard_pages();
- // Initialize Java-Leve synchronization subsystem
- ObjectSynchronizer::Initialize() ;
+ // Initialize Java-Level synchronization subsystem
+ ObjectMonitor::Initialize() ;
// Initialize global modules
jint status = init_globals();
@@ -3965,215 +3965,272 @@
}
}
-
-// Lifecycle management for TSM ParkEvents.
-// ParkEvents are type-stable (TSM).
-// In our particular implementation they happen to be immortal.
+// Internal SpinLock and Mutex
+// Based on ParkEvent
+
+// Ad-hoc mutual exclusion primitives: SpinLock and Mux
//
-// We manage concurrency on the FreeList with a CAS-based
-// detach-modify-reattach idiom that avoids the ABA problems
-// that would otherwise be present in a simple CAS-based
-// push-pop implementation. (push-one and pop-all)
+// We employ SpinLocks _only for low-contention, fixed-length
+// short-duration critical sections where we're concerned
+// about native mutex_t or HotSpot Mutex:: latency.
+// The mux construct provides a spin-then-block mutual exclusion
+// mechanism.
+//
+// Testing has shown that contention on the ListLock guarding gFreeList
+// is common. If we implement ListLock as a simple SpinLock it's common
+// for the JVM to devolve to yielding with little progress. This is true
+// despite the fact that the critical sections protected by ListLock are
+// extremely short.
//
-// Caveat: Allocate() and Release() may be called from threads
-// other than the thread associated with the Event!
-// If we need to call Allocate() when running as the thread in
-// question then look for the PD calls to initialize native TLS.
-// Native TLS (Win32/Linux/Solaris) can only be initialized or
-// accessed by the associated thread.
-// See also pd_initialize().
-//
-// Note that we could defer associating a ParkEvent with a thread
-// until the 1st time the thread calls park(). unpark() calls to
-// an unprovisioned thread would be ignored. The first park() call
-// for a thread would allocate and associate a ParkEvent and return
-// immediately.
-
-volatile int ParkEvent::ListLock = 0 ;
-ParkEvent * volatile ParkEvent::FreeList = NULL ;
-
-ParkEvent * ParkEvent::Allocate (Thread * t) {
- // In rare cases -- JVM_RawMonitor* operations -- we can find t == null.
- ParkEvent * ev ;
-
- // Start by trying to recycle an existing but unassociated
- // ParkEvent from the global free list.
+// TODO-FIXME: ListLock should be of type SpinLock.
+// We should make this a 1st-class type, integrated into the lock
+// hierarchy as leaf-locks. Critically, the SpinLock structure
+// should have sufficient padding to avoid false-sharing and excessive
+// cache-coherency traffic.
+
+
+typedef volatile int SpinLockT ;
+
+void Thread::SpinAcquire (volatile int * adr, const char * LockName) {
+ if (Atomic::cmpxchg (1, adr, 0) == 0) {
+ return ; // normal fast-path return
+ }
+
+ // Slow-path : We've encountered contention -- Spin/Yield/Block strategy.
+ TEVENT (SpinAcquire - ctx) ;
+ int ctr = 0 ;
+ int Yields = 0 ;
for (;;) {
- ev = FreeList ;
- if (ev == NULL) break ;
- // 1: Detach - sequester or privatize the list
- // Tantamount to ev = Swap (&FreeList, NULL)
- if (Atomic::cmpxchg_ptr (NULL, &FreeList, ev) != ev) {
- continue ;
- }
-
- // We've detached the list. The list in-hand is now
- // local to this thread. This thread can operate on the
- // list without risk of interference from other threads.
- // 2: Extract -- pop the 1st element from the list.
- ParkEvent * List = ev->FreeNext ;
- if (List == NULL) break ;
- for (;;) {
- // 3: Try to reattach the residual list
- guarantee (List != NULL, "invariant") ;
- ParkEvent * Arv = (ParkEvent *) Atomic::cmpxchg_ptr (List, &FreeList, NULL) ;
- if (Arv == NULL) break ;
-
- // New nodes arrived. Try to detach the recent arrivals.
- if (Atomic::cmpxchg_ptr (NULL, &FreeList, Arv) != Arv) {
- continue ;
+ while (*adr != 0) {
+ ++ctr ;
+ if ((ctr & 0xFFF) == 0 || !os::is_MP()) {
+ if (Yields > 5) {
+ // Consider using a simple NakedSleep() instead.
+ // Then SpinAcquire could be called by non-JVM threads
+ Thread::current()->_ParkEvent->park(1) ;
+ } else {
+ os::NakedYield() ;
+ ++Yields ;
+ }
+ } else {
+ SpinPause() ;
}
- guarantee (Arv != NULL, "invariant") ;
- // 4: Merge Arv into List
- ParkEvent * Tail = List ;
- while (Tail->FreeNext != NULL) Tail = Tail->FreeNext ;
- Tail->FreeNext = Arv ;
- }
- break ;
- }
-
- if (ev != NULL) {
- guarantee (ev->AssociatedWith == NULL, "invariant") ;
- } else {
- // Do this the hard way -- materialize a new ParkEvent.
- // In rare cases an allocating thread might detach a long list --
- // installing null into FreeList -- and then stall or be obstructed.
- // A 2nd thread calling Allocate() would see FreeList == null.
- // The list held privately by the 1st thread is unavailable to the 2nd thread.
- // In that case the 2nd thread would have to materialize a new ParkEvent,
- // even though free ParkEvents existed in the system. In this case we end up
- // with more ParkEvents in circulation than we need, but the race is
- // rare and the outcome is benign. Ideally, the # of extant ParkEvents
- // is equal to the maximum # of threads that existed at any one time.
- // Because of the race mentioned above, segments of the freelist
- // can be transiently inaccessible. At worst we may end up with the
- // # of ParkEvents in circulation slightly above the ideal.
- // Note that if we didn't have the TSM/immortal constraint, then
- // when reattaching, above, we could trim the list.
- ev = new ParkEvent () ;
- guarantee ((intptr_t(ev) & 0xFF) == 0, "invariant") ;
- }
- ev->reset() ; // courtesy to caller
- ev->AssociatedWith = t ; // Associate ev with t
- ev->FreeNext = NULL ;
- return ev ;
-}
-
-void ParkEvent::Release (ParkEvent * ev) {
- if (ev == NULL) return ;
- guarantee (ev->FreeNext == NULL , "invariant") ;
- ev->AssociatedWith = NULL ;
- for (;;) {
- // Push ev onto FreeList
- // The mechanism is "half" lock-free.
- ParkEvent * List = FreeList ;
- ev->FreeNext = List ;
- if (Atomic::cmpxchg_ptr (ev, &FreeList, List) == List) break ;
+ }
+ if (Atomic::cmpxchg (1, adr, 0) == 0) return ;
}
}
-// Override operator new and delete so we can ensure that the
-// least significant byte of ParkEvent addresses is 0.
-// Beware that excessive address alignment is undesirable
-// as it can result in D$ index usage imbalance as
-// well as bank access imbalance on Niagara-like platforms,
-// although Niagara's hash function should help.
-
-void * ParkEvent::operator new (size_t sz) {
- return (void *) ((intptr_t (CHeapObj::operator new (sz + 256)) + 256) & -256) ;
-}
-
-void ParkEvent::operator delete (void * a) {
- // ParkEvents are type-stable and immortal ...
- ShouldNotReachHere();
+void Thread::SpinRelease (volatile int * adr) {
+ assert (*adr != 0, "invariant") ;
+ OrderAccess::fence() ; // guarantee at least release consistency.
+ // Roach-motel semantics.
+ // It's safe if subsequent LDs and STs float "up" into the critical section,
+ // but prior LDs and STs within the critical section can't be allowed
+ // to reorder or float past the ST that releases the lock.
+ *adr = 0 ;
}
-
-// 6399321 As a temporary measure we copied & modified the ParkEvent::
-// allocate() and release() code for use by Parkers. The Parker:: forms
-// will eventually be removed as we consolide and shift over to ParkEvents
-// for both builtin synchronization and JSR166 operations.
-
-volatile int Parker::ListLock = 0 ;
-Parker * volatile Parker::FreeList = NULL ;
-
-Parker * Parker::Allocate (JavaThread * t) {
- guarantee (t != NULL, "invariant") ;
- Parker * p ;
-
- // Start by trying to recycle an existing but unassociated
- // Parker from the global free list.
+// muxAcquire and muxRelease:
+//
+// * muxAcquire and muxRelease support a single-word lock-word construct.
+// The LSB of the word is set IFF the lock is held.
+// The remainder of the word points to the head of a singly-linked list
+// of threads blocked on the lock.
+//
+// * The current implementation of muxAcquire-muxRelease uses its own
+// dedicated Thread._MuxEvent instance. If we're interested in
+// minimizing the peak number of extant ParkEvent instances then
+// we could eliminate _MuxEvent and "borrow" _ParkEvent as long
+// as certain invariants were satisfied. Specifically, care would need
+// to be taken with regards to consuming unpark() "permits".
+// A safe rule of thumb is that a thread would never call muxAcquire()
+// if it's enqueued (cxq, EntryList, WaitList, etc) and will subsequently
+// park(). Otherwise the _ParkEvent park() operation in muxAcquire() could
+// consume an unpark() permit intended for monitorenter, for instance.
+// One way around this would be to widen the restricted-range semaphore
+// implemented in park(). Another alternative would be to provide
+// multiple instances of the PlatformEvent() for each thread. One
+// instance would be dedicated to muxAcquire-muxRelease, for instance.
+//
+// * Usage:
+// -- Only as leaf locks
+// -- for short-term locking only as muxAcquire does not perform
+// thread state transitions.
+//
+// Alternatives:
+// * We could implement muxAcquire and muxRelease with MCS or CLH locks
+// but with parking or spin-then-park instead of pure spinning.
+// * Use Taura-Oyama-Yonenzawa locks.
+// * It's possible to construct a 1-0 lock if we encode the lockword as
+// (List,LockByte). Acquire will CAS the full lockword while Release
+// will STB 0 into the LockByte. The 1-0 scheme admits stranding, so
+// acquiring threads use timers (ParkTimed) to detect and recover from
+// the stranding window. Thread/Node structures must be aligned on 256-byte
+// boundaries by using placement-new.
+// * Augment MCS with advisory back-link fields maintained with CAS().
+// Pictorially: LockWord -> T1 <-> T2 <-> T3 <-> ... <-> Tn <-> Owner.
+// The validity of the backlinks must be ratified before we trust the value.
+// If the backlinks are invalid the exiting thread must back-track through the
+// the forward links, which are always trustworthy.
+// * Add a successor indication. The LockWord is currently encoded as
+// (List, LOCKBIT:1). We could also add a SUCCBIT or an explicit _succ variable
+// to provide the usual futile-wakeup optimization.
+// See RTStt for details.
+// * Consider schedctl.sc_nopreempt to cover the critical section.
+//
+
+
+typedef volatile intptr_t MutexT ; // Mux Lock-word
+enum MuxBits { LOCKBIT = 1 } ;
+
+void Thread::muxAcquire (volatile intptr_t * Lock, const char * LockName) {
+ intptr_t w = Atomic::cmpxchg_ptr (LOCKBIT, Lock, 0) ;
+ if (w == 0) return ;
+ if ((w & LOCKBIT) == 0 && Atomic::cmpxchg_ptr (w|LOCKBIT, Lock, w) == w) {
+ return ;
+ }
+
+ TEVENT (muxAcquire - Contention) ;
+ ParkEvent * const Self = Thread::current()->_MuxEvent ;
+ assert ((intptr_t(Self) & LOCKBIT) == 0, "invariant") ;
for (;;) {
- p = FreeList ;
- if (p == NULL) break ;
- // 1: Detach
- // Tantamount to p = Swap (&FreeList, NULL)
- if (Atomic::cmpxchg_ptr (NULL, &FreeList, p) != p) {
- continue ;
- }
-
- // We've detached the list. The list in-hand is now
- // local to this thread. This thread can operate on the
- // list without risk of interference from other threads.
- // 2: Extract -- pop the 1st element from the list.
- Parker * List = p->FreeNext ;
- if (List == NULL) break ;
- for (;;) {
- // 3: Try to reattach the residual list
- guarantee (List != NULL, "invariant") ;
- Parker * Arv = (Parker *) Atomic::cmpxchg_ptr (List, &FreeList, NULL) ;
- if (Arv == NULL) break ;
-
- // New nodes arrived. Try to detach the recent arrivals.
- if (Atomic::cmpxchg_ptr (NULL, &FreeList, Arv) != Arv) {
- continue ;
+ int its = (os::is_MP() ? 100 : 0) + 1 ;
+
+ // Optional spin phase: spin-then-park strategy
+ while (--its >= 0) {
+ w = *Lock ;
+ if ((w & LOCKBIT) == 0 && Atomic::cmpxchg_ptr (w|LOCKBIT, Lock, w) == w) {
+ return ;
+ }
+ }
+
+ Self->reset() ;
+ Self->OnList = intptr_t(Lock) ;
+ // The following fence() isn't _strictly necessary as the subsequent
+ // CAS() both serializes execution and ratifies the fetched *Lock value.
+ OrderAccess::fence();
+ for (;;) {
+ w = *Lock ;
+ if ((w & LOCKBIT) == 0) {
+ if (Atomic::cmpxchg_ptr (w|LOCKBIT, Lock, w) == w) {
+ Self->OnList = 0 ; // hygiene - allows stronger asserts
+ return ;
+ }
+ continue ; // Interference -- *Lock changed -- Just retry
}
- guarantee (Arv != NULL, "invariant") ;
- // 4: Merge Arv into List
- Parker * Tail = List ;
- while (Tail->FreeNext != NULL) Tail = Tail->FreeNext ;
- Tail->FreeNext = Arv ;
- }
- break ;
- }
-
- if (p != NULL) {
- guarantee (p->AssociatedWith == NULL, "invariant") ;
- } else {
- // Do this the hard way -- materialize a new Parker..
- // In rare cases an allocating thread might detach
- // a long list -- installing null into FreeList --and
- // then stall. Another thread calling Allocate() would see
- // FreeList == null and then invoke the ctor. In this case we
- // end up with more Parkers in circulation than we need, but
- // the race is rare and the outcome is benign.
- // Ideally, the # of extant Parkers is equal to the
- // maximum # of threads that existed at any one time.
- // Because of the race mentioned above, segments of the
- // freelist can be transiently inaccessible. At worst
- // we may end up with the # of Parkers in circulation
- // slightly above the ideal.
- p = new Parker() ;
- }
- p->AssociatedWith = t ; // Associate p with t
- p->FreeNext = NULL ;
- return p ;
-}
-
-
-void Parker::Release (Parker * p) {
- if (p == NULL) return ;
- guarantee (p->AssociatedWith != NULL, "invariant") ;
- guarantee (p->FreeNext == NULL , "invariant") ;
- p->AssociatedWith = NULL ;
- for (;;) {
- // Push p onto FreeList
- Parker * List = FreeList ;
- p->FreeNext = List ;
- if (Atomic::cmpxchg_ptr (p, &FreeList, List) == List) break ;
+ assert (w & LOCKBIT, "invariant") ;
+ Self->ListNext = (ParkEvent *) (w & ~LOCKBIT );
+ if (Atomic::cmpxchg_ptr (intptr_t(Self)|LOCKBIT, Lock, w) == w) break ;
+ }
+
+ while (Self->OnList != 0) {
+ Self->park() ;
+ }
}
}
+void Thread::muxAcquireW (volatile intptr_t * Lock, ParkEvent * ev) {
+ intptr_t w = Atomic::cmpxchg_ptr (LOCKBIT, Lock, 0) ;
+ if (w == 0) return ;
+ if ((w & LOCKBIT) == 0 && Atomic::cmpxchg_ptr (w|LOCKBIT, Lock, w) == w) {
+ return ;
+ }
+
+ TEVENT (muxAcquire - Contention) ;
+ ParkEvent * ReleaseAfter = NULL ;
+ if (ev == NULL) {
+ ev = ReleaseAfter = ParkEvent::Allocate (NULL) ;
+ }
+ assert ((intptr_t(ev) & LOCKBIT) == 0, "invariant") ;
+ for (;;) {
+ guarantee (ev->OnList == 0, "invariant") ;
+ int its = (os::is_MP() ? 100 : 0) + 1 ;
+
+ // Optional spin phase: spin-then-park strategy
+ while (--its >= 0) {
+ w = *Lock ;
+ if ((w & LOCKBIT) == 0 && Atomic::cmpxchg_ptr (w|LOCKBIT, Lock, w) == w) {
+ if (ReleaseAfter != NULL) {
+ ParkEvent::Release (ReleaseAfter) ;
+ }
+ return ;
+ }
+ }
+
+ ev->reset() ;
+ ev->OnList = intptr_t(Lock) ;
+ // The following fence() isn't _strictly necessary as the subsequent
+ // CAS() both serializes execution and ratifies the fetched *Lock value.
+ OrderAccess::fence();
+ for (;;) {
+ w = *Lock ;
+ if ((w & LOCKBIT) == 0) {
+ if (Atomic::cmpxchg_ptr (w|LOCKBIT, Lock, w) == w) {
+ ev->OnList = 0 ;
+ // We call ::Release while holding the outer lock, thus
+ // artificially lengthening the critical section.
+ // Consider deferring the ::Release() until the subsequent unlock(),
+ // after we've dropped the outer lock.
+ if (ReleaseAfter != NULL) {
+ ParkEvent::Release (ReleaseAfter) ;
+ }
+ return ;
+ }
+ continue ; // Interference -- *Lock changed -- Just retry
+ }
+ assert (w & LOCKBIT, "invariant") ;
+ ev->ListNext = (ParkEvent *) (w & ~LOCKBIT );
+ if (Atomic::cmpxchg_ptr (intptr_t(ev)|LOCKBIT, Lock, w) == w) break ;
+ }
+
+ while (ev->OnList != 0) {
+ ev->park() ;
+ }
+ }
+}
+
+// Release() must extract a successor from the list and then wake that thread.
+// It can "pop" the front of the list or use a detach-modify-reattach (DMR) scheme
+// similar to that used by ParkEvent::Allocate() and ::Release(). DMR-based
+// Release() would :
+// (A) CAS() or swap() null to *Lock, releasing the lock and detaching the list.
+// (B) Extract a successor from the private list "in-hand"
+// (C) attempt to CAS() the residual back into *Lock over null.
+// If there were any newly arrived threads and the CAS() would fail.
+// In that case Release() would detach the RATs, re-merge the list in-hand
+// with the RATs and repeat as needed. Alternately, Release() might
+// detach and extract a successor, but then pass the residual list to the wakee.
+// The wakee would be responsible for reattaching and remerging before it
+// competed for the lock.
+//
+// Both "pop" and DMR are immune from ABA corruption -- there can be
+// multiple concurrent pushers, but only one popper or detacher.
+// This implementation pops from the head of the list. This is unfair,
+// but tends to provide excellent throughput as hot threads remain hot.
+// (We wake recently run threads first).
+
+void Thread::muxRelease (volatile intptr_t * Lock) {
+ for (;;) {
+ const intptr_t w = Atomic::cmpxchg_ptr (0, Lock, LOCKBIT) ;
+ assert (w & LOCKBIT, "invariant") ;
+ if (w == LOCKBIT) return ;
+ ParkEvent * List = (ParkEvent *) (w & ~LOCKBIT) ;
+ assert (List != NULL, "invariant") ;
+ assert (List->OnList == intptr_t(Lock), "invariant") ;
+ ParkEvent * nxt = List->ListNext ;
+
+ // The following CAS() releases the lock and pops the head element.
+ if (Atomic::cmpxchg_ptr (intptr_t(nxt), Lock, w) != w) {
+ continue ;
+ }
+ List->OnList = 0 ;
+ OrderAccess::fence() ;
+ List->unpark () ;
+ return ;
+ }
+}
+
+
void Threads::verify() {
ALL_JAVA_THREADS(p) {
p->verify();