# HG changeset patch
# User pchilanomate
# Date 1549397533 18000
# Node ID 043ae846819f4cd86973fcb255b2994a2ab9da20
# Parent  2c6c0fabe6a2482110cccd96696260f1729165f4
8210832: Remove sneaky locking in class Monitor
Summary: Removed sneaky locking and simplified vm monitors implementation
Reviewed-by: rehn, dcubed, pliden, dholmes, coleenp
Contributed-by: david.holmes@oracle.com, patricio.chilano.mateo@oracle.com

diff -r 2c6c0fabe6a2 -r 043ae846819f src/hotspot/os/posix/os_posix.cpp
--- a/src/hotspot/os/posix/os_posix.cpp	Tue Feb 05 13:21:59 2019 -0500
+++ b/src/hotspot/os/posix/os_posix.cpp	Tue Feb 05 15:12:13 2019 -0500
@@ -2215,5 +2215,74 @@
   }
 }
 
+// Platform Monitor implementation
+
+os::PlatformMonitor::PlatformMonitor() {
+  int status = pthread_cond_init(&_cond, _condAttr);
+  assert_status(status == 0, status, "cond_init");
+  status = pthread_mutex_init(&_mutex, _mutexAttr);
+  assert_status(status == 0, status, "mutex_init");
+}
+
+os::PlatformMonitor::~PlatformMonitor() {
+  int status = pthread_cond_destroy(&_cond);
+  assert_status(status == 0, status, "cond_destroy");
+  status = pthread_mutex_destroy(&_mutex);
+  assert_status(status == 0, status, "mutex_destroy");
+}
+
+void os::PlatformMonitor::lock() {
+  int status = pthread_mutex_lock(&_mutex);
+  assert_status(status == 0, status, "mutex_lock");
+}
+
+void os::PlatformMonitor::unlock() {
+  int status = pthread_mutex_unlock(&_mutex);
+  assert_status(status == 0, status, "mutex_unlock");
+}
+
+bool os::PlatformMonitor::try_lock() {
+  int status = pthread_mutex_trylock(&_mutex);
+  assert_status(status == 0 || status == EBUSY, status, "mutex_trylock");
+  return status == 0;
+}
+
+// Must already be locked
+int os::PlatformMonitor::wait(jlong millis) {
+  assert(millis >= 0, "negative timeout");
+  if (millis > 0) {
+    struct timespec abst;
+    // We have to watch for overflow when converting millis to nanos,
+    // but if millis is that large then we will end up limiting to
+    // MAX_SECS anyway, so just do that here.
+    if (millis / MILLIUNITS > MAX_SECS) {
+      millis = jlong(MAX_SECS) * MILLIUNITS;
+    }
+    to_abstime(&abst, millis * (NANOUNITS / MILLIUNITS), false, false);
+
+    int ret = OS_TIMEOUT;
+    int status = pthread_cond_timedwait(&_cond, &_mutex, &abst);
+    assert_status(status == 0 || status == ETIMEDOUT,
+                  status, "cond_timedwait");
+    if (status == 0) {
+      ret = OS_OK;
+    }
+    return ret;
+  } else {
+    int status = pthread_cond_wait(&_cond, &_mutex);
+    assert_status(status == 0, status, "cond_wait");
+    return OS_OK;
+  }
+}
+
+void os::PlatformMonitor::notify() {
+  int status = pthread_cond_signal(&_cond);
+  assert_status(status == 0, status, "cond_signal");
+}
+
+void os::PlatformMonitor::notify_all() {
+  int status = pthread_cond_broadcast(&_cond);
+  assert_status(status == 0, status, "cond_broadcast");
+}
 
 #endif // !SOLARIS
diff -r 2c6c0fabe6a2 -r 043ae846819f src/hotspot/os/posix/os_posix.hpp
--- a/src/hotspot/os/posix/os_posix.hpp	Tue Feb 05 13:21:59 2019 -0500
+++ b/src/hotspot/os/posix/os_posix.hpp	Tue Feb 05 15:12:13 2019 -0500
@@ -224,6 +224,23 @@
   PlatformParker();
 };
 
+// Platform specific implementation that underpins VM Monitor/Mutex class
+class PlatformMonitor : public CHeapObj<mtInternal> {
+ private:
+  pthread_mutex_t _mutex; // Native mutex for locking
+  pthread_cond_t  _cond;  // Native condition variable for blocking
+
+ public:
+  PlatformMonitor();
+  ~PlatformMonitor();
+  void lock();
+  void unlock();
+  bool try_lock();
+  int wait(jlong millis);
+  void notify();
+  void notify_all();
+};
+
 #endif // !SOLARIS
 
 #endif // OS_POSIX_OS_POSIX_HPP
diff -r 2c6c0fabe6a2 -r 043ae846819f src/hotspot/os/solaris/os_solaris.cpp
--- a/src/hotspot/os/solaris/os_solaris.cpp	Tue Feb 05 13:21:59 2019 -0500
+++ b/src/hotspot/os/solaris/os_solaris.cpp	Tue Feb 05 15:12:13 2019 -0500
@@ -5192,6 +5192,72 @@
   }
 }
 
+// Platform Monitor implementation
+
+os::PlatformMonitor::PlatformMonitor() {
+  int status = os::Solaris::cond_init(&_cond);
+  assert_status(status == 0, status, "cond_init");
+  status = os::Solaris::mutex_init(&_mutex);
+  assert_status(status == 0, status, "mutex_init");
+}
+
+os::PlatformMonitor::~PlatformMonitor() {
+  int status = os::Solaris::cond_destroy(&_cond);
+  assert_status(status == 0, status, "cond_destroy");
+  status = os::Solaris::mutex_destroy(&_mutex);
+  assert_status(status == 0, status, "mutex_destroy");
+}
+
+void os::PlatformMonitor::lock() {
+  int status = os::Solaris::mutex_lock(&_mutex);
+  assert_status(status == 0, status, "mutex_lock");
+}
+
+void os::PlatformMonitor::unlock() {
+  int status = os::Solaris::mutex_unlock(&_mutex);
+  assert_status(status == 0, status, "mutex_unlock");
+}
+
+bool os::PlatformMonitor::try_lock() {
+  int status = os::Solaris::mutex_trylock(&_mutex);
+  assert_status(status == 0 || status == EBUSY, status, "mutex_trylock");
+  return status == 0;
+}
+
+// Must already be locked
+int os::PlatformMonitor::wait(jlong millis) {
+  assert(millis >= 0, "negative timeout");
+  if (millis > 0) {
+    timestruc_t abst;
+    int ret = OS_TIMEOUT;
+    compute_abstime(&abst, millis);
+    int status = os::Solaris::cond_timedwait(&_cond, &_mutex, &abst);
+    assert_status(status == 0 || status == EINTR ||
+                  status == ETIME || status == ETIMEDOUT,
+                  status, "cond_timedwait");
+    // EINTR acts as spurious wakeup - which is permitted anyway
+    if (status == 0 || status == EINTR) {
+      ret = OS_OK;
+    }
+    return ret;
+  } else {
+    int status = os::Solaris::cond_wait(&_cond, &_mutex);
+    assert_status(status == 0 || status == EINTR,
+                  status, "cond_wait");
+    return OS_OK;
+  }
+}
+
+void os::PlatformMonitor::notify() {
+  int status = os::Solaris::cond_signal(&_cond);
+  assert_status(status == 0, status, "cond_signal");
+}
+
+void os::PlatformMonitor::notify_all() {
+  int status = os::Solaris::cond_broadcast(&_cond);
+  assert_status(status == 0, status, "cond_broadcast");
+}
+
 extern char** environ;
 
 // Run the specified command in a separate process. Return its exit value,
diff -r 2c6c0fabe6a2 -r 043ae846819f src/hotspot/os/solaris/os_solaris.hpp
--- a/src/hotspot/os/solaris/os_solaris.hpp	Tue Feb 05 13:21:59 2019 -0500
+++ b/src/hotspot/os/solaris/os_solaris.hpp	Tue Feb 05 15:12:13 2019 -0500
@@ -335,4 +335,21 @@
   }
 };
 
+// Platform specific implementation that underpins VM Monitor/Mutex class
+class PlatformMonitor : public CHeapObj<mtInternal> {
+ private:
+  mutex_t _mutex; // Native mutex for locking
+  cond_t  _cond;  // Native condition variable for blocking
+
+ public:
+  PlatformMonitor();
+  ~PlatformMonitor();
+  void lock();
+  void unlock();
+  bool try_lock();
+  int wait(jlong millis);
+  void notify();
+  void notify_all();
+};
+
 #endif // OS_SOLARIS_OS_SOLARIS_HPP
diff -r 2c6c0fabe6a2 -r 043ae846819f src/hotspot/os/windows/os_windows.cpp
--- a/src/hotspot/os/windows/os_windows.cpp	Tue Feb 05 13:21:59 2019 -0500
+++ b/src/hotspot/os/windows/os_windows.cpp	Tue Feb 05 15:12:13 2019 -0500
@@ -5277,6 +5277,55 @@
   SetEvent(_ParkEvent);
 }
 
+// Platform Monitor implementation
+
+os::PlatformMonitor::PlatformMonitor() {
+  InitializeConditionVariable(&_cond);
+  InitializeCriticalSection(&_mutex);
+}
+
+os::PlatformMonitor::~PlatformMonitor() {
+  DeleteCriticalSection(&_mutex);
+}
+
+void os::PlatformMonitor::lock() {
+  EnterCriticalSection(&_mutex);
+}
+
+void os::PlatformMonitor::unlock() {
+  LeaveCriticalSection(&_mutex);
+}
+
+bool os::PlatformMonitor::try_lock() {
+  return TryEnterCriticalSection(&_mutex);
+}
+
+// Must already be locked
+int os::PlatformMonitor::wait(jlong millis) {
+  assert(millis >= 0, "negative timeout");
+  int ret = OS_TIMEOUT;
+  int status = SleepConditionVariableCS(&_cond, &_mutex,
+                                        millis == 0 ? INFINITE : millis);
+  if (status != 0) {
+    ret = OS_OK;
+  }
+  #ifndef PRODUCT
+  else {
+    DWORD err = GetLastError();
+    assert(err == ERROR_TIMEOUT, "SleepConditionVariableCS: %ld:", err);
+  }
+  #endif
+  return ret;
+}
+
+void os::PlatformMonitor::notify() {
+  WakeConditionVariable(&_cond);
+}
+
+void os::PlatformMonitor::notify_all() {
+  WakeAllConditionVariable(&_cond);
+}
+
 // Run the specified command in a separate process. Return its exit value,
 // or -1 on failure (e.g. can't create a new process).
 int os::fork_and_exec(char* cmd, bool use_vfork_if_available) {
diff -r 2c6c0fabe6a2 -r 043ae846819f src/hotspot/os/windows/os_windows.hpp
--- a/src/hotspot/os/windows/os_windows.hpp	Tue Feb 05 13:21:59 2019 -0500
+++ b/src/hotspot/os/windows/os_windows.hpp	Tue Feb 05 15:12:13 2019 -0500
@@ -187,4 +187,21 @@
 
 } ;
 
+// Platform specific implementation that underpins VM Monitor/Mutex class
+class PlatformMonitor : public CHeapObj<mtInternal> {
+ private:
+  CRITICAL_SECTION   _mutex; // Native mutex for locking
+  CONDITION_VARIABLE _cond;  // Native condition variable for blocking
+
+ public:
+  PlatformMonitor();
+  ~PlatformMonitor();
+  void lock();
+  void unlock();
+  bool try_lock();
+  int wait(jlong millis);
+  void notify();
+  void notify_all();
+};
+
 #endif // OS_WINDOWS_OS_WINDOWS_HPP
diff -r 2c6c0fabe6a2 -r 043ae846819f src/hotspot/share/logging/logTag.hpp
--- a/src/hotspot/share/logging/logTag.hpp	Tue Feb 05 13:21:59 2019 -0500
+++ b/src/hotspot/share/logging/logTag.hpp	Tue Feb 05 15:12:13 2019 -0500
@@ -169,6 +169,7 @@
   LOG_TAG(mirror) \
   LOG_TAG(verification) \
   LOG_TAG(verify) \
+  LOG_TAG(vmmonitor) \
   LOG_TAG(vmoperation) \
   LOG_TAG(vmthread) \
   LOG_TAG(vtables) \
diff -r 2c6c0fabe6a2 -r 043ae846819f src/hotspot/share/runtime/interfaceSupport.inline.hpp
--- a/src/hotspot/share/runtime/interfaceSupport.inline.hpp	Tue Feb 05 13:21:59 2019 -0500
+++ b/src/hotspot/share/runtime/interfaceSupport.inline.hpp	Tue Feb 05 15:12:13 2019 -0500
@@ -286,6 +286,69 @@
   }
 };
 
+// Unlike ThreadBlockInVM, this class is designed to avoid certain deadlock scenarios while making
+// transitions inside class Monitor in cases where we need to block for a safepoint or handshake. It
+// receives an extra argument compared to ThreadBlockInVM, the address of a pointer to the monitor we
+// are trying to acquire. This will be used to access and release the monitor if needed to avoid
+// said deadlocks.
+// It works like ThreadBlockInVM but differs from it in two ways:
+// - When transitioning in (constructor), it checks for safepoints without blocking, i.e., calls
+//   back if needed to allow a pending safepoint to continue but does not block in it.
+// - When transitioning back (destructor), if there is a pending safepoint or handshake it releases
+//   the monitor that is only partially acquired.
+class ThreadBlockInVMWithDeadlockCheck : public ThreadStateTransition {
+ private:
+  Monitor** _in_flight_monitor_adr;
+
+  void release_monitor() {
+    assert(_in_flight_monitor_adr != NULL, "_in_flight_monitor_adr should have been set on constructor");
+    Monitor* in_flight_monitor = *_in_flight_monitor_adr;
+    if (in_flight_monitor != NULL) {
+      in_flight_monitor->release_for_safepoint();
+      *_in_flight_monitor_adr = NULL;
+    }
+  }
+ public:
+  ThreadBlockInVMWithDeadlockCheck(JavaThread* thread, Monitor** in_flight_monitor_adr)
+  : ThreadStateTransition(thread), _in_flight_monitor_adr(in_flight_monitor_adr) {
+    // Once we are blocked vm expects stack to be walkable
+    thread->frame_anchor()->make_walkable(thread);
+
+    thread->set_thread_state((JavaThreadState)(_thread_in_vm + 1));
+    InterfaceSupport::serialize_thread_state_with_handler(thread);
+
+    SafepointMechanism::callback_if_safepoint(thread);
+
+    thread->set_thread_state(_thread_blocked);
+
+    CHECK_UNHANDLED_OOPS_ONLY(_thread->clear_unhandled_oops();)
+  }
+  ~ThreadBlockInVMWithDeadlockCheck() {
+    // Change to transition state
+    _thread->set_thread_state((JavaThreadState)(_thread_blocked + 1));
+
+    InterfaceSupport::serialize_thread_state_with_handler(_thread);
+
+    if (SafepointMechanism::should_block(_thread)) {
+      release_monitor();
+      SafepointMechanism::callback_if_safepoint(_thread);
+      // The VMThread might have read that we were in a _thread_blocked state
+      // and proceeded to process a handshake for us. If that's the case then
+      // we need to block.
+      // By doing this we are also making the current thread process its own
+      // handshake if there is one pending and the VMThread didn't try to process
+      // it yet. This is more of a side-effect and not really necessary; the
+      // handshake could be processed later on.
+      if (_thread->has_handshake()) {
+        _thread->handshake_process_by_self();
+      }
+    }
+
+    _thread->set_thread_state(_thread_in_vm);
+    CHECK_UNHANDLED_OOPS_ONLY(_thread->clear_unhandled_oops();)
+  }
+};
+
 
 // This special transition class is only used to prevent asynchronous exceptions
 // from being installed on vm exit in situations where we can't tolerate them.
diff -r 2c6c0fabe6a2 -r 043ae846819f src/hotspot/share/runtime/mutex.cpp
--- a/src/hotspot/share/runtime/mutex.cpp	Tue Feb 05 13:21:59 2019 -0500
+++ b/src/hotspot/share/runtime/mutex.cpp	Tue Feb 05 15:12:13 2019 -0500
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2019, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -23,915 +23,81 @@
  */
 
 #include "precompiled.hpp"
-#include "runtime/atomic.hpp"
+#include "logging/log.hpp"
 #include "runtime/interfaceSupport.inline.hpp"
 #include "runtime/mutex.hpp"
-#include "runtime/orderAccess.hpp"
 #include "runtime/osThread.hpp"
 #include "runtime/safepointMechanism.inline.hpp"
 #include "runtime/thread.inline.hpp"
 #include "utilities/events.hpp"
 #include "utilities/macros.hpp"
 
-// o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o
-//
-// Native Monitor-Mutex locking - theory of operations
-//
-// * Native Monitors are completely unrelated to Java-level monitors,
-//   although the "back-end" slow-path implementations share a common lineage.
-//   See objectMonitor:: in synchronizer.cpp.
-//   Native Monitors do *not* support nesting or recursion but otherwise
-//   they're basically Hoare-flavor monitors.
-//
-// * A thread acquires ownership of a Monitor/Mutex by CASing the LockByte
-//   in the _LockWord from zero to non-zero.  Note that the _Owner field
-//   is advisory and is used only to verify that the thread calling unlock()
-//   is indeed the last thread to have acquired the lock.
-//
-// * Contending threads "push" themselves onto the front of the contention
-//   queue -- called the cxq -- with CAS and then spin/park.
-//   The _LockWord contains the LockByte as well as the pointer to the head
-//   of the cxq.  Colocating the LockByte with the cxq precludes certain races.
-//
-// * Using a separately addressable LockByte allows for CAS:MEMBAR or CAS:0
-//   idioms.  We currently use MEMBAR in the uncontended unlock() path, as
-//   MEMBAR often has less latency than CAS.  If warranted, we could switch to
-//   a CAS:0 mode, using timers to close the resultant race, as is done
-//   with Java Monitors in synchronizer.cpp.
-//
-//   See the following for a discussion of the relative cost of atomics (CAS)
-//   MEMBAR, and ways to eliminate such instructions from the common-case paths:
-//   -- http://blogs.sun.com/dave/entry/biased_locking_in_hotspot
-//   -- http://blogs.sun.com/dave/resource/MustangSync.pdf
-//   -- http://blogs.sun.com/dave/resource/synchronization-public2.pdf
-//   -- synchronizer.cpp
-//
-// * Overall goals - desiderata
-//   1. Minimize context switching
-//   2. Minimize lock migration
-//   3. Minimize CPI -- affinity and locality
-//   4. Minimize the execution of high-latency instructions such as CAS or MEMBAR
-//   5. Minimize outer lock hold times
-//   6. Behave gracefully on a loaded system
-//
-// * Thread flow and list residency:
-//
-//   Contention queue --> EntryList --> OnDeck --> Owner --> !Owner
-//   [..resident on monitor list..]
-//   [...........contending..................]
-//
-//   -- The contention queue (cxq) contains recently-arrived threads (RATs).
-//      Threads on the cxq eventually drain into the EntryList.
-//   -- Invariant: a thread appears on at most one list -- cxq, EntryList
-//      or WaitSet -- at any one time.
-//   -- For a given monitor there can be at most one "OnDeck" thread at any
-//      given time but if needbe this particular invariant could be relaxed.
-//
-// * The WaitSet and EntryList linked lists are composed of ParkEvents.
-//   I use ParkEvent instead of threads as ParkEvents are immortal and
-//   type-stable, meaning we can safely unpark() a possibly stale
-//   list element in the unlock()-path.  (That's benign).
-//
-// * Succession policy - providing for progress:
-//
-//   As necessary, the unlock()ing thread identifies, unlinks, and unparks
-//   an "heir presumptive" tentative successor thread from the EntryList.
-//   This becomes the so-called "OnDeck" thread, of which there can be only
-//   one at any given time for a given monitor.  The wakee will recontend
-//   for ownership of monitor.
-//
-//   Succession is provided for by a policy of competitive handoff.
-//   The exiting thread does _not_ grant or pass ownership to the
-//   successor thread.  (This is also referred to as "handoff" succession").
-//   Instead the exiting thread releases ownership and possibly wakes
-//   a successor, so the successor can (re)compete for ownership of the lock.
-//
-//   Competitive handoff provides excellent overall throughput at the expense
-//   of short-term fairness.  If fairness is a concern then one remedy might
-//   be to add an AcquireCounter field to the monitor.  After a thread acquires
-//   the lock it will decrement the AcquireCounter field.  When the count
-//   reaches 0 the thread would reset the AcquireCounter variable, abdicate
-//   the lock directly to some thread on the EntryList, and then move itself to the
-//   tail of the EntryList.
-//
-//   But in practice most threads engage or otherwise participate in resource
-//   bounded producer-consumer relationships, so lock domination is not usually
-//   a practical concern.  Recall too, that in general it's easier to construct
-//   a fair lock from a fast lock, but not vice-versa.
-//
-// * The cxq can have multiple concurrent "pushers" but only one concurrent
-//   detaching thread.  This mechanism is immune from the ABA corruption.
-//   More precisely, the CAS-based "push" onto cxq is ABA-oblivious.
-//   We use OnDeck as a pseudo-lock to enforce the at-most-one detaching
-//   thread constraint.
-//
-// * Taken together, the cxq and the EntryList constitute or form a
-//   single logical queue of threads stalled trying to acquire the lock.
-//   We use two distinct lists to reduce heat on the list ends.
-//   Threads in lock() enqueue onto cxq while threads in unlock() will
-//   dequeue from the EntryList.  (c.f. Michael Scott's "2Q" algorithm).
-//   A key desideratum is to minimize queue & monitor metadata manipulation
-//   that occurs while holding the "outer" monitor lock -- that is, we want to
-//   minimize monitor lock holds times.
-//
-//   The EntryList is ordered by the prevailing queue discipline and
-//   can be organized in any convenient fashion, such as a doubly-linked list or
-//   a circular doubly-linked list.  If we need a priority queue then something akin
-//   to Solaris' sleepq would work nicely.  Viz.,
-//   -- http://agg.eng/ws/on10_nightly/source/usr/src/uts/common/os/sleepq.c.
-//   -- http://cvs.opensolaris.org/source/xref/onnv/onnv-gate/usr/src/uts/common/os/sleepq.c
-//   Queue discipline is enforced at ::unlock() time, when the unlocking thread
-//   drains the cxq into the EntryList, and orders or reorders the threads on the
-//   EntryList accordingly.
-//
-//   Barring "lock barging", this mechanism provides fair cyclic ordering,
-//   somewhat similar to an elevator-scan.
-//
-// * OnDeck
-//   --  For a given monitor there can be at most one OnDeck thread at any given
-//       instant.  The OnDeck thread is contending for the lock, but has been
-//       unlinked from the EntryList and cxq by some previous unlock() operations.
-//       Once a thread has been designated the OnDeck thread it will remain so
-//       until it manages to acquire the lock -- being OnDeck is a stable property.
-//   --  Threads on the EntryList or cxq are _not allowed to attempt lock acquisition.
-//   --  OnDeck also serves as an "inner lock" as follows.  Threads in unlock() will, after
-//       having cleared the LockByte and dropped the outer lock,  attempt to "trylock"
-//       OnDeck by CASing the field from null to non-null.  If successful, that thread
-//       is then responsible for progress and succession and can use CAS to detach and
-//       drain the cxq into the EntryList.  By convention, only this thread, the holder of
-//       the OnDeck inner lock, can manipulate the EntryList or detach and drain the
-//       RATs on the cxq into the EntryList.  This avoids ABA corruption on the cxq as
-//       we allow multiple concurrent "push" operations but restrict detach concurrency
-//       to at most one thread.  Having selected and detached a successor, the thread then
-//       changes the OnDeck to refer to that successor, and then unparks the successor.
-//       That successor will eventually acquire the lock and clear OnDeck.  Beware
-//       that the OnDeck usage as a lock is asymmetric.  A thread in unlock() transiently
-//       "acquires" OnDeck, performs queue manipulations, passes OnDeck to some successor,
-//       and then the successor eventually "drops" OnDeck.  Note that there's never
-//       any sense of contention on the inner lock, however.  Threads never contend
-//       or wait for the inner lock.
-//   --  OnDeck provides for futile wakeup throttling a described in section 3.3 of
-//       See http://www.usenix.org/events/jvm01/full_papers/dice/dice.pdf
-//       In a sense, OnDeck subsumes the ObjectMonitor _Succ and ObjectWaiter
-//       TState fields found in Java-level objectMonitors.  (See synchronizer.cpp).
-//
-// * Waiting threads reside on the WaitSet list -- wait() puts
-//   the caller onto the WaitSet.  Notify() or notifyAll() simply
-//   transfers threads from the WaitSet to either the EntryList or cxq.
-//   Subsequent unlock() operations will eventually unpark the notifyee.
-//   Unparking a notifee in notify() proper is inefficient - if we were to do so
-//   it's likely the notifyee would simply impale itself on the lock held
-//   by the notifier.
-//
-// * The mechanism is obstruction-free in that if the holder of the transient
-//   OnDeck lock in unlock() is preempted or otherwise stalls, other threads
-//   can still acquire and release the outer lock and continue to make progress.
-//   At worst, waking of already blocked contending threads may be delayed,
-//   but nothing worse.  (We only use "trylock" operations on the inner OnDeck
-//   lock).
-//
-// * Note that thread-local storage must be initialized before a thread
-//   uses Native monitors or mutexes.  The native monitor-mutex subsystem
-//   depends on Thread::current().
-//
-// * The monitor synchronization subsystem avoids the use of native
-//   synchronization primitives except for the narrow platform-specific
-//   park-unpark abstraction.  See the comments in os_solaris.cpp regarding
-//   the semantics of park-unpark.  Put another way, this monitor implementation
-//   depends only on atomic operations and park-unpark.  The monitor subsystem
-//   manages all RUNNING->BLOCKED and BLOCKED->READY transitions while the
-//   underlying OS manages the READY<->RUN transitions.
-//
-// * The memory consistency model provide by lock()-unlock() is at least as
-//   strong or stronger than the Java Memory model defined by JSR-133.
-//   That is, we guarantee at least entry consistency, if not stronger.
-//   See http://g.oswego.edu/dl/jmm/cookbook.html.
-//
-// * Thread:: currently contains a set of purpose-specific ParkEvents:
-//   _MutexEvent, _ParkEvent, etc.  A better approach might be to do away with
-//   the purpose-specific ParkEvents and instead implement a general per-thread
-//   stack of available ParkEvents which we could provision on-demand.  The
-//   stack acts as a local cache to avoid excessive calls to ParkEvent::Allocate()
-//   and ::Release().  A thread would simply pop an element from the local stack before it
-//   enqueued or park()ed.  When the contention was over the thread would
-//   push the no-longer-needed ParkEvent back onto its stack.
-//
-// * A slightly reduced form of ILock() and IUnlock() have been partially
-//   model-checked (Murphi) for safety and progress at T=1,2,3 and 4.
-//   It'd be interesting to see if TLA/TLC could be useful as well.
-//
-// * Mutex-Monitor is a low-level "leaf" subsystem.  That is, the monitor
-//   code should never call other code in the JVM that might itself need to
-//   acquire monitors or mutexes.  That's true *except* in the case of the
-//   ThreadBlockInVM state transition wrappers.  The ThreadBlockInVM DTOR handles
-//   mutator reentry (ingress) by checking for a pending safepoint in which case it will
-//   call SafepointSynchronize::block(), which in turn may call Safepoint_lock->lock(), etc.
-//   In that particular case a call to lock() for a given Monitor can end up recursively
-//   calling lock() on another monitor.   While distasteful, this is largely benign
-//   as the calls come from jacket that wraps lock(), and not from deep within lock() itself.
-//
-//   It's unfortunate that native mutexes and thread state transitions were convolved.
-//   They're really separate concerns and should have remained that way.  Melding
-//   them together was facile -- a bit too facile.   The current implementation badly
-//   conflates the two concerns.
-//
-// * TODO-FIXME:
-//
-//   -- Add DTRACE probes for contended acquire, contended acquired, contended unlock
-//      We should also add DTRACE probes in the ParkEvent subsystem for
-//      Park-entry, Park-exit, and Unpark.
-//
-//   -- We have an excess of mutex-like constructs in the JVM, namely:
-//      1. objectMonitors for Java-level synchronization (synchronizer.cpp)
-//      2. low-level muxAcquire and muxRelease
-//      3. low-level spinAcquire and spinRelease
-//      4. native Mutex:: and Monitor::
-//      5. jvm_raw_lock() and _unlock()
-//      6. JVMTI raw monitors -- distinct from (5) despite having a confusingly
-//         similar name.
-//
-// o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o-o
 
-#define UNS(x) (uintptr_t(x))
-#define TRACE(m)                   \
-  {                                \
-    static volatile int ctr = 0;   \
-    int x = ++ctr;                 \
-    if ((x & (x - 1)) == 0) {      \
-      ::printf("%d:%s\n", x, #m);  \
-      ::fflush(stdout);            \
-    }                              \
-  }
-
-const intptr_t _LBIT = 1;
-
-// Endian-ness ... index of least-significant byte in SplitWord.Bytes[]
-#ifdef VM_LITTLE_ENDIAN
- #define _LSBINDEX 0
-#else
- #define _LSBINDEX (sizeof(intptr_t)-1)
-#endif
-
-// Simplistic low-quality Marsaglia SHIFT-XOR RNG.
-// Bijective except for the trailing mask operation.
-// Useful for spin loops as the compiler can't optimize it away.
-
-static inline jint MarsagliaXORV(jint x) {
-  if (x == 0) x = 1|os::random();
-  x ^= x << 6;
-  x ^= ((unsigned)x) >> 21;
-  x ^= x << 7;
-  return x & 0x7FFFFFFF;
-}
-
-static int Stall(int its) {
-  static volatile jint rv = 1;
-  volatile int OnFrame = 0;
-  jint v = rv ^ UNS(OnFrame);
-  while (--its >= 0) {
-    v = MarsagliaXORV(v);
-  }
-  // Make this impossible for the compiler to optimize away,
-  // but (mostly) avoid W coherency sharing on MP systems.
-  if (v == 0x12345) rv = v;
-  return v;
-}
-
-int Monitor::TryLock() {
-  intptr_t v = _LockWord.FullWord;
-  for (;;) {
-    if ((v & _LBIT) != 0) return 0;
-    const intptr_t u = Atomic::cmpxchg(v|_LBIT, &_LockWord.FullWord, v);
-    if (v == u) return 1;
-    v = u;
-  }
-}
-
-int Monitor::TryFast() {
-  // Optimistic fast-path form ...
-  // Fast-path attempt for the common uncontended case.
-  // Avoid RTS->RTO $ coherence upgrade on typical SMP systems.
-  intptr_t v = Atomic::cmpxchg(_LBIT, &_LockWord.FullWord, (intptr_t)0);  // agro ...
-  if (v == 0) return 1;
-
-  for (;;) {
-    if ((v & _LBIT) != 0) return 0;
-    const intptr_t u = Atomic::cmpxchg(v|_LBIT, &_LockWord.FullWord, v);
-    if (v == u) return 1;
-    v = u;
-  }
-}
-
-int Monitor::ILocked() {
-  const intptr_t w = _LockWord.FullWord & 0xFF;
-  assert(w == 0 || w == _LBIT, "invariant");
-  return w == _LBIT;
-}
-
-// Polite TATAS spinlock with exponential backoff - bounded spin.
-// Ideally we'd use processor cycles, time or vtime to control
-// the loop, but we currently use iterations.
-// All the constants within were derived empirically but work over
-// over the spectrum of J2SE reference platforms.
-// On Niagara-class systems the back-off is unnecessary but
-// is relatively harmless.  (At worst it'll slightly retard
-// acquisition times).  The back-off is critical for older SMP systems
-// where constant fetching of the LockWord would otherwise impair
-// scalability.
-//
-// Clamp spinning at approximately 1/2 of a context-switch round-trip.
-// See synchronizer.cpp for details and rationale.
-
-int Monitor::TrySpin(Thread * const Self) {
-  if (TryLock())    return 1;
-  if (!os::is_MP()) return 0;
-
-  int Probes  = 0;
-  int Delay   = 0;
-  int SpinMax = 20;
-  for (;;) {
-    intptr_t v = _LockWord.FullWord;
-    if ((v & _LBIT) == 0) {
-      if (Atomic::cmpxchg (v|_LBIT, &_LockWord.FullWord, v) == v) {
-        return 1;
-      }
-      continue;
-    }
-
-    SpinPause();
-
-    // Periodically increase Delay -- variable Delay form
-    // conceptually: delay *= 1 + 1/Exponent
-    ++Probes;
-    if (Probes > SpinMax) return 0;
-
-    if ((Probes & 0x7) == 0) {
-      Delay = ((Delay << 1)|1) & 0x7FF;
-      // CONSIDER: Delay += 1 + (Delay/4); Delay &= 0x7FF ;
-    }
-
-    // Stall for "Delay" time units - iterations in the current implementation.
-    // Avoid generating coherency traffic while stalled.
-    // Possible ways to delay:
-    //   PAUSE, SLEEP, MEMBAR #sync, MEMBAR #halt,
-    //   wr %g0,%asi, gethrtime, rdstick, rdtick, rdtsc, etc. ...
-    // Note that on Niagara-class systems we want to minimize STs in the
-    // spin loop.  N1 and brethren write-around the L1$ over the xbar into the L2$.
-    // Furthermore, they don't have a W$ like traditional SPARC processors.
-    // We currently use a Marsaglia Shift-Xor RNG loop.
-    if (Self != NULL) {
-      jint rv = Self->rng[0];
-      for (int k = Delay; --k >= 0;) {
-        rv = MarsagliaXORV(rv);
-        if (SafepointMechanism::should_block(Self)) return 0;
-      }
-      Self->rng[0] = rv;
-    } else {
-      Stall(Delay);
-    }
-  }
-}
-
-static int ParkCommon(ParkEvent * ev, jlong timo) {
-  // Diagnostic support - periodically unwedge blocked threads
-  int err = OS_OK;
-  if (0 == timo) {
-    ev->park();
-  } else {
-    err = ev->park(timo);
-  }
-  return err;
-}
-
-inline int Monitor::AcquireOrPush(ParkEvent * ESelf) {
-  intptr_t v = _LockWord.FullWord;
-  for (;;) {
-    if ((v & _LBIT) == 0) {
-      const intptr_t u = Atomic::cmpxchg(v|_LBIT, &_LockWord.FullWord, v);
-      if (u == v) return 1;        // indicate acquired
-      v = u;
-    } else {
-      // Anticipate success ...
-      ESelf->ListNext = (ParkEvent *)(v & ~_LBIT);
-      const intptr_t u = Atomic::cmpxchg(intptr_t(ESelf)|_LBIT, &_LockWord.FullWord, v);
-      if (u == v) return 0;        // indicate pushed onto cxq
-      v = u;
-    }
-    // Interference - LockWord change - just retry
-  }
-}
-
-// ILock and IWait are the lowest level primitive internal blocking
-// synchronization functions.  The callers of IWait and ILock must have
-// performed any needed state transitions beforehand.
-// IWait and ILock may directly call park() without any concern for thread state.
-// Note that ILock and IWait do *not* access _owner.
-// _owner is a higher-level logical concept.
-
-void Monitor::ILock(Thread * Self) {
-  assert(_OnDeck != Self->_MutexEvent, "invariant");
-
-  if (TryFast()) {
- Exeunt:
-    assert(ILocked(), "invariant");
-    return;
-  }
-
-  ParkEvent * const ESelf = Self->_MutexEvent;
-  assert(_OnDeck != ESelf, "invariant");
-
-  // As an optimization, spinners could conditionally try to set _OnDeck to _LBIT
-  // Synchronizer.cpp uses a similar optimization.
-  if (TrySpin(Self)) goto Exeunt;
-
-  // Slow-path - the lock is contended.
-  // Either Enqueue Self on cxq or acquire the outer lock.
-  // LockWord encoding = (cxq,LOCKBYTE)
-  ESelf->reset();
-  OrderAccess::fence();
-
-  if (AcquireOrPush(ESelf)) goto Exeunt;
-
-  // At any given time there is at most one ondeck thread.
-  // ondeck implies not resident on cxq and not resident on EntryList
-  // Only the OnDeck thread can try to acquire -- contend for -- the lock.
-  // CONSIDER: use Self->OnDeck instead of m->OnDeck.
-  // Deschedule Self so that others may run.
-  while (OrderAccess::load_acquire(&_OnDeck) != ESelf) {
-    ParkCommon(ESelf, 0);
-  }
-
-  // Self is now in the OnDeck position and will remain so until it
-  // manages to acquire the lock.
-  for (;;) {
-    assert(_OnDeck == ESelf, "invariant");
-    if (TrySpin(Self)) break;
-    // It's probably wise to spin only if we *actually* blocked
-    // CONSIDER: check the lockbyte, if it remains set then
-    // preemptively drain the cxq into the EntryList.
-    // The best place and time to perform queue operations -- lock metadata --
-    // is _before having acquired the outer lock, while waiting for the lock to drop.
-    ParkCommon(ESelf, 0);
-  }
-
-  assert(_OnDeck == ESelf, "invariant");
-  _OnDeck = NULL;
-
-  // Note that we current drop the inner lock (clear OnDeck) in the slow-path
-  // epilogue immediately after having acquired the outer lock.
-  // But instead we could consider the following optimizations:
-  // A. Shift or defer dropping the inner lock until the subsequent IUnlock() operation.
-  //    This might avoid potential reacquisition of the inner lock in IUlock().
-  // B. While still holding the inner lock, attempt to opportunistically select
-  //    and unlink the next OnDeck thread from the EntryList.
-  //    If successful, set OnDeck to refer to that thread, otherwise clear OnDeck.
-  //    It's critical that the select-and-unlink operation run in constant-time as
-  //    it executes when holding the outer lock and may artificially increase the
-  //    effective length of the critical section.
-  // Note that (A) and (B) are tantamount to succession by direct handoff for
-  // the inner lock.
-  goto Exeunt;
-}
-
-void Monitor::IUnlock(bool RelaxAssert) {
-  assert(ILocked(), "invariant");
-  // Conceptually we need a MEMBAR #storestore|#loadstore barrier or fence immediately
-  // before the store that releases the lock.  Crucially, all the stores and loads in the
-  // critical section must be globally visible before the store of 0 into the lock-word
-  // that releases the lock becomes globally visible.  That is, memory accesses in the
-  // critical section should not be allowed to bypass or overtake the following ST that
-  // releases the lock.  As such, to prevent accesses within the critical section
-  // from "leaking" out, we need a release fence between the critical section and the
-  // store that releases the lock.  In practice that release barrier is elided on
-  // platforms with strong memory models such as TSO.
-  //
-  // Note that the OrderAccess::storeload() fence that appears after unlock store
-  // provides for progress conditions and succession and is _not related to exclusion
-  // safety or lock release consistency.
-  OrderAccess::release_store(&_LockWord.Bytes[_LSBINDEX], jbyte(0)); // drop outer lock
-
-  OrderAccess::storeload();
-  ParkEvent * const w = _OnDeck; // raw load as we will just return if non-NULL
-  assert(RelaxAssert || w != Thread::current()->_MutexEvent, "invariant");
-  if (w != NULL) {
-    // Either we have a valid ondeck thread or ondeck is transiently "locked"
-    // by some exiting thread as it arranges for succession.  The LSBit of
-    // OnDeck allows us to discriminate two cases.  If the latter, the
-    // responsibility for progress and succession lies with that other thread.
-    // For good performance, we also depend on the fact that redundant unpark()
-    // operations are cheap.  That is, repeated Unpark()ing of the OnDeck thread
-    // is inexpensive.  This approach provides implicit futile wakeup throttling.
-    // Note that the referent "w" might be stale with respect to the lock.
-    // In that case the following unpark() is harmless and the worst that'll happen
-    // is a spurious return from a park() operation.  Critically, if "w" _is stale,
-    // then progress is known to have occurred as that means the thread associated
-    // with "w" acquired the lock.  In that case this thread need take no further
-    // action to guarantee progress.
-    if ((UNS(w) & _LBIT) == 0) w->unpark();
-    return;
-  }
-
-  intptr_t cxq = _LockWord.FullWord;
-  if (((cxq & ~_LBIT)|UNS(_EntryList)) == 0) {
-    return;      // normal fast-path exit - cxq and EntryList both empty
-  }
-  if (cxq & _LBIT) {
-    // Optional optimization ...
-    // Some other thread acquired the lock in the window since this
-    // thread released it.  Succession is now that thread's responsibility.
-    return;
-  }
-
- Succession:
-  // Slow-path exit - this thread must ensure succession and progress.
-  // OnDeck serves as lock to protect cxq and EntryList.
-  // Only the holder of OnDeck can manipulate EntryList or detach the RATs from cxq.
-  // Avoid ABA - allow multiple concurrent producers (enqueue via push-CAS)
-  // but only one concurrent consumer (detacher of RATs).
-  // Unlike a normal lock, however, the exiting thread "locks" OnDeck,
-  // picks a successor and marks that thread as OnDeck.  That successor
-  // thread will then clear OnDeck once it eventually acquires the outer lock.
-  if (!Atomic::replace_if_null((ParkEvent*)_LBIT, &_OnDeck)) {
-    return;
-  }
-
-  ParkEvent * List = _EntryList;
-  if (List != NULL) {
-    // Transfer the head of the EntryList to the OnDeck position.
-    // Once OnDeck, a thread stays OnDeck until it acquires the lock.
-    // For a given lock there is at most OnDeck thread at any one instant.
-   WakeOne:
-    assert(List == _EntryList, "invariant");
-    ParkEvent * const w = List;
-    assert(RelaxAssert || w != Thread::current()->_MutexEvent, "invariant");
-    _EntryList = w->ListNext;
-    // as a diagnostic measure consider setting w->_ListNext = BAD
-    assert(intptr_t(_OnDeck) == _LBIT, "invariant");
-
-    // Pass OnDeck role to w, ensuring that _EntryList has been set first.
-    // w will clear _OnDeck once it acquires the outer lock.
-    // Note that once we set _OnDeck that thread can acquire the mutex, proceed
-    // with its critical section and then enter this code to unlock the mutex. So
-    // you can have multiple threads active in IUnlock at the same time.
-    OrderAccess::release_store(&_OnDeck, w);
-
-    // Another optional optimization ...
-    // For heavily contended locks it's not uncommon that some other
-    // thread acquired the lock while this thread was arranging succession.
-    // Try to defer the unpark() operation - Delegate the responsibility
-    // for unpark()ing the OnDeck thread to the current or subsequent owners
-    // That is, the new owner is responsible for unparking the OnDeck thread.
-    OrderAccess::storeload();
-    cxq = _LockWord.FullWord;
-    if (cxq & _LBIT) return;
-
-    w->unpark();
-    return;
-  }
-
-  cxq = _LockWord.FullWord;
-  if ((cxq & ~_LBIT) != 0) {
-    // The EntryList is empty but the cxq is populated.
-    // drain RATs from cxq into EntryList
-    // Detach RATs segment with CAS and then merge into EntryList
-    for (;;) {
-      // optional optimization - if locked, the owner is responsible for succession
-      if (cxq & _LBIT) goto Punt;
-      const intptr_t vfy = Atomic::cmpxchg(cxq & _LBIT, &_LockWord.FullWord, cxq);
-      if (vfy == cxq) break;
-      cxq = vfy;
-      // Interference - LockWord changed - Just retry
-      // We can see concurrent interference from contending threads
-      // pushing themselves onto the cxq or from lock-unlock operations.
-      // From the perspective of this thread, EntryList is stable and
-      // the cxq is prepend-only -- the head is volatile but the interior
-      // of the cxq is stable.  In theory if we encounter interference from threads
-      // pushing onto cxq we could simply break off the original cxq suffix and
-      // move that segment to the EntryList, avoiding a 2nd or multiple CAS attempts
-      // on the high-traffic LockWord variable.   For instance lets say the cxq is "ABCD"
-      // when we first fetch cxq above.  Between the fetch -- where we observed "A"
-      // -- and CAS -- where we attempt to CAS null over A -- "PQR" arrive,
-      // yielding cxq = "PQRABCD".  In this case we could simply set A.ListNext
-      // null, leaving cxq = "PQRA" and transfer the "BCD" segment to the EntryList.
-      // Note too, that it's safe for this thread to traverse the cxq
-      // without taking any special concurrency precautions.
-    }
-
-    // We don't currently reorder the cxq segment as we move it onto
-    // the EntryList, but it might make sense to reverse the order
-    // or perhaps sort by thread priority.  See the comments in
-    // synchronizer.cpp objectMonitor::exit().
-    assert(_EntryList == NULL, "invariant");
-    _EntryList = List = (ParkEvent *)(cxq & ~_LBIT);
-    assert(List != NULL, "invariant");
-    goto WakeOne;
-  }
-
-  // cxq|EntryList is empty.
-  // w == NULL implies that cxq|EntryList == NULL in the past.
-  // Possible race - rare inopportune interleaving.
-  // A thread could have added itself to cxq since this thread previously checked.
-  // Detect and recover by refetching cxq.
- Punt:
-  assert(intptr_t(_OnDeck) == _LBIT, "invariant");
-  _OnDeck = NULL;            // Release inner lock.
-  OrderAccess::storeload();   // Dekker duality - pivot point
-
-  // Resample LockWord/cxq to recover from possible race.
-  // For instance, while this thread T1 held OnDeck, some other thread T2 might
-  // acquire the outer lock.  Another thread T3 might try to acquire the outer
-  // lock, but encounter contention and enqueue itself on cxq.  T2 then drops the
-  // outer lock, but skips succession as this thread T1 still holds OnDeck.
-  // T1 is and remains responsible for ensuring succession of T3.
-  //
-  // Note that we don't need to recheck EntryList, just cxq.
-  // If threads moved onto EntryList since we dropped OnDeck
-  // that implies some other thread forced succession.
-  cxq = _LockWord.FullWord;
-  if ((cxq & ~_LBIT) != 0 && (cxq & _LBIT) == 0) {
-    goto Succession;         // potential race -- re-run succession
-  }
-  return;
-}
-
-bool Monitor::notify() {
-  assert(_owner == Thread::current(), "invariant");
-  assert(ILocked(), "invariant");
-  if (_WaitSet == NULL) return true;
-
-  // Transfer one thread from the WaitSet to the EntryList or cxq.
-  // Currently we just unlink the head of the WaitSet and prepend to the cxq.
-  // And of course we could just unlink it and unpark it, too, but
-  // in that case it'd likely impale itself on the reentry.
-  Thread::muxAcquire(_WaitLock, "notify:WaitLock");
-  ParkEvent * nfy = _WaitSet;
-  if (nfy != NULL) {                  // DCL idiom
-    _WaitSet = nfy->ListNext;
-    assert(nfy->Notified == 0, "invariant");
-    // push nfy onto the cxq
-    for (;;) {
-      const intptr_t v = _LockWord.FullWord;
-      assert((v & 0xFF) == _LBIT, "invariant");
-      nfy->ListNext = (ParkEvent *)(v & ~_LBIT);
-      if (Atomic::cmpxchg(intptr_t(nfy)|_LBIT, &_LockWord.FullWord, v) == v) break;
-      // interference - _LockWord changed -- just retry
-    }
-    // Note that setting Notified before pushing nfy onto the cxq is
-    // also legal and safe, but the safety properties are much more
-    // subtle, so for the sake of code stewardship ...
-    OrderAccess::fence();
-    nfy->Notified = 1;
-  }
-  Thread::muxRelease(_WaitLock);
-  assert(ILocked(), "invariant");
-  return true;
-}
-
-// Currently notifyAll() transfers the waiters one-at-a-time from the waitset
-// to the cxq.  This could be done more efficiently with a single bulk en-mass transfer,
-// but in practice notifyAll() for large #s of threads is rare and not time-critical.
-// Beware too, that we invert the order of the waiters.  Lets say that the
-// waitset is "ABCD" and the cxq is "XYZ".  After a notifyAll() the waitset
-// will be empty and the cxq will be "DCBAXYZ".  This is benign, of course.
-
-bool Monitor::notify_all() {
-  assert(_owner == Thread::current(), "invariant");
-  assert(ILocked(), "invariant");
-  while (_WaitSet != NULL) notify();
-  return true;
-}
-
-int Monitor::IWait(Thread * Self, jlong timo) {
-  assert(ILocked(), "invariant");
-
-  // Phases:
-  // 1. Enqueue Self on WaitSet - currently prepend
-  // 2. unlock - drop the outer lock
-  // 3. wait for either notification or timeout
-  // 4. lock - reentry - reacquire the outer lock
-
-  ParkEvent * const ESelf = Self->_MutexEvent;
-  ESelf->Notified = 0;
-  ESelf->reset();
-  OrderAccess::fence();
-
-  // Add Self to WaitSet
-  // Ideally only the holder of the outer lock would manipulate the WaitSet -
-  // That is, the outer lock would implicitly protect the WaitSet.
-  // But if a thread in wait() encounters a timeout it will need to dequeue itself
-  // from the WaitSet _before it becomes the owner of the lock.  We need to dequeue
-  // as the ParkEvent -- which serves as a proxy for the thread -- can't reside
-  // on both the WaitSet and the EntryList|cxq at the same time..  That is, a thread
-  // on the WaitSet can't be allowed to compete for the lock until it has managed to
-  // unlink its ParkEvent from WaitSet.  Thus the need for WaitLock.
-  // Contention on the WaitLock is minimal.
-  //
-  // Another viable approach would be add another ParkEvent, "WaitEvent" to the
-  // thread class.  The WaitSet would be composed of WaitEvents.  Only the
-  // owner of the outer lock would manipulate the WaitSet.  A thread in wait()
-  // could then compete for the outer lock, and then, if necessary, unlink itself
-  // from the WaitSet only after having acquired the outer lock.  More precisely,
-  // there would be no WaitLock.  A thread in in wait() would enqueue its WaitEvent
-  // on the WaitSet; release the outer lock; wait for either notification or timeout;
-  // reacquire the inner lock; and then, if needed, unlink itself from the WaitSet.
-  //
-  // Alternatively, a 2nd set of list link fields in the ParkEvent might suffice.
-  // One set would be for the WaitSet and one for the EntryList.
-  // We could also deconstruct the ParkEvent into a "pure" event and add a
-  // new immortal/TSM "ListElement" class that referred to ParkEvents.
-  // In that case we could have one ListElement on the WaitSet and another
-  // on the EntryList, with both referring to the same pure Event.
-
-  Thread::muxAcquire(_WaitLock, "wait:WaitLock:Add");
-  ESelf->ListNext = _WaitSet;
-  _WaitSet = ESelf;
-  Thread::muxRelease(_WaitLock);
-
-  // Release the outer lock
-  // We call IUnlock (RelaxAssert=true) as a thread T1 might
-  // enqueue itself on the WaitSet, call IUnlock(), drop the lock,
-  // and then stall before it can attempt to wake a successor.
-  // Some other thread T2 acquires the lock, and calls notify(), moving
-  // T1 from the WaitSet to the cxq.  T2 then drops the lock.  T1 resumes,
-  // and then finds *itself* on the cxq.  During the course of a normal
-  // IUnlock() call a thread should _never find itself on the EntryList
-  // or cxq, but in the case of wait() it's possible.
-  // See synchronizer.cpp objectMonitor::wait().
-  IUnlock(true);
-
-  // Wait for either notification or timeout
-  // Beware that in some circumstances we might propagate
-  // spurious wakeups back to the caller.
-
-  for (;;) {
-    if (ESelf->Notified) break;
-    int err = ParkCommon(ESelf, timo);
-    if (err == OS_TIMEOUT) break;
-  }
-
-  // Prepare for reentry - if necessary, remove ESelf from WaitSet
-  // ESelf can be:
-  // 1. Still on the WaitSet.  This can happen if we exited the loop by timeout.
-  // 2. On the cxq or EntryList
-  // 3. Not resident on cxq, EntryList or WaitSet, but in the OnDeck position.
-
-  OrderAccess::fence();
-  int WasOnWaitSet = 0;
-  if (ESelf->Notified == 0) {
-    Thread::muxAcquire(_WaitLock, "wait:WaitLock:remove");
-    if (ESelf->Notified == 0) {     // DCL idiom
-      assert(_OnDeck != ESelf, "invariant");   // can't be both OnDeck and on WaitSet
-      // ESelf is resident on the WaitSet -- unlink it.
-      // A doubly-linked list would be better here so we can unlink in constant-time.
-      // We have to unlink before we potentially recontend as ESelf might otherwise
-      // end up on the cxq|EntryList -- it can't be on two lists at once.
-      ParkEvent * p = _WaitSet;
-      ParkEvent * q = NULL;            // classic q chases p
-      while (p != NULL && p != ESelf) {
-        q = p;
-        p = p->ListNext;
-      }
-      assert(p == ESelf, "invariant");
-      if (p == _WaitSet) {      // found at head
-        assert(q == NULL, "invariant");
-        _WaitSet = p->ListNext;
-      } else {                  // found in interior
-        assert(q->ListNext == p, "invariant");
-        q->ListNext = p->ListNext;
-      }
-      WasOnWaitSet = 1;        // We were *not* notified but instead encountered timeout
-    }
-    Thread::muxRelease(_WaitLock);
-  }
-
-  // Reentry phase - reacquire the lock
-  if (WasOnWaitSet) {
-    // ESelf was previously on the WaitSet but we just unlinked it above
-    // because of a timeout.  ESelf is not resident on any list and is not OnDeck
-    assert(_OnDeck != ESelf, "invariant");
-    ILock(Self);
-  } else {
-    // A prior notify() operation moved ESelf from the WaitSet to the cxq.
-    // ESelf is now on the cxq, EntryList or at the OnDeck position.
-    // The following fragment is extracted from Monitor::ILock()
-    for (;;) {
-      if (OrderAccess::load_acquire(&_OnDeck) == ESelf && TrySpin(Self)) break;
-      ParkCommon(ESelf, 0);
-    }
-    assert(_OnDeck == ESelf, "invariant");
-    _OnDeck = NULL;
-  }
-
-  assert(ILocked(), "invariant");
-  return WasOnWaitSet != 0;        // return true IFF timeout
-}
-
-
-// ON THE VMTHREAD SNEAKING PAST HELD LOCKS:
-// In particular, there are certain types of global lock that may be held
-// by a Java thread while it is blocked at a safepoint but before it has
-// written the _owner field. These locks may be sneakily acquired by the
-// VM thread during a safepoint to avoid deadlocks. Alternatively, one should
-// identify all such locks, and ensure that Java threads never block at
-// safepoints while holding them (_no_safepoint_check_flag). While it
-// seems as though this could increase the time to reach a safepoint
-// (or at least increase the mean, if not the variance), the latter
-// approach might make for a cleaner, more maintainable JVM design.
-//
-// Sneaking is vile and reprehensible and should be excised at the 1st
-// opportunity.  It's possible that the need for sneaking could be obviated
-// as follows.  Currently, a thread might (a) while TBIVM, call pthread_mutex_lock
-// or ILock() thus acquiring the "physical" lock underlying Monitor/Mutex.
-// (b) stall at the TBIVM exit point as a safepoint is in effect.  Critically,
-// it'll stall at the TBIVM reentry state transition after having acquired the
-// underlying lock, but before having set _owner and having entered the actual
-// critical section.  The lock-sneaking facility leverages that fact and allowed the
-// VM thread to logically acquire locks that had already be physically locked by mutators
-// but where mutators were known blocked by the reentry thread state transition.
-//
-// If we were to modify the Monitor-Mutex so that TBIVM state transitions tightly
-// wrapped calls to park(), then we could likely do away with sneaking.  We'd
-// decouple lock acquisition and parking.  The critical invariant  to eliminating
-// sneaking is to ensure that we never "physically" acquire the lock while TBIVM.
-// An easy way to accomplish this is to wrap the park calls in a narrow TBIVM jacket.
-// One difficulty with this approach is that the TBIVM wrapper could recurse and
-// call lock() deep from within a lock() call, while the MutexEvent was already enqueued.
-// Using a stack (N=2 at minimum) of ParkEvents would take care of that problem.
-//
-// But of course the proper ultimate approach is to avoid schemes that require explicit
-// sneaking or dependence on any any clever invariants or subtle implementation properties
-// of Mutex-Monitor and instead directly address the underlying design flaw.
-
-void Monitor::lock(Thread * Self) {
+void Monitor::lock(Thread * self) {
   // Ensure that the Monitor requires/allows safepoint checks.
   assert(_safepoint_check_required != Monitor::_safepoint_check_never,
          "This lock should never have a safepoint check: %s", name());
 
 #ifdef CHECK_UNHANDLED_OOPS
-  // Clear unhandled oops so we get a crash right away.  Only clear for non-vm
-  // or GC threads.
-  if (Self->is_Java_thread()) {
-    Self->clear_unhandled_oops();
+  // Clear unhandled oops in JavaThreads so we get a crash right away.
+  if (self->is_Java_thread()) {
+    self->clear_unhandled_oops();
   }
 #endif // CHECK_UNHANDLED_OOPS
 
-  DEBUG_ONLY(check_prelock_state(Self, StrictSafepointChecks);)
-  assert(_owner != Self, "invariant");
-  assert(_OnDeck != Self->_MutexEvent, "invariant");
+  DEBUG_ONLY(check_prelock_state(self, StrictSafepointChecks));
+  assert(_owner != self, "invariant");
+
+  Monitor* in_flight_monitor = NULL;
+  DEBUG_ONLY(int retry_cnt = 0;)
+  while (!_lock.try_lock()) {
+    // The lock is contended
+
+  #ifdef ASSERT
+    check_block_state(self);
+    if (retry_cnt++ > 3) {
+      log_trace(vmmonitor)("JavaThread " INTPTR_FORMAT " on %d attempt trying to acquire vmmonitor %s", p2i(self), retry_cnt, _name);
+    }
+  #endif // ASSERT
 
-  if (TryFast()) {
- Exeunt:
-    assert(ILocked(), "invariant");
-    assert(owner() == NULL, "invariant");
-    set_owner(Self);
-    return;
+    if (self->is_Java_thread()) {
+      assert(rank() > Mutex::special, "Potential deadlock with special or lesser rank mutex");
+      { ThreadBlockInVMWithDeadlockCheck tbivmdc((JavaThread *) self, &in_flight_monitor);
+        in_flight_monitor = this;  // save for ~ThreadBlockInVMWithDeadlockCheck
+        _lock.lock();
+      }
+      if (in_flight_monitor != NULL) {
+        // Not unlocked by ~ThreadBlockInVMWithDeadlockCheck
+        break;
+      }
+    } else {
+      _lock.lock();
+      break;
+    }
   }
 
-  // The lock is contended ...
-
-  bool can_sneak = Self->is_VM_thread() && SafepointSynchronize::is_at_safepoint();
-  if (can_sneak && _owner == NULL) {
-    // a java thread has locked the lock but has not entered the
-    // critical region -- let's just pretend we've locked the lock
-    // and go on.  we note this with _snuck so we can also
-    // pretend to unlock when the time comes.
-    _snuck = true;
-    goto Exeunt;
-  }
-
-  // Try a brief spin to avoid passing thru thread state transition ...
-  if (TrySpin(Self)) goto Exeunt;
-
-  DEBUG_ONLY(check_block_state(Self);)
-  if (Self->is_Java_thread()) {
-    // Horrible dictu - we suffer through a state transition
-    assert(rank() > Mutex::special, "Potential deadlock with special or lesser rank mutex");
-    ThreadBlockInVM tbivm((JavaThread *) Self);
-    ILock(Self);
-  } else {
-    // Mirabile dictu
-    ILock(Self);
-  }
-  goto Exeunt;
+  assert_owner(NULL);
+  set_owner(self);
 }
 
 void Monitor::lock() {
   this->lock(Thread::current());
 }
 
-// Lock without safepoint check - a degenerate variant of lock().
-// Should ONLY be used by safepoint code and other code
-// that is guaranteed not to block while running inside the VM. If this is called with
-// thread state set to be in VM, the safepoint synchronization code will deadlock!
+// Lock without safepoint check - a degenerate variant of lock() for use by
+// JavaThreads when it is known to be safe to not check for a safepoint when
+// acquiring this lock. If the thread blocks acquiring the lock it is not
+// safepoint-safe and so will prevent a safepoint from being reached. If used
+// in the wrong way this can lead to a deadlock with the safepoint code.
 
-void Monitor::lock_without_safepoint_check(Thread * Self) {
-  // Ensure that the Monitor does not require or allow safepoint checks.
+void Monitor::lock_without_safepoint_check(Thread * self) {
+  // Ensure that the Monitor does not require safepoint checks.
   assert(_safepoint_check_required != Monitor::_safepoint_check_always,
          "This lock should always have a safepoint check: %s", name());
-  assert(_owner != Self, "invariant");
-  ILock(Self);
-  assert(_owner == NULL, "invariant");
-  set_owner(Self);
+  assert(_owner != self, "invariant");
+  _lock.lock();
+  assert_owner(NULL);
+  set_owner(self);
 }
 
 void Monitor::lock_without_safepoint_check() {
@@ -942,117 +108,36 @@
 // Returns true if thread succeeds in grabbing the lock, otherwise false.
 
 bool Monitor::try_lock() {
-  Thread * const Self = Thread::current();
-  DEBUG_ONLY(check_prelock_state(Self, false);)
-  // assert(!thread->is_inside_signal_handler(), "don't lock inside signal handler");
+  Thread * const self = Thread::current();
+  DEBUG_ONLY(check_prelock_state(self, false);)
 
-  // Special case, where all Java threads are stopped.
-  // The lock may have been acquired but _owner is not yet set.
-  // In that case the VM thread can safely grab the lock.
-  // It strikes me this should appear _after the TryLock() fails, below.
-  bool can_sneak = Self->is_VM_thread() && SafepointSynchronize::is_at_safepoint();
-  if (can_sneak && _owner == NULL) {
-    set_owner(Self); // Do not need to be atomic, since we are at a safepoint
-    _snuck = true;
-    return true;
-  }
-
-  if (TryLock()) {
-    // We got the lock
-    assert(_owner == NULL, "invariant");
-    set_owner(Self);
+  if (_lock.try_lock()) {
+    assert_owner(NULL);
+    set_owner(self);
     return true;
   }
   return false;
 }
 
-void Monitor::unlock() {
-  assert(_owner == Thread::current(), "invariant");
-  assert(_OnDeck != Thread::current()->_MutexEvent, "invariant");
-  set_owner(NULL);
-  if (_snuck) {
-    assert(SafepointSynchronize::is_at_safepoint() && Thread::current()->is_VM_thread(), "sneak");
-    _snuck = false;
-    return;
-  }
-  IUnlock(false);
+void Monitor::release_for_safepoint() {
+  assert_owner(NULL);
+  _lock.unlock();
 }
 
-// Yet another degenerate version of Monitor::lock() or lock_without_safepoint_check()
-// jvm_raw_lock() and _unlock() can be called by non-Java threads via JVM_RawMonitorEnter.
-//
-// There's no expectation that JVM_RawMonitors will interoperate properly with the native
-// Mutex-Monitor constructs.  We happen to implement JVM_RawMonitors in terms of
-// native Mutex-Monitors simply as a matter of convenience.  A simple abstraction layer
-// over a pthread_mutex_t would work equally as well, but require more platform-specific
-// code -- a "PlatformMutex".  Alternatively, a simply layer over muxAcquire-muxRelease
-// would work too.
-//
-// Since the caller might be a foreign thread, we don't necessarily have a Thread.MutexEvent
-// instance available.  Instead, we transiently allocate a ParkEvent on-demand if
-// we encounter contention.  That ParkEvent remains associated with the thread
-// until it manages to acquire the lock, at which time we return the ParkEvent
-// to the global ParkEvent free list.  This is correct and suffices for our purposes.
-//
-// Beware that the original jvm_raw_unlock() had a "_snuck" test but that
-// jvm_raw_lock() didn't have the corresponding test.  I suspect that's an
-// oversight, but I've replicated the original suspect logic in the new code ...
-
-void Monitor::jvm_raw_lock() {
-  assert(rank() == native, "invariant");
-
-  if (TryLock()) {
- Exeunt:
-    assert(ILocked(), "invariant");
-    assert(_owner == NULL, "invariant");
-    // This can potentially be called by non-java Threads. Thus, the Thread::current_or_null()
-    // might return NULL. Don't call set_owner since it will break on an NULL owner
-    // Consider installing a non-null "ANON" distinguished value instead of just NULL.
-    _owner = Thread::current_or_null();
-    return;
-  }
-
-  if (TrySpin(NULL)) goto Exeunt;
-
-  // slow-path - apparent contention
-  // Allocate a ParkEvent for transient use.
-  // The ParkEvent remains associated with this thread until
-  // the time the thread manages to acquire the lock.
-  ParkEvent * const ESelf = ParkEvent::Allocate(NULL);
-  ESelf->reset();
-  OrderAccess::storeload();
-
-  // Either Enqueue Self on cxq or acquire the outer lock.
-  if (AcquireOrPush (ESelf)) {
-    ParkEvent::Release(ESelf);      // surrender the ParkEvent
-    goto Exeunt;
-  }
-
-  // At any given time there is at most one ondeck thread.
-  // ondeck implies not resident on cxq and not resident on EntryList
-  // Only the OnDeck thread can try to acquire -- contend for -- the lock.
-  // CONSIDER: use Self->OnDeck instead of m->OnDeck.
-  for (;;) {
-    if (OrderAccess::load_acquire(&_OnDeck) == ESelf && TrySpin(NULL)) break;
-    ParkCommon(ESelf, 0);
-  }
-
-  assert(_OnDeck == ESelf, "invariant");
-  _OnDeck = NULL;
-  ParkEvent::Release(ESelf);      // surrender the ParkEvent
-  goto Exeunt;
+void Monitor::unlock() {
+  assert_owner(Thread::current());
+  set_owner(NULL);
+  _lock.unlock();
 }
 
-void Monitor::jvm_raw_unlock() {
-  // Nearly the same as Monitor::unlock() ...
-  // directly set _owner instead of using set_owner(null)
-  _owner = NULL;
-  if (_snuck) {         // ???
-    assert(SafepointSynchronize::is_at_safepoint() && Thread::current()->is_VM_thread(), "sneak");
-    _snuck = false;
-    return;
-  }
-  IUnlock(false);
+void Monitor::notify() {
+  assert_owner(Thread::current());
+  _lock.notify();
+}
+
+void Monitor::notify_all() {
+  assert_owner(Thread::current());
+  _lock.notify_all();
 }
 
 bool Monitor::wait(bool no_safepoint_check, long timeout,
@@ -1063,22 +148,24 @@
   assert(!(_safepoint_check_required == Monitor::_safepoint_check_always && no_safepoint_check == true),
          "This lock should always have a safepoint check: %s", name());
 
-  Thread * const Self = Thread::current();
-  assert(_owner == Self, "invariant");
-  assert(ILocked(), "invariant");
+  // timeout is in milliseconds - with zero meaning never timeout
+  assert(timeout >= 0, "negative timeout");
+
+  Thread * const self = Thread::current();
+  assert_owner(self);
 
   // as_suspend_equivalent logically implies !no_safepoint_check
   guarantee(!as_suspend_equivalent || !no_safepoint_check, "invariant");
   // !no_safepoint_check logically implies java_thread
-  guarantee(no_safepoint_check || Self->is_Java_thread(), "invariant");
+  guarantee(no_safepoint_check || self->is_Java_thread(), "invariant");
 
   #ifdef ASSERT
-  Monitor * least = get_least_ranked_lock_besides_this(Self->owned_locks());
+  Monitor * least = get_least_ranked_lock_besides_this(self->owned_locks());
   assert(least != this, "Specification of get_least_... call above");
   if (least != NULL && least->rank() <= special) {
     ::tty->print("Attempting to wait on monitor %s/%d while holding"
-                 " lock %s/%d -- possible deadlock",
-                 name(), rank(), least->name(), least->rank());
+               " lock %s/%d -- possible deadlock",
+               name(), rank(), least->name(), least->rank());
     assert(false, "Shouldn't block(wait) while holding a lock of rank special");
   }
   #endif // ASSERT
@@ -1088,75 +175,79 @@
   // abdicating the lock in wait
   set_owner(NULL);
   if (no_safepoint_check) {
-    wait_status = IWait(Self, timeout);
+    wait_status = _lock.wait(timeout);
+    set_owner(self);
   } else {
-    assert(Self->is_Java_thread(), "invariant");
-    JavaThread *jt = (JavaThread *)Self;
+    assert(self->is_Java_thread(), "invariant");
+    JavaThread *jt = (JavaThread *)self;
+    Monitor* in_flight_monitor = NULL;
 
-    // Enter safepoint region - ornate and Rococo ...
-    ThreadBlockInVM tbivm(jt);
-    OSThreadWaitState osts(Self->osthread(), false /* not Object.wait() */);
+    {
+      ThreadBlockInVMWithDeadlockCheck tbivmdc(jt, &in_flight_monitor);
+      OSThreadWaitState osts(self->osthread(), false /* not Object.wait() */);
+      if (as_suspend_equivalent) {
+        jt->set_suspend_equivalent();
+        // cleared by handle_special_suspend_equivalent_condition() or
+        // java_suspend_self()
+      }
 
-    if (as_suspend_equivalent) {
-      jt->set_suspend_equivalent();
-      // cleared by handle_special_suspend_equivalent_condition() or
-      // java_suspend_self()
+      wait_status = _lock.wait(timeout);
+      in_flight_monitor = this;  // save for ~ThreadBlockInVMWithDeadlockCheck
+
+      // were we externally suspended while we were waiting?
+      if (as_suspend_equivalent && jt->handle_special_suspend_equivalent_condition()) {
+        // Our event wait has finished and we own the lock, but
+        // while we were waiting another thread suspended us. We don't
+        // want to hold the lock while suspended because that
+        // would surprise the thread that suspended us.
+        _lock.unlock();
+        jt->java_suspend_self();
+        _lock.lock();
+      }
     }
 
-    wait_status = IWait(Self, timeout);
-
-    // were we externally suspended while we were waiting?
-    if (as_suspend_equivalent && jt->handle_special_suspend_equivalent_condition()) {
-      // Our event wait has finished and we own the lock, but
-      // while we were waiting another thread suspended us. We don't
-      // want to hold the lock while suspended because that
-      // would surprise the thread that suspended us.
-      assert(ILocked(), "invariant");
-      IUnlock(true);
-      jt->java_suspend_self();
-      ILock(Self);
-      assert(ILocked(), "invariant");
+    if (in_flight_monitor != NULL) {
+      // Not unlocked by ~ThreadBlockInVMWithDeadlockCheck
+      assert_owner(NULL);
+      // Conceptually reestablish ownership of the lock.
+      set_owner(self);
+    } else {
+      lock(self);
     }
   }
-
-  // Conceptually reestablish ownership of the lock.
-  // The "real" lock -- the LockByte -- was reacquired by IWait().
-  assert(ILocked(), "invariant");
-  assert(_owner == NULL, "invariant");
-  set_owner(Self);
   return wait_status != 0;          // return true IFF timeout
 }
 
+
+// Temporary JVM_RawMonitor* support.
+// Yet another degenerate version of Monitor::lock() or lock_without_safepoint_check()
+// jvm_raw_lock() and _unlock() can be called by non-Java threads via JVM_RawMonitorEnter.
+// There's no expectation that JVM_RawMonitors will interoperate properly with the native
+// Mutex-Monitor constructs.  We happen to implement JVM_RawMonitors in terms of
+// native Mutex-Monitors simply as a matter of convenience.
+
+void Monitor::jvm_raw_lock() {
+  _lock.lock();
+  assert_owner(NULL);
+}
+
+void Monitor::jvm_raw_unlock() {
+  assert_owner(NULL);
+  _lock.unlock();
+}
+
 Monitor::~Monitor() {
-#ifdef ASSERT
-  uintptr_t owner = UNS(_owner);
-  uintptr_t lockword = UNS(_LockWord.FullWord);
-  uintptr_t entrylist = UNS(_EntryList);
-  uintptr_t waitset = UNS(_WaitSet);
-  uintptr_t ondeck = UNS(_OnDeck);
-  // Print _name with precision limit, in case failure is due to memory
-  // corruption that also trashed _name.
-  assert((owner|lockword|entrylist|waitset|ondeck) == 0,
-         "%.*s: _owner(" INTPTR_FORMAT ")|_LockWord(" INTPTR_FORMAT ")|_EntryList(" INTPTR_FORMAT ")|_WaitSet("
-         INTPTR_FORMAT ")|_OnDeck(" INTPTR_FORMAT ") != 0",
-         MONITOR_NAME_LEN, _name, owner, lockword, entrylist, waitset, ondeck);
-#endif
+  assert_owner(NULL);
 }
 
 void Monitor::ClearMonitor(Monitor * m, const char *name) {
   m->_owner             = NULL;
-  m->_snuck             = false;
   if (name == NULL) {
     strcpy(m->_name, "UNKNOWN");
   } else {
     strncpy(m->_name, name, MONITOR_NAME_LEN - 1);
     m->_name[MONITOR_NAME_LEN - 1] = '\0';
   }
-  m->_LockWord.FullWord = 0;
-  m->_EntryList         = NULL;
-  m->_OnDeck            = NULL;
-  m->_WaitSet           = NULL;
-  m->_WaitLock[0]       = 0;
 }
 
 Monitor::Monitor() {
@@ -1186,9 +277,7 @@
 }
 
 bool Monitor::owned_by_self() const {
-  bool ret = _owner == Thread::current();
-  assert(!ret || _LockWord.Bytes[_LSBINDEX] != 0, "invariant");
-  return ret;
+  return _owner == Thread::current();
 }
 
 void Monitor::print_on_error(outputStream* st) const {
@@ -1197,21 +286,32 @@
   st->print(" - owner thread: " PTR_FORMAT, p2i(_owner));
 }
 
-
-
-
 // ----------------------------------------------------------------------------------
 // Non-product code
 
 #ifndef PRODUCT
 void Monitor::print_on(outputStream* st) const {
-  st->print_cr("Mutex: [" PTR_FORMAT "/" PTR_FORMAT "] %s - owner: " PTR_FORMAT,
-               p2i(this), _LockWord.FullWord, _name, p2i(_owner));
+  st->print_cr("Mutex: [" PTR_FORMAT "] %s - owner: " PTR_FORMAT,
+               p2i(this), _name, p2i(_owner));
 }
 #endif
 
 #ifndef PRODUCT
 #ifdef ASSERT
+
+void Monitor::assert_owner(Thread * expected) {
+  const char* msg = "invalid owner";
+  if (expected == NULL) {
+    msg = "should be un-owned";
+  }
+  else if (expected == Thread::current()) {
+    msg = "should be owned by current thread";
+  }
+  assert(_owner == expected,
+         "%s: owner=" INTPTR_FORMAT ", should be=" INTPTR_FORMAT,
+         msg, p2i(_owner), p2i(expected));
+}
+
 Monitor * Monitor::get_least_ranked_lock(Monitor * locks) {
   Monitor *res, *tmp;
   for (res = tmp = locks; tmp != NULL; tmp = tmp->next()) {
@@ -1297,8 +397,8 @@
     // Deadlock avoidance rules require us to acquire Mutexes only in
     // a global total order. For example m1 is the lowest ranked mutex
     // that the thread holds and m2 is the mutex the thread is trying
-    // to acquire, then  deadlock avoidance rules require that the rank
-    // of m2 be less  than the rank of m1.
+    // to acquire, then deadlock avoidance rules require that the rank
+    // of m2 be less than the rank of m1.
     // The rank Mutex::native  is an exception in that it is not subject
     // to the verification rules.
     // Here are some further notes relating to mutex acquisition anomalies:
diff -r 2c6c0fabe6a2 -r 043ae846819f src/hotspot/share/runtime/mutex.hpp
--- a/src/hotspot/share/runtime/mutex.hpp	Tue Feb 05 13:21:59 2019 -0500
+++ b/src/hotspot/share/runtime/mutex.hpp	Tue Feb 05 15:12:13 2019 -0500
@@ -29,50 +29,10 @@
 #include "runtime/os.hpp"
 #include "utilities/histogram.hpp"
 
-// The SplitWord construct allows us to colocate the contention queue
-// (cxq) with the lock-byte.  The queue elements are ParkEvents, which are
-// always aligned on 256-byte addresses - the least significant byte of
-// a ParkEvent is always 0.  Colocating the lock-byte with the queue
-// allows us to easily avoid what would otherwise be a race in lock()
-// if we were to use two completely separate fields for the contention queue
-// and the lock indicator.  Specifically, colocation renders us immune
-// from the race where a thread might enqueue itself in the lock() slow-path
-// immediately after the lock holder drops the outer lock in the unlock()
-// fast-path.
-//
-// Colocation allows us to use a fast-path unlock() form that uses
-// A MEMBAR instead of a CAS.  MEMBAR has lower local latency than CAS
-// on many platforms.
-//
-// See:
-// +  http://blogs.sun.com/dave/entry/biased_locking_in_hotspot
-// +  http://blogs.sun.com/dave/resource/synchronization-public2.pdf
-//
-// Note that we're *not* using word-tearing the classic sense.
-// The lock() fast-path will CAS the lockword and the unlock()
-// fast-path will store into the lock-byte colocated within the lockword.
-// We depend on the fact that all our reference platforms have
-// coherent and atomic byte accesses.  More precisely, byte stores
-// interoperate in a safe, sane, and expected manner with respect to
-// CAS, ST and LDs to the full-word containing the byte.
-// If you're porting HotSpot to a platform where that isn't the case
-// then you'll want change the unlock() fast path from:
-//    STB;MEMBAR #storeload; LDN
-// to a full-word CAS of the lockword.
 
-
-union SplitWord {   // full-word with separately addressable LSB
-  volatile intptr_t FullWord ;
-  volatile void * Address ;
-  volatile jbyte Bytes [sizeof(intptr_t)] ;
-} ;
-
-class ParkEvent ;
-
-// See orderAccess.hpp.  We assume throughout the VM that mutex lock and
-// try_lock do fence-lock-acquire, and that unlock does a release-unlock,
-// *in that order*.  If their implementations change such that these
-// assumptions are violated, a whole lot of code will break.
+// A Mutex/Monitor is a simple wrapper around a native lock plus condition
+// variable that supports lock ownership tracking, lock ranking for deadlock
+// detection and coordinates with the safepoint protocol.
 
 // The default length of monitor name was originally chosen to be 64 to avoid
 // false sharing. Now, PaddedMonitor is available for this purpose.
@@ -118,22 +78,10 @@
        native         = max_nonleaf    +   1
   };
 
-  // The WaitSet and EntryList linked lists are composed of ParkEvents.
-  // I use ParkEvent instead of threads as ParkEvents are immortal and
-  // type-stable, meaning we can safely unpark() a possibly stale
-  // list element in the unlock()-path.
-
  protected:                              // Monitor-Mutex metadata
-  SplitWord _LockWord ;                  // Contention queue (cxq) colocated with Lock-byte
   Thread * volatile _owner;              // The owner of the lock
-                                         // Consider sequestering _owner on its own $line
-                                         // to aid future synchronization mechanisms.
-  ParkEvent * volatile _EntryList ;      // List of threads waiting for entry
-  ParkEvent * volatile _OnDeck ;         // heir-presumptive
-  volatile intptr_t _WaitLock [1] ;      // Protects _WaitSet
-  ParkEvent * volatile  _WaitSet ;       // LL of ParkEvents
-  volatile bool     _snuck;              // Used for sneaky locking (evil).
-  char _name[MONITOR_NAME_LEN];          // Name of mutex
+  os::PlatformMonitor _lock;             // Native monitor implementation
+  char _name[MONITOR_NAME_LEN];          // Name of mutex/monitor
 
   // Debugging fields for naming, deadlock detection, etc. (some only used in debug mode)
 #ifndef PRODUCT
@@ -149,8 +97,8 @@
   void set_owner_implementation(Thread* owner)                        PRODUCT_RETURN;
   void check_prelock_state     (Thread* thread, bool safepoint_check) PRODUCT_RETURN;
   void check_block_state       (Thread* thread)                       PRODUCT_RETURN;
+  void assert_owner            (Thread* expected)                     NOT_DEBUG_RETURN;
 
-  // platform-dependent support code can go here (in os_<os_family>.cpp)
  public:
   enum {
     _no_safepoint_check_flag    = true,
@@ -164,6 +112,9 @@
   // consistent checking for each lock.
   // A few existing locks will sometimes have a safepoint check and
   // sometimes not, but these locks are set up in such a way to avoid deadlocks.
+  // Note: monitors that may be shared between JavaThreads and the VMThread
+  // should never encounter a safepoint check whilst they are held, else a
+  // deadlock with the VMThread can occur.
   enum SafepointCheckRequired {
     _safepoint_check_never,       // Monitors with this value will cause errors
                                   // when acquired with a safepoint check.
@@ -176,22 +127,6 @@
 
   NOT_PRODUCT(SafepointCheckRequired _safepoint_check_required;)
 
-  enum WaitResults {
-    CONDVAR_EVENT,         // Wait returned because of condition variable notification
-    INTERRUPT_EVENT,       // Wait returned because waiting thread was interrupted
-    NUMBER_WAIT_RESULTS
-  };
-
- private:
-   int  TrySpin (Thread * Self) ;
-   int  TryLock () ;
-   int  TryFast () ;
-   int  AcquireOrPush (ParkEvent * ev) ;
-   void IUnlock (bool RelaxAssert) ;
-   void ILock (Thread * Self) ;
-   int  IWait (Thread * Self, jlong timo);
-   int  ILocked () ;
-
  protected:
    static void ClearMonitor (Monitor * m, const char* name = NULL) ;
    Monitor() ;
@@ -208,8 +143,8 @@
   bool wait(bool no_safepoint_check = !_no_safepoint_check_flag,
             long timeout = 0,
             bool as_suspend_equivalent = !_as_suspend_equivalent_flag);
-  bool notify();
-  bool notify_all();
+  void notify();
+  void notify_all();
 
 
   void lock(); // prints out warning if VM thread blocks
@@ -219,6 +154,8 @@
 
   bool try_lock(); // Like lock(), but unblocking. It returns false instead
 
+  void release_for_safepoint();
+
   // Lock without safepoint check. Should ONLY be used by safepoint code and other code
   // that is guaranteed not to block while running inside the VM.
   void lock_without_safepoint_check();
@@ -290,9 +227,6 @@
 // there may have been some benefit to having distinct mutexes and monitors, but that time
 // has past.
 //
-// The Mutex/Monitor design parallels that of Java-monitors, being based on
-// thread-specific park-unpark platform-specific primitives.
-
 
 class Mutex : public Monitor {      // degenerate Monitor
  public:
@@ -300,8 +234,8 @@
          SafepointCheckRequired safepoint_check_required = _safepoint_check_always);
   // default destructor
  private:
-   bool notify ()    { ShouldNotReachHere(); return false; }
-   bool notify_all() { ShouldNotReachHere(); return false; }
+   void notify ()    { ShouldNotReachHere(); }
+   void notify_all() { ShouldNotReachHere(); }
    bool wait (bool no_safepoint_check, long timeout, bool as_suspend_equivalent) {
      ShouldNotReachHere() ;
      return false ;
diff -r 2c6c0fabe6a2 -r 043ae846819f src/hotspot/share/runtime/mutexLocker.hpp
--- a/src/hotspot/share/runtime/mutexLocker.hpp	Tue Feb 05 13:21:59 2019 -0500
+++ b/src/hotspot/share/runtime/mutexLocker.hpp	Tue Feb 05 15:12:13 2019 -0500
@@ -272,18 +272,16 @@
     return false;
   }
 
-  bool notify_all() {
+  void notify_all() {
     if (_monitor != NULL) {
-      return _monitor->notify_all();
+      _monitor->notify_all();
     }
-    return true;
   }
 
-  bool notify() {
+  void notify() {
     if (_monitor != NULL) {
-      return _monitor->notify();
+      _monitor->notify();
     }
-    return true;
   }
 };
 
diff -r 2c6c0fabe6a2 -r 043ae846819f src/hotspot/share/runtime/safepoint.cpp
--- a/src/hotspot/share/runtime/safepoint.cpp	Tue Feb 05 13:21:59 2019 -0500
+++ b/src/hotspot/share/runtime/safepoint.cpp	Tue Feb 05 15:12:13 2019 -0500
@@ -793,7 +793,7 @@
 // -------------------------------------------------------------------------------------------------------
 // Implementation of Safepoint callback point
 
-void SafepointSynchronize::block(JavaThread *thread) {
+void SafepointSynchronize::block(JavaThread *thread, bool block_in_safepoint_check) {
   assert(thread != NULL, "thread must be set");
   assert(thread->is_Java_thread(), "not a Java thread");
 
@@ -848,28 +848,37 @@
         }
       }
 
-      // We transition the thread to state _thread_blocked here, but
-      // we can't do our usual check for external suspension and then
-      // self-suspend after the lock_without_safepoint_check() call
-      // below because we are often called during transitions while
-      // we hold different locks. That would leave us suspended while
-      // holding a resource which results in deadlocks.
-      thread->set_thread_state(_thread_blocked);
-      Safepoint_lock->unlock();
+      if (block_in_safepoint_check) {
+        // We transition the thread to state _thread_blocked here, but
+        // we can't do our usual check for external suspension and then
+        // self-suspend after the lock_without_safepoint_check() call
+        // below because we are often called during transitions while
+        // we hold different locks. That would leave us suspended while
+        // holding a resource which results in deadlocks.
+        thread->set_thread_state(_thread_blocked);
+        Safepoint_lock->unlock();
 
-      // We now try to acquire the threads lock. Since this lock is hold by the VM thread during
-      // the entire safepoint, the threads will all line up here during the safepoint.
-      Threads_lock->lock_without_safepoint_check();
-      // restore original state. This is important if the thread comes from compiled code, so it
-      // will continue to execute with the _thread_in_Java state.
-      thread->set_thread_state(state);
-      Threads_lock->unlock();
+        // We now try to acquire the threads lock. Since this lock is hold by the VM thread during
+        // the entire safepoint, the threads will all line up here during the safepoint.
+        Threads_lock->lock_without_safepoint_check();
+        // restore original state. This is important if the thread comes from compiled code, so it
+        // will continue to execute with the _thread_in_Java state.
+        thread->set_thread_state(state);
+        Threads_lock->unlock();
+      } else {
+        // We choose not to block in this call since we would be
+        // caught when transitioning back anyways if the safepoint
+        // is still going on.
+        thread->set_thread_state(state);
+        Safepoint_lock->unlock();
+      }
       break;
 
     case _thread_in_native_trans:
     case _thread_blocked_trans:
     case _thread_new_trans:
-      if (thread->safepoint_state()->type() == ThreadSafepointState::_call_back) {
+      if (thread->safepoint_state()->type() == ThreadSafepointState::_call_back &&
+          block_in_safepoint_check) {
         thread->print_thread_state();
         fatal("Deadlock in safepoint code.  "
               "Should have called back to the VM before blocking.");
diff -r 2c6c0fabe6a2 -r 043ae846819f src/hotspot/share/runtime/safepoint.hpp
--- a/src/hotspot/share/runtime/safepoint.hpp	Tue Feb 05 13:21:59 2019 -0500
+++ b/src/hotspot/share/runtime/safepoint.hpp	Tue Feb 05 15:12:13 2019 -0500
@@ -143,7 +143,7 @@
   }
 
   // Called when a thread voluntarily blocks
-  static void   block(JavaThread *thread);
+  static void   block(JavaThread *thread, bool block_in_safepoint_check = true);
 
   friend class SafepointMechanism;
 
diff -r 2c6c0fabe6a2 -r 043ae846819f src/hotspot/share/runtime/safepointMechanism.hpp
--- a/src/hotspot/share/runtime/safepointMechanism.hpp	Tue Feb 05 13:21:59 2019 -0500
+++ b/src/hotspot/share/runtime/safepointMechanism.hpp	Tue Feb 05 15:12:13 2019 -0500
@@ -72,12 +72,15 @@
 #endif
   }
 
-  // Call this method to see if this thread should block for a safepoint.
+  // Call this method to see if this thread should block for a safepoint or process handshake.
   static inline bool should_block(Thread* thread);
 
-  // Blocks a thread until safepoint is completed
+  // Blocks a thread until safepoint/handshake is completed.
   static inline void block_if_requested(JavaThread* thread);
 
+  // Calls back if there is a pending safepoint but does not block for it.
+  static inline void callback_if_safepoint(JavaThread* thread);
+
   // Caller is responsible for using a memory barrier if needed.
   static inline void arm_local_poll(JavaThread* thread);
   static inline void disarm_local_poll(JavaThread* thread);
diff -r 2c6c0fabe6a2 -r 043ae846819f src/hotspot/share/runtime/safepointMechanism.inline.hpp
--- a/src/hotspot/share/runtime/safepointMechanism.inline.hpp	Tue Feb 05 13:21:59 2019 -0500
+++ b/src/hotspot/share/runtime/safepointMechanism.inline.hpp	Tue Feb 05 15:12:13 2019 -0500
@@ -62,6 +62,20 @@
   block_if_requested_slow(thread);
 }
 
+void SafepointMechanism::callback_if_safepoint(JavaThread* thread) {
+  if (!uses_thread_local_poll() || local_poll_armed(thread)) {
+    // If using thread local polls, we should not check the
+    // global_poll() and callback via block() if the VMThread
+    // has not yet armed the local poll. Otherwise, when used in
+    // combination with should_block(), the latter could miss
+    // detecting the same safepoint that this method would detect
+    // if only checking global polls.
+    if (global_poll()) {
+      SafepointSynchronize::block(thread, false);
+    }
+  }
+}
+
 void SafepointMechanism::arm_local_poll(JavaThread* thread) {
   thread->set_polling_page(poll_armed_value());
 }
diff -r 2c6c0fabe6a2 -r 043ae846819f src/hotspot/share/runtime/thread.cpp
--- a/src/hotspot/share/runtime/thread.cpp	Tue Feb 05 13:21:59 2019 -0500
+++ b/src/hotspot/share/runtime/thread.cpp	Tue Feb 05 15:12:13 2019 -0500
@@ -294,7 +294,6 @@
   // and ::Release()
   _ParkEvent   = ParkEvent::Allocate(this);
   _SleepEvent  = ParkEvent::Allocate(this);
-  _MutexEvent  = ParkEvent::Allocate(this);
   _MuxEvent    = ParkEvent::Allocate(this);
 
 #ifdef CHECK_UNHANDLED_OOPS
@@ -460,7 +459,6 @@
   // We NULL out the fields for good hygiene.
   ParkEvent::Release(_ParkEvent); _ParkEvent   = NULL;
   ParkEvent::Release(_SleepEvent); _SleepEvent  = NULL;
-  ParkEvent::Release(_MutexEvent); _MutexEvent  = NULL;
   ParkEvent::Release(_MuxEvent); _MuxEvent    = NULL;
 
   delete handle_area();
diff -r 2c6c0fabe6a2 -r 043ae846819f src/hotspot/share/runtime/thread.hpp
--- a/src/hotspot/share/runtime/thread.hpp	Tue Feb 05 13:21:59 2019 -0500
+++ b/src/hotspot/share/runtime/thread.hpp	Tue Feb 05 15:12:13 2019 -0500
@@ -782,7 +782,6 @@
   volatile int _TypeTag;
   ParkEvent * _ParkEvent;                     // for synchronized()
   ParkEvent * _SleepEvent;                    // for Thread.sleep
-  ParkEvent * _MutexEvent;                    // for native internal Mutex/Monitor
   ParkEvent * _MuxEvent;                      // for low-level muxAcquire-muxRelease
   int NativeSyncRecursion;                    // diagnostic
 
@@ -792,8 +791,6 @@
   jint _hashStateY;
   jint _hashStateZ;
 
-  volatile jint rng[4];                      // RNG for spin loop
-
   // Low-level leaf-lock primitives used to implement synchronization
   // and native monitor-mutex infrastructure.
   // Not for general synchronization use.