8212826: Make PtrQueue free list lock-free
authorkbarrett
Sat, 19 Jan 2019 19:50:01 -0500
changeset 53404 9ff1e6cacac3
parent 53403 683a112e0e1e
child 53405 8d03f69b8325
8212826: Make PtrQueue free list lock-free Summary: Add lock-free stack and use in BufferNode::Allocator. Reviewed-by: tschatzl, sangheki
src/hotspot/share/gc/g1/g1BarrierSet.cpp
src/hotspot/share/gc/shared/ptrQueue.cpp
src/hotspot/share/gc/shared/ptrQueue.hpp
src/hotspot/share/logging/logTag.hpp
src/hotspot/share/memory/padded.hpp
src/hotspot/share/runtime/mutexLocker.cpp
src/hotspot/share/runtime/mutexLocker.hpp
src/hotspot/share/utilities/lockFreeStack.hpp
test/hotspot/gtest/gc/shared/test_ptrQueueBufferAllocator.cpp
test/hotspot/gtest/utilities/test_lockFreeStack.cpp
--- a/src/hotspot/share/gc/g1/g1BarrierSet.cpp	Sat Jan 19 11:20:01 2019 +0100
+++ b/src/hotspot/share/gc/g1/g1BarrierSet.cpp	Sat Jan 19 19:50:01 2019 -0500
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -55,8 +55,8 @@
                       make_barrier_set_c2<G1BarrierSetC2>(),
                       card_table,
                       BarrierSet::FakeRtti(BarrierSet::G1BarrierSet)),
-  _satb_mark_queue_buffer_allocator(G1SATBBufferSize, SATB_Q_FL_lock),
-  _dirty_card_queue_buffer_allocator(G1UpdateBufferSize, DirtyCardQ_FL_lock),
+  _satb_mark_queue_buffer_allocator("SATB Buffer Allocator", G1SATBBufferSize),
+  _dirty_card_queue_buffer_allocator("DC Buffer Allocator", G1UpdateBufferSize),
   _satb_mark_queue_set(),
   _dirty_card_queue_set()
 {}
--- a/src/hotspot/share/gc/shared/ptrQueue.cpp	Sat Jan 19 11:20:01 2019 +0100
+++ b/src/hotspot/share/gc/shared/ptrQueue.cpp	Sat Jan 19 19:50:01 2019 -0500
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -24,12 +24,15 @@
 
 #include "precompiled.hpp"
 #include "gc/shared/ptrQueue.hpp"
+#include "logging/log.hpp"
 #include "memory/allocation.hpp"
 #include "memory/allocation.inline.hpp"
 #include "runtime/atomic.hpp"
 #include "runtime/mutex.hpp"
 #include "runtime/mutexLocker.hpp"
+#include "runtime/orderAccess.hpp"
 #include "runtime/thread.inline.hpp"
+#include "utilities/globalCounter.inline.hpp"
 
 #include <new>
 
@@ -85,20 +88,29 @@
   FREE_C_HEAP_ARRAY(char, node);
 }
 
-BufferNode::Allocator::Allocator(size_t buffer_size, Mutex* lock) :
+BufferNode::Allocator::Allocator(const char* name, size_t buffer_size) :
   _buffer_size(buffer_size),
-  _lock(lock),
-  _free_list(NULL),
-  _free_count(0)
+  _pending_list(),
+  _free_list(),
+  _pending_count(0),
+  _free_count(0),
+  _transfer_lock(false)
 {
-  assert(lock != NULL, "precondition");
+  strncpy(_name, name, sizeof(_name));
+  _name[sizeof(_name) - 1] = '\0';
 }
 
 BufferNode::Allocator::~Allocator() {
-  while (_free_list != NULL) {
-    BufferNode* node = _free_list;
-    _free_list = node->next();
-    BufferNode::deallocate(node);
+  delete_list(_free_list.pop_all());
+  delete_list(_pending_list.pop_all());
+}
+
+void BufferNode::Allocator::delete_list(BufferNode* list) {
+  while (list != NULL) {
+    BufferNode* next = list->next();
+    DEBUG_ONLY(list->set_next(NULL);)
+    BufferNode::deallocate(list);
+    list = next;
   }
 }
 
@@ -107,55 +119,109 @@
 }
 
 BufferNode* BufferNode::Allocator::allocate() {
-  BufferNode* node = NULL;
+  BufferNode* node;
   {
-    MutexLockerEx ml(_lock, Mutex::_no_safepoint_check_flag);
-    node = _free_list;
-    if (node != NULL) {
-      _free_list = node->next();
-      --_free_count;
-      node->set_next(NULL);
-      node->set_index(0);
-      return node;
-    }
+    // Protect against ABA; see release().
+    GlobalCounter::CriticalSection cs(Thread::current());
+    node = _free_list.pop();
   }
-  return  BufferNode::allocate(_buffer_size);
+  if (node == NULL) {
+    node = BufferNode::allocate(_buffer_size);
+  } else {
+    // Decrement count after getting buffer from free list.  This, along
+    // with incrementing count before adding to free list, ensures count
+    // never underflows.
+    size_t count = Atomic::sub(1u, &_free_count);
+    assert((count + 1) != 0, "_free_count underflow");
+  }
+  return node;
 }
 
+// To solve the ABA problem for lock-free stack pop, allocate does the
+// pop inside a critical section, and release synchronizes on the
+// critical sections before adding to the _free_list.  But we don't
+// want to make every release have to do a synchronize.  Instead, we
+// initially place released nodes on the _pending_list, and transfer
+// them to the _free_list in batches.  Only one transfer at a time is
+// permitted, with a lock bit to control access to that phase.  A
+// transfer takes all the nodes from the _pending_list, synchronizes on
+// the _free_list pops, and then adds the former pending nodes to the
+// _free_list.  While that's happening, other threads might be adding
+// other nodes to the _pending_list, to be dealt with by some later
+// transfer.
 void BufferNode::Allocator::release(BufferNode* node) {
-  MutexLockerEx ml(_lock, Mutex::_no_safepoint_check_flag);
-  node->set_next(_free_list);
-  _free_list = node;
-  ++_free_count;
+  assert(node != NULL, "precondition");
+  assert(node->next() == NULL, "precondition");
+
+  // Desired minimum transfer batch size.  There is relatively little
+  // importance to the specific number.  It shouldn't be too big, else
+  // we're wasting space when the release rate is low.  If the release
+  // rate is high, we might accumulate more than this before being
+  // able to start a new transfer, but that's okay.  Also note that
+  // the allocation rate and the release rate are going to be fairly
+  // similar, due to how the buffers are used.
+  const size_t trigger_transfer = 10;
+
+  // Add to pending list. Update count first so no underflow in transfer.
+  size_t pending_count = Atomic::add(1u, &_pending_count);
+  _pending_list.push(*node);
+  if (pending_count > trigger_transfer) {
+    try_transfer_pending();
+  }
 }
 
-void BufferNode::Allocator::reduce_free_list() {
-  BufferNode* head = NULL;
-  {
-    MutexLockerEx ml(_lock, Mutex::_no_safepoint_check_flag);
-    // For now, delete half.
-    size_t remove = _free_count / 2;
-    if (remove > 0) {
-      head = _free_list;
-      BufferNode* tail = head;
-      BufferNode* prev = NULL;
-      for (size_t i = 0; i < remove; ++i) {
-        assert(tail != NULL, "free list size is wrong");
-        prev = tail;
-        tail = tail->next();
-      }
-      assert(prev != NULL, "invariant");
-      assert(prev->next() == tail, "invariant");
-      prev->set_next(NULL);
-      _free_list = tail;
-      _free_count -= remove;
+// Try to transfer nodes from _pending_list to _free_list, with a
+// synchronization delay for any in-progress pops from the _free_list,
+// to solve ABA there.  Return true if performed a (possibly empty)
+// transfer, false if blocked from doing so by some other thread's
+// in-progress transfer.
+bool BufferNode::Allocator::try_transfer_pending() {
+  // Attempt to claim the lock.
+  if (Atomic::load(&_transfer_lock) || // Skip CAS if likely to fail.
+      Atomic::cmpxchg(true, &_transfer_lock, false)) {
+    return false;
+  }
+  // Have the lock; perform the transfer.
+
+  // Claim all the pending nodes.
+  BufferNode* first = _pending_list.pop_all();
+  if (first != NULL) {
+    // Prepare to add the claimed nodes, and update _pending_count.
+    BufferNode* last = first;
+    size_t count = 1;
+    for (BufferNode* next = first->next(); next != NULL; next = next->next()) {
+      last = next;
+      ++count;
     }
+    Atomic::sub(count, &_pending_count);
+
+    // Wait for any in-progress pops, to avoid ABA for them.
+    GlobalCounter::write_synchronize();
+
+    // Add synchronized nodes to _free_list.
+    // Update count first so no underflow in allocate().
+    Atomic::add(count, &_free_count);
+    _free_list.prepend(*first, *last);
+    log_trace(gc, ptrqueue, freelist)
+             ("Transferred %s pending to free: " SIZE_FORMAT, name(), count);
   }
-  while (head != NULL) {
-    BufferNode* next = head->next();
-    BufferNode::deallocate(head);
-    head = next;
+  OrderAccess::release_store(&_transfer_lock, false);
+  return true;
+}
+
+size_t BufferNode::Allocator::reduce_free_list(size_t remove_goal) {
+  try_transfer_pending();
+  size_t removed = 0;
+  for ( ; removed < remove_goal; ++removed) {
+    BufferNode* node = _free_list.pop();
+    if (node == NULL) break;
+    BufferNode::deallocate(node);
   }
+  size_t new_count = Atomic::sub(removed, &_free_count);
+  log_debug(gc, ptrqueue, freelist)
+           ("Reduced %s free list by " SIZE_FORMAT " to " SIZE_FORMAT,
+            name(), removed, new_count);
+  return removed;
 }
 
 PtrQueueSet::PtrQueueSet(bool notify_when_complete) :
--- a/src/hotspot/share/gc/shared/ptrQueue.hpp	Sat Jan 19 11:20:01 2019 +0100
+++ b/src/hotspot/share/gc/shared/ptrQueue.hpp	Sat Jan 19 19:50:01 2019 -0500
@@ -25,7 +25,10 @@
 #ifndef SHARE_GC_SHARED_PTRQUEUE_HPP
 #define SHARE_GC_SHARED_PTRQUEUE_HPP
 
+#include "memory/padded.hpp"
 #include "utilities/align.hpp"
+#include "utilities/debug.hpp"
+#include "utilities/lockFreeStack.hpp"
 #include "utilities/sizes.hpp"
 
 class Mutex;
@@ -215,7 +218,7 @@
 
 class BufferNode {
   size_t _index;
-  BufferNode* _next;
+  BufferNode* volatile _next;
   void* _buffer[1];             // Pseudo flexible array member.
 
   BufferNode() : _index(0), _next(NULL) { }
@@ -225,6 +228,8 @@
     return offset_of(BufferNode, _buffer);
   }
 
+  static BufferNode* volatile* next_ptr(BufferNode& bn) { return &bn._next; }
+
 AIX_ONLY(public:)               // xlC 12 on AIX doesn't implement C++ DR45.
   // Allocate a new BufferNode with the "buffer" having size elements.
   static BufferNode* allocate(size_t size);
@@ -233,6 +238,8 @@
   static void deallocate(BufferNode* node);
 
 public:
+  typedef LockFreeStack<BufferNode, &next_ptr> Stack;
+
   BufferNode* next() const     { return _next;  }
   void set_next(BufferNode* n) { _next = n;     }
   size_t index() const         { return _index; }
@@ -254,23 +261,52 @@
       reinterpret_cast<char*>(node) + buffer_offset());
   }
 
-  // Free-list based allocator.
-  class Allocator {
-    size_t _buffer_size;
-    Mutex* _lock;
-    BufferNode* _free_list;
-    volatile size_t _free_count;
+  class Allocator;              // Free-list based allocator.
+  class TestSupport;            // Unit test support.
+};
+
+// Allocation is based on a lock-free free list of nodes, linked through
+// BufferNode::_next (see BufferNode::Stack).  To solve the ABA problem,
+// popping a node from the free list is performed within a GlobalCounter
+// critical section, and pushing nodes onto the free list is done after
+// a GlobalCounter synchronization associated with the nodes to be pushed.
+// This is documented behavior so that other parts of the node life-cycle
+// can depend on and make use of it too.
+class BufferNode::Allocator {
+  friend class TestSupport;
+
+  // Since we don't expect many instances, and measured >15% speedup
+  // on stress gtest, padding seems like a good tradeoff here.
+#define DECLARE_PADDED_MEMBER(Id, Type, Name) \
+  Type Name; DEFINE_PAD_MINUS_SIZE(Id, DEFAULT_CACHE_LINE_SIZE, sizeof(Type))
 
-  public:
-    Allocator(size_t buffer_size, Mutex* lock);
-    ~Allocator();
+  const size_t _buffer_size;
+  char _name[DEFAULT_CACHE_LINE_SIZE - sizeof(size_t)]; // Use name as padding.
+  DECLARE_PADDED_MEMBER(1, Stack, _pending_list);
+  DECLARE_PADDED_MEMBER(2, Stack, _free_list);
+  DECLARE_PADDED_MEMBER(3, volatile size_t, _pending_count);
+  DECLARE_PADDED_MEMBER(4, volatile size_t, _free_count);
+  DECLARE_PADDED_MEMBER(5, volatile bool, _transfer_lock);
+
+#undef DECLARE_PADDED_MEMBER
+
+  void delete_list(BufferNode* list);
+  bool try_transfer_pending();
 
-    size_t buffer_size() const { return _buffer_size; }
-    size_t free_count() const;
-    BufferNode* allocate();
-    void release(BufferNode* node);
-    void reduce_free_list();
-  };
+public:
+  Allocator(const char* name, size_t buffer_size);
+  ~Allocator();
+
+  const char* name() const { return _name; }
+  size_t buffer_size() const { return _buffer_size; }
+  size_t free_count() const;
+  BufferNode* allocate();
+  void release(BufferNode* node);
+
+  // Deallocate some of the available buffers.  remove_goal is the target
+  // number to remove.  Returns the number actually deallocated, which may
+  // be less than the goal if there were fewer available.
+  size_t reduce_free_list(size_t remove_goal);
 };
 
 // A PtrQueueSet represents resources common to a set of pointer queues.
--- a/src/hotspot/share/logging/logTag.hpp	Sat Jan 19 11:20:01 2019 +0100
+++ b/src/hotspot/share/logging/logTag.hpp	Sat Jan 19 19:50:01 2019 -0500
@@ -133,6 +133,7 @@
   LOG_TAG(reloc) \
   LOG_TAG(remset) \
   LOG_TAG(parser) \
+  LOG_TAG(ptrqueue) \
   LOG_TAG(purge) \
   LOG_TAG(resolve) \
   LOG_TAG(safepoint) \
--- a/src/hotspot/share/memory/padded.hpp	Sat Jan 19 11:20:01 2019 +0100
+++ b/src/hotspot/share/memory/padded.hpp	Sat Jan 19 19:50:01 2019 -0500
@@ -25,6 +25,7 @@
 #ifndef SHARE_MEMORY_PADDED_HPP
 #define SHARE_MEMORY_PADDED_HPP
 
+#include "memory/allocation.hpp"
 #include "utilities/align.hpp"
 #include "utilities/globalDefinitions.hpp"
 
--- a/src/hotspot/share/runtime/mutexLocker.cpp	Sat Jan 19 11:20:01 2019 +0100
+++ b/src/hotspot/share/runtime/mutexLocker.cpp	Sat Jan 19 19:50:01 2019 -0500
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -79,10 +79,8 @@
 Monitor* CGC_lock                     = NULL;
 Monitor* STS_lock                     = NULL;
 Monitor* FullGCCount_lock             = NULL;
-Mutex*   SATB_Q_FL_lock               = NULL;
 Monitor* SATB_Q_CBL_mon               = NULL;
 Mutex*   Shared_SATB_Q_lock           = NULL;
-Mutex*   DirtyCardQ_FL_lock           = NULL;
 Monitor* DirtyCardQ_CBL_mon           = NULL;
 Mutex*   Shared_DirtyCardQ_lock       = NULL;
 Mutex*   MarkStackFreeList_lock       = NULL;
@@ -214,11 +212,9 @@
     def(FullGCCount_lock           , PaddedMonitor, leaf,        true,  Monitor::_safepoint_check_never);      // in support of ExplicitGCInvokesConcurrent
   }
   if (UseG1GC) {
-    def(SATB_Q_FL_lock             , PaddedMutex  , access,      true,  Monitor::_safepoint_check_never);
     def(SATB_Q_CBL_mon             , PaddedMonitor, access,      true,  Monitor::_safepoint_check_never);
     def(Shared_SATB_Q_lock         , PaddedMutex  , access + 1,  true,  Monitor::_safepoint_check_never);
 
-    def(DirtyCardQ_FL_lock         , PaddedMutex  , access,      true,  Monitor::_safepoint_check_never);
     def(DirtyCardQ_CBL_mon         , PaddedMonitor, access,      true,  Monitor::_safepoint_check_never);
     def(Shared_DirtyCardQ_lock     , PaddedMutex  , access + 1,  true,  Monitor::_safepoint_check_never);
 
@@ -235,7 +231,6 @@
     def(MonitoringSupport_lock     , PaddedMutex  , native   ,   true,  Monitor::_safepoint_check_never);      // used for serviceability monitoring support
   }
   if (UseShenandoahGC) {
-    def(SATB_Q_FL_lock             , PaddedMutex  , access,      true,  Monitor::_safepoint_check_never);
     def(SATB_Q_CBL_mon             , PaddedMonitor, access,      true,  Monitor::_safepoint_check_never);
     def(Shared_SATB_Q_lock         , PaddedMutex  , access + 1,  true,  Monitor::_safepoint_check_never);
 
--- a/src/hotspot/share/runtime/mutexLocker.hpp	Sat Jan 19 11:20:01 2019 +0100
+++ b/src/hotspot/share/runtime/mutexLocker.hpp	Sat Jan 19 19:50:01 2019 -0500
@@ -76,16 +76,12 @@
                                                  // fore- & background GC threads.
 extern Monitor* STS_lock;                        // used for joining/leaving SuspendibleThreadSet.
 extern Monitor* FullGCCount_lock;                // in support of "concurrent" full gc
-extern Mutex*   SATB_Q_FL_lock;                  // Protects SATB Q
-                                                 // buffer free list.
 extern Monitor* SATB_Q_CBL_mon;                  // Protects SATB Q
                                                  // completed buffer queue.
 extern Mutex*   Shared_SATB_Q_lock;              // Lock protecting SATB
                                                  // queue shared by
                                                  // non-Java threads.
 
-extern Mutex*   DirtyCardQ_FL_lock;              // Protects dirty card Q
-                                                 // buffer free list.
 extern Monitor* DirtyCardQ_CBL_mon;              // Protects dirty card Q
                                                  // completed buffer queue.
 extern Mutex*   Shared_DirtyCardQ_lock;          // Lock protecting dirty card
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/hotspot/share/utilities/lockFreeStack.hpp	Sat Jan 19 19:50:01 2019 -0500
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_UTILITIES_LOCKFREESTACK_HPP
+#define SHARE_UTILITIES_LOCKFREESTACK_HPP
+
+#include "runtime/atomic.hpp"
+#include "utilities/debug.hpp"
+#include "utilities/macros.hpp"
+
+// The LockFreeStack class template provides a lock-free LIFO. The objects
+// in the sequence are intrusively linked via a member in the objects.  As
+// a result, there is no allocation involved in adding objects to the stack
+// or removing them from the stack.
+//
+// To be used in a LockFreeStack of objects of type T, an object of
+// type T must have a list entry member of type T* volatile, with an
+// non-member accessor function returning a pointer to that member.  A
+// LockFreeStack is associated with the class of its elements and an
+// entry member from that class.
+//
+// An object can be in multiple stacks at the same time, so long as
+// each stack uses a different entry member. That is, the class of the
+// object must have multiple LockFreeStack entry members, one for each
+// stack in which the object may simultaneously be an element.
+//
+// LockFreeStacks support polymorphic elements.  Because the objects
+// in a stack are externally managed, rather than being embedded
+// values in the stack, the actual type of such objects may be more
+// specific than the stack's element type.
+//
+// \tparam T is the class of the elements in the stack.
+//
+// \tparam next_ptr is a function pointer.  Applying this function to
+// an object of type T must return a pointer to the list entry member
+// of the object associated with the LockFreeStack type.
+template<typename T, T* volatile* (*next_ptr)(T&)>
+class LockFreeStack {
+  T* volatile _top;
+
+  void prepend_impl(T* first, T* last) {
+    T* cur = top();
+    T* old;
+    do {
+      old = cur;
+      set_next(*last, cur);
+      cur = Atomic::cmpxchg(first, &_top, cur);
+    } while (old != cur);
+  }
+
+  // Noncopyable.
+  LockFreeStack(const LockFreeStack&);
+  LockFreeStack& operator=(const LockFreeStack&);
+
+public:
+  LockFreeStack() : _top(NULL) {}
+  ~LockFreeStack() { assert(empty(), "stack not empty"); }
+
+  // Atomically removes the top object from this stack and returns a
+  // pointer to that object, or NULL if this stack is empty. Acts as a
+  // full memory barrier. Subject to ABA behavior; callers must ensure
+  // usage is safe.
+  T* pop() {
+    T* result = top();
+    T* old;
+    do {
+      old = result;
+      T* new_top = NULL;
+      if (result != NULL) {
+        new_top = next(*result);
+      }
+      // CAS even on empty pop, for consistent membar bahavior.
+      result = Atomic::cmpxchg(new_top, &_top, result);
+    } while (result != old);
+    if (result != NULL) {
+      set_next(*result, NULL);
+    }
+    return result;
+  }
+
+  // Atomically exchange the list of elements with NULL, returning the old
+  // list of elements.  Acts as a full memory barrier.
+  // postcondition: empty()
+  T* pop_all() {
+    return Atomic::xchg((T*)NULL, &_top);
+  }
+
+  // Atomically adds value to the top of this stack.  Acts as a full
+  // memory barrier.
+  void push(T& value) {
+    assert(next(value) == NULL, "precondition");
+    prepend_impl(&value, &value);
+  }
+
+  // Atomically adds the list of objects (designated by first and
+  // last) before the objects already in this stack, in the same order
+  // as in the list. Acts as a full memory barrier.
+  // precondition: next(last) == NULL.
+  // postcondition: top() == &first, next(last) == old top().
+  void prepend(T& first, T& last) {
+    assert(next(last) == NULL, "precondition");
+#ifdef ASSERT
+    for (T* p = &first; p != &last; p = next(*p)) {
+      assert(p != NULL, "invalid prepend list");
+    }
+#endif
+    prepend_impl(&first, &last);
+  }
+
+  // Atomically adds the list of objects headed by first before the
+  // objects already in this stack, in the same order as in the list.
+  // Acts as a full memory barrier.
+  // postcondition: top() == &first.
+  void prepend(T& first) {
+    T* last = &first;
+    while (true) {
+      T* step_to = next(*last);
+      if (step_to == NULL) break;
+      last = step_to;
+    }
+    prepend_impl(&first, last);
+  }
+
+  // Return true if the stack is empty.
+  bool empty() const { return top() == NULL; }
+
+  // Return the most recently pushed element, or NULL if the stack is empty.
+  // The returned element is not removed from the stack.
+  T* top() const { return Atomic::load(&_top); }
+
+  // Return the number of objects in the stack.  There must be no concurrent
+  // pops while the length is being determined.
+  size_t length() const {
+    size_t result = 0;
+    for (const T* current = top(); current != NULL; current = next(*current)) {
+      ++result;
+    }
+    return result;
+  }
+
+  // Return the entry following value in the list used by the
+  // specialized LockFreeStack class.
+  static T* next(const T& value) {
+    return Atomic::load(next_ptr(const_cast<T&>(value)));
+  }
+
+  // Set the entry following value to new_next in the list used by the
+  // specialized LockFreeStack class.  Not thread-safe; in particular,
+  // if value is in an instance of this specialization of LockFreeStack,
+  // there must be no concurrent push or pop operations on that stack.
+  static void set_next(T& value, T* new_next) {
+    Atomic::store(new_next, next_ptr(value));
+  }
+};
+
+#endif // SHARE_UTILITIES_LOCKFREESTACK_HPP
--- a/test/hotspot/gtest/gc/shared/test_ptrQueueBufferAllocator.cpp	Sat Jan 19 11:20:01 2019 +0100
+++ b/test/hotspot/gtest/gc/shared/test_ptrQueueBufferAllocator.cpp	Sat Jan 19 19:50:01 2019 -0500
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2019, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -24,14 +24,37 @@
 
 #include "precompiled.hpp"
 #include "gc/shared/ptrQueue.hpp"
-#include "runtime/mutex.hpp"
+#include "memory/allocation.hpp"
+#include "runtime/interfaceSupport.inline.hpp"
+#include "runtime/orderAccess.hpp"
+#include "runtime/semaphore.inline.hpp"
+#include "runtime/thread.hpp"
+#include "utilities/globalCounter.inline.hpp"
+#include "utilities/globalDefinitions.hpp"
+#include "utilities/ostream.hpp"
+#include "threadHelper.inline.hpp"
 #include "unittest.hpp"
 
+class BufferNode::TestSupport : AllStatic {
+public:
+  static bool try_transfer_pending(Allocator* allocator) {
+    return allocator->try_transfer_pending();
+  }
+
+  class CompletedList;
+  class AllocatorThread;
+  class ProcessorThread;
+};
+
+typedef BufferNode::TestSupport::CompletedList CompletedList;
+typedef BufferNode::TestSupport::AllocatorThread AllocatorThread;
+typedef BufferNode::TestSupport::ProcessorThread ProcessorThread;
+
 // Some basic testing of BufferNode::Allocator.
 TEST_VM(PtrQueueBufferAllocatorTest, test) {
-  Mutex m(Mutex::leaf, "PtrQueueBufferAllocatorTest",
-          false, Mutex::_safepoint_check_never);
-  BufferNode::Allocator allocator(256, &m);
+  const size_t buffer_size = 256;
+  BufferNode::Allocator allocator("Test Buffer Allocator", buffer_size);
+  ASSERT_EQ(buffer_size, allocator.buffer_size());
 
   // Allocate some new nodes for use in testing.
   BufferNode* nodes[10] = {};
@@ -44,8 +67,11 @@
 
   // Release the nodes, adding them to the allocator's free list.
   for (size_t i = 0; i < node_count; ++i) {
-    ASSERT_EQ(i, allocator.free_count());
     allocator.release(nodes[i]);
+  }
+  ASSERT_TRUE(BufferNode::TestSupport::try_transfer_pending(&allocator));
+  ASSERT_EQ(node_count, allocator.free_count());
+  for (size_t i = 0; i < node_count; ++i) {
     if (i == 0) {
       ASSERT_EQ((BufferNode*)NULL, nodes[i]->next());
     } else {
@@ -56,7 +82,6 @@
   // Allocate nodes from the free list.
   for (size_t i = 0; i < node_count; ++i) {
     size_t j = node_count - i;
-    ASSERT_EQ(j, allocator.free_count());
     ASSERT_EQ(nodes[j - 1], allocator.allocate());
   }
   ASSERT_EQ(0u, allocator.free_count());
@@ -65,11 +90,161 @@
   for (size_t i = 0; i < node_count; ++i) {
     allocator.release(nodes[i]);
   }
+  ASSERT_TRUE(BufferNode::TestSupport::try_transfer_pending(&allocator));
   ASSERT_EQ(node_count, allocator.free_count());
 
   // Destroy some nodes in the free list.
   // We don't have a way to verify destruction, but we can at
-  // leat verify we don't crash along the way.
-  allocator.reduce_free_list();
+  // least verify we don't crash along the way.
+  size_t count = allocator.free_count();
+  ASSERT_EQ(count, allocator.reduce_free_list(count));
   // destroy allocator.
 }
+
+// Stress test with lock-free allocator and completed buffer list.
+// Completed buffer list pop avoids ABA by also being in a critical
+// section that is synchronized by the allocator's release.
+
+class BufferNode::TestSupport::CompletedList {
+  BufferNode::Stack _completed_list;
+
+public:
+  CompletedList() : _completed_list() {}
+
+  ~CompletedList() {
+    assert(_completed_list.empty(), "completed list not empty");
+  }
+
+  void push(BufferNode* node) {
+    assert(node != NULL, "precondition");
+    _completed_list.push(*node);
+  }
+
+  BufferNode* pop() {
+    GlobalCounter::CriticalSection cs(Thread::current());
+    return _completed_list.pop();
+  }
+};
+
+// Simulate a mutator thread, allocating buffers and adding them to
+// the completed buffer list.
+class BufferNode::TestSupport::AllocatorThread : public JavaTestThread {
+  BufferNode::Allocator* _allocator;
+  CompletedList* _cbl;
+  volatile size_t* _total_allocations;
+  volatile bool* _continue_running;
+  size_t _allocations;
+
+public:
+  AllocatorThread(Semaphore* post,
+                  BufferNode::Allocator* allocator,
+                  CompletedList* cbl,
+                  volatile size_t* total_allocations,
+                  volatile bool* continue_running) :
+    JavaTestThread(post),
+    _allocator(allocator),
+    _cbl(cbl),
+    _total_allocations(total_allocations),
+    _continue_running(continue_running),
+    _allocations(0)
+  {}
+
+  virtual void main_run() {
+    while (OrderAccess::load_acquire(_continue_running)) {
+      BufferNode* node = _allocator->allocate();
+      _cbl->push(node);
+      ++_allocations;
+      ThreadBlockInVM tbiv(this); // Safepoint check.
+    }
+    tty->print_cr("allocations: " SIZE_FORMAT, _allocations);
+    Atomic::add(_allocations, _total_allocations);
+  }
+};
+
+// Simulate a GC thread, taking buffers from the completed buffer list
+// and returning them to the allocator.
+class BufferNode::TestSupport::ProcessorThread : public JavaTestThread {
+  BufferNode::Allocator* _allocator;
+  CompletedList* _cbl;
+  volatile bool* _continue_running;
+
+public:
+  ProcessorThread(Semaphore* post,
+                  BufferNode::Allocator* allocator,
+                  CompletedList* cbl,
+                  volatile bool* continue_running) :
+    JavaTestThread(post),
+    _allocator(allocator),
+    _cbl(cbl),
+    _continue_running(continue_running)
+  {}
+
+  virtual void main_run() {
+    while (true) {
+      BufferNode* node = _cbl->pop();
+      if (node != NULL) {
+        _allocator->release(node);
+      } else if (!OrderAccess::load_acquire(_continue_running)) {
+        return;
+      }
+      ThreadBlockInVM tbiv(this); // Safepoint check.
+    }
+  }
+};
+
+static void run_test(BufferNode::Allocator* allocator, CompletedList* cbl) {
+  const uint nthreads = 4;
+  const uint milliseconds_to_run = 1000;
+
+  Semaphore post;
+  volatile size_t total_allocations = 0;
+  volatile bool allocator_running = true;
+  volatile bool processor_running = true;
+
+  ProcessorThread* proc_threads[nthreads] = {};
+  for (uint i = 0; i < nthreads; ++i) {
+    proc_threads[i] = new ProcessorThread(&post,
+                                          allocator,
+                                          cbl,
+                                          &processor_running);
+    proc_threads[i]->doit();
+  }
+
+  AllocatorThread* alloc_threads[nthreads] = {};
+  for (uint i = 0; i < nthreads; ++i) {
+    alloc_threads[i] = new AllocatorThread(&post,
+                                           allocator,
+                                           cbl,
+                                           &total_allocations,
+                                           &allocator_running);
+    alloc_threads[i]->doit();
+  }
+
+  JavaThread* this_thread = JavaThread::current();
+  tty->print_cr("Stressing allocator for %u ms", milliseconds_to_run);
+  {
+    ThreadInVMfromNative invm(this_thread);
+    os::sleep(this_thread, milliseconds_to_run, true);
+  }
+  OrderAccess::release_store(&allocator_running, false);
+  for (uint i = 0; i < nthreads; ++i) {
+    ThreadInVMfromNative invm(this_thread);
+    post.wait_with_safepoint_check(this_thread);
+  }
+  OrderAccess::release_store(&processor_running, false);
+  for (uint i = 0; i < nthreads; ++i) {
+    ThreadInVMfromNative invm(this_thread);
+    post.wait_with_safepoint_check(this_thread);
+  }
+  ASSERT_TRUE(BufferNode::TestSupport::try_transfer_pending(allocator));
+  tty->print_cr("total allocations: " SIZE_FORMAT, total_allocations);
+  tty->print_cr("allocator free count: " SIZE_FORMAT, allocator->free_count());
+}
+
+const size_t buffer_size = 1024;
+
+TEST_VM(PtrQueueBufferAllocatorTest, stress_free_list_allocator) {
+  BufferNode::Allocator allocator("Test Allocator", buffer_size);
+  CompletedList completed;
+  run_test(&allocator, &completed);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/hotspot/gtest/utilities/test_lockFreeStack.cpp	Sat Jan 19 19:50:01 2019 -0500
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+#include "precompiled.hpp"
+#include "memory/allocation.inline.hpp"
+#include "runtime/atomic.hpp"
+#include "runtime/orderAccess.hpp"
+#include "utilities/globalDefinitions.hpp"
+#include "utilities/lockFreeStack.hpp"
+#include "threadHelper.inline.hpp"
+#include "unittest.hpp"
+#include <new>
+
+class LockFreeStackTestElement {
+  typedef LockFreeStackTestElement Element;
+
+  Element* volatile _entry;
+  Element* volatile _entry1;
+  size_t _id;
+
+  static Element* volatile* entry_ptr(Element& e) { return &e._entry; }
+  static Element* volatile* entry1_ptr(Element& e) { return &e._entry1; }
+
+public:
+  LockFreeStackTestElement(size_t id = 0) : _entry(), _entry1(), _id(id) {}
+  size_t id() const { return _id; }
+  void set_id(size_t value) { _id = value; }
+
+  typedef LockFreeStack<Element, &entry_ptr> Stack;
+  typedef LockFreeStack<Element, &entry1_ptr> Stack1;
+};
+
+typedef LockFreeStackTestElement Element;
+typedef Element::Stack Stack;
+typedef Element::Stack1 Stack1;
+
+static void initialize_ids(Element* elements, size_t size) {
+  for (size_t i = 0; i < size; ++i) {
+    elements[i].set_id(i);
+  }
+}
+
+class LockFreeStackTestBasics : public ::testing::Test {
+public:
+  LockFreeStackTestBasics();
+
+  static const size_t nelements = 10;
+  Element elements[nelements];
+  Stack stack;
+
+private:
+  void initialize();
+};
+
+const size_t LockFreeStackTestBasics::nelements;
+
+LockFreeStackTestBasics::LockFreeStackTestBasics() : stack() {
+  initialize_ids(elements, nelements);
+  initialize();
+}
+
+void LockFreeStackTestBasics::initialize() {
+  ASSERT_TRUE(stack.empty());
+  ASSERT_EQ(0u, stack.length());
+  ASSERT_TRUE(stack.pop() == NULL);
+  ASSERT_TRUE(stack.top() == NULL);
+
+  for (size_t id = 0; id < nelements; ++id) {
+    ASSERT_EQ(id, stack.length());
+    Element* e = &elements[id];
+    ASSERT_EQ(id, e->id());
+    stack.push(*e);
+    ASSERT_FALSE(stack.empty());
+    ASSERT_EQ(e, stack.top());
+  }
+}
+
+TEST_F(LockFreeStackTestBasics, push_pop) {
+  for (size_t i = nelements; i > 0; ) {
+    ASSERT_FALSE(stack.empty());
+    ASSERT_EQ(i, stack.length());
+    --i;
+    Element* e = stack.pop();
+    ASSERT_TRUE(e != NULL);
+    ASSERT_EQ(&elements[i], e);
+    ASSERT_EQ(i, e->id());
+  }
+  ASSERT_TRUE(stack.empty());
+  ASSERT_EQ(0u, stack.length());
+  ASSERT_TRUE(stack.pop() == NULL);
+}
+
+TEST_F(LockFreeStackTestBasics, prepend_one) {
+  Stack other_stack;
+  ASSERT_TRUE(other_stack.empty());
+  ASSERT_TRUE(other_stack.pop() == NULL);
+  ASSERT_EQ(0u, other_stack.length());
+  ASSERT_TRUE(other_stack.top() == NULL);
+  ASSERT_TRUE(other_stack.pop() == NULL);
+
+  other_stack.prepend(*stack.pop_all());
+  ASSERT_EQ(nelements, other_stack.length());
+  ASSERT_TRUE(stack.empty());
+  ASSERT_EQ(0u, stack.length());
+  ASSERT_TRUE(stack.pop() == NULL);
+  ASSERT_TRUE(stack.top() == NULL);
+
+  for (size_t i = nelements; i > 0; ) {
+    ASSERT_EQ(i, other_stack.length());
+    --i;
+    Element* e = other_stack.pop();
+    ASSERT_TRUE(e != NULL);
+    ASSERT_EQ(&elements[i], e);
+    ASSERT_EQ(i, e->id());
+  }
+  ASSERT_EQ(0u, other_stack.length());
+  ASSERT_TRUE(other_stack.pop() == NULL);
+}
+
+TEST_F(LockFreeStackTestBasics, prepend_two) {
+  Stack other_stack;
+  ASSERT_TRUE(other_stack.empty());
+  ASSERT_EQ(0u, other_stack.length());
+  ASSERT_TRUE(other_stack.top() == NULL);
+  ASSERT_TRUE(other_stack.pop() == NULL);
+
+  Element* top = stack.pop_all();
+  ASSERT_EQ(top, &elements[nelements - 1]);
+  other_stack.prepend(*top, elements[0]);
+
+  for (size_t i = nelements; i > 0; ) {
+    ASSERT_EQ(i, other_stack.length());
+    --i;
+    Element* e = other_stack.pop();
+    ASSERT_TRUE(e != NULL);
+    ASSERT_EQ(&elements[i], e);
+    ASSERT_EQ(i, e->id());
+  }
+  ASSERT_EQ(0u, other_stack.length());
+  ASSERT_TRUE(other_stack.pop() == NULL);
+}
+
+TEST_F(LockFreeStackTestBasics, two_stacks) {
+  Stack1 stack1;
+  ASSERT_TRUE(stack1.pop() == NULL);
+
+  for (size_t id = 0; id < nelements; ++id) {
+    stack1.push(elements[id]);
+  }
+  ASSERT_EQ(nelements, stack1.length());
+  Element* e0 = stack.top();
+  Element* e1 = stack1.top();
+  while (true) {
+    ASSERT_EQ(e0, e1);
+    if (e0 == NULL) break;
+    e0 = stack.next(*e0);
+    e1 = stack1.next(*e1);
+  }
+
+  for (size_t i = nelements; i > 0; ) {
+    ASSERT_EQ(i, stack.length());
+    ASSERT_EQ(i, stack1.length());
+    --i;
+    Element* e = stack.pop();
+    ASSERT_TRUE(e != NULL);
+    ASSERT_EQ(&elements[i], e);
+    ASSERT_EQ(i, e->id());
+
+    Element* e1 = stack1.pop();
+    ASSERT_TRUE(e1 != NULL);
+    ASSERT_EQ(&elements[i], e1);
+    ASSERT_EQ(i, e1->id());
+
+    ASSERT_EQ(e, e1);
+  }
+  ASSERT_EQ(0u, stack.length());
+  ASSERT_EQ(0u, stack1.length());
+  ASSERT_TRUE(stack.pop() == NULL);
+  ASSERT_TRUE(stack1.pop() == NULL);
+}
+
+class LockFreeStackTestThread : public JavaTestThread {
+  uint _id;
+  Stack* _from;
+  Stack* _to;
+  volatile size_t* _processed;
+  size_t _process_limit;
+  size_t _local_processed;
+  volatile bool _ready;
+
+public:
+  LockFreeStackTestThread(Semaphore* post,
+                          uint id,
+                          Stack* from,
+                          Stack* to,
+                          volatile size_t* processed,
+                          size_t process_limit) :
+    JavaTestThread(post),
+    _id(id),
+    _from(from),
+    _to(to),
+    _processed(processed),
+    _process_limit(process_limit),
+    _local_processed(0),
+    _ready(false)
+  {}
+
+  virtual void main_run() {
+    OrderAccess::release_store_fence(&_ready, true);
+    while (true) {
+      Element* e = _from->pop();
+      if (e != NULL) {
+        _to->push(*e);
+        Atomic::inc(_processed);
+        ++_local_processed;
+      } else if (OrderAccess::load_acquire(_processed) == _process_limit) {
+        tty->print_cr("thread %u processed " SIZE_FORMAT, _id, _local_processed);
+        return;
+      }
+    }
+  }
+
+  bool ready() const { return OrderAccess::load_acquire(&_ready); }
+};
+
+TEST_VM(LockFreeStackTest, stress) {
+  Semaphore post;
+  Stack initial_stack;
+  Stack start_stack;
+  Stack middle_stack;
+  Stack final_stack;
+  volatile size_t stage1_processed = 0;
+  volatile size_t stage2_processed = 0;
+
+  const size_t nelements = 10000;
+  Element* elements = NEW_C_HEAP_ARRAY(Element, nelements, mtOther);
+  for (size_t id = 0; id < nelements; ++id) {
+    ::new (&elements[id]) Element(id);
+    initial_stack.push(elements[id]);
+  }
+  ASSERT_EQ(nelements, initial_stack.length());
+
+  // - stage1 threads pop from start_stack and push to middle_stack.
+  // - stage2 threads pop from middle_stack and push to final_stack.
+  // - all threads in a stage count the number of elements processed in
+  //   their corresponding stageN_processed counter.
+
+  const uint stage1_threads = 2;
+  const uint stage2_threads = 2;
+  const uint nthreads = stage1_threads + stage2_threads;
+  LockFreeStackTestThread* threads[nthreads] = {};
+
+  for (uint i = 0; i < ARRAY_SIZE(threads); ++i) {
+    Stack* from = &start_stack;
+    Stack* to = &middle_stack;
+    volatile size_t* processed = &stage1_processed;
+    if (i >= stage1_threads) {
+      from = &middle_stack;
+      to = &final_stack;
+      processed = &stage2_processed;
+    }
+    threads[i] =
+      new LockFreeStackTestThread(&post, i, from, to, processed, nelements);
+    threads[i]->doit();
+    while (!threads[i]->ready()) {} // Wait until ready to start test.
+  }
+
+  // Transfer elements to start_stack to start test.
+  start_stack.prepend(*initial_stack.pop_all());
+
+  // Wait for all threads to complete.
+  for (uint i = 0; i < nthreads; ++i) {
+    post.wait();
+  }
+
+  // Verify expected state.
+  ASSERT_EQ(nelements, stage1_processed);
+  ASSERT_EQ(nelements, stage2_processed);
+  ASSERT_EQ(0u, initial_stack.length());
+  ASSERT_EQ(0u, start_stack.length());
+  ASSERT_EQ(0u, middle_stack.length());
+  ASSERT_EQ(nelements, final_stack.length());
+  while (final_stack.pop() != NULL) {}
+
+  FREE_C_HEAP_ARRAY(Element, elements);
+}