8196083: Avoid locking in OopStorage::release
authorkbarrett
Thu, 08 Feb 2018 17:23:43 -0500
changeset 48886 e1d09bd56d2d
parent 48885 00e159258897
child 48887 368d7a786111
8196083: Avoid locking in OopStorage::release Summary: Defer release list updates to later allocate/delete operations. Reviewed-by: coleenp, eosterlund
src/hotspot/share/gc/shared/oopStorage.cpp
src/hotspot/share/gc/shared/oopStorage.hpp
src/hotspot/share/gc/shared/oopStorage.inline.hpp
src/hotspot/share/runtime/mutexLocker.cpp
test/hotspot/gtest/gc/shared/test_oopStorage.cpp
--- a/src/hotspot/share/gc/shared/oopStorage.cpp	Thu Feb 08 15:21:59 2018 -0500
+++ b/src/hotspot/share/gc/shared/oopStorage.cpp	Thu Feb 08 17:23:43 2018 -0500
@@ -26,7 +26,9 @@
 #include "gc/shared/oopStorage.inline.hpp"
 #include "gc/shared/oopStorageParState.inline.hpp"
 #include "logging/log.hpp"
+#include "logging/logStream.hpp"
 #include "memory/allocation.inline.hpp"
+#include "memory/resourceArea.hpp"
 #include "runtime/atomic.hpp"
 #include "runtime/handles.inline.hpp"
 #include "runtime/mutex.hpp"
@@ -107,7 +109,7 @@
 }
 
 // Blocks start with an array of BitsPerWord oop entries.  That array
-// is divided into conceptual BytesPerWord sections of BitsPerWord
+// is divided into conceptual BytesPerWord sections of BitsPerByte
 // entries.  Blocks are allocated aligned on section boundaries, for
 // the convenience of mapping from an entry to the containing block;
 // see block_for_ptr().  Aligning on section boundary rather than on
@@ -130,7 +132,9 @@
   _owner(owner),
   _memory(memory),
   _active_entry(),
-  _allocate_entry()
+  _allocate_entry(),
+  _deferred_updates_next(NULL),
+  _release_refcount(0)
 {
   STATIC_ASSERT(_data_pos == 0);
   STATIC_ASSERT(section_size * section_count == ARRAY_SIZE(_data));
@@ -143,6 +147,8 @@
 #endif
 
 OopStorage::Block::~Block() {
+  assert(_release_refcount == 0, "deleting block while releasing");
+  assert(_deferred_updates_next == NULL, "deleting block with deferred update");
   // Clear fields used by block_for_ptr and entry validation, which
   // might help catch bugs.  Volatile to prevent dead-store elimination.
   const_cast<uintx volatile&>(_allocated_bitmask) = 0;
@@ -182,8 +188,24 @@
   return bitmask_for_index(get_index(ptr));
 }
 
-uintx OopStorage::Block::cmpxchg_allocated_bitmask(uintx new_value, uintx compare_value) {
-  return Atomic::cmpxchg(new_value, &_allocated_bitmask, compare_value);
+// A block is deletable if
+// (1) It is empty.
+// (2) There is not a release() operation currently operating on it.
+// (3) It is not in the deferred updates list.
+// The order of tests is important for proper interaction between release()
+// and concurrent deletion.
+bool OopStorage::Block::is_deletable() const {
+  return (OrderAccess::load_acquire(&_allocated_bitmask) == 0) &&
+         (OrderAccess::load_acquire(&_release_refcount) == 0) &&
+         (OrderAccess::load_acquire(&_deferred_updates_next) == NULL);
+}
+
+OopStorage::Block* OopStorage::Block::deferred_updates_next() const {
+  return _deferred_updates_next;
+}
+
+void OopStorage::Block::set_deferred_updates_next(Block* block) {
+  _deferred_updates_next = block;
 }
 
 bool OopStorage::Block::contains(const oop* ptr) const {
@@ -203,7 +225,7 @@
     assert(!is_full_bitmask(allocated), "attempt to allocate from full block");
     unsigned index = count_trailing_zeros(~allocated);
     uintx new_value = allocated | bitmask_for_index(index);
-    uintx fetched = cmpxchg_allocated_bitmask(new_value, allocated);
+    uintx fetched = Atomic::cmpxchg(new_value, &_allocated_bitmask, allocated);
     if (fetched == allocated) {
       return get_pointer(index); // CAS succeeded; return entry for index.
     }
@@ -261,20 +283,6 @@
   return NULL;
 }
 
-bool OopStorage::is_valid_block_locked_or_safepoint(const Block* check_block) const {
-  assert_locked_or_safepoint(_allocate_mutex);
-  // For now, simple linear search.  Do something more clever if this
-  // is a performance bottleneck, particularly for allocation_status.
-  for (const Block* block = _active_list.chead();
-       block != NULL;
-       block = _active_list.next(*block)) {
-    if (check_block == block) {
-      return true;
-    }
-  }
-  return false;
-}
-
 #ifdef ASSERT
 void OopStorage::assert_at_safepoint() {
   assert(SafepointSynchronize::is_at_safepoint(), "must be at safepoint");
@@ -291,39 +299,49 @@
 // kept at the end of the _allocate_list, to make it easy for empty block
 // deletion to find them.
 //
-// allocate(), release(), and delete_empty_blocks_concurrent() all lock the
+// allocate(), and delete_empty_blocks_concurrent() lock the
 // _allocate_mutex while performing any list modifications.
 //
 // allocate() and release() update a block's _allocated_bitmask using CAS
-// loops.  This prevents loss of updates even though release() may perform
-// some updates without any locking.
+// loops.  This prevents loss of updates even though release() performs
+// its updates without any locking.
 //
 // allocate() obtains the entry from the first block in the _allocate_list,
 // and updates that block's _allocated_bitmask to indicate the entry is in
 // use.  If this makes the block full (all entries in use), the block is
 // removed from the _allocate_list so it won't be considered by future
-// allocations until some entries in it are relased.
+// allocations until some entries in it are released.
 //
-// release() looks up the block for the entry without locking.  Once the block
-// has been determined, its _allocated_bitmask needs to be updated, and its
-// position in the _allocate_list may need to be updated.  There are two
-// cases:
+// release() is performed lock-free. release() first looks up the block for
+// the entry, using address alignment to find the enclosing block (thereby
+// avoiding iteration over the _active_list).  Once the block has been
+// determined, its _allocated_bitmask needs to be updated, and its position in
+// the _allocate_list may need to be updated.  There are two cases:
 //
 // (a) If the block is neither full nor would become empty with the release of
 // the entry, only its _allocated_bitmask needs to be updated.  But if the CAS
 // update fails, the applicable case may change for the retry.
 //
-// (b) Otherwise, the _allocate_list will also need to be modified.  This
-// requires locking the _allocate_mutex, and then attempting to CAS the
-// _allocated_bitmask.  If the CAS fails, the applicable case may change for
-// the retry.  If the CAS succeeds, then update the _allocate_list according
-// to the the state changes.  If the block changed from full to not full, then
-// it needs to be added to the _allocate_list, for use in future allocations.
-// If the block changed from not empty to empty, then it is moved to the end
-// of the _allocate_list, for ease of empty block deletion processing.
+// (b) Otherwise, the _allocate_list also needs to be modified.  This requires
+// locking the _allocate_mutex.  To keep the release() operation lock-free,
+// rather than updating the _allocate_list itself, it instead performs a
+// lock-free push of the block onto the _deferred_updates list.  Entries on
+// that list are processed by allocate() and delete_empty_blocks_XXX(), while
+// they already hold the necessary lock.  That processing makes the block's
+// list state consistent with its current _allocated_bitmask.  The block is
+// added to the _allocate_list if not already present and the bitmask is not
+// full.  The block is moved to the end of the _allocated_list if the bitmask
+// is empty, for ease of empty block deletion processing.
 
 oop* OopStorage::allocate() {
   MutexLockerEx ml(_allocate_mutex, Mutex::_no_safepoint_check_flag);
+  // Do some deferred update processing every time we allocate.
+  // Continue processing deferred updates if _allocate_list is empty,
+  // in the hope that we'll get a block from that, rather than
+  // allocating a new block.
+  while (reduce_deferred_updates() && (_allocate_list.head() == NULL)) {}
+
+  // Use the first block in _allocate_list for the allocation.
   Block* block = _allocate_list.head();
   if (block == NULL) {
     // No available blocks; make a new one, and add to storage.
@@ -331,7 +349,17 @@
       MutexUnlockerEx mul(_allocate_mutex, Mutex::_no_safepoint_check_flag);
       block = Block::new_block(this);
     }
-    if (block != NULL) {
+    if (block == NULL) {
+      while (_allocate_list.head() == NULL) {
+        if (!reduce_deferred_updates()) {
+          // Failed to make new block, no other thread made a block
+          // available while the mutex was released, and didn't get
+          // one from a deferred update either, so return failure.
+          log_info(oopstorage, ref)("%s: failed allocation", name());
+          return NULL;
+        }
+      }
+    } else {
       // Add new block to storage.
       log_info(oopstorage, blocks)("%s: new block " PTR_FORMAT, name(), p2i(block));
 
@@ -340,22 +368,14 @@
       // to allocate from non-empty blocks, to allow empty blocks to
       // be deleted.
       _allocate_list.push_back(*block);
-      ++_empty_block_count;
       // Add to front of _active_list, and then record as the head
       // block, for concurrent iteration protocol.
       _active_list.push_front(*block);
       ++_block_count;
       // Ensure all setup of block is complete before making it visible.
       OrderAccess::release_store(&_active_head, block);
-    } else {
-      log_info(oopstorage, blocks)("%s: failed new block allocation", name());
     }
     block = _allocate_list.head();
-    if (block == NULL) {
-      // Failed to make new block, and no other thread made a block
-      // available while the mutex was released, so return failure.
-      return NULL;
-    }
   }
   // Allocate from first block.
   assert(block != NULL, "invariant");
@@ -363,7 +383,6 @@
   if (block->is_empty()) {
     // Transitioning from empty to not empty.
     log_debug(oopstorage, blocks)("%s: block not empty " PTR_FORMAT, name(), p2i(block));
-    --_empty_block_count;
   }
   oop* result = block->allocate();
   assert(result != NULL, "allocation failed");
@@ -384,72 +403,115 @@
   return Block::block_for_ptr(this, ptr);
 }
 
-void OopStorage::release_from_block(Block& block, uintx releasing) {
-  assert(releasing != 0, "invariant");
-  uintx allocated = block.allocated_bitmask();
-  while (true) {
-    assert(releasing == (allocated & releasing), "invariant");
-    uintx new_value = allocated ^ releasing;
-    // CAS new_value into block's allocated bitmask, retrying with
-    // updated allocated bitmask until the CAS succeeds.
-    uintx fetched;
-    if (!is_full_bitmask(allocated) && !is_empty_bitmask(new_value)) {
-      fetched = block.cmpxchg_allocated_bitmask(new_value, allocated);
-      if (fetched == allocated) return;
-    } else {
-      // Need special handling if transitioning from full to not full,
-      // or from not empty to empty.  For those cases, must hold the
-      // _allocation_mutex when updating the allocated bitmask, to
-      // ensure the associated list manipulations will be consistent
-      // with the allocation bitmask that is visible to other threads
-      // in allocate() or deleting empty blocks.
-      MutexLockerEx ml(_allocate_mutex, Mutex::_no_safepoint_check_flag);
-      fetched = block.cmpxchg_allocated_bitmask(new_value, allocated);
-      if (fetched == allocated) {
-        // CAS succeeded; handle special cases, which might no longer apply.
-        if (is_full_bitmask(allocated)) {
-          // Transitioning from full to not-full; add to _allocate_list.
-          log_debug(oopstorage, blocks)("%s: block not full " PTR_FORMAT, name(), p2i(&block));
-          _allocate_list.push_front(block);
-          assert(!block.is_full(), "invariant"); // Still not full.
-        }
-        if (is_empty_bitmask(new_value)) {
-          // Transitioning from not-empty to empty; move to end of
-          // _allocate_list, to make it a deletion candidate.
-          log_debug(oopstorage, blocks)("%s: block empty " PTR_FORMAT, name(), p2i(&block));
-          _allocate_list.unlink(block);
-          _allocate_list.push_back(block);
-          ++_empty_block_count;
-          assert(block.is_empty(), "invariant"); // Still empty.
-        }
-        return;                 // Successful CAS and transitions handled.
-      }
-    }
-    // CAS failed; retry with latest value.
-    allocated = fetched;
+static void log_release_transitions(uintx releasing,
+                                    uintx old_allocated,
+                                    const OopStorage* owner,
+                                    const void* block) {
+  ResourceMark rm;
+  Log(oopstorage, blocks) log;
+  LogStream ls(log.debug());
+  if (is_full_bitmask(old_allocated)) {
+    ls.print_cr("%s: block not full " PTR_FORMAT, owner->name(), p2i(block));
+  }
+  if (releasing == old_allocated) {
+    ls.print_cr("%s: block empty " PTR_FORMAT, owner->name(), p2i(block));
   }
 }
 
-#ifdef ASSERT
-void OopStorage::check_release(const Block* block, const oop* ptr) const {
-  switch (allocation_status_validating_block(block, ptr)) {
-  case INVALID_ENTRY:
-    fatal("Releasing invalid entry: " PTR_FORMAT, p2i(ptr));
-    break;
+void OopStorage::Block::release_entries(uintx releasing, Block* volatile* deferred_list) {
+  assert(releasing != 0, "preconditon");
+  // Prevent empty block deletion when transitioning to empty.
+  Atomic::inc(&_release_refcount);
+
+  // Atomically update allocated bitmask.
+  uintx old_allocated = _allocated_bitmask;
+  while (true) {
+    assert((releasing & ~old_allocated) == 0, "releasing unallocated entries");
+    uintx new_value = old_allocated ^ releasing;
+    uintx fetched = Atomic::cmpxchg(new_value, &_allocated_bitmask, old_allocated);
+    if (fetched == old_allocated) break; // Successful update.
+    old_allocated = fetched;             // Retry with updated bitmask.
+  }
 
-  case UNALLOCATED_ENTRY:
-    fatal("Releasing unallocated entry: " PTR_FORMAT, p2i(ptr));
-    break;
+  // Now that the bitmask has been updated, if we have a state transition
+  // (updated bitmask is empty or old bitmask was full), atomically push
+  // this block onto the deferred updates list.  Some future call to
+  // reduce_deferred_updates will make any needed changes related to this
+  // block and _allocate_list.  This deferral avoids list updates and the
+  // associated locking here.
+  if ((releasing == old_allocated) || is_full_bitmask(old_allocated)) {
+    // Log transitions.  Both transitions are possible in a single update.
+    if (log_is_enabled(Debug, oopstorage, blocks)) {
+      log_release_transitions(releasing, old_allocated, _owner, this);
+    }
+    // Attempt to claim responsibility for adding this block to the deferred
+    // list, by setting the link to non-NULL by self-looping.  If this fails,
+    // then someone else has made such a claim and the deferred update has not
+    // yet been processed and will include our change, so we don't need to do
+    // anything further.
+    if (Atomic::replace_if_null(this, &_deferred_updates_next)) {
+      // Successfully claimed.  Push, with self-loop for end-of-list.
+      Block* head = *deferred_list;
+      while (true) {
+        _deferred_updates_next = (head == NULL) ? this : head;
+        Block* fetched = Atomic::cmpxchg(this, deferred_list, head);
+        if (fetched == head) break; // Successful update.
+        head = fetched;             // Retry with updated head.
+      }
+      log_debug(oopstorage, blocks)("%s: deferred update " PTR_FORMAT,
+                                    _owner->name(), p2i(this));
+    }
+  }
+  // Release hold on empty block deletion.
+  Atomic::dec(&_release_refcount);
+}
 
-  case ALLOCATED_ENTRY:
-    assert(block->contains(ptr), "invariant");
-    break;
+// Process one available deferred update.  Returns true if one was processed.
+bool OopStorage::reduce_deferred_updates() {
+  assert_locked_or_safepoint(_allocate_mutex);
+  // Atomically pop a block off the list, if any available.
+  // No ABA issue because this is only called by one thread at a time.
+  // The atomicity is wrto pushes by release().
+  Block* block = OrderAccess::load_acquire(&_deferred_updates);
+  while (true) {
+    if (block == NULL) return false;
+    // Try atomic pop of block from list.
+    Block* tail = block->deferred_updates_next();
+    if (block == tail) tail = NULL; // Handle self-loop end marker.
+    Block* fetched = Atomic::cmpxchg(tail, &_deferred_updates, block);
+    if (fetched == block) break; // Update successful.
+    block = fetched;             // Retry with updated block.
+  }
+  block->set_deferred_updates_next(NULL); // Clear tail after updating head.
+  // Ensure bitmask read after pop is complete, including clearing tail, for
+  // ordering with release().  Without this, we may be processing a stale
+  // bitmask state here while blocking a release() operation from recording
+  // the deferred update needed for its bitmask change.
+  OrderAccess::storeload();
+  // Process popped block.
+  uintx allocated = block->allocated_bitmask();
 
-  default:
-    ShouldNotReachHere();
+  // Make membership in list consistent with bitmask state.
+  if ((_allocate_list.ctail() != NULL) &&
+      ((_allocate_list.ctail() == block) ||
+       (_allocate_list.next(*block) != NULL))) {
+    // Block is in the allocate list.
+    assert(!is_full_bitmask(allocated), "invariant");
+  } else if (!is_full_bitmask(allocated)) {
+    // Block is not in the allocate list, but now should be.
+    _allocate_list.push_front(*block);
+  } // Else block is full and not in list, which is correct.
+
+  // Move empty block to end of list, for possible deletion.
+  if (is_empty_bitmask(allocated)) {
+    _allocate_list.unlink(*block);
+    _allocate_list.push_back(*block);
   }
+
+  log_debug(oopstorage, blocks)("%s: processed deferred update " PTR_FORMAT,
+                                name(), p2i(block));
+  return true;              // Processed one pending update.
 }
-#endif // ASSERT
 
 inline void check_release_entry(const oop* entry) {
   assert(entry != NULL, "Releasing NULL");
@@ -459,9 +521,9 @@
 void OopStorage::release(const oop* ptr) {
   check_release_entry(ptr);
   Block* block = find_block_or_null(ptr);
-  check_release(block, ptr);
+  assert(block != NULL, "%s: invalid release " PTR_FORMAT, name(), p2i(ptr));
   log_info(oopstorage, ref)("%s: released " PTR_FORMAT, name(), p2i(ptr));
-  release_from_block(*block, block->bitmask_for_entry(ptr));
+  block->release_entries(block->bitmask_for_entry(ptr), &_deferred_updates);
   Atomic::dec(&_allocation_count);
 }
 
@@ -470,15 +532,15 @@
   while (i < size) {
     check_release_entry(ptrs[i]);
     Block* block = find_block_or_null(ptrs[i]);
-    check_release(block, ptrs[i]);
+    assert(block != NULL, "%s: invalid release " PTR_FORMAT, name(), p2i(ptrs[i]));
     log_info(oopstorage, ref)("%s: released " PTR_FORMAT, name(), p2i(ptrs[i]));
     size_t count = 0;
     uintx releasing = 0;
     for ( ; i < size; ++i) {
       const oop* entry = ptrs[i];
+      check_release_entry(entry);
       // If entry not in block, finish block and resume outer loop with entry.
       if (!block->contains(entry)) break;
-      check_release_entry(entry);
       // Add entry to releasing bitmap.
       log_info(oopstorage, ref)("%s: released " PTR_FORMAT, name(), p2i(entry));
       uintx entry_bitmask = block->bitmask_for_entry(entry);
@@ -488,7 +550,7 @@
       ++count;
     }
     // Release the contiguous entries that are in block.
-    release_from_block(*block, releasing);
+    block->release_entries(releasing, &_deferred_updates);
     Atomic::sub(count, &_allocation_count);
   }
 }
@@ -506,11 +568,11 @@
   _active_list(&Block::get_active_entry),
   _allocate_list(&Block::get_allocate_entry),
   _active_head(NULL),
+  _deferred_updates(NULL),
   _allocate_mutex(allocate_mutex),
   _active_mutex(active_mutex),
   _allocation_count(0),
   _block_count(0),
-  _empty_block_count(0),
   _concurrent_iteration_active(false)
 {
   assert(_active_mutex->rank() < _allocate_mutex->rank(),
@@ -529,6 +591,10 @@
 
 OopStorage::~OopStorage() {
   Block* block;
+  while ((block = _deferred_updates) != NULL) {
+    _deferred_updates = block->deferred_updates_next();
+    block->set_deferred_updates_next(NULL);
+  }
   while ((block = _allocate_list.head()) != NULL) {
     _allocate_list.unlink(*block);
   }
@@ -539,43 +605,47 @@
   FREE_C_HEAP_ARRAY(char, _name);
 }
 
-void OopStorage::delete_empty_blocks_safepoint(size_t retain) {
+void OopStorage::delete_empty_blocks_safepoint() {
   assert_at_safepoint();
+  // Process any pending release updates, which may make more empty
+  // blocks available for deletion.
+  while (reduce_deferred_updates()) {}
   // Don't interfere with a concurrent iteration.
   if (_concurrent_iteration_active) return;
-  // Compute the number of blocks to remove, to minimize volatile accesses.
-  size_t empty_blocks = _empty_block_count;
-  if (retain < empty_blocks) {
-    size_t remove_count = empty_blocks - retain;
-    // Update volatile counters once.
-    _block_count -= remove_count;
-    _empty_block_count -= remove_count;
-    do {
-      const Block* block = _allocate_list.ctail();
-      assert(block != NULL, "invariant");
-      assert(block->is_empty(), "invariant");
-      // Remove block from lists, and delete it.
-      _active_list.unlink(*block);
-      _allocate_list.unlink(*block);
-      delete_empty_block(*block);
-    } while (--remove_count > 0);
-    // Update _active_head, in case current value was in deleted set.
-    _active_head = _active_list.head();
+  // Delete empty (and otherwise deletable) blocks from end of _allocate_list.
+  for (const Block* block = _allocate_list.ctail();
+       (block != NULL) && block->is_deletable();
+       block = _allocate_list.ctail()) {
+    _active_list.unlink(*block);
+    _allocate_list.unlink(*block);
+    delete_empty_block(*block);
+    --_block_count;
   }
+  // Update _active_head, in case current value was in deleted set.
+  _active_head = _active_list.head();
 }
 
-void OopStorage::delete_empty_blocks_concurrent(size_t retain) {
+void OopStorage::delete_empty_blocks_concurrent() {
   MutexLockerEx ml(_allocate_mutex, Mutex::_no_safepoint_check_flag);
   // Other threads could be adding to the empty block count while we
   // release the mutex across the block deletions.  Set an upper bound
   // on how many blocks we'll try to release, so other threads can't
   // cause an unbounded stay in this function.
-  if (_empty_block_count <= retain) return;
-  size_t limit = _empty_block_count - retain;
-  for (size_t i = 0; (i < limit) && (retain < _empty_block_count); ++i) {
+  size_t limit = _block_count;
+
+  for (size_t i = 0; i < limit; ++i) {
+    // Additional updates might become available while we dropped the
+    // lock.  But limit number processed to limit lock duration.
+    reduce_deferred_updates();
+
     const Block* block = _allocate_list.ctail();
-    assert(block != NULL, "invariant");
-    assert(block->is_empty(), "invariant");
+    if ((block == NULL) || !block->is_deletable()) {
+      // No block to delete, so done.  There could be more pending
+      // deferred updates that could give us more work to do; deal with
+      // that in some later call, to limit lock duration here.
+      return;
+    }
+
     {
       MutexLockerEx aml(_active_mutex, Mutex::_no_safepoint_check_flag);
       // Don't interfere with a concurrent iteration.
@@ -589,28 +659,31 @@
     }
     // Remove block from _allocate_list and delete it.
     _allocate_list.unlink(*block);
-    --_empty_block_count;
     // Release mutex while deleting block.
     MutexUnlockerEx ul(_allocate_mutex, Mutex::_no_safepoint_check_flag);
     delete_empty_block(*block);
   }
 }
 
-OopStorage::EntryStatus
-OopStorage::allocation_status_validating_block(const Block* block,
-                                               const oop* ptr) const {
-  MutexLockerEx ml(_allocate_mutex, Mutex::_no_safepoint_check_flag);
-  if ((block == NULL) || !is_valid_block_locked_or_safepoint(block)) {
-    return INVALID_ENTRY;
-  } else if ((block->allocated_bitmask() & block->bitmask_for_entry(ptr)) != 0) {
-    return ALLOCATED_ENTRY;
-  } else {
-    return UNALLOCATED_ENTRY;
+OopStorage::EntryStatus OopStorage::allocation_status(const oop* ptr) const {
+  const Block* block = find_block_or_null(ptr);
+  if (block != NULL) {
+    // Verify block is a real block.  For now, simple linear search.
+    // Do something more clever if this is a performance bottleneck.
+    MutexLockerEx ml(_allocate_mutex, Mutex::_no_safepoint_check_flag);
+    for (const Block* check_block = _active_list.chead();
+         check_block != NULL;
+         check_block = _active_list.next(*check_block)) {
+      if (check_block == block) {
+        if ((block->allocated_bitmask() & block->bitmask_for_entry(ptr)) != 0) {
+          return ALLOCATED_ENTRY;
+        } else {
+          return UNALLOCATED_ENTRY;
+        }
+      }
+    }
   }
-}
-
-OopStorage::EntryStatus OopStorage::allocation_status(const oop* ptr) const {
-  return allocation_status_validating_block(find_block_or_null(ptr), ptr);
+  return INVALID_ENTRY;
 }
 
 size_t OopStorage::allocation_count() const {
@@ -621,10 +694,6 @@
   return _block_count;
 }
 
-size_t OopStorage::empty_block_count() const {
-  return _empty_block_count;
-}
-
 size_t OopStorage::total_memory_usage() const {
   size_t total_size = sizeof(OopStorage);
   total_size += strlen(name()) + 1;
@@ -690,17 +759,12 @@
 void OopStorage::print_on(outputStream* st) const {
   size_t allocations = _allocation_count;
   size_t blocks = _block_count;
-  size_t empties = _empty_block_count;
-  // Comparison is being careful about racy accesses.
-  size_t used = (blocks < empties) ? 0 : (blocks - empties);
 
   double data_size = section_size * section_count;
-  double alloc_percentage = percent_of((double)allocations, used * data_size);
+  double alloc_percentage = percent_of((double)allocations, blocks * data_size);
 
-  st->print("%s: " SIZE_FORMAT " entries in " SIZE_FORMAT " blocks (%.F%%), "
-            SIZE_FORMAT " empties, " SIZE_FORMAT " bytes",
-            name(), allocations, used, alloc_percentage,
-            empties, total_memory_usage());
+  st->print("%s: " SIZE_FORMAT " entries in " SIZE_FORMAT " blocks (%.F%%), " SIZE_FORMAT " bytes",
+            name(), allocations, blocks, alloc_percentage, total_memory_usage());
   if (_concurrent_iteration_active) {
     st->print(", concurrent iteration active");
   }
--- a/src/hotspot/share/gc/shared/oopStorage.hpp	Thu Feb 08 15:21:59 2018 -0500
+++ b/src/hotspot/share/gc/shared/oopStorage.hpp	Thu Feb 08 17:23:43 2018 -0500
@@ -84,10 +84,6 @@
   // The number of blocks of entries.  Useful for sizing parallel iteration.
   size_t block_count() const;
 
-  // The number of blocks with no allocated entries.  Useful for sizing
-  // parallel iteration and scheduling block deletion.
-  size_t empty_block_count() const;
-
   // Total number of blocks * memory allocation per block, plus
   // bookkeeping overhead, including this storage object.
   size_t total_memory_usage() const;
@@ -107,14 +103,13 @@
   // postcondition: *result == NULL.
   oop* allocate();
 
-  // Deallocates ptr, after setting its value to NULL. Locks _allocate_mutex.
+  // Deallocates ptr.  No locking.
   // precondition: ptr is a valid allocated entry.
   // precondition: *ptr == NULL.
   void release(const oop* ptr);
 
   // Releases all the ptrs.  Possibly faster than individual calls to
-  // release(oop*).  Best if ptrs is sorted by address.  Locks
-  // _allocate_mutex.
+  // release(oop*).  Best if ptrs is sorted by address.  No locking.
   // precondition: All elements of ptrs are valid allocated entries.
   // precondition: *ptrs[i] == NULL, for i in [0,size).
   void release(const oop* const* ptrs, size_t size);
@@ -160,8 +155,8 @@
   // Block cleanup functions are for the exclusive use of the GC.
   // Both stop deleting if there is an in-progress concurrent iteration.
   // Concurrent deletion locks both the allocate_mutex and the active_mutex.
-  void delete_empty_blocks_safepoint(size_t retain = 1);
-  void delete_empty_blocks_concurrent(size_t retain = 1);
+  void delete_empty_blocks_safepoint();
+  void delete_empty_blocks_concurrent();
 
   // Debugging and logging support.
   const char* name() const;
@@ -231,6 +226,7 @@
   BlockList _active_list;
   BlockList _allocate_list;
   Block* volatile _active_head;
+  Block* volatile _deferred_updates;
 
   Mutex* _allocate_mutex;
   Mutex* _active_mutex;
@@ -238,16 +234,12 @@
   // Counts are volatile for racy unlocked accesses.
   volatile size_t _allocation_count;
   volatile size_t _block_count;
-  volatile size_t _empty_block_count;
   // mutable because this gets set even for const iteration.
   mutable bool _concurrent_iteration_active;
 
   Block* find_block_or_null(const oop* ptr) const;
-  bool is_valid_block_locked_or_safepoint(const Block* block) const;
-  EntryStatus allocation_status_validating_block(const Block* block, const oop* ptr) const;
-  void check_release(const Block* block, const oop* ptr) const NOT_DEBUG_RETURN;
-  void release_from_block(Block& block, uintx release_bitmask);
   void delete_empty_block(const Block& block);
+  bool reduce_deferred_updates();
 
   static void assert_at_safepoint() NOT_DEBUG_RETURN;
 
--- a/src/hotspot/share/gc/shared/oopStorage.inline.hpp	Thu Feb 08 15:21:59 2018 -0500
+++ b/src/hotspot/share/gc/shared/oopStorage.inline.hpp	Thu Feb 08 17:23:43 2018 -0500
@@ -44,6 +44,8 @@
   void* _memory;              // Unaligned storage containing block.
   BlockEntry _active_entry;
   BlockEntry _allocate_entry;
+  Block* volatile _deferred_updates_next;
+  volatile uintx _release_refcount;
 
   Block(const OopStorage* owner, void* memory);
   ~Block();
@@ -75,7 +77,10 @@
   bool is_full() const;
   bool is_empty() const;
   uintx allocated_bitmask() const;
-  uintx cmpxchg_allocated_bitmask(uintx new_value, uintx compare_value);
+  bool is_deletable() const;
+
+  Block* deferred_updates_next() const;
+  void set_deferred_updates_next(Block* new_next);
 
   bool contains(const oop* ptr) const;
 
@@ -86,6 +91,8 @@
   static Block* new_block(const OopStorage* owner);
   static void delete_block(const Block& block);
 
+  void release_entries(uintx releasing, Block* volatile* deferred_list);
+
   template<typename F> bool iterate(F f);
   template<typename F> bool iterate(F f) const;
 }; // class Block
--- a/src/hotspot/share/runtime/mutexLocker.cpp	Thu Feb 08 15:21:59 2018 -0500
+++ b/src/hotspot/share/runtime/mutexLocker.cpp	Thu Feb 08 17:23:43 2018 -0500
@@ -253,10 +253,10 @@
   // of some places which hold other locks while releasing a handle, including
   // the Patching_lock, which is of "special" rank.  As a temporary workaround,
   // lower the JNI oopstorage lock ranks to make them super-special.
-  def(JNIGlobalAlloc_lock          , PaddedMutex  , special-1,   true,  Monitor::_safepoint_check_never);
-  def(JNIGlobalActive_lock         , PaddedMutex  , special-2,   true,  Monitor::_safepoint_check_never);
-  def(JNIWeakAlloc_lock            , PaddedMutex  , special-1,   true,  Monitor::_safepoint_check_never);
-  def(JNIWeakActive_lock           , PaddedMutex  , special-2,   true,  Monitor::_safepoint_check_never);
+  def(JNIGlobalAlloc_lock          , PaddedMutex  , nonleaf,     true,  Monitor::_safepoint_check_never);
+  def(JNIGlobalActive_lock         , PaddedMutex  , nonleaf-1,   true,  Monitor::_safepoint_check_never);
+  def(JNIWeakAlloc_lock            , PaddedMutex  , nonleaf,     true,  Monitor::_safepoint_check_never);
+  def(JNIWeakActive_lock           , PaddedMutex  , nonleaf-1,   true,  Monitor::_safepoint_check_never);
   def(JNICritical_lock             , PaddedMonitor, nonleaf,     true,  Monitor::_safepoint_check_always);     // used for JNI critical regions
   def(AdapterHandlerLibrary_lock   , PaddedMutex  , nonleaf,     true,  Monitor::_safepoint_check_always);
 
--- a/test/hotspot/gtest/gc/shared/test_oopStorage.cpp	Thu Feb 08 15:21:59 2018 -0500
+++ b/test/hotspot/gtest/gc/shared/test_oopStorage.cpp	Thu Feb 08 17:23:43 2018 -0500
@@ -70,6 +70,10 @@
     return storage._allocate_mutex;
   }
 
+  static bool reduce_deferred_updates(OopStorage& storage) {
+    return storage.reduce_deferred_updates();
+  }
+
   static bool block_is_empty(const Block& block) {
     return block.is_empty();
   }
@@ -127,9 +131,31 @@
   return list.chead() == NULL;
 }
 
-static void release_entry(OopStorage& storage, oop* entry) {
+static bool process_deferred_updates(OopStorage& storage) {
+  MutexLockerEx ml(TestAccess::allocate_mutex(storage), Mutex::_no_safepoint_check_flag);
+  bool result = false;
+  while (TestAccess::reduce_deferred_updates(storage)) {
+    result = true;
+  }
+  return result;
+}
+
+static void release_entry(OopStorage& storage, oop* entry, bool process_deferred = true) {
   *entry = NULL;
   storage.release(entry);
+  if (process_deferred) {
+    process_deferred_updates(storage);
+  }
+}
+
+static size_t empty_block_count(const OopStorage& storage) {
+  const TestAccess::BlockList& list = TestAccess::allocate_list(storage);
+  size_t count = 0;
+  for (const OopBlock* block = list.ctail();
+       (block != NULL) && block->is_empty();
+       ++count, block = list.prev(*block))
+  {}
+  return count;
 }
 
 class OopStorageTest : public ::testing::Test {
@@ -188,31 +214,22 @@
 class OopStorageTestWithAllocation::VM_DeleteBlocksAtSafepoint
   : public VM_GTestExecuteAtSafepoint {
 public:
-  VM_DeleteBlocksAtSafepoint(OopStorage* storage, size_t retain) :
-    _storage(storage), _retain(retain)
-  {}
+  VM_DeleteBlocksAtSafepoint(OopStorage* storage) : _storage(storage) {}
 
   void doit() {
-    _storage->delete_empty_blocks_safepoint(_retain);
+    _storage->delete_empty_blocks_safepoint();
   }
 
 private:
   OopStorage* _storage;
-  size_t _retain;
 };
 
 static bool is_allocate_list_sorted(const OopStorage& storage) {
   // The allocate_list isn't strictly sorted.  Rather, all empty
-  // blocks are segregated to the end of the list.  And the number of
-  // empty blocks should match empty_block_count().
-  size_t expected_empty = storage.empty_block_count();
+  // blocks are segregated to the end of the list.
   const TestAccess::BlockList& list = TestAccess::allocate_list(storage);
   const OopBlock* block = list.ctail();
-  for (size_t i = 0; i < expected_empty; ++i, block = list.prev(*block)) {
-    if ((block == NULL) || !block->is_empty()) {
-      return false;
-    }
-  }
+  for ( ; (block != NULL) && block->is_empty(); block = list.prev(*block)) {}
   for ( ; block != NULL; block = list.prev(*block)) {
     if (block->is_empty()) {
       return false;
@@ -243,7 +260,7 @@
   EXPECT_EQ(1u, _storage.block_count());
   EXPECT_EQ(1u, list_length(TestAccess::allocate_list(_storage)));
 
-  EXPECT_EQ(0u, _storage.empty_block_count());
+  EXPECT_EQ(0u, empty_block_count(_storage));
 
   const OopBlock* block = TestAccess::allocate_list(_storage).chead();
   EXPECT_NE(block, (OopBlock*)NULL);
@@ -259,7 +276,7 @@
   EXPECT_EQ(1u, _storage.block_count());
   EXPECT_EQ(1u, list_length(TestAccess::allocate_list(_storage)));
 
-  EXPECT_EQ(1u, _storage.empty_block_count());
+  EXPECT_EQ(1u, empty_block_count(_storage));
 
   const OopBlock* new_block = TestAccess::allocate_list(_storage).chead();
   EXPECT_EQ(block, new_block);
@@ -322,14 +339,14 @@
   TestAccess::BlockList& active_list = TestAccess::active_list(_storage);
   TestAccess::BlockList& allocate_list = TestAccess::allocate_list(_storage);
 
-  EXPECT_EQ(0u, _storage.empty_block_count());
+  EXPECT_EQ(0u, empty_block_count(_storage));
 
   entries[0] = _storage.allocate();
   ASSERT_TRUE(entries[0] != NULL);
   EXPECT_EQ(1u, list_length(active_list));
   EXPECT_EQ(1u, _storage.block_count());
   EXPECT_EQ(1u, list_length(allocate_list));
-  EXPECT_EQ(0u, _storage.empty_block_count());
+  EXPECT_EQ(0u, empty_block_count(_storage));
 
   const OopBlock* block = active_list.chead();
   EXPECT_EQ(1u, TestAccess::block_allocation_count(*block));
@@ -339,7 +356,7 @@
     entries[i] = _storage.allocate();
     EXPECT_EQ(i + 1, _storage.allocation_count());
     ASSERT_TRUE(entries[i] != NULL);
-    EXPECT_EQ(0u, _storage.empty_block_count());
+    EXPECT_EQ(0u, empty_block_count(_storage));
 
     if (block == NULL) {
       ASSERT_FALSE(is_list_empty(allocate_list));
@@ -374,7 +391,7 @@
 
   EXPECT_EQ(list_length(active_list), list_length(allocate_list));
   EXPECT_EQ(list_length(active_list), _storage.block_count());
-  EXPECT_EQ(list_length(active_list), _storage.empty_block_count());
+  EXPECT_EQ(list_length(active_list), empty_block_count(_storage));
   for (const OopBlock* block = allocate_list.chead();
        block != NULL;
        block = allocate_list.next(*block)) {
@@ -386,7 +403,7 @@
   static const size_t step = 11;
   ASSERT_NE(0u, _max_entries % step); // max_entries and step are mutually prime
 
-  EXPECT_EQ(0u, _storage.empty_block_count());
+  EXPECT_EQ(0u, empty_block_count(_storage));
 
   TestAccess::BlockList& active_list = TestAccess::active_list(_storage);
   TestAccess::BlockList& allocate_list = TestAccess::allocate_list(_storage);
@@ -409,7 +426,7 @@
   EXPECT_EQ(list_length(active_list), list_length(allocate_list));
   EXPECT_EQ(list_length(active_list), _storage.block_count());
   EXPECT_EQ(0u, total_allocation_count(active_list));
-  EXPECT_EQ(list_length(allocate_list), _storage.empty_block_count());
+  EXPECT_EQ(list_length(allocate_list), empty_block_count(_storage));
 }
 
 TEST_VM_F(OopStorageTestWithAllocation, random_allocate_release) {
@@ -417,7 +434,7 @@
   static const size_t allocate_step = 5;
   ASSERT_NE(0u, _max_entries % release_step); // max_entries and step are mutually prime
 
-  EXPECT_EQ(0u, _storage.empty_block_count());
+  EXPECT_EQ(0u, empty_block_count(_storage));
 
   TestAccess::BlockList& active_list = TestAccess::active_list(_storage);
   TestAccess::BlockList& allocate_list = TestAccess::allocate_list(_storage);
@@ -449,7 +466,7 @@
   EXPECT_EQ(list_length(active_list), list_length(allocate_list));
   EXPECT_EQ(list_length(active_list), _storage.block_count());
   EXPECT_EQ(0u, total_allocation_count(active_list));
-  EXPECT_EQ(list_length(allocate_list), _storage.empty_block_count());
+  EXPECT_EQ(list_length(allocate_list), empty_block_count(_storage));
 }
 
 template<bool sorted>
@@ -471,11 +488,12 @@
     EXPECT_EQ(_max_entries - nrelease, _storage.allocation_count());
 
     for (size_t i = 0; i < nrelease; ++i) {
-      release_entry(_storage, _entries[2 * i + 1]);
+      release_entry(_storage, _entries[2 * i + 1], false);
       EXPECT_EQ(_max_entries - nrelease - (i + 1), _storage.allocation_count());
     }
+    EXPECT_TRUE(process_deferred_updates(_storage));
 
-    EXPECT_EQ(_storage.block_count(), _storage.empty_block_count());
+    EXPECT_EQ(_storage.block_count(), empty_block_count(_storage));
 
     FREE_C_HEAP_ARRAY(oop*, to_release);
   }
@@ -607,8 +625,9 @@
   }
 
   while (allocated > 0) {
-    release_entry(_storage, entries[--allocated]);
+    release_entry(_storage, entries[--allocated], false);
   }
+  process_deferred_updates(_storage);
 }
 
 class OopStorageTestIteration : public OopStorageTestWithAllocation {
@@ -627,16 +646,17 @@
     memset(_states, 0, sizeof(_states));
 
     size_t initial_release = 0;
-    for ( ; _storage.empty_block_count() < 2; ++initial_release) {
+    for ( ; empty_block_count(_storage) < 2; ++initial_release) {
       ASSERT_GT(_max_entries, initial_release);
       release_entry(_storage, _entries[initial_release]);
       _states[0][initial_release] = mark_released;
     }
 
     for (size_t i = initial_release; i < _max_entries; i += 3) {
-      release_entry(_storage, _entries[i]);
+      release_entry(_storage, _entries[i], false);
       _states[0][i] = mark_released;
     }
+    process_deferred_updates(_storage);
   }
 
   class VerifyState;
@@ -1006,30 +1026,21 @@
   EXPECT_EQ(initial_active_size, _storage.block_count());
   ASSERT_LE(3u, initial_active_size); // Need at least 3 blocks for test
 
-  for (size_t i = 0; _storage.empty_block_count() < 3; ++i) {
+  for (size_t i = 0; empty_block_count(_storage) < 3; ++i) {
     ASSERT_GT(_max_entries, i);
     release_entry(_storage, _entries[i]);
   }
 
   EXPECT_EQ(initial_active_size, list_length(active_list));
   EXPECT_EQ(initial_active_size, _storage.block_count());
-  EXPECT_EQ(3u, _storage.empty_block_count());
+  EXPECT_EQ(3u, empty_block_count(_storage));
 
   {
     ThreadInVMfromNative invm(JavaThread::current());
-    VM_DeleteBlocksAtSafepoint op(&_storage, 2);
+    VM_DeleteBlocksAtSafepoint op(&_storage);
     VMThread::execute(&op);
   }
-  EXPECT_EQ(2u, _storage.empty_block_count());
-  EXPECT_EQ(initial_active_size - 1, list_length(active_list));
-  EXPECT_EQ(initial_active_size - 1, _storage.block_count());
-
-  {
-    ThreadInVMfromNative invm(JavaThread::current());
-    VM_DeleteBlocksAtSafepoint op(&_storage, 0);
-    VMThread::execute(&op);
-  }
-  EXPECT_EQ(0u, _storage.empty_block_count());
+  EXPECT_EQ(0u, empty_block_count(_storage));
   EXPECT_EQ(initial_active_size - 3, list_length(active_list));
   EXPECT_EQ(initial_active_size - 3, _storage.block_count());
 }
@@ -1041,22 +1052,17 @@
   EXPECT_EQ(initial_active_size, _storage.block_count());
   ASSERT_LE(3u, initial_active_size); // Need at least 3 blocks for test
 
-  for (size_t i = 0; _storage.empty_block_count() < 3; ++i) {
+  for (size_t i = 0; empty_block_count(_storage) < 3; ++i) {
     ASSERT_GT(_max_entries, i);
     release_entry(_storage, _entries[i]);
   }
 
   EXPECT_EQ(initial_active_size, list_length(active_list));
   EXPECT_EQ(initial_active_size, _storage.block_count());
-  EXPECT_EQ(3u, _storage.empty_block_count());
+  EXPECT_EQ(3u, empty_block_count(_storage));
 
-  _storage.delete_empty_blocks_concurrent(2);
-  EXPECT_EQ(2u, _storage.empty_block_count());
-  EXPECT_EQ(initial_active_size - 1, list_length(active_list));
-  EXPECT_EQ(initial_active_size - 1, _storage.block_count());
-
-  _storage.delete_empty_blocks_concurrent(0);
-  EXPECT_EQ(0u, _storage.empty_block_count());
+  _storage.delete_empty_blocks_concurrent();
+  EXPECT_EQ(0u, empty_block_count(_storage));
   EXPECT_EQ(initial_active_size - 3, list_length(active_list));
   EXPECT_EQ(initial_active_size - 3, _storage.block_count());
 }
@@ -1075,13 +1081,14 @@
 
   for (size_t i = 0; i < _max_entries; ++i) {
     if ((_entries[i] != retained) && (_entries[i] != released)) {
-      release_entry(_storage, _entries[i]);
+      // Leave deferred release updates to block deletion.
+      release_entry(_storage, _entries[i], false);
     }
   }
 
   {
     ThreadInVMfromNative invm(JavaThread::current());
-    VM_DeleteBlocksAtSafepoint op(&_storage, 0);
+    VM_DeleteBlocksAtSafepoint op(&_storage);
     VMThread::execute(&op);
   }
   EXPECT_EQ(OopStorage::ALLOCATED_ENTRY, _storage.allocation_status(retained));
@@ -1121,12 +1128,14 @@
 TEST_VM_F(OopStorageTestWithAllocation, print_storage) {
   // Release the first 1/2
   for (size_t i = 0; i < (_max_entries / 2); ++i) {
-    release_entry(_storage, _entries[i]);
+    // Deferred updates don't affect print output.
+    release_entry(_storage, _entries[i], false);
     _entries[i] = NULL;
   }
   // Release every other remaining
   for (size_t i = _max_entries / 2; i < _max_entries; i += 2) {
-    release_entry(_storage, _entries[i]);
+    // Deferred updates don't affect print output.
+    release_entry(_storage, _entries[i], false);
     _entries[i] = NULL;
   }
 
@@ -1137,24 +1146,17 @@
   size_t expected_blocks = (_max_entries + entries_per_block - 1) / entries_per_block;
   EXPECT_EQ(expected_blocks, _storage.block_count());
 
-  size_t expected_empties = (_max_entries / 2) / entries_per_block;
-  EXPECT_EQ(expected_empties, _storage.empty_block_count());
-
-  size_t expected_used = expected_blocks - expected_empties;
-
-  double expected_usage = (100.0 * expected_entries) / (expected_used * entries_per_block);
+  double expected_usage = (100.0 * expected_entries) / (expected_blocks * entries_per_block);
 
   {
     ResourceMark rm;
     stringStream expected_st;
     expected_st.print("Test Storage: " SIZE_FORMAT
                       " entries in " SIZE_FORMAT
-                      " blocks (%.F%%), " SIZE_FORMAT
-                      " empties, " SIZE_FORMAT " bytes",
+                      " blocks (%.F%%), " SIZE_FORMAT " bytes",
                       expected_entries,
-                      expected_used,
+                      expected_blocks,
                       expected_usage,
-                      expected_empties,
                       _storage.total_memory_usage());
     stringStream st;
     _storage.print_on(&st);