src/hotspot/share/gc/shared/oopStorage.cpp
changeset 49977 9f758f0bb058
parent 49711 4a7addb5762c
child 50209 2fdce199fcb9
--- a/src/hotspot/share/gc/shared/oopStorage.cpp	Thu May 03 14:13:20 2018 -0700
+++ b/src/hotspot/share/gc/shared/oopStorage.cpp	Thu May 03 17:36:50 2018 -0400
@@ -28,20 +28,22 @@
 #include "logging/log.hpp"
 #include "logging/logStream.hpp"
 #include "memory/allocation.inline.hpp"
-#include "memory/resourceArea.hpp"
 #include "runtime/atomic.hpp"
+#include "runtime/globals.hpp"
 #include "runtime/handles.inline.hpp"
 #include "runtime/mutex.hpp"
 #include "runtime/mutexLocker.hpp"
 #include "runtime/orderAccess.inline.hpp"
 #include "runtime/safepoint.hpp"
 #include "runtime/stubRoutines.hpp"
+#include "runtime/thread.hpp"
 #include "utilities/align.hpp"
 #include "utilities/count_trailing_zeros.hpp"
 #include "utilities/debug.hpp"
 #include "utilities/globalDefinitions.hpp"
 #include "utilities/macros.hpp"
 #include "utilities/ostream.hpp"
+#include "utilities/spinYield.hpp"
 
 OopStorage::BlockEntry::BlockEntry() : _prev(NULL), _next(NULL) {}
 
@@ -108,6 +110,90 @@
   }
 }
 
+OopStorage::BlockArray::BlockArray(size_t size) :
+  _size(size),
+  _block_count(0),
+  _refcount(0)
+{}
+
+OopStorage::BlockArray::~BlockArray() {
+  assert(_refcount == 0, "precondition");
+}
+
+OopStorage::BlockArray* OopStorage::BlockArray::create(size_t size, AllocFailType alloc_fail) {
+  size_t size_in_bytes = blocks_offset() + sizeof(Block*) * size;
+  void* mem = NEW_C_HEAP_ARRAY3(char, size_in_bytes, mtGC, CURRENT_PC, alloc_fail);
+  if (mem == NULL) return NULL;
+  return new (mem) BlockArray(size);
+}
+
+void OopStorage::BlockArray::destroy(BlockArray* ba) {
+  ba->~BlockArray();
+  FREE_C_HEAP_ARRAY(char, ba);
+}
+
+size_t OopStorage::BlockArray::size() const {
+  return _size;
+}
+
+size_t OopStorage::BlockArray::block_count() const {
+  return _block_count;
+}
+
+size_t OopStorage::BlockArray::block_count_acquire() const {
+  return OrderAccess::load_acquire(&_block_count);
+}
+
+void OopStorage::BlockArray::increment_refcount() const {
+  int new_value = Atomic::add(1, &_refcount);
+  assert(new_value >= 1, "negative refcount %d", new_value - 1);
+}
+
+bool OopStorage::BlockArray::decrement_refcount() const {
+  int new_value = Atomic::sub(1, &_refcount);
+  assert(new_value >= 0, "negative refcount %d", new_value);
+  return new_value == 0;
+}
+
+bool OopStorage::BlockArray::push(Block* block) {
+  size_t index = _block_count;
+  if (index < _size) {
+    block->set_active_index(index);
+    *block_ptr(index) = block;
+    // Use a release_store to ensure all the setup is complete before
+    // making the block visible.
+    OrderAccess::release_store(&_block_count, index + 1);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void OopStorage::BlockArray::remove(Block* block) {
+  assert(_block_count > 0, "array is empty");
+  size_t index = block->active_index();
+  assert(*block_ptr(index) == block, "block not present");
+  size_t last_index = _block_count - 1;
+  Block* last_block = *block_ptr(last_index);
+  last_block->set_active_index(index);
+  *block_ptr(index) = last_block;
+  _block_count = last_index;
+}
+
+void OopStorage::BlockArray::copy_from(const BlockArray* from) {
+  assert(_block_count == 0, "array must be empty");
+  size_t count = from->_block_count;
+  assert(count <= _size, "precondition");
+  Block* const* from_ptr = from->block_ptr(0);
+  Block** to_ptr = block_ptr(0);
+  for (size_t i = 0; i < count; ++i) {
+    Block* block = *from_ptr++;
+    assert(block->active_index() == i, "invariant");
+    *to_ptr++ = block;
+  }
+  _block_count = count;
+}
+
 // Blocks start with an array of BitsPerWord oop entries.  That array
 // is divided into conceptual BytesPerWord sections of BitsPerByte
 // entries.  Blocks are allocated aligned on section boundaries, for
@@ -125,7 +211,7 @@
   _allocated_bitmask(0),
   _owner(owner),
   _memory(memory),
-  _active_entry(),
+  _active_index(0),
   _allocate_entry(),
   _deferred_updates_next(NULL),
   _release_refcount(0)
@@ -146,10 +232,6 @@
   const_cast<OopStorage* volatile&>(_owner) = NULL;
 }
 
-const OopStorage::BlockEntry& OopStorage::Block::get_active_entry(const Block& block) {
-  return block._active_entry;
-}
-
 const OopStorage::BlockEntry& OopStorage::Block::get_allocate_entry(const Block& block) {
   return block._allocate_entry;
 }
@@ -204,6 +286,20 @@
   return (base <= ptr) && (ptr < (base + ARRAY_SIZE(_data)));
 }
 
+size_t OopStorage::Block::active_index() const {
+  return _active_index;
+}
+
+void OopStorage::Block::set_active_index(size_t index) {
+  _active_index = index;
+}
+
+size_t OopStorage::Block::active_index_safe(const Block* block) {
+  STATIC_ASSERT(sizeof(intptr_t) == sizeof(block->_active_index));
+  assert(CanUseSafeFetchN(), "precondition");
+  return SafeFetchN((intptr_t*)&block->_active_index, 0);
+}
+
 unsigned OopStorage::Block::get_index(const oop* ptr) const {
   assert(contains(ptr), PTR_FORMAT " not in block " PTR_FORMAT, p2i(ptr), p2i(this));
   return static_cast<unsigned>(ptr - get_pointer(0));
@@ -246,7 +342,7 @@
 
 // This can return a false positive if ptr is not contained by some
 // block.  For some uses, it is a precondition that ptr is valid,
-// e.g. contained in some block in owner's _active_list.  Other uses
+// e.g. contained in some block in owner's _active_array.  Other uses
 // require additional validation of the result.
 OopStorage::Block*
 OopStorage::Block::block_for_ptr(const OopStorage* owner, const oop* ptr) {
@@ -280,12 +376,12 @@
 // Allocation involves the _allocate_list, which contains a subset of the
 // blocks owned by a storage object.  This is a doubly-linked list, linked
 // through dedicated fields in the blocks.  Full blocks are removed from this
-// list, though they are still present in the _active_list.  Empty blocks are
+// list, though they are still present in the _active_array.  Empty blocks are
 // kept at the end of the _allocate_list, to make it easy for empty block
 // deletion to find them.
 //
 // allocate(), and delete_empty_blocks_concurrent() lock the
-// _allocate_mutex while performing any list modifications.
+// _allocate_mutex while performing any list and array modifications.
 //
 // allocate() and release() update a block's _allocated_bitmask using CAS
 // loops.  This prevents loss of updates even though release() performs
@@ -299,7 +395,7 @@
 //
 // release() is performed lock-free. release() first looks up the block for
 // the entry, using address alignment to find the enclosing block (thereby
-// avoiding iteration over the _active_list).  Once the block has been
+// avoiding iteration over the _active_array).  Once the block has been
 // determined, its _allocated_bitmask needs to be updated, and its position in
 // the _allocate_list may need to be updated.  There are two cases:
 //
@@ -340,7 +436,7 @@
           // Failed to make new block, no other thread made a block
           // available while the mutex was released, and didn't get
           // one from a deferred update either, so return failure.
-          log_info(oopstorage, ref)("%s: failed allocation", name());
+          log_info(oopstorage, ref)("%s: failed block allocation", name());
           return NULL;
         }
       }
@@ -348,17 +444,21 @@
       // Add new block to storage.
       log_info(oopstorage, blocks)("%s: new block " PTR_FORMAT, name(), p2i(block));
 
+      // Add new block to the _active_array, growing if needed.
+      if (!_active_array->push(block)) {
+        if (expand_active_array()) {
+          guarantee(_active_array->push(block), "push failed after expansion");
+        } else {
+          log_info(oopstorage, blocks)("%s: failed active array expand", name());
+          Block::delete_block(*block);
+          return NULL;
+        }
+      }
       // Add to end of _allocate_list.  The mutex release allowed
       // other threads to add blocks to the _allocate_list.  We prefer
       // to allocate from non-empty blocks, to allow empty blocks to
       // be deleted.
       _allocate_list.push_back(*block);
-      // Add to front of _active_list, and then record as the head
-      // block, for concurrent iteration protocol.
-      _active_list.push_front(*block);
-      ++_block_count;
-      // Ensure all setup of block is complete before making it visible.
-      OrderAccess::release_store(&_active_head, block);
     }
     block = _allocate_list.head();
   }
@@ -383,6 +483,123 @@
   return result;
 }
 
+// Create a new, larger, active array with the same content as the
+// current array, and then replace, relinquishing the old array.
+// Return true if the array was successfully expanded, false to
+// indicate allocation failure.
+bool OopStorage::expand_active_array() {
+  assert_lock_strong(_allocate_mutex);
+  BlockArray* old_array = _active_array;
+  size_t new_size = 2 * old_array->size();
+  log_info(oopstorage, blocks)("%s: expand active array " SIZE_FORMAT,
+                               name(), new_size);
+  BlockArray* new_array = BlockArray::create(new_size, AllocFailStrategy::RETURN_NULL);
+  if (new_array == NULL) return false;
+  new_array->copy_from(old_array);
+  replace_active_array(new_array);
+  relinquish_block_array(old_array);
+  return true;
+}
+
+OopStorage::ProtectActive::ProtectActive() : _enter(0), _exit() {}
+
+// Begin read-side critical section.
+uint OopStorage::ProtectActive::read_enter() {
+  return Atomic::add(2u, &_enter);
+}
+
+// End read-side critical section.
+void OopStorage::ProtectActive::read_exit(uint enter_value) {
+  Atomic::add(2u, &_exit[enter_value & 1]);
+}
+
+// Wait until all readers that entered the critical section before
+// synchronization have exited that critical section.
+void OopStorage::ProtectActive::write_synchronize() {
+  SpinYield spinner;
+  // Determine old and new exit counters, based on bit0 of the
+  // on-entry _enter counter.
+  uint value = OrderAccess::load_acquire(&_enter);
+  volatile uint* new_ptr = &_exit[(value + 1) & 1];
+  // Atomically change the in-use exit counter to the new counter, by
+  // adding 1 to the _enter counter (flipping bit0 between 0 and 1)
+  // and initializing the new exit counter to that enter value.  Note:
+  // The new exit counter is not being used by read operations until
+  // this change succeeds.
+  uint old;
+  do {
+    old = value;
+    *new_ptr = ++value;
+    value = Atomic::cmpxchg(value, &_enter, old);
+  } while (old != value);
+  // Readers that entered the critical section before we changed the
+  // selected exit counter will use the old exit counter.  Readers
+  // entering after the change will use the new exit counter.  Wait
+  // for all the critical sections started before the change to
+  // complete, e.g. for the value of old_ptr to catch up with old.
+  volatile uint* old_ptr = &_exit[old & 1];
+  while (old != OrderAccess::load_acquire(old_ptr)) {
+    spinner.wait();
+  }
+}
+
+// Make new_array the _active_array.  Increments new_array's refcount
+// to account for the new reference.  The assignment is atomic wrto
+// obtain_active_array; once this function returns, it is safe for the
+// caller to relinquish the old array.
+void OopStorage::replace_active_array(BlockArray* new_array) {
+  // Caller has the old array that is the current value of _active_array.
+  // Update new_array refcount to account for the new reference.
+  new_array->increment_refcount();
+  // Install new_array, ensuring its initialization is complete first.
+  OrderAccess::release_store(&_active_array, new_array);
+  // Wait for any readers that could read the old array from _active_array.
+  _protect_active.write_synchronize();
+  // All obtain critical sections that could see the old array have
+  // completed, having incremented the refcount of the old array.  The
+  // caller can now safely relinquish the old array.
+}
+
+// Atomically (wrto replace_active_array) get the active array and
+// increment its refcount.  This provides safe access to the array,
+// even if an allocate operation expands and replaces the value of
+// _active_array.  The caller must relinquish the array when done
+// using it.
+OopStorage::BlockArray* OopStorage::obtain_active_array() const {
+  uint enter_value = _protect_active.read_enter();
+  BlockArray* result = OrderAccess::load_acquire(&_active_array);
+  result->increment_refcount();
+  _protect_active.read_exit(enter_value);
+  return result;
+}
+
+// Decrement refcount of array and destroy if refcount is zero.
+void OopStorage::relinquish_block_array(BlockArray* array) const {
+  if (array->decrement_refcount()) {
+    assert(array != _active_array, "invariant");
+    BlockArray::destroy(array);
+  }
+}
+
+class OopStorage::WithActiveArray : public StackObj {
+  const OopStorage* _storage;
+  BlockArray* _active_array;
+
+public:
+  WithActiveArray(const OopStorage* storage) :
+    _storage(storage),
+    _active_array(storage->obtain_active_array())
+  {}
+
+  ~WithActiveArray() {
+    _storage->relinquish_block_array(_active_array);
+  }
+
+  BlockArray& active_array() const {
+    return *_active_array;
+  }
+};
+
 OopStorage::Block* OopStorage::find_block_or_null(const oop* ptr) const {
   assert(ptr != NULL, "precondition");
   return Block::block_for_ptr(this, ptr);
@@ -392,7 +609,6 @@
                                     uintx old_allocated,
                                     const OopStorage* owner,
                                     const void* block) {
-  ResourceMark rm;
   Log(oopstorage, blocks) log;
   LogStream ls(log.debug());
   if (is_full_bitmask(old_allocated)) {
@@ -546,20 +762,21 @@
   return dup;
 }
 
+const size_t initial_active_array_size = 8;
+
 OopStorage::OopStorage(const char* name,
                        Mutex* allocate_mutex,
                        Mutex* active_mutex) :
   _name(dup_name(name)),
-  _active_list(&Block::get_active_entry),
+  _active_array(BlockArray::create(initial_active_array_size)),
   _allocate_list(&Block::get_allocate_entry),
-  _active_head(NULL),
   _deferred_updates(NULL),
   _allocate_mutex(allocate_mutex),
   _active_mutex(active_mutex),
   _allocation_count(0),
-  _block_count(0),
   _concurrent_iteration_active(false)
 {
+  _active_array->increment_refcount();
   assert(_active_mutex->rank() < _allocate_mutex->rank(),
          "%s: active_mutex must have lower rank than allocate_mutex", _name);
   assert(_active_mutex->_safepoint_check_required != Mutex::_safepoint_check_always,
@@ -583,10 +800,13 @@
   while ((block = _allocate_list.head()) != NULL) {
     _allocate_list.unlink(*block);
   }
-  while ((block = _active_list.head()) != NULL) {
-    _active_list.unlink(*block);
+  bool unreferenced = _active_array->decrement_refcount();
+  assert(unreferenced, "deleting storage while _active_array is referenced");
+  for (size_t i = _active_array->block_count(); 0 < i; ) {
+    block = _active_array->at(--i);
     Block::delete_block(*block);
   }
+  BlockArray::destroy(_active_array);
   FREE_C_HEAP_ARRAY(char, _name);
 }
 
@@ -598,16 +818,13 @@
   // Don't interfere with a concurrent iteration.
   if (_concurrent_iteration_active) return;
   // Delete empty (and otherwise deletable) blocks from end of _allocate_list.
-  for (const Block* block = _allocate_list.ctail();
+  for (Block* block = _allocate_list.tail();
        (block != NULL) && block->is_deletable();
-       block = _allocate_list.ctail()) {
-    _active_list.unlink(*block);
+       block = _allocate_list.tail()) {
+    _active_array->remove(block);
     _allocate_list.unlink(*block);
     delete_empty_block(*block);
-    --_block_count;
   }
-  // Update _active_head, in case current value was in deleted set.
-  _active_head = _active_list.head();
 }
 
 void OopStorage::delete_empty_blocks_concurrent() {
@@ -616,14 +833,14 @@
   // release the mutex across the block deletions.  Set an upper bound
   // on how many blocks we'll try to release, so other threads can't
   // cause an unbounded stay in this function.
-  size_t limit = _block_count;
+  size_t limit = block_count();
 
   for (size_t i = 0; i < limit; ++i) {
     // Additional updates might become available while we dropped the
     // lock.  But limit number processed to limit lock duration.
     reduce_deferred_updates();
 
-    const Block* block = _allocate_list.ctail();
+    Block* block = _allocate_list.tail();
     if ((block == NULL) || !block->is_deletable()) {
       // No block to delete, so done.  There could be more pending
       // deferred updates that could give us more work to do; deal with
@@ -635,12 +852,7 @@
       MutexLockerEx aml(_active_mutex, Mutex::_no_safepoint_check_flag);
       // Don't interfere with a concurrent iteration.
       if (_concurrent_iteration_active) return;
-      // Remove block from _active_list, updating head if needed.
-      _active_list.unlink(*block);
-      --_block_count;
-      if (block == _active_head) {
-        _active_head = _active_list.head();
-      }
+      _active_array->remove(block);
     }
     // Remove block from _allocate_list and delete it.
     _allocate_list.unlink(*block);
@@ -653,18 +865,17 @@
 OopStorage::EntryStatus OopStorage::allocation_status(const oop* ptr) const {
   const Block* block = find_block_or_null(ptr);
   if (block != NULL) {
-    // Verify block is a real block.  For now, simple linear search.
-    // Do something more clever if this is a performance bottleneck.
+    // Prevent block deletion and _active_array modification.
     MutexLockerEx ml(_allocate_mutex, Mutex::_no_safepoint_check_flag);
-    for (const Block* check_block = _active_list.chead();
-         check_block != NULL;
-         check_block = _active_list.next(*check_block)) {
-      if (check_block == block) {
-        if ((block->allocated_bitmask() & block->bitmask_for_entry(ptr)) != 0) {
-          return ALLOCATED_ENTRY;
-        } else {
-          return UNALLOCATED_ENTRY;
-        }
+    // Block could be a false positive, so get index carefully.
+    size_t index = Block::active_index_safe(block);
+    if ((index < _active_array->block_count()) &&
+        (block == _active_array->at(index)) &&
+        block->contains(ptr)) {
+      if ((block->allocated_bitmask() & block->bitmask_for_entry(ptr)) != 0) {
+        return ALLOCATED_ENTRY;
+      } else {
+        return UNALLOCATED_ENTRY;
       }
     }
   }
@@ -676,30 +887,50 @@
 }
 
 size_t OopStorage::block_count() const {
-  return _block_count;
+  WithActiveArray wab(this);
+  // Count access is racy, but don't care.
+  return wab.active_array().block_count();
 }
 
 size_t OopStorage::total_memory_usage() const {
   size_t total_size = sizeof(OopStorage);
   total_size += strlen(name()) + 1;
-  total_size += block_count() * Block::allocation_size();
+  total_size += sizeof(BlockArray);
+  WithActiveArray wab(this);
+  const BlockArray& blocks = wab.active_array();
+  // Count access is racy, but don't care.
+  total_size += blocks.block_count() * Block::allocation_size();
+  total_size += blocks.size() * sizeof(Block*);
   return total_size;
 }
 
 // Parallel iteration support
 
-static char* not_started_marker_dummy = NULL;
-static void* const not_started_marker = &not_started_marker_dummy;
+uint OopStorage::BasicParState::default_estimated_thread_count(bool concurrent) {
+  return concurrent ? ConcGCThreads : ParallelGCThreads;
+}
 
-OopStorage::BasicParState::BasicParState(OopStorage* storage, bool concurrent) :
+OopStorage::BasicParState::BasicParState(const OopStorage* storage,
+                                         uint estimated_thread_count,
+                                         bool concurrent) :
   _storage(storage),
-  _next_block(not_started_marker),
+  _active_array(_storage->obtain_active_array()),
+  _block_count(0),              // initialized properly below
+  _next_block(0),
+  _estimated_thread_count(estimated_thread_count),
   _concurrent(concurrent)
 {
+  assert(estimated_thread_count > 0, "estimated thread count must be positive");
   update_iteration_state(true);
+  // Get the block count *after* iteration state updated, so concurrent
+  // empty block deletion is suppressed and can't reduce the count.  But
+  // ensure the count we use was written after the block with that count
+  // was fully initialized; see BlockArray::push.
+  _block_count = _active_array->block_count_acquire();
 }
 
 OopStorage::BasicParState::~BasicParState() {
+  _storage->relinquish_block_array(_active_array);
   update_iteration_state(false);
 }
 
@@ -711,29 +942,49 @@
   }
 }
 
-void OopStorage::BasicParState::ensure_iteration_started() {
-  if (!_concurrent) {
-    assert_at_safepoint();
+bool OopStorage::BasicParState::claim_next_segment(IterationData* data) {
+  data->_processed += data->_segment_end - data->_segment_start;
+  size_t start = OrderAccess::load_acquire(&_next_block);
+  if (start >= _block_count) {
+    return finish_iteration(data); // No more blocks available.
   }
-  assert(!_concurrent || _storage->_concurrent_iteration_active, "invariant");
-  // Ensure _next_block is not the not_started_marker, setting it to
-  // the _active_head to start the iteration if necessary.
-  if (OrderAccess::load_acquire(&_next_block) == not_started_marker) {
-    Atomic::cmpxchg(_storage->_active_head, &_next_block, not_started_marker);
+  // Try to claim several at a time, but not *too* many.  We want to
+  // avoid deciding there are many available and selecting a large
+  // quantity, get delayed, and then end up claiming most or all of
+  // the remaining largish amount of work, leaving nothing for other
+  // threads to do.  But too small a step can lead to contention
+  // over _next_block, esp. when the work per block is small.
+  size_t max_step = 10;
+  size_t remaining = _block_count - start;
+  size_t step = MIN2(max_step, 1 + (remaining / _estimated_thread_count));
+  // Atomic::add with possible overshoot.  This can perform better
+  // than a CAS loop on some platforms when there is contention.
+  // We can cope with the uncertainty by recomputing start/end from
+  // the result of the add, and dealing with potential overshoot.
+  size_t end = Atomic::add(step, &_next_block);
+  // _next_block may have changed, so recompute start from result of add.
+  start = end - step;
+  // _next_block may have changed so much that end has overshot.
+  end = MIN2(end, _block_count);
+  // _next_block may have changed so much that even start has overshot.
+  if (start < _block_count) {
+    // Record claimed segment for iteration.
+    data->_segment_start = start;
+    data->_segment_end = end;
+    return true;                // Success.
+  } else {
+    // No more blocks to claim.
+    return finish_iteration(data);
   }
-  assert(_next_block != not_started_marker, "postcondition");
 }
 
-OopStorage::Block* OopStorage::BasicParState::claim_next_block() {
-  assert(_next_block != not_started_marker, "Iteration not started");
-  void* next = _next_block;
-  while (next != NULL) {
-    void* new_next = _storage->_active_list.next(*static_cast<Block*>(next));
-    void* fetched = Atomic::cmpxchg(new_next, &_next_block, next);
-    if (fetched == next) break; // Claimed.
-    next = fetched;
-  }
-  return static_cast<Block*>(next);
+bool OopStorage::BasicParState::finish_iteration(const IterationData* data) const {
+  log_debug(oopstorage, blocks, stats)
+           ("Parallel iteration on %s: blocks = " SIZE_FORMAT
+            ", processed = " SIZE_FORMAT " (%2.f%%)",
+            _storage->name(), _block_count, data->_processed,
+            percent_of(data->_processed, _block_count));
+  return false;
 }
 
 const char* OopStorage::name() const { return _name; }
@@ -742,7 +993,7 @@
 
 void OopStorage::print_on(outputStream* st) const {
   size_t allocations = _allocation_count;
-  size_t blocks = _block_count;
+  size_t blocks = _active_array->block_count();
 
   double data_size = section_size * section_count;
   double alloc_percentage = percent_of((double)allocations, blocks * data_size);