8159422: Very high Concurrent Mark mark stack contention
authortschatzl
Thu, 15 Sep 2016 16:44:19 +0200
changeset 41176 ff9f64534cff
parent 41175 68d3c7c39153
child 41177 3869072fc2e1
8159422: Very high Concurrent Mark mark stack contention Summary: Decrease contention on mark stack by splitting locks, and minimizing the amount of time these locks are held. Improve mark stack management. Reviewed-by: kbarrett, mgerdin, eosterlund
hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp
hotspot/src/share/vm/gc/g1/g1ConcurrentMark.cpp
hotspot/src/share/vm/gc/g1/g1ConcurrentMark.hpp
hotspot/src/share/vm/gc/g1/g1ConcurrentMark.inline.hpp
hotspot/src/share/vm/gc/g1/g1OopClosures.hpp
hotspot/src/share/vm/memory/allocation.hpp
hotspot/src/share/vm/memory/allocation.inline.hpp
hotspot/src/share/vm/runtime/mutexLocker.cpp
hotspot/src/share/vm/runtime/mutexLocker.hpp
--- a/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp	Wed Sep 14 16:20:54 2016 +0300
+++ b/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp	Thu Sep 15 16:44:19 2016 +0200
@@ -3165,7 +3165,6 @@
 
         assert(_verifier->check_cset_fast_test(), "Inconsistency in the InCSetState table.");
 
-        _cm->note_start_of_gc();
         // We call this after finalize_cset() to
         // ensure that the CSet has been finalized.
         _cm->verify_no_cset_oops();
@@ -3251,7 +3250,6 @@
         // We redo the verification but now wrt to the new CSet which
         // has just got initialized after the previous CSet was freed.
         _cm->verify_no_cset_oops();
-        _cm->note_end_of_gc();
 
         // This timing is only used by the ergonomics to handle our pause target.
         // It is unclear why this should not include the full pause. We will
--- a/hotspot/src/share/vm/gc/g1/g1ConcurrentMark.cpp	Wed Sep 14 16:20:54 2016 +0300
+++ b/hotspot/src/share/vm/gc/g1/g1ConcurrentMark.cpp	Thu Sep 15 16:44:19 2016 +0200
@@ -133,129 +133,184 @@
 }
 
 G1CMMarkStack::G1CMMarkStack() :
-  _reserved_space(),
+  _max_chunk_capacity(0),
   _base(NULL),
-  _capacity(0),
-  _saved_index((size_t)AllBits),
+  _chunk_capacity(0),
+  _out_of_memory(false),
   _should_expand(false) {
   set_empty();
 }
 
 bool G1CMMarkStack::resize(size_t new_capacity) {
   assert(is_empty(), "Only resize when stack is empty.");
-  assert(new_capacity <= MarkStackSizeMax,
-         "Trying to resize stack to " SIZE_FORMAT " elements when the maximum is " SIZE_FORMAT, new_capacity, MarkStackSizeMax);
-
-  size_t reservation_size = ReservedSpace::allocation_align_size_up(new_capacity * sizeof(oop));
-
-  ReservedSpace rs(reservation_size);
-  if (!rs.is_reserved()) {
-    log_warning(gc)("Failed to reserve memory for new overflow mark stack with " SIZE_FORMAT " elements and size " SIZE_FORMAT "B.", new_capacity, reservation_size);
+  assert(new_capacity <= _max_chunk_capacity,
+         "Trying to resize stack to " SIZE_FORMAT " chunks when the maximum is " SIZE_FORMAT, new_capacity, _max_chunk_capacity);
+
+  OopChunk* new_base = MmapArrayAllocator<OopChunk, mtGC>::allocate_or_null(new_capacity);
+
+  if (new_base == NULL) {
+    log_warning(gc)("Failed to reserve memory for new overflow mark stack with " SIZE_FORMAT " chunks and size " SIZE_FORMAT "B.", new_capacity, new_capacity * sizeof(OopChunk));
     return false;
   }
-
-  VirtualSpace vs;
-
-  if (!vs.initialize(rs, rs.size())) {
-    rs.release();
-    log_warning(gc)("Failed to commit memory for new overflow mark stack of size " SIZE_FORMAT "B.", rs.size());
-    return false;
+  // Release old mapping.
+  if (_base != NULL) {
+    MmapArrayAllocator<OopChunk, mtGC>::free(_base, _chunk_capacity);
   }
 
-  assert(vs.committed_size() == rs.size(), "Failed to commit all of the mark stack.");
-
-  // Release old mapping.
-  _reserved_space.release();
-
-  // Save new mapping for future unmapping.
-  _reserved_space = rs;
-
-  MemTracker::record_virtual_memory_type((address)_reserved_space.base(), mtGC);
-
-  _base = (oop*) vs.low();
-  _capacity = new_capacity;
+  _base = new_base;
+  _chunk_capacity = new_capacity;
   set_empty();
   _should_expand = false;
 
   return true;
 }
 
-bool G1CMMarkStack::allocate(size_t capacity) {
-  return resize(capacity);
+size_t G1CMMarkStack::capacity_alignment() {
+  return (size_t)lcm(os::vm_allocation_granularity(), sizeof(OopChunk)) / sizeof(void*);
+}
+
+bool G1CMMarkStack::initialize(size_t initial_capacity, size_t max_capacity) {
+  guarantee(_max_chunk_capacity == 0, "G1CMMarkStack already initialized.");
+
+  size_t const OopChunkSizeInVoidStar = sizeof(OopChunk) / sizeof(void*);
+
+  _max_chunk_capacity = (size_t)align_size_up(max_capacity, capacity_alignment()) / OopChunkSizeInVoidStar;
+  size_t initial_chunk_capacity = (size_t)align_size_up(initial_capacity, capacity_alignment()) / OopChunkSizeInVoidStar;
+
+  guarantee(initial_chunk_capacity <= _max_chunk_capacity,
+            "Maximum chunk capacity " SIZE_FORMAT " smaller than initial capacity " SIZE_FORMAT,
+            _max_chunk_capacity,
+            initial_chunk_capacity);
+
+  log_debug(gc)("Initialize mark stack with " SIZE_FORMAT " chunks, maximum " SIZE_FORMAT,
+                initial_chunk_capacity, _max_chunk_capacity);
+
+  return resize(initial_chunk_capacity);
 }
 
 void G1CMMarkStack::expand() {
   // Clear expansion flag
   _should_expand = false;
 
-  if (_capacity == MarkStackSizeMax) {
-    log_debug(gc)("Can not expand overflow mark stack further, already at maximum capacity of " SIZE_FORMAT " elements.", _capacity);
+  if (_chunk_capacity == _max_chunk_capacity) {
+    log_debug(gc)("Can not expand overflow mark stack further, already at maximum capacity of " SIZE_FORMAT " chunks.", _chunk_capacity);
     return;
   }
-  size_t old_capacity = _capacity;
+  size_t old_capacity = _chunk_capacity;
   // Double capacity if possible
-  size_t new_capacity = MIN2(old_capacity * 2, MarkStackSizeMax);
+  size_t new_capacity = MIN2(old_capacity * 2, _max_chunk_capacity);
 
   if (resize(new_capacity)) {
-    log_debug(gc)("Expanded marking stack capacity from " SIZE_FORMAT " to " SIZE_FORMAT " elements",
+    log_debug(gc)("Expanded mark stack capacity from " SIZE_FORMAT " to " SIZE_FORMAT " chunks",
                   old_capacity, new_capacity);
   } else {
-    log_warning(gc)("Failed to expand marking stack capacity from " SIZE_FORMAT " to " SIZE_FORMAT " elements",
+    log_warning(gc)("Failed to expand mark stack capacity from " SIZE_FORMAT " to " SIZE_FORMAT " chunks",
                     old_capacity, new_capacity);
   }
 }
 
 G1CMMarkStack::~G1CMMarkStack() {
   if (_base != NULL) {
-    _base = NULL;
-    _reserved_space.release();
-  }
-}
-
-void G1CMMarkStack::par_push_arr(oop* buffer, size_t n) {
-  MutexLockerEx x(ParGCRareEvent_lock, Mutex::_no_safepoint_check_flag);
-  size_t start = _index;
-  size_t next_index = start + n;
-  if (next_index > _capacity) {
-    _overflow = true;
-    return;
-  }
-  // Otherwise.
-  _index = next_index;
-  for (size_t i = 0; i < n; i++) {
-    size_t ind = start + i;
-    assert(ind < _capacity, "By overflow test above.");
-    _base[ind] = buffer[i];
+    MmapArrayAllocator<OopChunk, mtGC>::free(_base, _chunk_capacity);
   }
 }
 
-bool G1CMMarkStack::par_pop_arr(oop* buffer, size_t max, size_t* n) {
-  MutexLockerEx x(ParGCRareEvent_lock, Mutex::_no_safepoint_check_flag);
-  size_t index = _index;
-  if (index == 0) {
-    *n = 0;
+void G1CMMarkStack::add_chunk_to_list(OopChunk* volatile* list, OopChunk* elem) {
+  elem->next = *list;
+  *list = elem;
+}
+
+void G1CMMarkStack::add_chunk_to_chunk_list(OopChunk* elem) {
+  MutexLockerEx x(MarkStackChunkList_lock, Mutex::_no_safepoint_check_flag);
+  add_chunk_to_list(&_chunk_list, elem);
+  _chunks_in_chunk_list++;
+}
+
+void G1CMMarkStack::add_chunk_to_free_list(OopChunk* elem) {
+  MutexLockerEx x(MarkStackFreeList_lock, Mutex::_no_safepoint_check_flag);
+  add_chunk_to_list(&_free_list, elem);
+}
+
+G1CMMarkStack::OopChunk* G1CMMarkStack::remove_chunk_from_list(OopChunk* volatile* list) {
+  OopChunk* result = *list;
+  if (result != NULL) {
+    *list = (*list)->next;
+  }
+  return result;
+}
+
+G1CMMarkStack::OopChunk* G1CMMarkStack::remove_chunk_from_chunk_list() {
+  MutexLockerEx x(MarkStackChunkList_lock, Mutex::_no_safepoint_check_flag);
+  OopChunk* result = remove_chunk_from_list(&_chunk_list);
+  if (result != NULL) {
+    _chunks_in_chunk_list--;
+  }
+  return result;
+}
+
+G1CMMarkStack::OopChunk* G1CMMarkStack::remove_chunk_from_free_list() {
+  MutexLockerEx x(MarkStackFreeList_lock, Mutex::_no_safepoint_check_flag);
+  return remove_chunk_from_list(&_free_list);
+}
+
+G1CMMarkStack::OopChunk* G1CMMarkStack::allocate_new_chunk() {
+  // This dirty read of _hwm is okay because we only ever increase the _hwm in parallel code.
+  // Further this limits _hwm to a value of _chunk_capacity + #threads, avoiding
+  // wraparound of _hwm.
+  if (_hwm >= _chunk_capacity) {
+    return NULL;
+  }
+
+  size_t cur_idx = Atomic::add(1, &_hwm) - 1;
+  if (cur_idx >= _chunk_capacity) {
+    return NULL;
+  }
+
+  OopChunk* result = ::new (&_base[cur_idx]) OopChunk;
+  result->next = NULL;
+  return result;
+}
+
+bool G1CMMarkStack::par_push_chunk(oop* ptr_arr) {
+  // Get a new chunk.
+  OopChunk* new_chunk = remove_chunk_from_free_list();
+
+  if (new_chunk == NULL) {
+    // Did not get a chunk from the free list. Allocate from backing memory.
+    new_chunk = allocate_new_chunk();
+  }
+
+  if (new_chunk == NULL) {
+    _out_of_memory = true;
     return false;
-  } else {
-    size_t k = MIN2(max, index);
-    size_t new_ind = index - k;
-    for (size_t j = 0; j < k; j++) {
-      buffer[j] = _base[new_ind + j];
-    }
-    _index = new_ind;
-    *n = k;
-    return true;
   }
+
+  Copy::conjoint_oops_atomic(ptr_arr, new_chunk->data, OopsPerChunk);
+
+  add_chunk_to_chunk_list(new_chunk);
+
+  return true;
 }
 
-void G1CMMarkStack::note_start_of_gc() {
-  assert(_saved_index == (size_t)AllBits, "note_start_of_gc()/end_of_gc() calls bracketed incorrectly");
-  _saved_index = _index;
+bool G1CMMarkStack::par_pop_chunk(oop* ptr_arr) {
+  OopChunk* cur = remove_chunk_from_chunk_list();
+
+  if (cur == NULL) {
+    return false;
+  }
+
+  Copy::conjoint_oops_atomic(cur->data, ptr_arr, OopsPerChunk);
+
+  add_chunk_to_free_list(cur);
+  return true;
 }
 
-void G1CMMarkStack::note_end_of_gc() {
-  guarantee(!stack_modified(), "Saved index " SIZE_FORMAT " must be the same as " SIZE_FORMAT, _saved_index, _index);
-
-  _saved_index = (size_t)AllBits;
+void G1CMMarkStack::set_empty() {
+  _chunks_in_chunk_list = 0;
+  _hwm = 0;
+  clear_out_of_memory();
+  _chunk_list = NULL;
+  _free_list = NULL;
 }
 
 G1CMRootRegions::G1CMRootRegions() :
@@ -483,9 +538,8 @@
     }
   }
 
-  if (!_global_mark_stack.allocate(MarkStackSize)) {
+  if (!_global_mark_stack.initialize(MarkStackSize, MarkStackSizeMax)) {
     vm_exit_during_initialization("Failed to allocate initial concurrent mark overflow mark stack.");
-    return;
   }
 
   _tasks = NEW_C_HEAP_ARRAY(G1CMTask*, _max_worker_id, mtGC);
@@ -1695,10 +1749,10 @@
     // oop closures will set the has_overflown flag if we overflow the
     // global marking stack.
 
-    assert(_global_mark_stack.overflow() || _global_mark_stack.is_empty(),
-            "mark stack should be empty (unless it overflowed)");
-
-    if (_global_mark_stack.overflow()) {
+    assert(_global_mark_stack.is_out_of_memory() || _global_mark_stack.is_empty(),
+            "Mark stack should be empty (unless it is out of memory)");
+
+    if (_global_mark_stack.is_out_of_memory()) {
       // This should have been done already when we tried to push an
       // entry on to the global mark stack. But let's do it again.
       set_has_overflown();
@@ -2343,49 +2397,54 @@
 }
 
 void G1CMTask::move_entries_to_global_stack() {
-  // local array where we'll store the entries that will be popped
-  // from the local queue
-  oop buffer[global_stack_transfer_size];
-
-  int n = 0;
+  // Local array where we'll store the entries that will be popped
+  // from the local queue.
+  oop buffer[G1CMMarkStack::OopsPerChunk];
+
+  size_t n = 0;
   oop obj;
-  while (n < global_stack_transfer_size && _task_queue->pop_local(obj)) {
+  while (n < G1CMMarkStack::OopsPerChunk && _task_queue->pop_local(obj)) {
     buffer[n] = obj;
     ++n;
   }
+  if (n < G1CMMarkStack::OopsPerChunk) {
+    buffer[n] = NULL;
+  }
 
   if (n > 0) {
-    // we popped at least one entry from the local queue
-
-    if (!_cm->mark_stack_push(buffer, n)) {
+    if (!_cm->mark_stack_push(buffer)) {
       set_has_aborted();
     }
   }
 
-  // this operation was quite expensive, so decrease the limits
+  // This operation was quite expensive, so decrease the limits.
   decrease_limits();
 }
 
-void G1CMTask::get_entries_from_global_stack() {
-  // local array where we'll store the entries that will be popped
+bool G1CMTask::get_entries_from_global_stack() {
+  // Local array where we'll store the entries that will be popped
   // from the global stack.
-  oop buffer[global_stack_transfer_size];
-  size_t n;
-  _cm->mark_stack_pop(buffer, global_stack_transfer_size, &n);
-  assert(n <= global_stack_transfer_size,
-         "we should not pop more than the given limit");
-  if (n > 0) {
-    // yes, we did actually pop at least one entry
-    for (size_t i = 0; i < n; ++i) {
-      bool success = _task_queue->push(buffer[i]);
-      // We only call this when the local queue is empty or under a
-      // given target limit. So, we do not expect this push to fail.
-      assert(success, "invariant");
+  oop buffer[G1CMMarkStack::OopsPerChunk];
+
+  if (!_cm->mark_stack_pop(buffer)) {
+    return false;
+  }
+
+  // We did actually pop at least one entry.
+  for (size_t i = 0; i < G1CMMarkStack::OopsPerChunk; ++i) {
+    oop elem = buffer[i];
+    if (elem == NULL) {
+      break;
     }
+    bool success = _task_queue->push(elem);
+    // We only call this when the local queue is empty or under a
+    // given target limit. So, we do not expect this push to fail.
+    assert(success, "invariant");
   }
 
-  // this operation was quite expensive, so decrease the limits
+  // This operation was quite expensive, so decrease the limits
   decrease_limits();
+  return true;
 }
 
 void G1CMTask::drain_local_queue(bool partially) {
@@ -2429,20 +2488,21 @@
 
   // Decide what the target size is, depending whether we're going to
   // drain it partially (so that other tasks can steal if they run out
-  // of things to do) or totally (at the very end).  Notice that,
-  // because we move entries from the global stack in chunks or
-  // because another task might be doing the same, we might in fact
-  // drop below the target. But, this is not a problem.
-  size_t target_size;
+  // of things to do) or totally (at the very end).
+  // Notice that when draining the global mark stack partially, due to the racyness
+  // of the mark stack size update we might in fact drop below the target. But,
+  // this is not a problem.
+  // In case of total draining, we simply process until the global mark stack is
+  // totally empty, disregarding the size counter.
   if (partially) {
-    target_size = _cm->partial_mark_stack_size_target();
+    size_t const target_size = _cm->partial_mark_stack_size_target();
+    while (!has_aborted() && _cm->mark_stack_size() > target_size) {
+      if (get_entries_from_global_stack()) {
+        drain_local_queue(partially);
+      }
+    }
   } else {
-    target_size = 0;
-  }
-
-  if (_cm->mark_stack_size() > target_size) {
-    while (!has_aborted() && _cm->mark_stack_size() > target_size) {
-      get_entries_from_global_stack();
+    while (!has_aborted() && get_entries_from_global_stack()) {
       drain_local_queue(partially);
     }
   }
--- a/hotspot/src/share/vm/gc/g1/g1ConcurrentMark.hpp	Wed Sep 14 16:20:54 2016 +0300
+++ b/hotspot/src/share/vm/gc/g1/g1ConcurrentMark.hpp	Thu Sep 15 16:44:19 2016 +0200
@@ -149,42 +149,98 @@
 //
 // Stores oops in a huge buffer in virtual memory that is always fully committed.
 // Resizing may only happen during a STW pause when the stack is empty.
+//
+// Memory is allocated on a "chunk" basis, i.e. a set of oops. For this, the mark
+// stack memory is split into evenly sized chunks of oops. Users can only
+// add or remove entries on that basis.
+// Chunks are filled in increasing address order. Not completely filled chunks
+// have a NULL element as a terminating element.
+//
+// Every chunk has a header containing a single pointer element used for memory
+// management. This wastes some space, but is negligible (< .1% with current sizing).
+//
+// Memory management is done using a mix of tracking a high water-mark indicating
+// that all chunks at a lower address are valid chunks, and a singly linked free
+// list connecting all empty chunks.
 class G1CMMarkStack VALUE_OBJ_CLASS_SPEC {
-  ReservedSpace _reserved_space; // Space currently reserved for the mark stack.
+public:
+  // Number of oops that can fit in a single chunk.
+  static const size_t OopsPerChunk = 1024 - 1 /* One reference for the next pointer */;
+private:
+  struct OopChunk {
+    OopChunk* next;
+    oop data[OopsPerChunk];
+  };
+
+  size_t _max_chunk_capacity;    // Maximum number of OopChunk elements on the stack.
+
+  OopChunk* _base;               // Bottom address of allocated memory area.
+  size_t _chunk_capacity;        // Current maximum number of OopChunk elements.
 
-  oop* _base;                    // Bottom address of allocated memory area.
-  size_t _capacity;              // Maximum number of elements.
-  size_t _index;                 // One more than last occupied index.
+  char _pad0[DEFAULT_CACHE_LINE_SIZE];
+  OopChunk* volatile _free_list;  // Linked list of free chunks that can be allocated by users.
+  char _pad1[DEFAULT_CACHE_LINE_SIZE - sizeof(OopChunk*)];
+  OopChunk* volatile _chunk_list; // List of chunks currently containing data.
+  volatile size_t _chunks_in_chunk_list;
+  char _pad2[DEFAULT_CACHE_LINE_SIZE - sizeof(OopChunk*) - sizeof(size_t)];
+
+  volatile size_t _hwm;          // High water mark within the reserved space.
+  char _pad4[DEFAULT_CACHE_LINE_SIZE - sizeof(size_t)];
+
+  // Allocate a new chunk from the reserved memory, using the high water mark. Returns
+  // NULL if out of memory.
+  OopChunk* allocate_new_chunk();
 
-  size_t _saved_index;           // Value of _index saved at start of GC to detect mark stack modifications during that time.
+  volatile bool _out_of_memory;
 
-  bool  _overflow;
+  // Atomically add the given chunk to the list.
+  void add_chunk_to_list(OopChunk* volatile* list, OopChunk* elem);
+  // Atomically remove and return a chunk from the given list. Returns NULL if the
+  // list is empty.
+  OopChunk* remove_chunk_from_list(OopChunk* volatile* list);
+
+  void add_chunk_to_chunk_list(OopChunk* elem);
+  void add_chunk_to_free_list(OopChunk* elem);
+
+  OopChunk* remove_chunk_from_chunk_list();
+  OopChunk* remove_chunk_from_free_list();
+
   bool  _should_expand;
 
   // Resizes the mark stack to the given new capacity. Releases any previous
   // memory if successful.
   bool resize(size_t new_capacity);
 
-  bool stack_modified() const { return _index != _saved_index; }
  public:
   G1CMMarkStack();
   ~G1CMMarkStack();
 
-  bool allocate(size_t capacity);
+  // Alignment and minimum capacity of this mark stack in number of oops.
+  static size_t capacity_alignment();
+
+  // Allocate and initialize the mark stack with the given number of oops.
+  bool initialize(size_t initial_capacity, size_t max_capacity);
 
-  // Pushes the first "n" elements of the given buffer on the stack.
-  void par_push_arr(oop* buffer, size_t n);
+  // Pushes the given buffer containing at most OopsPerChunk elements on the mark
+  // stack. If less than OopsPerChunk elements are to be pushed, the array must
+  // be terminated with a NULL.
+  // Returns whether the buffer contents were successfully pushed to the global mark
+  // stack.
+  bool par_push_chunk(oop* buffer);
 
-  // Moves up to max elements from the stack into the given buffer. Returns
-  // the number of elements pushed, and false if the array has been empty.
-  // Returns true if the buffer contains at least one element.
-  bool par_pop_arr(oop* buffer, size_t max, size_t* n);
+  // Pops a chunk from this mark stack, copying them into the given buffer. This
+  // chunk may contain up to OopsPerChunk elements. If there are less, the last
+  // element in the array is a NULL pointer.
+  bool par_pop_chunk(oop* buffer);
 
-  bool is_empty() const { return _index == 0; }
-  size_t capacity() const  { return _capacity; }
+  // Return whether the chunk list is empty. Racy due to unsynchronized access to
+  // _chunk_list.
+  bool is_empty() const { return _chunk_list == NULL; }
 
-  bool overflow() const { return _overflow; }
-  void clear_overflow() { _overflow = false; }
+  size_t capacity() const  { return _chunk_capacity; }
+
+  bool is_out_of_memory() const { return _out_of_memory; }
+  void clear_out_of_memory() { _out_of_memory = false; }
 
   bool should_expand() const { return _should_expand; }
   void set_should_expand(bool value) { _should_expand = value; }
@@ -192,20 +248,15 @@
   // Expand the stack, typically in response to an overflow condition
   void expand();
 
-  size_t size() const { return _index; }
-
-  void set_empty() { _index = 0; clear_overflow(); }
-
-  // Record the current index.
-  void note_start_of_gc();
+  // Return the approximate number of oops on this mark stack. Racy due to
+  // unsynchronized access to _chunks_in_chunk_list.
+  size_t size() const { return _chunks_in_chunk_list * OopsPerChunk; }
 
-  // Make sure that we have not added any entries to the stack during GC.
-  void note_end_of_gc();
+  void set_empty();
 
-  // Apply fn to each oop in the mark stack, up to the bound recorded
-  // via one of the above "note" functions.  The mark stack must not
+  // Apply Fn to every oop on the mark stack. The mark stack must not
   // be modified while iterating.
-  template<typename Fn> void iterate(Fn fn);
+  template<typename Fn> void iterate(Fn fn) const PRODUCT_RETURN;
 };
 
 // Root Regions are regions that are not empty at the beginning of a
@@ -278,7 +329,6 @@
   friend class G1CMDrainMarkingStackClosure;
   friend class G1CMBitMapClosure;
   friend class G1CMConcurrentMarkingTask;
-  friend class G1CMMarkStack;
   friend class G1CMRemarkTask;
   friend class G1CMTask;
 
@@ -479,22 +529,20 @@
 public:
   // Manipulation of the global mark stack.
   // The push and pop operations are used by tasks for transfers
-  // between task-local queues and the global mark stack, and use
-  // locking for concurrency safety.
-  bool mark_stack_push(oop* arr, size_t n) {
-    _global_mark_stack.par_push_arr(arr, n);
-    if (_global_mark_stack.overflow()) {
+  // between task-local queues and the global mark stack.
+  bool mark_stack_push(oop* arr) {
+    if (!_global_mark_stack.par_push_chunk(arr)) {
       set_has_overflown();
       return false;
     }
     return true;
   }
-  void mark_stack_pop(oop* arr, size_t max, size_t* n) {
-    _global_mark_stack.par_pop_arr(arr, max, n);
+  bool mark_stack_pop(oop* arr) {
+    return _global_mark_stack.par_pop_chunk(arr);
   }
   size_t mark_stack_size()                { return _global_mark_stack.size(); }
   size_t partial_mark_stack_size_target() { return _global_mark_stack.capacity()/3; }
-  bool mark_stack_overflow()              { return _global_mark_stack.overflow(); }
+  bool mark_stack_overflow()              { return _global_mark_stack.is_out_of_memory(); }
   bool mark_stack_empty()                 { return _global_mark_stack.is_empty(); }
 
   G1CMRootRegions* root_regions() { return &_root_regions; }
@@ -599,16 +647,6 @@
   // read-only, so use this carefully!
   void clearRangePrevBitmap(MemRegion mr);
 
-  // Notify data structures that a GC has started.
-  void note_start_of_gc() {
-    _global_mark_stack.note_start_of_gc();
-  }
-
-  // Notify data structures that a GC is finished.
-  void note_end_of_gc() {
-    _global_mark_stack.note_end_of_gc();
-  }
-
   // Verify that there are no CSet oops on the stacks (taskqueues /
   // global mark stack) and fingers (global / per-task).
   // If marking is not in progress, it's a no-op.
@@ -670,10 +708,7 @@
     // references reaches this limit
     refs_reached_period           = 384,
     // Initial value for the hash seed, used in the work stealing code
-    init_hash_seed                = 17,
-    // How many entries will be transferred between global stack and
-    // local queues at once.
-    global_stack_transfer_size    = 1024
+    init_hash_seed                = 17
   };
 
   uint                        _worker_id;
@@ -858,9 +893,10 @@
   // It pushes an object on the local queue.
   inline void push(oop obj);
 
-  // These two move entries to/from the global stack.
+  // Move entries to the global stack.
   void move_entries_to_global_stack();
-  void get_entries_from_global_stack();
+  // Move entries from the global stack, return true if we were successful to do so.
+  bool get_entries_from_global_stack();
 
   // It pops and scans objects from the local queue. If partially is
   // true, then it stops when the queue size is of a given limit. If
--- a/hotspot/src/share/vm/gc/g1/g1ConcurrentMark.inline.hpp	Wed Sep 14 16:20:54 2016 +0300
+++ b/hotspot/src/share/vm/gc/g1/g1ConcurrentMark.inline.hpp	Thu Sep 15 16:44:19 2016 +0200
@@ -89,14 +89,28 @@
 
 #undef check_mark
 
+#ifndef PRODUCT
 template<typename Fn>
-inline void G1CMMarkStack::iterate(Fn fn) {
+inline void G1CMMarkStack::iterate(Fn fn) const {
   assert_at_safepoint(true);
-  assert(!stack_modified(), "Saved index " SIZE_FORMAT " must be the same as " SIZE_FORMAT, _saved_index, _index);
-  for (size_t i = 0; i < _index; ++i) {
-    fn(_base[i]);
+
+  size_t num_chunks = 0;
+
+  OopChunk* cur = _chunk_list;
+  while (cur != NULL) {
+    guarantee(num_chunks <= _chunks_in_chunk_list, "Found " SIZE_FORMAT " oop chunks which is more than there should be", num_chunks);
+
+    for (size_t i = 0; i < OopsPerChunk; ++i) {
+      if (cur->data[i] == NULL) {
+        break;
+      }
+      fn(cur->data[i]);
+    }
+    cur = cur->next;
+    num_chunks++;
   }
 }
+#endif
 
 // It scans an object and visits its children.
 inline void G1CMTask::scan_object(oop obj) { process_grey_object<true>(obj); }
--- a/hotspot/src/share/vm/gc/g1/g1OopClosures.hpp	Wed Sep 14 16:20:54 2016 +0300
+++ b/hotspot/src/share/vm/gc/g1/g1OopClosures.hpp	Thu Sep 15 16:44:19 2016 +0200
@@ -34,7 +34,6 @@
 class G1ConcurrentMark;
 class DirtyCardToOopClosure;
 class G1CMBitMap;
-class G1CMMarkStack;
 class G1ParScanThreadState;
 class G1CMTask;
 class ReferenceProcessor;
--- a/hotspot/src/share/vm/memory/allocation.hpp	Wed Sep 14 16:20:54 2016 +0300
+++ b/hotspot/src/share/vm/memory/allocation.hpp	Thu Sep 15 16:44:19 2016 +0200
@@ -738,6 +738,7 @@
   static size_t size_for(size_t length);
 
  public:
+  static E* allocate_or_null(size_t length);
   static E* allocate(size_t length);
   static void free(E* addr, size_t length);
 };
--- a/hotspot/src/share/vm/memory/allocation.inline.hpp	Wed Sep 14 16:20:54 2016 +0300
+++ b/hotspot/src/share/vm/memory/allocation.inline.hpp	Thu Sep 15 16:44:19 2016 +0200
@@ -153,6 +153,24 @@
 }
 
 template <class E, MEMFLAGS F>
+E* MmapArrayAllocator<E, F>::allocate_or_null(size_t length) {
+  size_t size = size_for(length);
+  int alignment = os::vm_allocation_granularity();
+
+  char* addr = os::reserve_memory(size, NULL, alignment, F);
+  if (addr == NULL) {
+    return NULL;
+  }
+
+  if (os::commit_memory(addr, size, !ExecMem, "Allocator (commit)")) {
+    return (E*)addr;
+  } else {
+    os::release_memory(addr, size);
+    return NULL;
+  }
+}
+
+template <class E, MEMFLAGS F>
 E* MmapArrayAllocator<E, F>::allocate(size_t length) {
   size_t size = size_for(length);
   int alignment = os::vm_allocation_granularity();
--- a/hotspot/src/share/vm/runtime/mutexLocker.cpp	Wed Sep 14 16:20:54 2016 +0300
+++ b/hotspot/src/share/vm/runtime/mutexLocker.cpp	Thu Sep 15 16:44:19 2016 +0200
@@ -77,6 +77,8 @@
 Mutex*   DirtyCardQ_FL_lock           = NULL;
 Monitor* DirtyCardQ_CBL_mon           = NULL;
 Mutex*   Shared_DirtyCardQ_lock       = NULL;
+Mutex*   MarkStackFreeList_lock       = NULL;
+Mutex*   MarkStackChunkList_lock      = NULL;
 Mutex*   ParGCRareEvent_lock          = NULL;
 Mutex*   DerivedPointerTableGC_lock   = NULL;
 Mutex*   Compile_lock                 = NULL;
@@ -194,6 +196,9 @@
 
     def(StringDedupQueue_lock      , Monitor, leaf,        true,  Monitor::_safepoint_check_never);
     def(StringDedupTable_lock      , Mutex  , leaf,        true,  Monitor::_safepoint_check_never);
+
+    def(MarkStackFreeList_lock     , Mutex , leaf      ,   true,  Monitor::_safepoint_check_never);
+    def(MarkStackChunkList_lock    , Mutex , leaf      ,   true,  Monitor::_safepoint_check_never);
   }
   def(ParGCRareEvent_lock          , Mutex  , leaf     ,   true,  Monitor::_safepoint_check_sometimes);
   def(DerivedPointerTableGC_lock   , Mutex,   leaf,        true,  Monitor::_safepoint_check_never);
--- a/hotspot/src/share/vm/runtime/mutexLocker.hpp	Wed Sep 14 16:20:54 2016 +0300
+++ b/hotspot/src/share/vm/runtime/mutexLocker.hpp	Thu Sep 15 16:44:19 2016 +0200
@@ -81,7 +81,8 @@
 extern Mutex*   Shared_DirtyCardQ_lock;          // Lock protecting dirty card
                                                  // queue shared by
                                                  // non-Java threads.
-                                                 // (see option ExplicitGCInvokesConcurrent)
+extern Mutex*   MarkStackFreeList_lock;          // Protects access to the global mark stack free list.
+extern Mutex*   MarkStackChunkList_lock;         // Protects access to the global mark stack chunk list.
 extern Mutex*   ParGCRareEvent_lock;             // Synchronizes various (rare) parallel GC ops.
 extern Mutex*   Compile_lock;                    // a lock held when Compilation is updating code (used to block CodeCache traversal, CHA updates, etc)
 extern Monitor* MethodCompileQueue_lock;         // a lock held when method compilations are enqueued, dequeued