8197850: Calculate liveness in regions during marking
authortschatzl
Mon, 26 Mar 2018 16:51:43 +0200
changeset 49606 9ae8719efcae
parent 49605 784f3f2dea14
child 49607 acffe6ff3ae7
8197850: Calculate liveness in regions during marking Reviewed-by: sjohanss, sangheki
src/hotspot/share/gc/g1/g1CardLiveData.cpp
src/hotspot/share/gc/g1/g1CollectedHeap.cpp
src/hotspot/share/gc/g1/g1ConcurrentMark.cpp
src/hotspot/share/gc/g1/g1ConcurrentMark.hpp
src/hotspot/share/gc/g1/g1ConcurrentMark.inline.hpp
src/hotspot/share/gc/g1/g1EvacFailure.cpp
src/hotspot/share/gc/g1/g1OopClosures.hpp
src/hotspot/share/gc/g1/g1OopClosures.inline.hpp
src/hotspot/share/gc/g1/g1RegionMarkStatsCache.cpp
src/hotspot/share/gc/g1/g1RegionMarkStatsCache.hpp
src/hotspot/share/gc/g1/g1RegionMarkStatsCache.inline.hpp
--- a/src/hotspot/share/gc/g1/g1CardLiveData.cpp	Mon Mar 26 16:51:43 2018 +0200
+++ b/src/hotspot/share/gc/g1/g1CardLiveData.cpp	Mon Mar 26 16:51:43 2018 +0200
@@ -290,6 +290,9 @@
       size_t marked_bytes = _helper.mark_marked_during_marking(_mark_bitmap, hr);
       if (marked_bytes > 0) {
         hr->add_to_marked_bytes(marked_bytes);
+        assert(!hr->is_old() || marked_bytes == (_cm->liveness(hr->hrm_index()) * HeapWordSize),
+               "Marked bytes " SIZE_FORMAT " for region %u do not match liveness during mark " SIZE_FORMAT,
+               marked_bytes, hr->hrm_index(), _cm->liveness(hr->hrm_index()) * HeapWordSize);
       }
 
       return (_cm->do_yield_check() && _cm->has_aborted());
--- a/src/hotspot/share/gc/g1/g1CollectedHeap.cpp	Mon Mar 26 16:51:43 2018 +0200
+++ b/src/hotspot/share/gc/g1/g1CollectedHeap.cpp	Mon Mar 26 16:51:43 2018 +0200
@@ -4214,7 +4214,8 @@
   if (collector_state()->during_initial_mark_pause()) {
     oop pll_head = Universe::reference_pending_list();
     if (pll_head != NULL) {
-      _cm->mark_in_next_bitmap(pll_head);
+      // Any valid worker id is fine here as we are in the VM thread and single-threaded.
+      _cm->mark_in_next_bitmap(0 /* worker_id */, pll_head);
     }
   }
 
--- a/src/hotspot/share/gc/g1/g1ConcurrentMark.cpp	Mon Mar 26 16:51:43 2018 +0200
+++ b/src/hotspot/share/gc/g1/g1ConcurrentMark.cpp	Mon Mar 26 16:51:43 2018 +0200
@@ -34,6 +34,7 @@
 #include "gc/g1/g1OopClosures.inline.hpp"
 #include "gc/g1/g1CardLiveData.inline.hpp"
 #include "gc/g1/g1Policy.hpp"
+#include "gc/g1/g1RegionMarkStatsCache.inline.hpp"
 #include "gc/g1/g1StringDedup.hpp"
 #include "gc/g1/heapRegion.inline.hpp"
 #include "gc/g1/heapRegionRemSet.hpp"
@@ -389,7 +390,9 @@
 
   _concurrent_workers(NULL),
   _num_concurrent_workers(0),
-  _max_concurrent_workers(0)
+  _max_concurrent_workers(0),
+
+  _region_mark_stats(NEW_C_HEAP_ARRAY(G1RegionMarkStats, _g1h->max_regions(), mtGC))
 {
   _mark_bitmap_1.initialize(g1h->reserved_region(), prev_bitmap_storage);
   _mark_bitmap_2.initialize(g1h->reserved_region(), next_bitmap_storage);
@@ -479,7 +482,7 @@
     task_queue->initialize();
     _task_queues->register_queue(i, task_queue);
 
-    _tasks[i] = new G1CMTask(i, this, task_queue);
+    _tasks[i] = new G1CMTask(i, this, task_queue, _region_mark_stats, _g1h->max_regions());
 
     _accum_task_vtime[i] = 0.0;
   }
@@ -503,18 +506,29 @@
   // Reset all the marking data structures and any necessary flags
   reset_marking_state();
 
-  // We reset all of them, since different phases will use
-  // different number of active threads. So, it's easiest to have all
-  // of them ready.
+  // Reset all tasks, since different phases will use different number of active
+  // threads. So, it's easiest to have all of them ready.
   for (uint i = 0; i < _max_num_tasks; ++i) {
     _tasks[i]->reset(_next_mark_bitmap);
   }
 
+  uint max_regions = _g1h->max_regions();
+  for (uint i = 0; i < max_regions; i++) {
+    _region_mark_stats[i].clear();
+  }
+
   // we need this to make sure that the flag is on during the evac
   // pause with initial mark piggy-backed
   set_concurrent_marking_in_progress();
 }
 
+void G1ConcurrentMark::clear_statistics_in_region(uint region_idx) {
+  for (uint j = 0; j < _max_num_tasks; ++j) {
+    _tasks[j]->clear_mark_stats_cache(region_idx);
+  }
+  _region_mark_stats[region_idx].clear();
+}
+
 void G1ConcurrentMark::humongous_object_eagerly_reclaimed(HeapRegion* r) {
   assert(SafepointSynchronize::is_at_safepoint(), "May only be called at a safepoint.");
 
@@ -522,6 +536,18 @@
   if (_next_mark_bitmap->is_marked(r->bottom())) {
     _next_mark_bitmap->clear(r->bottom());
   }
+
+  // Clear any statistics about the region gathered so far.
+  uint const region_idx = r->hrm_index();
+  if (r->is_humongous()) {
+    assert(r->is_starts_humongous(), "Got humongous continues region here");
+    uint const size_in_regions = (uint)_g1h->humongous_obj_size_in_regions(oop(r->humongous_start_region()->bottom())->size());
+    for (uint j = region_idx; j < (region_idx + size_in_regions); j++) {
+      clear_statistics_in_region(j);
+    }
+  } else {
+    clear_statistics_in_region(region_idx);
+  }
 }
 
 void G1ConcurrentMark::reset_marking_state() {
@@ -530,6 +556,11 @@
   // Expand the marking stack, if we have to and if we can.
   if (has_overflown()) {
     _global_mark_stack.expand();
+
+    uint max_regions = _g1h->max_regions();
+    for (uint i = 0; i < max_regions; i++) {
+      _region_mark_stats[i].clear_during_overflow();
+    }
   }
 
   clear_has_overflown();
@@ -583,6 +614,7 @@
 }
 
 G1ConcurrentMark::~G1ConcurrentMark() {
+  FREE_C_HEAP_ARRAY(G1RegionMarkStats, _region_mark_stats);
   // The G1ConcurrentMark instance is never freed.
   ShouldNotReachHere();
 }
@@ -801,29 +833,6 @@
     // just abort the whole marking phase as quickly as possible.
     return;
   }
-
-  // If we're executing the concurrent phase of marking, reset the marking
-  // state; otherwise the marking state is reset after reference processing,
-  // during the remark pause.
-  // If we reset here as a result of an overflow during the remark we will
-  // see assertion failures from any subsequent set_concurrency_and_phase()
-  // calls.
-  if (concurrent()) {
-    // let the task associated with with worker 0 do this
-    if (worker_id == 0) {
-      // task 0 is responsible for clearing the global data structures
-      // We should be here because of an overflow. During STW we should
-      // not clear the overflow flag since we rely on it being true when
-      // we exit this method to abort the pause and restart concurrent
-      // marking.
-      reset_marking_state();
-
-      log_info(gc, marking)("Concurrent Mark reset for overflow");
-    }
-  }
-
-  // after this, each task should reset its own data structures then
-  // then go into the second barrier
 }
 
 void G1ConcurrentMark::enter_second_sync_barrier(uint worker_id) {
@@ -897,10 +906,10 @@
   return result;
 }
 
-void G1ConcurrentMark::scan_root_region(HeapRegion* hr) {
+void G1ConcurrentMark::scan_root_region(HeapRegion* hr, uint worker_id) {
   // Currently, only survivors can be root regions.
   assert(hr->next_top_at_mark_start() == hr->bottom(), "invariant");
-  G1RootRegionScanClosure cl(_g1h, this);
+  G1RootRegionScanClosure cl(_g1h, this, worker_id);
 
   const uintx interval = PrefetchScanIntervalInBytes;
   HeapWord* curr = hr->bottom();
@@ -929,7 +938,7 @@
     G1CMRootRegions* root_regions = _cm->root_regions();
     HeapRegion* hr = root_regions->claim_next();
     while (hr != NULL) {
-      _cm->scan_root_region(hr);
+      _cm->scan_root_region(hr, worker_id);
       hr = root_regions->claim_next();
     }
   }
@@ -1058,6 +1067,11 @@
     satb_mq_set.set_active_all_threads(false, /* new active value */
                                        true /* expected_active */);
 
+    {
+      GCTraceTime(Debug, gc, phases)("Flush Task Caches");
+      flush_all_task_caches();
+    }
+
     if (VerifyDuringGC) {
       g1h->verifier()->verify(G1HeapVerifier::G1VerifyRemark, VerifyOption_G1UseNextMarking, "During GC (after)");
     }
@@ -1377,8 +1391,7 @@
 
   template <class T> void do_oop_work(T* p) {
     if (!_cm->has_overflown()) {
-      oop obj = RawAccess<>::oop_load(p);
-      _task->deal_with_reference(obj);
+      _task->deal_with_reference(p);
       _ref_counter--;
 
       if (_ref_counter == 0) {
@@ -1816,6 +1829,19 @@
   print_stats();
 }
 
+void G1ConcurrentMark::flush_all_task_caches() {
+  size_t hits = 0;
+  size_t misses = 0;
+  for (uint i = 0; i < _max_num_tasks; i++) {
+    Pair<size_t, size_t> stats = _tasks[i]->flush_mark_stats_cache();
+    hits += stats.first;
+    misses += stats.second;
+  }
+  size_t sum = hits + misses;
+  log_debug(gc, stats)("Mark stats cache hits " SIZE_FORMAT " misses " SIZE_FORMAT " ratio %1.3lf",
+                       hits, misses, percent_of(hits, sum));
+}
+
 void G1ConcurrentMark::clear_range_in_prev_bitmap(MemRegion mr) {
   _prev_mark_bitmap->clear_range(mr);
 }
@@ -2148,6 +2174,8 @@
   _elapsed_time_ms               = 0.0;
   _termination_time_ms           = 0.0;
   _termination_start_time_ms     = 0.0;
+
+  _mark_stats_cache.reset();
 }
 
 bool G1CMTask::should_exit_termination() {
@@ -2385,16 +2413,28 @@
   decrease_limits();
 }
 
+void G1CMTask::clear_mark_stats_cache(uint region_idx) {
+  _mark_stats_cache.reset(region_idx);
+}
+
+Pair<size_t, size_t> G1CMTask::flush_mark_stats_cache() {
+  return _mark_stats_cache.evict_all();
+}
+
 void G1CMTask::print_stats() {
-  log_debug(gc, stats)("Marking Stats, task = %u, calls = %u",
-                       _worker_id, _calls);
+  log_debug(gc, stats)("Marking Stats, task = %u, calls = %u", _worker_id, _calls);
   log_debug(gc, stats)("  Elapsed time = %1.2lfms, Termination time = %1.2lfms",
                        _elapsed_time_ms, _termination_time_ms);
-  log_debug(gc, stats)("  Step Times (cum): num = %d, avg = %1.2lfms, sd = %1.2lfms",
-                       _step_times_ms.num(), _step_times_ms.avg(),
-                       _step_times_ms.sd());
-  log_debug(gc, stats)("                    max = %1.2lfms, total = %1.2lfms",
-                       _step_times_ms.maximum(), _step_times_ms.sum());
+  log_debug(gc, stats)("  Step Times (cum): num = %d, avg = %1.2lfms, sd = %1.2lfms max = %1.2lfms, total = %1.2lfms",
+                       _step_times_ms.num(),
+                       _step_times_ms.avg(),
+                       _step_times_ms.sd(),
+                       _step_times_ms.maximum(),
+                       _step_times_ms.sum());
+  size_t const hits = _mark_stats_cache.hits();
+  size_t const misses = _mark_stats_cache.misses();
+  log_debug(gc, stats)("  Mark Stats Cache: hits " SIZE_FORMAT " misses " SIZE_FORMAT " ratio %.3f",
+                       hits, misses, percent_of(hits, hits + misses));
 }
 
 bool G1ConcurrentMark::try_stealing(uint worker_id, int* hash_seed, G1TaskQueueEntry& task_entry) {
@@ -2800,14 +2840,29 @@
 
         // When we exit this sync barrier we know that all tasks have
         // stopped doing marking work. So, it's now safe to
-        // re-initialize our data structures. At the end of this method,
-        // task 0 will clear the global data structures.
+        // re-initialize our data structures.
       }
 
-      // We clear the local state of this task...
       clear_region_fields();
+      flush_mark_stats_cache();
 
       if (!is_serial) {
+        // If we're executing the concurrent phase of marking, reset the marking
+        // state; otherwise the marking state is reset after reference processing,
+        // during the remark pause.
+        // If we reset here as a result of an overflow during the remark we will
+        // see assertion failures from any subsequent set_concurrency_and_phase()
+        // calls.
+        if (_cm->concurrent() && _worker_id == 0) {
+          // Worker 0 is responsible for clearing the global data structures because
+          // of an overflow. During STW we should not clear the overflow flag (in
+          // G1ConcurrentMark::reset_marking_state()) since we rely on it being true when we exit
+          // method to abort the pause and restart concurrent marking.
+          _cm->reset_marking_state();
+
+          log_info(gc, marking)("Concurrent Mark reset for overflow");
+        }
+
         // ...and enter the second barrier.
         _cm->enter_second_sync_barrier(_worker_id);
       }
@@ -2818,13 +2873,18 @@
   }
 }
 
-G1CMTask::G1CMTask(uint worker_id, G1ConcurrentMark* cm, G1CMTaskQueue* task_queue) :
+G1CMTask::G1CMTask(uint worker_id,
+                   G1ConcurrentMark* cm,
+                   G1CMTaskQueue* task_queue,
+                   G1RegionMarkStats* mark_stats,
+                   uint max_regions) :
   _objArray_processor(this),
   _worker_id(worker_id),
   _g1h(G1CollectedHeap::heap()),
   _cm(cm),
   _next_mark_bitmap(NULL),
   _task_queue(task_queue),
+  _mark_stats_cache(mark_stats, max_regions, RegionMarkStatsCacheSize),
   _calls(0),
   _time_target_ms(0.0),
   _start_time_ms(0.0),
--- a/src/hotspot/share/gc/g1/g1ConcurrentMark.hpp	Mon Mar 26 16:51:43 2018 +0200
+++ b/src/hotspot/share/gc/g1/g1ConcurrentMark.hpp	Mon Mar 26 16:51:43 2018 +0200
@@ -27,6 +27,7 @@
 
 #include "gc/g1/g1ConcurrentMarkBitMap.hpp"
 #include "gc/g1/g1ConcurrentMarkObjArrayProcessor.hpp"
+#include "gc/g1/g1RegionMarkStatsCache.hpp"
 #include "gc/g1/heapRegionSet.hpp"
 #include "gc/shared/taskqueue.hpp"
 #include "memory/allocation.hpp"
@@ -428,7 +429,9 @@
 
   // Returns the task with the given id
   G1CMTask* task(uint id) {
-    assert(id < _num_active_tasks, "Task id %u not within active bounds up to %u", id, _num_active_tasks);
+    // During initial mark we use the parallel gc threads to do some work, so
+    // we can only compare against _max_num_tasks.
+    assert(id < _max_num_tasks, "Task id %u not within bounds up to %u", id, _max_num_tasks);
     return _tasks[id];
   }
 
@@ -446,7 +449,18 @@
   // Clear the given bitmap in parallel using the given WorkGang. If may_yield is
   // true, periodically insert checks to see if this method should exit prematurely.
   void clear_bitmap(G1CMBitMap* bitmap, WorkGang* workers, bool may_yield);
+
+  // Clear statistics gathered during the concurrent cycle for the given region after
+  // it has been reclaimed.
+  void clear_statistics_in_region(uint region_idx);
+  // Region statistics gathered during marking.
+  G1RegionMarkStats* _region_mark_stats;
 public:
+  void add_to_liveness(uint worker_id, oop const obj, size_t size);
+  // Liveness of the given region as determined by concurrent marking, i.e. the amount of
+  // live words between bottom and nTAMS.
+  size_t liveness(uint region)  { return _region_mark_stats[region]._live_words; }
+
   // Notification for eagerly reclaimed regions to clean up.
   void humongous_object_eagerly_reclaimed(HeapRegion* r);
   // Manipulation of the global mark stack.
@@ -508,6 +522,8 @@
   // Calculates the number of concurrent GC threads to be used in the marking phase.
   uint calc_active_marking_workers();
 
+  // Moves all per-task cached data into global state.
+  void flush_all_task_caches();
   // Prepare internal data structures for the next mark cycle. This includes clearing
   // the next mark bitmap and some internal data structures. This method is intended
   // to be called concurrently to the mutator. It will yield to safepoint requests.
@@ -534,7 +550,7 @@
   void scan_root_regions();
 
   // Scan a single root region and mark everything reachable from it.
-  void scan_root_region(HeapRegion* hr);
+  void scan_root_region(HeapRegion* hr, uint worker_id);
 
   // Do concurrent phase of marking, to a tentative transitive closure.
   void mark_from_roots();
@@ -576,8 +592,10 @@
   void print_on_error(outputStream* st) const;
 
   // Mark the given object on the next bitmap if it is below nTAMS.
-  inline bool mark_in_next_bitmap(HeapRegion* const hr, oop const obj);
-  inline bool mark_in_next_bitmap(oop const obj);
+  // If the passed obj_size is zero, it is recalculated from the given object if
+  // needed. This is to be as lazy as possible with accessing the object's size.
+  inline bool mark_in_next_bitmap(uint worker_id, HeapRegion* const hr, oop const obj, size_t const obj_size = 0);
+  inline bool mark_in_next_bitmap(uint worker_id, oop const obj, size_t const obj_size = 0);
 
   // Returns true if initialization was successfully completed.
   bool completed_initialization() const {
@@ -619,6 +637,10 @@
     init_hash_seed                = 17
   };
 
+  // Number of entries in the per-task stats entry. This seems enough to have a very
+  // low cache miss rate.
+  static const uint RegionMarkStatsCacheSize = 1024;
+
   G1CMObjArrayProcessor       _objArray_processor;
 
   uint                        _worker_id;
@@ -628,6 +650,7 @@
   // the task queue of this task
   G1CMTaskQueue*              _task_queue;
 
+  G1RegionMarkStatsCache      _mark_stats_cache;
   // Number of calls to this task
   uint                        _calls;
 
@@ -786,7 +809,8 @@
   // Grey the object (by calling make_grey_reference) if required,
   // e.g. obj is below its containing region's NTAMS.
   // Precondition: obj is a valid heap object.
-  inline void deal_with_reference(oop obj);
+  template <class T>
+  inline void deal_with_reference(T* p);
 
   // Scans an object and visits its children.
   inline void scan_task_entry(G1TaskQueueEntry task_entry);
@@ -820,8 +844,17 @@
 
   G1CMTask(uint worker_id,
            G1ConcurrentMark *cm,
-           G1CMTaskQueue* task_queue);
+           G1CMTaskQueue* task_queue,
+           G1RegionMarkStats* mark_stats,
+           uint max_regions);
+
+  inline void update_liveness(oop const obj, size_t const obj_size);
 
+  // Clear (without flushing) the mark cache entry for the given region.
+  void clear_mark_stats_cache(uint region_idx);
+  // Evict the whole statistics cache into the global statistics. Returns the
+  // number of cache hits and misses so far.
+  Pair<size_t, size_t> flush_mark_stats_cache();
   // Prints statistics associated with this task
   void print_stats();
 };
--- a/src/hotspot/share/gc/g1/g1ConcurrentMark.inline.hpp	Mon Mar 26 16:51:43 2018 +0200
+++ b/src/hotspot/share/gc/g1/g1ConcurrentMark.inline.hpp	Mon Mar 26 16:51:43 2018 +0200
@@ -29,16 +29,18 @@
 #include "gc/g1/g1ConcurrentMark.hpp"
 #include "gc/g1/g1ConcurrentMarkBitMap.inline.hpp"
 #include "gc/g1/g1ConcurrentMarkObjArrayProcessor.inline.hpp"
+#include "gc/g1/g1RegionMarkStatsCache.inline.hpp"
+#include "gc/g1/heapRegion.hpp"
 #include "gc/shared/suspendibleThreadSet.hpp"
 #include "gc/shared/taskqueue.inline.hpp"
 #include "utilities/bitMap.inline.hpp"
 
-inline bool G1ConcurrentMark::mark_in_next_bitmap(oop const obj) {
+inline bool G1ConcurrentMark::mark_in_next_bitmap(uint const worker_id, oop const obj, size_t const obj_size) {
   HeapRegion* const hr = _g1h->heap_region_containing(obj);
-  return mark_in_next_bitmap(hr, obj);
+  return mark_in_next_bitmap(worker_id, hr, obj, obj_size);
 }
 
-inline bool G1ConcurrentMark::mark_in_next_bitmap(HeapRegion* const hr, oop const obj) {
+inline bool G1ConcurrentMark::mark_in_next_bitmap(uint const worker_id, HeapRegion* const hr, oop const obj, size_t const obj_size) {
   assert(hr != NULL, "just checking");
   assert(hr->is_in_reserved(obj), "Attempting to mark object at " PTR_FORMAT " that is not contained in the given region %u", p2i(obj), hr->hrm_index());
 
@@ -52,7 +54,11 @@
 
   HeapWord* const obj_addr = (HeapWord*)obj;
 
-  return _next_mark_bitmap->par_mark(obj_addr);
+  bool success = _next_mark_bitmap->par_mark(obj_addr);
+  if (success) {
+    add_to_liveness(worker_id, obj, obj_size == 0 ? obj->size() : obj_size);
+  }
+  return success;
 }
 
 #ifndef PRODUCT
@@ -157,8 +163,16 @@
   return mr.word_size();
 }
 
+inline void G1CMTask::update_liveness(oop const obj, const size_t obj_size) {
+  _mark_stats_cache.add_live_words(_g1h->addr_to_region((HeapWord*)obj), obj_size);
+}
+
+inline void G1ConcurrentMark::add_to_liveness(uint worker_id, oop const obj, size_t size) {
+  task(worker_id)->update_liveness(obj, size);
+}
+
 inline void G1CMTask::make_reference_grey(oop obj) {
-  if (!_cm->mark_in_next_bitmap(obj)) {
+  if (!_cm->mark_in_next_bitmap(_worker_id, obj)) {
     return;
   }
 
@@ -199,8 +213,10 @@
   }
 }
 
-inline void G1CMTask::deal_with_reference(oop obj) {
+template <class T>
+inline void G1CMTask::deal_with_reference(T* p) {
   increment_refs_reached();
+  oop const obj = RawAccess<MO_VOLATILE>::oop_load(p);
   if (obj == NULL) {
     return;
   }
--- a/src/hotspot/share/gc/g1/g1EvacFailure.cpp	Mon Mar 26 16:51:43 2018 +0200
+++ b/src/hotspot/share/gc/g1/g1EvacFailure.cpp	Mon Mar 26 16:51:43 2018 +0200
@@ -126,7 +126,7 @@
         // explicitly and all objects in the CSet are considered
         // (implicitly) live. So, we won't mark them explicitly and
         // we'll leave them over NTAMS.
-        _cm->mark_in_next_bitmap(_hr, obj);
+        _cm->mark_in_next_bitmap(_worker_id, obj);
       }
       size_t obj_size = obj->size();
 
--- a/src/hotspot/share/gc/g1/g1OopClosures.hpp	Mon Mar 26 16:51:43 2018 +0200
+++ b/src/hotspot/share/gc/g1/g1OopClosures.hpp	Mon Mar 26 16:51:43 2018 +0200
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2017, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2018, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -181,9 +181,10 @@
 private:
   G1CollectedHeap* _g1h;
   G1ConcurrentMark* _cm;
+  uint _worker_id;
 public:
-  G1RootRegionScanClosure(G1CollectedHeap* g1h, G1ConcurrentMark* cm) :
-    _g1h(g1h), _cm(cm) { }
+  G1RootRegionScanClosure(G1CollectedHeap* g1h, G1ConcurrentMark* cm, uint worker_id) :
+    _g1h(g1h), _cm(cm), _worker_id(worker_id) { }
   template <class T> void do_oop_nv(T* p);
   virtual void do_oop(      oop* p) { do_oop_nv(p); }
   virtual void do_oop(narrowOop* p) { do_oop_nv(p); }
--- a/src/hotspot/share/gc/g1/g1OopClosures.inline.hpp	Mon Mar 26 16:51:43 2018 +0200
+++ b/src/hotspot/share/gc/g1/g1OopClosures.inline.hpp	Mon Mar 26 16:51:43 2018 +0200
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2017, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2018, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -88,8 +88,7 @@
 
 template <class T>
 inline void G1CMOopClosure::do_oop_nv(T* p) {
-  oop obj = RawAccess<MO_VOLATILE>::oop_load(p);
-  _task->deal_with_reference(obj);
+  _task->deal_with_reference(p);
 }
 
 template <class T>
@@ -99,7 +98,7 @@
     return;
   }
   oop obj = CompressedOops::decode_not_null(heap_oop);
-  _cm->mark_in_next_bitmap(obj);
+  _cm->mark_in_next_bitmap(_worker_id, obj);
 }
 
 template <class T>
@@ -204,7 +203,8 @@
 void G1ParCopyHelper::mark_object(oop obj) {
   assert(!_g1->heap_region_containing(obj)->in_collection_set(), "should not mark objects in the CSet");
 
-  _cm->mark_in_next_bitmap(obj);
+  // We know that the object is not moving so it's safe to read its size.
+  _cm->mark_in_next_bitmap(_worker_id, obj);
 }
 
 void G1ParCopyHelper::mark_forwarded_object(oop from_obj, oop to_obj) {
@@ -215,7 +215,11 @@
   assert(_g1->heap_region_containing(from_obj)->in_collection_set(), "from obj should be in the CSet");
   assert(!_g1->heap_region_containing(to_obj)->in_collection_set(), "should not mark objects in the CSet");
 
-  _cm->mark_in_next_bitmap(to_obj);
+  // The object might be in the process of being copied by another
+  // worker so we cannot trust that its to-space image is
+  // well-formed. So we have to read its size from its from-space
+  // image which we know should not be changing.
+  _cm->mark_in_next_bitmap(_worker_id, to_obj, from_obj->size());
 }
 
 template <G1Barrier barrier, G1Mark do_mark_object>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/hotspot/share/gc/g1/g1RegionMarkStatsCache.cpp	Mon Mar 26 16:51:43 2018 +0200
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "gc/g1/g1RegionMarkStatsCache.inline.hpp"
+#include "memory/allocation.inline.hpp"
+
+G1RegionMarkStatsCache::G1RegionMarkStatsCache(G1RegionMarkStats* target, uint max_regions, uint num_cache_entries) :
+  _num_stats(max_regions),
+  _target(target),
+  _num_cache_entries(num_cache_entries),
+  _cache_hits(0),
+  _cache_misses(0) {
+
+  guarantee(is_power_of_2(num_cache_entries),
+            "Number of cache entries must be power of two, but is %u", num_cache_entries);
+  _cache = NEW_C_HEAP_ARRAY(G1RegionMarkStatsCacheEntry, _num_cache_entries, mtGC);
+  for (uint i = 0; i < _num_cache_entries; i++) {
+    _cache[i].clear();
+  }
+  _num_cache_entries_mask = _num_cache_entries - 1;
+}
+
+G1RegionMarkStatsCache::~G1RegionMarkStatsCache() {
+  FREE_C_HEAP_ARRAY(G1RegionMarkStatsCacheEntry, _cache);
+}
+
+// Evict all remaining statistics, returning cache hits and misses.
+Pair<size_t, size_t> G1RegionMarkStatsCache::evict_all() {
+  for (uint i = 0; i < _num_cache_entries; i++) {
+    evict(i);
+  }
+  return Pair<size_t,size_t>(_cache_hits, _cache_misses);
+}
+
+// Reset all cache entries to their default values.
+void G1RegionMarkStatsCache::reset() {
+  _cache_hits = 0;
+  _cache_misses = 0;
+
+  for (uint i = 0; i < _num_cache_entries; i++) {
+    _cache[i].clear();
+  }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/hotspot/share/gc/g1/g1RegionMarkStatsCache.hpp	Mon Mar 26 16:51:43 2018 +0200
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_VM_GC_G1_G1REGIONMARKSTATSCACHE_HPP
+#define SHARE_VM_GC_G1_G1REGIONMARKSTATSCACHE_HPP
+
+#include "memory/allocation.hpp"
+#include "utilities/debug.hpp"
+#include "utilities/globalDefinitions.hpp"
+#include "utilities/pair.hpp"
+
+// Per-Region statistics gathered during marking.
+//
+// This includes
+// * the number of live words gathered during marking for the area from bottom
+// to ntams. This is an exact measure.
+// The code corrects later for the live data between ntams and top.
+struct G1RegionMarkStats {
+  size_t _live_words;
+
+  // Clear all members.
+  void clear() {
+    _live_words = 0;
+  }
+  // Clear all members after a marking overflow. Nothing to do as the live words
+  // are updated by the atomic mark. We do not remark objects after overflow.
+  void clear_during_overflow() {
+  }
+
+  bool is_clear() const { return _live_words == 0; }
+};
+
+// Per-marking thread cache for the region mark statistics.
+//
+// Each cache is a larg'ish map of region-idx -> G1RegionMarkStats entries that cache
+// currently gathered statistics; entries are evicted to the global statistics array
+// on every collision. This minimizes synchronization overhead which would be required
+// every time statistics change, as marking is very localized.
+// The map entry number is a power of two to allow simple and fast hashing using
+// logical and.
+class G1RegionMarkStatsCache {
+private:
+  // The array of statistics entries to evict to; the global array.
+  G1RegionMarkStats* _target;
+  // Number of entries in the eviction target.
+  uint _num_stats;
+
+  // An entry of the statistics cache.
+  struct G1RegionMarkStatsCacheEntry {
+    uint _region_idx;
+    G1RegionMarkStats _stats;
+
+    void clear() {
+      _region_idx = 0;
+      _stats.clear();
+    }
+
+    bool is_clear() const {
+      return _region_idx == 0 && _stats.is_clear();
+    }
+  };
+
+  // The actual cache and its number of entries.
+  G1RegionMarkStatsCacheEntry* _cache;
+  uint _num_cache_entries;
+
+  // Cache hits/miss counters.
+  size_t _cache_hits;
+  size_t _cache_misses;
+
+  // Evict a given element of the statistics cache.
+  void evict(uint idx);
+
+  size_t _num_cache_entries_mask;
+
+  uint hash(uint idx) {
+    return idx & _num_cache_entries_mask;
+  }
+
+  G1RegionMarkStatsCacheEntry* find_for_add(uint region_idx);
+public:
+  G1RegionMarkStatsCache(G1RegionMarkStats* target, uint max_regions, uint num_cache_entries);
+
+  ~G1RegionMarkStatsCache();
+
+  void add_live_words(uint region_idx, size_t live_words) {
+    G1RegionMarkStatsCacheEntry* const cur = find_for_add(region_idx);
+    cur->_stats._live_words += live_words;
+  }
+
+  void reset(uint region_idx) {
+    uint const cache_idx = hash(region_idx);
+    G1RegionMarkStatsCacheEntry* cur = &_cache[cache_idx];
+    if (cur->_region_idx == region_idx) {
+      _cache[cache_idx].clear();
+    }
+  }
+
+  // Evict all remaining statistics, returning cache hits and misses.
+  Pair<size_t, size_t> evict_all();
+
+  // Reset all cache entries to their default values.
+  void reset();
+
+  size_t hits() const { return _cache_hits; }
+  size_t misses() const { return _cache_misses; }
+};
+
+#endif // SHARE_VM_GC_G1_G1REGIONMARKSTATSCACHE_HPP
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/hotspot/share/gc/g1/g1RegionMarkStatsCache.inline.hpp	Mon Mar 26 16:51:43 2018 +0200
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_VM_GC_G1_G1REGIONMARKSTATSCACHE_INLINE_HPP
+#define SHARE_VM_GC_G1_G1REGIONMARKSTATSCACHE_INLINE_HPP
+
+#include "gc/g1/g1RegionMarkStatsCache.hpp"
+#include "runtime/atomic.hpp"
+
+inline G1RegionMarkStatsCache::G1RegionMarkStatsCacheEntry* G1RegionMarkStatsCache::find_for_add(uint region_idx) {
+  uint const cache_idx = hash(region_idx);
+
+  G1RegionMarkStatsCacheEntry* cur = &_cache[cache_idx];
+  if (cur->_region_idx != region_idx) {
+    evict(cache_idx);
+    cur->_region_idx = region_idx;
+    _cache_misses++;
+  } else {
+    _cache_hits++;
+  }
+
+  return cur;
+}
+
+inline void G1RegionMarkStatsCache::evict(uint idx) {
+  G1RegionMarkStatsCacheEntry* cur = &_cache[idx];
+  if (cur->_stats._live_words != 0) {
+    Atomic::add(cur->_stats._live_words, &_target[cur->_region_idx]._live_words);
+  }
+  cur->clear();
+}
+
+#endif // SHARE_VM_GC_G1_G1REGIONMARKSTATSCACHE_INLINE_HPP