8177707: Specialize G1RemSet::refine_card for concurrent/during safepoint refinement
authortschatzl
Fri, 02 Jun 2017 13:47:54 +0200
changeset 46519 40c9c132f961
parent 46518 69f8479862a2
child 46520 de5cb3eed39b
8177707: Specialize G1RemSet::refine_card for concurrent/during safepoint refinement Reviewed-by: ehelin, kbarrett
hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp
hotspot/src/share/vm/gc/g1/g1OopClosures.hpp
hotspot/src/share/vm/gc/g1/g1OopClosures.inline.hpp
hotspot/src/share/vm/gc/g1/g1RemSet.cpp
hotspot/src/share/vm/gc/g1/g1RemSet.hpp
hotspot/src/share/vm/gc/g1/g1_specialized_oop_closures.hpp
--- a/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp	Fri Jun 02 13:45:21 2017 +0200
+++ b/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp	Fri Jun 02 13:47:54 2017 +0200
@@ -101,11 +101,7 @@
   RefineCardTableEntryClosure() : _concurrent(true) { }
 
   bool do_card_ptr(jbyte* card_ptr, uint worker_i) {
-    bool oops_into_cset = G1CollectedHeap::heap()->g1_rem_set()->refine_card(card_ptr, worker_i, NULL);
-    // This path is executed by the concurrent refine or mutator threads,
-    // concurrently, and so we do not care if card_ptr contains references
-    // that point into the collection set.
-    assert(!oops_into_cset, "should be");
+    G1CollectedHeap::heap()->g1_rem_set()->refine_card_concurrently(card_ptr, worker_i);
 
     if (_concurrent && SuspendibleThreadSet::should_yield()) {
       // Caller will actually yield.
--- a/hotspot/src/share/vm/gc/g1/g1OopClosures.hpp	Fri Jun 02 13:45:21 2017 +0200
+++ b/hotspot/src/share/vm/gc/g1/g1OopClosures.hpp	Fri Jun 02 13:47:54 2017 +0200
@@ -171,6 +171,24 @@
   virtual void do_oop(narrowOop* p) { do_oop_nv(p); }
 };
 
+class G1ConcurrentRefineOopClosure: public ExtendedOopClosure {
+  G1CollectedHeap* _g1;
+  uint _worker_i;
+
+public:
+  G1ConcurrentRefineOopClosure(G1CollectedHeap* g1h, uint worker_i) :
+    _g1(g1h),
+    _worker_i(worker_i) {
+  }
+
+  // This closure needs special handling for InstanceRefKlass.
+  virtual ReferenceIterationMode reference_iteration_mode() { return DO_DISCOVERED_AND_DISCOVERY; }
+
+  template <class T> void do_oop_nv(T* p);
+  virtual void do_oop(narrowOop* p) { do_oop_nv(p); }
+  virtual void do_oop(oop* p) { do_oop_nv(p); }
+};
+
 class G1UpdateRSOrPushRefOopClosure: public ExtendedOopClosure {
   G1CollectedHeap* _g1;
   HeapRegion* _from;
--- a/hotspot/src/share/vm/gc/g1/g1OopClosures.inline.hpp	Fri Jun 02 13:45:21 2017 +0200
+++ b/hotspot/src/share/vm/gc/g1/g1OopClosures.inline.hpp	Fri Jun 02 13:47:54 2017 +0200
@@ -111,6 +111,54 @@
 }
 
 template <class T>
+inline static void check_obj_during_refinement(T* p, oop const obj) {
+#ifdef ASSERT
+  G1CollectedHeap* g1 = G1CollectedHeap::heap();
+  // can't do because of races
+  // assert(obj == NULL || obj->is_oop(), "expected an oop");
+  assert(check_obj_alignment(obj), "not oop aligned");
+  assert(g1->is_in_reserved(obj), "must be in heap");
+
+  HeapRegion* from = g1->heap_region_containing(p);
+
+  assert(from != NULL, "from region must be non-NULL");
+  assert(from->is_in_reserved(p) ||
+         (from->is_humongous() &&
+          g1->heap_region_containing(p)->is_humongous() &&
+          from->humongous_start_region() == g1->heap_region_containing(p)->humongous_start_region()),
+         "p " PTR_FORMAT " is not in the same region %u or part of the correct humongous object starting at region %u.",
+         p2i(p), from->hrm_index(), from->humongous_start_region()->hrm_index());
+#endif // ASSERT
+}
+
+template <class T>
+inline void G1ConcurrentRefineOopClosure::do_oop_nv(T* p) {
+  T o = oopDesc::load_heap_oop(p);
+  if (oopDesc::is_null(o)) {
+    return;
+  }
+  oop obj = oopDesc::decode_heap_oop_not_null(o);
+
+  check_obj_during_refinement(p, obj);
+
+  if (HeapRegion::is_in_same_region(p, obj)) {
+    // Normally this closure should only be called with cross-region references.
+    // But since Java threads are manipulating the references concurrently and we
+    // reload the values things may have changed.
+    // This check lets slip through references from a humongous continues region
+    // to its humongous start region, as they are in different regions, and adds a
+    // remembered set entry. This is benign (apart from memory usage), as this
+    // closure is never called during evacuation.
+    return;
+  }
+
+  HeapRegion* to = _g1->heap_region_containing(obj);
+
+  assert(to->rem_set() != NULL, "Need per-region 'into' remsets.");
+  to->rem_set()->add_reference(p, _worker_i);
+}
+
+template <class T>
 inline void G1UpdateRSOrPushRefOopClosure::do_oop_nv(T* p) {
   oop obj = oopDesc::load_decode_heap_oop(p);
   if (obj == NULL) {
@@ -161,9 +209,9 @@
     // we have already visited/tried to copy this object
     // there is no need to retry.
     if (!self_forwarded(obj)) {
-      assert(_push_ref_cl != NULL, "should not be null");
-      // Push the reference in the refs queue of the G1ParScanThreadState
-      // instance for this worker thread.
+    assert(_push_ref_cl != NULL, "should not be null");
+    // Push the reference in the refs queue of the G1ParScanThreadState
+    // instance for this worker thread.
       _push_ref_cl->do_oop(p);
     }
     _has_refs_into_cset = true;
--- a/hotspot/src/share/vm/gc/g1/g1RemSet.cpp	Fri Jun 02 13:45:21 2017 +0200
+++ b/hotspot/src/share/vm/gc/g1/g1RemSet.cpp	Fri Jun 02 13:47:54 2017 +0200
@@ -413,7 +413,7 @@
     assert(SafepointSynchronize::is_at_safepoint(), "not during an evacuation pause");
     assert(worker_i < ParallelGCThreads, "should be a GC worker");
 
-    if (_g1rs->refine_card(card_ptr, worker_i, _cl)) {
+    if (_g1rs->refine_card_during_gc(card_ptr, worker_i, _cl)) {
       // 'card_ptr' contains references that point into the collection
       // set. We need to record the card in the DCQS
       // (_into_cset_dirty_card_queue_set)
@@ -523,6 +523,18 @@
   _g1->heap_region_par_iterate(&scrub_cl, worker_num, hrclaimer);
 }
 
+inline void check_card_ptr(jbyte* card_ptr, CardTableModRefBS* ct_bs) {
+#ifdef ASSERT
+  G1CollectedHeap* g1 = G1CollectedHeap::heap();
+  assert(g1->is_in_exact(ct_bs->addr_for(card_ptr)),
+         "Card at " PTR_FORMAT " index " SIZE_FORMAT " representing heap at " PTR_FORMAT " (%u) must be in committed heap",
+         p2i(card_ptr),
+         ct_bs->index_for(ct_bs->addr_for(card_ptr)),
+         p2i(ct_bs->addr_for(card_ptr)),
+         g1->addr_to_region(ct_bs->addr_for(card_ptr)));
+#endif
+}
+
 G1UpdateRSOrPushRefOopClosure::G1UpdateRSOrPushRefOopClosure(G1CollectedHeap* g1h,
                                                              G1ParPushHeapRSClosure* push_ref_cl,
                                                              bool record_refs_into_cset,
@@ -534,24 +546,165 @@
   _push_ref_cl(push_ref_cl),
   _worker_i(worker_i) { }
 
-// Returns true if the given card contains references that point
-// into the collection set, if we're checking for such references;
-// false otherwise.
+void G1RemSet::refine_card_concurrently(jbyte* card_ptr,
+                                        uint worker_i) {
+  assert(!_g1->is_gc_active(), "Only call concurrently");
 
-bool G1RemSet::refine_card(jbyte* card_ptr,
-                           uint worker_i,
-                           G1ParPushHeapRSClosure*  oops_in_heap_closure) {
-  assert(_g1->is_in_exact(_ct_bs->addr_for(card_ptr)),
-         "Card at " PTR_FORMAT " index " SIZE_FORMAT " representing heap at " PTR_FORMAT " (%u) must be in committed heap",
-         p2i(card_ptr),
-         _ct_bs->index_for(_ct_bs->addr_for(card_ptr)),
-         p2i(_ct_bs->addr_for(card_ptr)),
-         _g1->addr_to_region(_ct_bs->addr_for(card_ptr)));
-
-  bool check_for_refs_into_cset = oops_in_heap_closure != NULL;
+  check_card_ptr(card_ptr, _ct_bs);
 
   // If the card is no longer dirty, nothing to do.
   if (*card_ptr != CardTableModRefBS::dirty_card_val()) {
+    return;
+  }
+
+  // Construct the region representing the card.
+  HeapWord* start = _ct_bs->addr_for(card_ptr);
+  // And find the region containing it.
+  HeapRegion* r = _g1->heap_region_containing(start);
+
+  // This check is needed for some uncommon cases where we should
+  // ignore the card.
+  //
+  // The region could be young.  Cards for young regions are
+  // distinctly marked (set to g1_young_gen), so the post-barrier will
+  // filter them out.  However, that marking is performed
+  // concurrently.  A write to a young object could occur before the
+  // card has been marked young, slipping past the filter.
+  //
+  // The card could be stale, because the region has been freed since
+  // the card was recorded. In this case the region type could be
+  // anything.  If (still) free or (reallocated) young, just ignore
+  // it.  If (reallocated) old or humongous, the later card trimming
+  // and additional checks in iteration may detect staleness.  At
+  // worst, we end up processing a stale card unnecessarily.
+  //
+  // In the normal (non-stale) case, the synchronization between the
+  // enqueueing of the card and processing it here will have ensured
+  // we see the up-to-date region type here.
+  if (!r->is_old_or_humongous()) {
+    return;
+  }
+
+  // While we are processing RSet buffers during the collection, we
+  // actually don't want to scan any cards on the collection set,
+  // since we don't want to update remembered sets with entries that
+  // point into the collection set, given that live objects from the
+  // collection set are about to move and such entries will be stale
+  // very soon. This change also deals with a reliability issue which
+  // involves scanning a card in the collection set and coming across
+  // an array that was being chunked and looking malformed. Note,
+  // however, that if evacuation fails, we have to scan any objects
+  // that were not moved and create any missing entries.
+  if (r->in_collection_set()) {
+    return;
+  }
+
+  // The result from the hot card cache insert call is either:
+  //   * pointer to the current card
+  //     (implying that the current card is not 'hot'),
+  //   * null
+  //     (meaning we had inserted the card ptr into the "hot" card cache,
+  //     which had some headroom),
+  //   * a pointer to a "hot" card that was evicted from the "hot" cache.
+  //
+
+  if (_hot_card_cache->use_cache()) {
+    assert(!SafepointSynchronize::is_at_safepoint(), "sanity");
+
+    const jbyte* orig_card_ptr = card_ptr;
+    card_ptr = _hot_card_cache->insert(card_ptr);
+    if (card_ptr == NULL) {
+      // There was no eviction. Nothing to do.
+      return;
+    } else if (card_ptr != orig_card_ptr) {
+      // Original card was inserted and an old card was evicted.
+      start = _ct_bs->addr_for(card_ptr);
+      r = _g1->heap_region_containing(start);
+
+      // Check whether the region formerly in the cache should be
+      // ignored, as discussed earlier for the original card.  The
+      // region could have been freed while in the cache.  The cset is
+      // not relevant here, since we're in concurrent phase.
+      if (!r->is_old_or_humongous()) {
+        return;
+      }
+    } // Else we still have the original card.
+  }
+
+  // Trim the region designated by the card to what's been allocated
+  // in the region.  The card could be stale, or the card could cover
+  // (part of) an object at the end of the allocated space and extend
+  // beyond the end of allocation.
+
+  // Non-humongous objects are only allocated in the old-gen during
+  // GC, so if region is old then top is stable.  Humongous object
+  // allocation sets top last; if top has not yet been set, this is
+  // a stale card and we'll end up with an empty intersection.  If
+  // this is not a stale card, the synchronization between the
+  // enqueuing of the card and processing it here will have ensured
+  // we see the up-to-date top here.
+  HeapWord* scan_limit = r->top();
+
+  if (scan_limit <= start) {
+    // If the trimmed region is empty, the card must be stale.
+    return;
+  }
+
+  // Okay to clean and process the card now.  There are still some
+  // stale card cases that may be detected by iteration and dealt with
+  // as iteration failure.
+  *const_cast<volatile jbyte*>(card_ptr) = CardTableModRefBS::clean_card_val();
+
+  // This fence serves two purposes.  First, the card must be cleaned
+  // before processing the contents.  Second, we can't proceed with
+  // processing until after the read of top, for synchronization with
+  // possibly concurrent humongous object allocation.  It's okay that
+  // reading top and reading type were racy wrto each other.  We need
+  // both set, in any order, to proceed.
+  OrderAccess::fence();
+
+  // Don't use addr_for(card_ptr + 1) which can ask for
+  // a card beyond the heap.
+  HeapWord* end = start + CardTableModRefBS::card_size_in_words;
+  MemRegion dirty_region(start, MIN2(scan_limit, end));
+  assert(!dirty_region.is_empty(), "sanity");
+
+  G1ConcurrentRefineOopClosure conc_refine_cl(_g1, worker_i);
+
+  bool card_processed =
+    r->oops_on_card_seq_iterate_careful<false>(dirty_region, &conc_refine_cl);
+
+  // If unable to process the card then we encountered an unparsable
+  // part of the heap (e.g. a partially allocated object) while
+  // processing a stale card.  Despite the card being stale, redirty
+  // and re-enqueue, because we've already cleaned the card.  Without
+  // this we could incorrectly discard a non-stale card.
+  if (!card_processed) {
+    // The card might have gotten re-dirtied and re-enqueued while we
+    // worked.  (In fact, it's pretty likely.)
+    if (*card_ptr != CardTableModRefBS::dirty_card_val()) {
+      *card_ptr = CardTableModRefBS::dirty_card_val();
+      MutexLockerEx x(Shared_DirtyCardQ_lock,
+                      Mutex::_no_safepoint_check_flag);
+      DirtyCardQueue* sdcq =
+        JavaThread::dirty_card_queue_set().shared_dirty_card_queue();
+      sdcq->enqueue(card_ptr);
+    }
+  } else {
+    _conc_refine_cards++;
+  }
+}
+
+bool G1RemSet::refine_card_during_gc(jbyte* card_ptr,
+                                     uint worker_i,
+                                     G1ParPushHeapRSClosure*  oops_in_heap_closure) {
+  assert(_g1->is_gc_active(), "Only call during GC");
+
+  check_card_ptr(card_ptr, _ct_bs);
+
+  // If the card is no longer dirty, nothing to do. This covers cards that were already
+  // scanned as parts of the remembered sets.
+  if (*card_ptr != CardTableModRefBS::dirty_card_val()) {
     // No need to return that this card contains refs that point
     // into the collection set.
     return false;
@@ -599,59 +752,16 @@
     return false;
   }
 
-  // The result from the hot card cache insert call is either:
-  //   * pointer to the current card
-  //     (implying that the current card is not 'hot'),
-  //   * null
-  //     (meaning we had inserted the card ptr into the "hot" card cache,
-  //     which had some headroom),
-  //   * a pointer to a "hot" card that was evicted from the "hot" cache.
-  //
-
-  if (_hot_card_cache->use_cache()) {
-    assert(!check_for_refs_into_cset, "sanity");
-    assert(!SafepointSynchronize::is_at_safepoint(), "sanity");
-
-    const jbyte* orig_card_ptr = card_ptr;
-    card_ptr = _hot_card_cache->insert(card_ptr);
-    if (card_ptr == NULL) {
-      // There was no eviction. Nothing to do.
-      return false;
-    } else if (card_ptr != orig_card_ptr) {
-      // Original card was inserted and an old card was evicted.
-      start = _ct_bs->addr_for(card_ptr);
-      r = _g1->heap_region_containing(start);
-
-      // Check whether the region formerly in the cache should be
-      // ignored, as discussed earlier for the original card.  The
-      // region could have been freed while in the cache.  The cset is
-      // not relevant here, since we're in concurrent phase.
-      if (!r->is_old_or_humongous()) {
-        return false;
-      }
-    } // Else we still have the original card.
-  }
-
   // Trim the region designated by the card to what's been allocated
   // in the region.  The card could be stale, or the card could cover
   // (part of) an object at the end of the allocated space and extend
   // beyond the end of allocation.
-  HeapWord* scan_limit;
-  if (_g1->is_gc_active()) {
-    // If we're in a STW GC, then a card might be in a GC alloc region
-    // and extend onto a GC LAB, which may not be parsable.  Stop such
-    // at the "scan_top" of the region.
-    scan_limit = r->scan_top();
-  } else {
-    // Non-humongous objects are only allocated in the old-gen during
-    // GC, so if region is old then top is stable.  Humongous object
-    // allocation sets top last; if top has not yet been set, this is
-    // a stale card and we'll end up with an empty intersection.  If
-    // this is not a stale card, the synchronization between the
-    // enqueuing of the card and processing it here will have ensured
-    // we see the up-to-date top here.
-    scan_limit = r->top();
-  }
+
+  // If we're in a STW GC, then a card might be in a GC alloc region
+  // and extend onto a GC LAB, which may not be parsable.  Stop such
+  // at the "scan_top" of the region.
+  HeapWord* scan_limit = r->scan_top();
+
   if (scan_limit <= start) {
     // If the trimmed region is empty, the card must be stale.
     return false;
@@ -662,14 +772,6 @@
   // as iteration failure.
   *const_cast<volatile jbyte*>(card_ptr) = CardTableModRefBS::clean_card_val();
 
-  // This fence serves two purposes.  First, the card must be cleaned
-  // before processing the contents.  Second, we can't proceed with
-  // processing until after the read of top, for synchronization with
-  // possibly concurrent humongous object allocation.  It's okay that
-  // reading top and reading type were racy wrto each other.  We need
-  // both set, in any order, to proceed.
-  OrderAccess::fence();
-
   // Don't use addr_for(card_ptr + 1) which can ask for
   // a card beyond the heap.
   HeapWord* end = start + CardTableModRefBS::card_size_in_words;
@@ -678,49 +780,17 @@
 
   G1UpdateRSOrPushRefOopClosure update_rs_oop_cl(_g1,
                                                  oops_in_heap_closure,
-                                                 check_for_refs_into_cset,
+                                                 true,
                                                  worker_i);
   update_rs_oop_cl.set_from(r);
 
-  bool card_processed;
-  if (_g1->is_gc_active()) {
-    card_processed = r->oops_on_card_seq_iterate_careful<true>(dirty_region, &update_rs_oop_cl);
-  } else {
-    card_processed = r->oops_on_card_seq_iterate_careful<false>(dirty_region, &update_rs_oop_cl);
-  }
+  bool card_processed =
+    r->oops_on_card_seq_iterate_careful<true>(dirty_region,
+                                              &update_rs_oop_cl);
+  assert(card_processed, "must be");
+  _conc_refine_cards++;
 
-  // If unable to process the card then we encountered an unparsable
-  // part of the heap (e.g. a partially allocated object) while
-  // processing a stale card.  Despite the card being stale, redirty
-  // and re-enqueue, because we've already cleaned the card.  Without
-  // this we could incorrectly discard a non-stale card.
-  if (!card_processed) {
-    assert(!_g1->is_gc_active(), "Unparsable heap during GC");
-    // The card might have gotten re-dirtied and re-enqueued while we
-    // worked.  (In fact, it's pretty likely.)
-    if (*card_ptr != CardTableModRefBS::dirty_card_val()) {
-      *card_ptr = CardTableModRefBS::dirty_card_val();
-      MutexLockerEx x(Shared_DirtyCardQ_lock,
-                      Mutex::_no_safepoint_check_flag);
-      DirtyCardQueue* sdcq =
-        JavaThread::dirty_card_queue_set().shared_dirty_card_queue();
-      sdcq->enqueue(card_ptr);
-    }
-  } else {
-    _conc_refine_cards++;
-  }
-
-  // This gets set to true if the card being refined has references that point
-  // into the collection set.
-  bool has_refs_into_cset = update_rs_oop_cl.has_refs_into_cset();
-
-  // We should only be detecting that the card contains references
-  // that point into the collection set if the current thread is
-  // a GC worker thread.
-  assert(!has_refs_into_cset || SafepointSynchronize::is_at_safepoint(),
-         "invalid result at non safepoint");
-
-  return has_refs_into_cset;
+  return update_rs_oop_cl.has_refs_into_cset();
 }
 
 void G1RemSet::print_periodic_summary_info(const char* header, uint period_count) {
--- a/hotspot/src/share/vm/gc/g1/g1RemSet.hpp	Fri Jun 02 13:45:21 2017 +0200
+++ b/hotspot/src/share/vm/gc/g1/g1RemSet.hpp	Fri Jun 02 13:47:54 2017 +0200
@@ -146,12 +146,14 @@
   void scrub(uint worker_num, HeapRegionClaimer* hrclaimer);
 
   // Refine the card corresponding to "card_ptr".
-  // If oops_in_heap_closure is not NULL, a true result is returned
-  // if the given card contains oops that have references into the
-  // current collection set.
-  bool refine_card(jbyte* card_ptr,
-                   uint worker_i,
-                   G1ParPushHeapRSClosure* oops_in_heap_closure);
+  void refine_card_concurrently(jbyte* card_ptr,
+                                uint worker_i);
+
+  // Refine the card corresponding to "card_ptr". Returns "true" if the given card contains
+  // oops that have references into the current collection set.
+  bool refine_card_during_gc(jbyte* card_ptr,
+                             uint worker_i,
+                             G1ParPushHeapRSClosure* oops_in_heap_closure);
 
   // Print accumulated summary info from the start of the VM.
   void print_summary_info();
--- a/hotspot/src/share/vm/gc/g1/g1_specialized_oop_closures.hpp	Fri Jun 02 13:45:21 2017 +0200
+++ b/hotspot/src/share/vm/gc/g1/g1_specialized_oop_closures.hpp	Fri Jun 02 13:47:54 2017 +0200
@@ -36,6 +36,8 @@
 class G1ParPushHeapRSClosure;
 
 class G1UpdateRSOrPushRefOopClosure;
+class G1ConcurrentRefineOopClosure;
+
 class G1CMOopClosure;
 class G1RootRegionScanClosure;
 
@@ -43,6 +45,7 @@
       f(G1ParScanClosure,_nv)                      \
       f(G1ParPushHeapRSClosure,_nv)                \
       f(G1UpdateRSOrPushRefOopClosure,_nv)         \
+      f(G1ConcurrentRefineOopClosure,_nv)          \
       f(G1CMOopClosure,_nv)                        \
       f(G1RootRegionScanClosure,_nv)