8213108: Improve work distribution during remembered set scan
authortschatzl
Thu, 27 Jun 2019 11:48:32 +0200
changeset 55510 3e31a8beaae4
parent 55509 d58442b8abc1
child 55511 91b38bfb9079
8213108: Improve work distribution during remembered set scan Summary: Before scanning the heap for roots into the collection set, merge them into a single remembered set (card table) and do work distribution based on location like other collectors do. Reviewed-by: kbarrett, lkorinth
src/hotspot/share/gc/g1/g1Analytics.cpp
src/hotspot/share/gc/g1/g1Analytics.hpp
src/hotspot/share/gc/g1/g1CardTable.cpp
src/hotspot/share/gc/g1/g1CardTable.hpp
src/hotspot/share/gc/g1/g1CardTable.inline.hpp
src/hotspot/share/gc/g1/g1CollectedHeap.cpp
src/hotspot/share/gc/g1/g1CollectedHeap.hpp
src/hotspot/share/gc/g1/g1CollectedHeap.inline.hpp
src/hotspot/share/gc/g1/g1CollectionSet.cpp
src/hotspot/share/gc/g1/g1CollectionSet.hpp
src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp
src/hotspot/share/gc/g1/g1ConcurrentRefine.hpp
src/hotspot/share/gc/g1/g1EvacFailure.cpp
src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp
src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp
src/hotspot/share/gc/g1/g1HeterogeneousHeapPolicy.cpp
src/hotspot/share/gc/g1/g1HeterogeneousHeapPolicy.hpp
src/hotspot/share/gc/g1/g1ParScanThreadState.cpp
src/hotspot/share/gc/g1/g1ParScanThreadState.hpp
src/hotspot/share/gc/g1/g1Policy.cpp
src/hotspot/share/gc/g1/g1Policy.hpp
src/hotspot/share/gc/g1/g1RemSet.cpp
src/hotspot/share/gc/g1/g1RemSet.hpp
src/hotspot/share/gc/g1/heapRegion.cpp
src/hotspot/share/gc/g1/heapRegion.hpp
src/hotspot/share/gc/g1/heapRegionRemSet.cpp
src/hotspot/share/gc/g1/heapRegionRemSet.hpp
src/hotspot/share/gc/g1/heapRegionRemSet.inline.hpp
src/hotspot/share/gc/g1/sparsePRT.cpp
src/hotspot/share/gc/g1/sparsePRT.hpp
src/hotspot/share/gc/shared/cardTable.hpp
src/hotspot/share/gc/shared/workerDataArray.hpp
src/hotspot/share/gc/shared/workerDataArray.inline.hpp
test/hotspot/jtreg/gc/g1/TestGCLogMessages.java
test/jdk/jdk/jfr/event/gc/collection/TestG1ParallelPhases.java
--- a/src/hotspot/share/gc/g1/g1Analytics.cpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/g1Analytics.cpp	Thu Jun 27 11:48:32 2019 +0200
@@ -38,7 +38,7 @@
   0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
 };
 
-static double cost_per_card_ms_defaults[] = {
+static double cost_per_log_buffer_entry_ms_defaults[] = {
   0.01, 0.005, 0.005, 0.003, 0.003, 0.002, 0.002, 0.0015
 };
 
@@ -47,7 +47,7 @@
   1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 };
 
-static double cost_per_entry_ms_defaults[] = {
+static double young_only_cost_per_remset_card_ms_defaults[] = {
   0.015, 0.01, 0.01, 0.008, 0.008, 0.0055, 0.0055, 0.005
 };
 
@@ -77,12 +77,12 @@
     _alloc_rate_ms_seq(new TruncatedSeq(TruncatedSeqLength)),
     _prev_collection_pause_end_ms(0.0),
     _rs_length_diff_seq(new TruncatedSeq(TruncatedSeqLength)),
-    _cost_per_card_ms_seq(new TruncatedSeq(TruncatedSeqLength)),
+    _cost_per_log_buffer_entry_ms_seq(new TruncatedSeq(TruncatedSeqLength)),
     _cost_scan_hcc_seq(new TruncatedSeq(TruncatedSeqLength)),
     _young_cards_per_entry_ratio_seq(new TruncatedSeq(TruncatedSeqLength)),
     _mixed_cards_per_entry_ratio_seq(new TruncatedSeq(TruncatedSeqLength)),
-    _cost_per_entry_ms_seq(new TruncatedSeq(TruncatedSeqLength)),
-    _mixed_cost_per_entry_ms_seq(new TruncatedSeq(TruncatedSeqLength)),
+    _young_only_cost_per_remset_card_ms_seq(new TruncatedSeq(TruncatedSeqLength)),
+    _mixed_cost_per_remset_card_ms_seq(new TruncatedSeq(TruncatedSeqLength)),
     _cost_per_byte_ms_seq(new TruncatedSeq(TruncatedSeqLength)),
     _constant_other_time_ms_seq(new TruncatedSeq(TruncatedSeqLength)),
     _young_other_cost_per_region_ms_seq(new TruncatedSeq(TruncatedSeqLength)),
@@ -101,10 +101,10 @@
   int index = MIN2(ParallelGCThreads - 1, 7u);
 
   _rs_length_diff_seq->add(rs_length_diff_defaults[index]);
-  _cost_per_card_ms_seq->add(cost_per_card_ms_defaults[index]);
+  _cost_per_log_buffer_entry_ms_seq->add(cost_per_log_buffer_entry_ms_defaults[index]);
   _cost_scan_hcc_seq->add(0.0);
   _young_cards_per_entry_ratio_seq->add(young_cards_per_entry_ratio_defaults[index]);
-  _cost_per_entry_ms_seq->add(cost_per_entry_ms_defaults[index]);
+  _young_only_cost_per_remset_card_ms_seq->add(young_only_cost_per_remset_card_ms_defaults[index]);
   _cost_per_byte_ms_seq->add(cost_per_byte_ms_defaults[index]);
   _constant_other_time_ms_seq->add(constant_other_time_ms_defaults[index]);
   _young_other_cost_per_region_ms_seq->add(young_other_cost_per_region_ms_defaults[index]);
@@ -158,19 +158,19 @@
     (pause_time_ms * _recent_prev_end_times_for_all_gcs_sec->num()) / interval_ms;
 }
 
-void G1Analytics::report_cost_per_card_ms(double cost_per_card_ms) {
-  _cost_per_card_ms_seq->add(cost_per_card_ms);
+void G1Analytics::report_cost_per_log_buffer_entry_ms(double cost_per_log_buffer_entry_ms) {
+  _cost_per_log_buffer_entry_ms_seq->add(cost_per_log_buffer_entry_ms);
 }
 
 void G1Analytics::report_cost_scan_hcc(double cost_scan_hcc) {
   _cost_scan_hcc_seq->add(cost_scan_hcc);
 }
 
-void G1Analytics::report_cost_per_entry_ms(double cost_per_entry_ms, bool for_young_gc) {
+void G1Analytics::report_cost_per_remset_card_ms(double cost_per_remset_card_ms, bool for_young_gc) {
   if (for_young_gc) {
-    _cost_per_entry_ms_seq->add(cost_per_entry_ms);
+    _young_only_cost_per_remset_card_ms_seq->add(cost_per_remset_card_ms);
   } else {
-    _mixed_cost_per_entry_ms_seq->add(cost_per_entry_ms);
+    _mixed_cost_per_remset_card_ms_seq->add(cost_per_remset_card_ms);
   }
 }
 
@@ -222,8 +222,8 @@
   return get_new_prediction(_alloc_rate_ms_seq);
 }
 
-double G1Analytics::predict_cost_per_card_ms() const {
-  return get_new_prediction(_cost_per_card_ms_seq);
+double G1Analytics::predict_cost_per_log_buffer_entry_ms() const {
+  return get_new_prediction(_cost_per_log_buffer_entry_ms_seq);
 }
 
 double G1Analytics::predict_scan_hcc_ms() const {
@@ -231,7 +231,7 @@
 }
 
 double G1Analytics::predict_rs_update_time_ms(size_t pending_cards) const {
-  return pending_cards * predict_cost_per_card_ms() + predict_scan_hcc_ms();
+  return pending_cards * predict_cost_per_log_buffer_entry_ms() + predict_scan_hcc_ms();
 }
 
 double G1Analytics::predict_young_cards_per_entry_ratio() const {
@@ -256,17 +256,17 @@
 
 double G1Analytics::predict_rs_scan_time_ms(size_t card_num, bool for_young_gc) const {
   if (for_young_gc) {
-    return card_num * get_new_prediction(_cost_per_entry_ms_seq);
+    return card_num * get_new_prediction(_young_only_cost_per_remset_card_ms_seq);
   } else {
     return predict_mixed_rs_scan_time_ms(card_num);
   }
 }
 
 double G1Analytics::predict_mixed_rs_scan_time_ms(size_t card_num) const {
-  if (_mixed_cost_per_entry_ms_seq->num() < 3) {
-    return card_num * get_new_prediction(_cost_per_entry_ms_seq);
+  if (_mixed_cost_per_remset_card_ms_seq->num() < 3) {
+    return card_num * get_new_prediction(_young_only_cost_per_remset_card_ms_seq);
   } else {
-    return card_num * get_new_prediction(_mixed_cost_per_entry_ms_seq);
+    return card_num * get_new_prediction(_mixed_cost_per_remset_card_ms_seq);
   }
 }
 
--- a/src/hotspot/share/gc/g1/g1Analytics.hpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/g1Analytics.hpp	Thu Jun 27 11:48:32 2019 +0200
@@ -46,12 +46,12 @@
   double        _prev_collection_pause_end_ms;
 
   TruncatedSeq* _rs_length_diff_seq;
-  TruncatedSeq* _cost_per_card_ms_seq;
+  TruncatedSeq* _cost_per_log_buffer_entry_ms_seq;
   TruncatedSeq* _cost_scan_hcc_seq;
   TruncatedSeq* _young_cards_per_entry_ratio_seq;
   TruncatedSeq* _mixed_cards_per_entry_ratio_seq;
-  TruncatedSeq* _cost_per_entry_ms_seq;
-  TruncatedSeq* _mixed_cost_per_entry_ms_seq;
+  TruncatedSeq* _young_only_cost_per_remset_card_ms_seq;
+  TruncatedSeq* _mixed_cost_per_remset_card_ms_seq;
   TruncatedSeq* _cost_per_byte_ms_seq;
   TruncatedSeq* _constant_other_time_ms_seq;
   TruncatedSeq* _young_other_cost_per_region_ms_seq;
@@ -99,9 +99,9 @@
   void report_concurrent_mark_remark_times_ms(double ms);
   void report_concurrent_mark_cleanup_times_ms(double ms);
   void report_alloc_rate_ms(double alloc_rate);
-  void report_cost_per_card_ms(double cost_per_card_ms);
+  void report_cost_per_log_buffer_entry_ms(double cost_per_log_buffer_entry_ms);
   void report_cost_scan_hcc(double cost_scan_hcc);
-  void report_cost_per_entry_ms(double cost_per_entry_ms, bool for_young_gc);
+  void report_cost_per_remset_card_ms(double cost_per_remset_card_ms, bool for_young_gc);
   void report_cards_per_entry_ratio(double cards_per_entry_ratio, bool for_young_gc);
   void report_rs_length_diff(double rs_length_diff);
   void report_cost_per_byte_ms(double cost_per_byte_ms, bool mark_or_rebuild_in_progress);
@@ -116,7 +116,7 @@
   double predict_alloc_rate_ms() const;
   int num_alloc_rate_ms() const;
 
-  double predict_cost_per_card_ms() const;
+  double predict_cost_per_log_buffer_entry_ms() const;
 
   double predict_scan_hcc_ms() const;
 
--- a/src/hotspot/share/gc/g1/g1CardTable.cpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/g1CardTable.cpp	Thu Jun 27 11:48:32 2019 +0200
@@ -30,28 +30,6 @@
 #include "runtime/atomic.hpp"
 #include "runtime/orderAccess.hpp"
 
-bool G1CardTable::mark_card_deferred(size_t card_index) {
-  CardValue val = _byte_map[card_index];
-  // It's already processed
-  if ((val & (clean_card_mask_val() | deferred_card_val())) == deferred_card_val()) {
-    return false;
-  }
-
-  // Cached bit can be installed either on a clean card or on a claimed card.
-  CardValue new_val = val;
-  if (val == clean_card_val()) {
-    new_val = deferred_card_val();
-  } else {
-    if (val & claimed_card_val()) {
-      new_val = val | deferred_card_val();
-    }
-  }
-  if (new_val != val) {
-    Atomic::cmpxchg(new_val, &_byte_map[card_index], val);
-  }
-  return true;
-}
-
 void G1CardTable::g1_mark_as_young(const MemRegion& mr) {
   CardValue *const first = byte_for(mr.start());
   CardValue *const last = byte_after(mr.last());
--- a/src/hotspot/share/gc/g1/g1CardTable.hpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/g1CardTable.hpp	Thu Jun 27 11:48:32 2019 +0200
@@ -44,55 +44,65 @@
   virtual void on_commit(uint start_idx, size_t num_regions, bool zero_filled);
 };
 
-class G1CardTable: public CardTable {
+class G1CardTable : public CardTable {
   friend class VMStructs;
   friend class G1CardTableChangedListener;
 
   G1CardTableChangedListener _listener;
 
+public:
   enum G1CardValues {
-    g1_young_gen = CT_MR_BS_last_reserved << 1
+    g1_young_gen = CT_MR_BS_last_reserved << 1,
+
+    // During evacuation we use the card table to consolidate the cards we need to
+    // scan for roots onto the card table from the various sources. Further it is
+    // used to record already completely scanned cards to avoid re-scanning them
+    // when incrementally evacuating the old gen regions of a collection set.
+    // This means that already scanned cards should be preserved.
+    //
+    // The merge at the start of each evacuation round simply sets cards to dirty
+    // that are clean; scanned cards are set to 0x1.
+    //
+    // This means that the LSB determines what to do with the card during evacuation
+    // given the following possible values:
+    //
+    // 11111111 - clean, do not scan
+    // 00000001 - already scanned, do not scan
+    // 00000000 - dirty, needs to be scanned.
+    //
+    g1_card_already_scanned = 0x1
   };
 
-public:
+  static const size_t WordAllClean = SIZE_MAX;
+  static const size_t WordAllDirty = 0;
+
+  STATIC_ASSERT(BitsPerByte == 8);
+  static const size_t WordAlreadyScanned = (SIZE_MAX / 255) * g1_card_already_scanned;
+
   G1CardTable(MemRegion whole_heap): CardTable(whole_heap, /* scanned concurrently */ true), _listener() {
     _listener.set_card_table(this);
   }
-  bool is_card_dirty(size_t card_index) {
-    return _byte_map[card_index] == dirty_card_val();
-  }
 
   static CardValue g1_young_card_val() { return g1_young_gen; }
 
-/*
-   Claimed and deferred bits are used together in G1 during the evacuation
-   pause. These bits can have the following state transitions:
-   1. The claimed bit can be put over any other card state. Except that
-      the "dirty -> dirty and claimed" transition is checked for in
-      G1 code and is not used.
-   2. Deferred bit can be set only if the previous state of the card
-      was either clean or claimed. mark_card_deferred() is wait-free.
-      We do not care if the operation is be successful because if
-      it does not it will only result in duplicate entry in the update
-      buffer because of the "cache-miss". So it's not worth spinning.
- */
-
-  bool is_card_claimed(size_t card_index) {
-    CardValue val = _byte_map[card_index];
-    return (val & (clean_card_mask_val() | claimed_card_val())) == claimed_card_val();
-  }
-
-  inline void set_card_claimed(size_t card_index);
-
   void verify_g1_young_region(MemRegion mr) PRODUCT_RETURN;
   void g1_mark_as_young(const MemRegion& mr);
 
-  bool mark_card_deferred(size_t card_index);
+  size_t index_for_cardvalue(CardValue const* p) const {
+    return pointer_delta(p, _byte_map, sizeof(CardValue));
+  }
+
+  // Mark the given card as Dirty if it is Clean.
+  inline void mark_clean_as_dirty(size_t card_index);
 
-  bool is_card_deferred(size_t card_index) {
-    CardValue val = _byte_map[card_index];
-    return (val & (clean_card_mask_val() | deferred_card_val())) == deferred_card_val();
-  }
+  // Change Clean cards in a (large) area on the card table as Dirty, preserving
+  // already scanned cards. Assumes that most cards in that area are Clean.
+  inline void mark_region_dirty(size_t start_card_index, size_t num_cards);
+
+  // Mark the given range of cards as Scanned. All of these cards must be Dirty.
+  inline void mark_as_scanned(size_t start_card_index, size_t num_cards);
+
+  inline uint region_idx_for(CardValue* p);
 
   static size_t compute_size(size_t mem_region_size_in_words) {
     size_t number_of_slots = (mem_region_size_in_words / card_size_in_words);
--- a/src/hotspot/share/gc/g1/g1CardTable.inline.hpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/g1CardTable.inline.hpp	Thu Jun 27 11:48:32 2019 +0200
@@ -26,15 +26,58 @@
 #define SHARE_GC_G1_G1CARDTABLE_INLINE_HPP
 
 #include "gc/g1/g1CardTable.hpp"
+#include "gc/g1/heapRegion.hpp"
 
-void G1CardTable::set_card_claimed(size_t card_index) {
-  jbyte val = _byte_map[card_index];
-  if (val == clean_card_val()) {
-    val = (jbyte)claimed_card_val();
-  } else {
-    val |= (jbyte)claimed_card_val();
+inline uint G1CardTable::region_idx_for(CardValue* p) {
+  size_t const card_idx = pointer_delta(p, _byte_map, sizeof(CardValue));
+  return (uint)(card_idx >> (HeapRegion::LogOfHRGrainBytes - card_shift));
+}
+
+inline void G1CardTable::mark_clean_as_dirty(size_t card_index) {
+  CardValue value = _byte_map[card_index];
+  if (value == clean_card_val()) {
+    _byte_map[card_index] = dirty_card_val();
   }
-  _byte_map[card_index] = val;
 }
 
-#endif // SHARE_GC_G1_G1CARDTABLE_INLINE_HPP
+inline void G1CardTable::mark_region_dirty(size_t start_card_index, size_t num_cards) {
+  assert(is_aligned(start_card_index, sizeof(size_t)), "Start card index must be aligned.");
+  assert(is_aligned(num_cards, sizeof(size_t)), "Number of cards to change must be evenly divisible.");
+
+  size_t const num_chunks = num_cards / sizeof(size_t);
+
+  size_t* cur_word = (size_t*)&_byte_map[start_card_index];
+  size_t* const end_word_map = cur_word + num_chunks;
+  while (cur_word < end_word_map) {
+    size_t value = *cur_word;
+    if (value == WordAllClean) {
+      *cur_word = WordAllDirty;
+    } else if (value == WordAllDirty) {
+      // do nothing.
+    } else {
+      // There is a mix of cards in there. Tread slowly.
+      CardValue* cur = (CardValue*)cur_word;
+      for (size_t i = 0; i < sizeof(size_t); i++) {
+        CardValue value = *cur;
+        if (value == clean_card_val()) {
+          *cur = dirty_card_val();
+        }
+        cur++;
+      }
+    }
+    cur_word++;
+  }
+}
+
+inline void G1CardTable::mark_as_scanned(size_t start_card_index, size_t num_cards) {
+  CardValue* start = &_byte_map[start_card_index];
+  CardValue* const end = start + num_cards;
+  while (start < end) {
+    CardValue value = *start;
+    assert(value == dirty_card_val(),
+           "Must have been dirty %d start " PTR_FORMAT " " PTR_FORMAT, value, p2i(start), p2i(end));
+    *start++ = g1_card_already_scanned;
+  }
+}
+
+#endif /* SHARE_GC_G1_G1CARDTABLE_INLINE_HPP */
--- a/src/hotspot/share/gc/g1/g1CollectedHeap.cpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/g1CollectedHeap.cpp	Thu Jun 27 11:48:32 2019 +0200
@@ -1954,7 +1954,7 @@
     n_completed_buffers++;
   }
   assert(dcqs.completed_buffers_num() == 0, "Completed buffers exist!");
-  phase_times()->record_thread_work_item(G1GCPhaseTimes::UpdateRS, worker_i, n_completed_buffers, G1GCPhaseTimes::UpdateRSProcessedBuffers);
+  phase_times()->record_thread_work_item(G1GCPhaseTimes::MergeLB, worker_i, n_completed_buffers, G1GCPhaseTimes::MergeLBProcessedBuffers);
 }
 
 // Computes the sum of the storage used by the various regions.
@@ -2238,8 +2238,8 @@
   _collection_set.iterate(cl);
 }
 
-void G1CollectedHeap::collection_set_iterate_increment_from(HeapRegionClosure *cl, uint worker_id) {
-  _collection_set.iterate_incremental_part_from(cl, worker_id, workers()->active_workers());
+void G1CollectedHeap::collection_set_iterate_increment_from(HeapRegionClosure *cl, HeapRegionClaimer* hr_claimer, uint worker_id) {
+  _collection_set.iterate_incremental_part_from(cl, hr_claimer, worker_id, workers()->active_workers());
 }
 
 HeapWord* G1CollectedHeap::block_start(const void* addr) const {
@@ -2630,8 +2630,6 @@
   size_t _total_humongous;
   size_t _candidate_humongous;
 
-  G1DirtyCardQueue _dcq;
-
   bool humongous_region_is_candidate(G1CollectedHeap* g1h, HeapRegion* region) const {
     assert(region->is_starts_humongous(), "Must start a humongous object");
 
@@ -2691,8 +2689,7 @@
  public:
   RegisterRegionsWithRegionAttrTableClosure()
   : _total_humongous(0),
-    _candidate_humongous(0),
-    _dcq(&G1BarrierSet::dirty_card_queue_set()) {
+    _candidate_humongous(0) {
   }
 
   virtual bool do_heap_region(HeapRegion* r) {
@@ -2707,49 +2704,9 @@
     uint rindex = r->hrm_index();
     g1h->set_humongous_reclaim_candidate(rindex, is_candidate);
     if (is_candidate) {
+      g1h->register_humongous_region_with_region_attr(rindex);
       _candidate_humongous++;
-      g1h->register_humongous_region_with_region_attr(rindex);
-      // Is_candidate already filters out humongous object with large remembered sets.
-      // If we have a humongous object with a few remembered sets, we simply flush these
-      // remembered set entries into the DCQS. That will result in automatic
-      // re-evaluation of their remembered set entries during the following evacuation
-      // phase.
-      if (!r->rem_set()->is_empty()) {
-        guarantee(r->rem_set()->occupancy_less_or_equal_than(G1RSetSparseRegionEntries),
-                  "Found a not-small remembered set here. This is inconsistent with previous assumptions.");
-        G1CardTable* ct = g1h->card_table();
-        HeapRegionRemSetIterator hrrs(r->rem_set());
-        size_t card_index;
-        while (hrrs.has_next(card_index)) {
-          CardTable::CardValue* card_ptr = ct->byte_for_index(card_index);
-          // The remembered set might contain references to already freed
-          // regions. Filter out such entries to avoid failing card table
-          // verification.
-          if (g1h->is_in(ct->addr_for(card_ptr))) {
-            if (*card_ptr != G1CardTable::dirty_card_val()) {
-              *card_ptr = G1CardTable::dirty_card_val();
-              _dcq.enqueue(card_ptr);
-            }
-          }
-        }
-        assert(hrrs.n_yielded() == r->rem_set()->occupied(),
-               "Remembered set hash maps out of sync, cur: " SIZE_FORMAT " entries, next: " SIZE_FORMAT " entries",
-               hrrs.n_yielded(), r->rem_set()->occupied());
-        // We should only clear the card based remembered set here as we will not
-        // implicitly rebuild anything else during eager reclaim. Note that at the moment
-        // (and probably never) we do not enter this path if there are other kind of
-        // remembered sets for this region.
-        r->rem_set()->clear_locked(true /* only_cardset */);
-        // Clear_locked() above sets the state to Empty. However we want to continue
-        // collecting remembered set entries for humongous regions that were not
-        // reclaimed.
-        r->rem_set()->set_state_complete();
-#ifdef ASSERT
-        G1HeapRegionAttr region_attr = g1h->region_attr(oop(r->bottom()));
-        assert(region_attr.needs_remset_update(), "must be");
-#endif
-      }
-      assert(r->rem_set()->is_empty(), "At this point any humongous candidate remembered set must be empty.");
+      // We will later handle the remembered sets of these regions.
     } else {
       g1h->register_region_with_region_attr(r);
     }
@@ -2760,8 +2717,6 @@
 
   size_t total_humongous() const { return _total_humongous; }
   size_t candidate_humongous() const { return _candidate_humongous; }
-
-  void flush_rem_set_entries() { _dcq.flush(); }
 };
 
 void G1CollectedHeap::register_regions_with_region_attr() {
@@ -2774,9 +2729,6 @@
                                          cl.total_humongous(),
                                          cl.candidate_humongous());
   _has_humongous_reclaim_candidates = cl.candidate_humongous() > 0;
-
-  // Finally flush all remembered set entries to re-check into the global DCQS.
-  cl.flush_rem_set_entries();
 }
 
 #ifndef PRODUCT
@@ -3071,7 +3023,7 @@
                                                   workers()->active_workers(),
                                                   collection_set()->young_region_length(),
                                                   collection_set()->optional_region_length());
-        pre_evacuate_collection_set(evacuation_info);
+        pre_evacuate_collection_set(evacuation_info, &per_thread_states);
 
         // Actually do the work...
         evacuate_initial_collection_set(&per_thread_states);
@@ -3104,9 +3056,7 @@
 
         double sample_end_time_sec = os::elapsedTime();
         double pause_time_ms = (sample_end_time_sec - sample_start_time_sec) * MILLIUNITS;
-        size_t total_cards_scanned = phase_times()->sum_thread_work_items(G1GCPhaseTimes::ScanRS, G1GCPhaseTimes::ScanRSScannedCards) +
-                                     phase_times()->sum_thread_work_items(G1GCPhaseTimes::OptScanRS, G1GCPhaseTimes::ScanRSScannedCards);
-        policy()->record_collection_pause_end(pause_time_ms, total_cards_scanned, heap_used_bytes_before_gc);
+        policy()->record_collection_pause_end(pause_time_ms, heap_used_bytes_before_gc);
       }
 
       verify_after_young_collection(verify_type);
@@ -3580,7 +3530,7 @@
   phase_times()->record_merge_pss_time_ms((os::elapsedTime() - merge_pss_time_start) * 1000.0);
 }
 
-void G1CollectedHeap::pre_evacuate_collection_set(G1EvacuationInfo& evacuation_info) {
+void G1CollectedHeap::pre_evacuate_collection_set(G1EvacuationInfo& evacuation_info, G1ParScanThreadStateSet* per_thread_states) {
   _expand_heap_after_alloc_failure = true;
   _evacuation_failed = false;
 
@@ -3591,10 +3541,15 @@
   // Initialize the GC alloc regions.
   _allocator->init_gc_alloc_regions(evacuation_info);
 
+  {
+    Ticks start = Ticks::now();
+    rem_set()->prepare_for_scan_heap_roots();
+    phase_times()->record_prepare_heap_roots_time_ms((Ticks::now() - start).seconds() * 1000.0);
+  }
+
   register_regions_with_region_attr();
   assert(_verifier->check_region_attr_table(), "Inconsistency in the region attributes table.");
 
-  rem_set()->prepare_for_scan_rem_set();
   _preserved_marks_set.assert_empty();
 
 #if COMPILER2_OR_JVMCI
@@ -3696,8 +3651,8 @@
 
   void scan_roots(G1ParScanThreadState* pss, uint worker_id) {
     _root_processor->evacuate_roots(pss, worker_id);
-    _g1h->rem_set()->update_rem_set(pss, worker_id);
-    _g1h->rem_set()->scan_rem_set(pss, worker_id, G1GCPhaseTimes::ScanRS, G1GCPhaseTimes::ObjCopy, G1GCPhaseTimes::CodeRoots);
+    _g1h->rem_set()->scan_heap_roots(pss, worker_id, G1GCPhaseTimes::ScanHR, G1GCPhaseTimes::ObjCopy);
+    _g1h->rem_set()->scan_collection_set_regions(pss, worker_id, G1GCPhaseTimes::ScanHR, G1GCPhaseTimes::CodeRoots, G1GCPhaseTimes::ObjCopy);
   }
 
   void evacuate_live_objects(G1ParScanThreadState* pss, uint worker_id) {
@@ -3724,6 +3679,14 @@
 };
 
 void G1CollectedHeap::evacuate_initial_collection_set(G1ParScanThreadStateSet* per_thread_states) {
+  G1GCPhaseTimes* p = phase_times();
+
+  {
+    Ticks start = Ticks::now();
+    rem_set()->merge_heap_roots(false /* remset_only */, G1GCPhaseTimes::MergeRS);
+    p->record_merge_heap_roots_time((Ticks::now() - start).seconds() * 1000.0);
+  }
+
   Tickspan task_time;
   const uint num_workers = workers()->active_workers();
 
@@ -3738,7 +3701,6 @@
   }
   Tickspan total_processing = Ticks::now() - start_processing;
 
-  G1GCPhaseTimes* p = phase_times();
   p->record_initial_evac_time(task_time.seconds() * 1000.0);
   p->record_or_add_code_root_fixup_time((total_processing - task_time).seconds() * 1000.0);
 }
@@ -3746,7 +3708,8 @@
 class G1EvacuateOptionalRegionsTask : public G1EvacuateRegionsBaseTask {
 
   void scan_roots(G1ParScanThreadState* pss, uint worker_id) {
-    _g1h->rem_set()->scan_rem_set(pss, worker_id, G1GCPhaseTimes::OptScanRS, G1GCPhaseTimes::OptObjCopy, G1GCPhaseTimes::OptCodeRoots);
+    _g1h->rem_set()->scan_heap_roots(pss, worker_id, G1GCPhaseTimes::OptScanHR, G1GCPhaseTimes::OptObjCopy);
+    _g1h->rem_set()->scan_collection_set_regions(pss, worker_id, G1GCPhaseTimes::OptScanHR, G1GCPhaseTimes::OptCodeRoots, G1GCPhaseTimes::OptObjCopy);
   }
 
   void evacuate_live_objects(G1ParScanThreadState* pss, uint worker_id) {
@@ -3782,8 +3745,6 @@
 void G1CollectedHeap::evacuate_optional_collection_set(G1ParScanThreadStateSet* per_thread_states) {
   const double gc_start_time_ms = phase_times()->cur_collection_start_sec() * 1000.0;
 
-  Ticks start = Ticks::now();
-
   while (!evacuation_failed() && _collection_set.optional_region_length() > 0) {
 
     double time_used_ms = os::elapsedTime() * 1000.0 - gc_start_time_ms;
@@ -3796,18 +3757,24 @@
       break;
     }
 
-    evacuate_next_optional_regions(per_thread_states);
+    {
+      Ticks start = Ticks::now();
+      rem_set()->merge_heap_roots(true /* remset_only */, G1GCPhaseTimes::OptMergeRS);
+      phase_times()->record_or_add_optional_merge_heap_roots_time((Ticks::now() - start).seconds() * 1000.0);
+    }
+
+    {
+      Ticks start = Ticks::now();
+      evacuate_next_optional_regions(per_thread_states);
+      phase_times()->record_or_add_optional_evac_time((Ticks::now() - start).seconds() * 1000.0);
+    }
   }
 
   _collection_set.abandon_optional_collection_set(per_thread_states);
-
-  phase_times()->record_or_add_optional_evac_time((Ticks::now() - start).seconds() * 1000.0);
 }
 
 void G1CollectedHeap::post_evacuate_collection_set(G1EvacuationInfo& evacuation_info, G1ParScanThreadStateSet* per_thread_states) {
-  // Also cleans the card table from temporary duplicate detection information used
-  // during UpdateRS/ScanRS.
-  rem_set()->cleanup_after_scan_rem_set();
+  rem_set()->cleanup_after_scan_heap_roots();
 
   // Process any discovered reference objects - we have
   // to do this _before_ we retire the GC alloc regions
--- a/src/hotspot/share/gc/g1/g1CollectedHeap.hpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/g1CollectedHeap.hpp	Thu Jun 27 11:48:32 2019 +0200
@@ -78,7 +78,6 @@
 class G1HotCardCache;
 class G1RemSet;
 class G1YoungRemSetSamplingThread;
-class HeapRegionRemSetIterator;
 class G1ConcurrentMark;
 class G1ConcurrentMarkThread;
 class G1ConcurrentRefine;
@@ -757,7 +756,7 @@
   void evacuate_next_optional_regions(G1ParScanThreadStateSet* per_thread_states);
 
 public:
-  void pre_evacuate_collection_set(G1EvacuationInfo& evacuation_info);
+  void pre_evacuate_collection_set(G1EvacuationInfo& evacuation_info, G1ParScanThreadStateSet* pss);
   void post_evacuate_collection_set(G1EvacuationInfo& evacuation_info, G1ParScanThreadStateSet* pss);
 
   void expand_heap_after_young_collection();
@@ -1115,7 +1114,8 @@
 
  public:
 
-  inline G1HeapRegionAttr region_attr(const void* obj);
+  inline G1HeapRegionAttr region_attr(const void* obj) const;
+  inline G1HeapRegionAttr region_attr(uint idx) const;
 
   // Return "TRUE" iff the given object address is in the reserved
   // region of g1.
@@ -1182,7 +1182,12 @@
   // Starts the iteration so that the start regions of a given worker id over the
   // set active_workers are evenly spread across the set of collection set regions
   // to be iterated.
-  void collection_set_iterate_increment_from(HeapRegionClosure *blk, uint worker_id);
+  // The variant with the HeapRegionClaimer guarantees that the closure will be
+  // applied to a particular region exactly once.
+  void collection_set_iterate_increment_from(HeapRegionClosure *blk, uint worker_id) {
+    collection_set_iterate_increment_from(blk, NULL, worker_id);
+  }
+  void collection_set_iterate_increment_from(HeapRegionClosure *blk, HeapRegionClaimer* hr_claimer, uint worker_id);
 
   // Returns the HeapRegion that contains addr. addr must not be NULL.
   template <class T>
--- a/src/hotspot/share/gc/g1/g1CollectedHeap.inline.hpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/g1CollectedHeap.inline.hpp	Thu Jun 27 11:48:32 2019 +0200
@@ -163,10 +163,14 @@
   return _region_attr.is_in_cset_or_humongous((HeapWord*)obj);
 }
 
-G1HeapRegionAttr G1CollectedHeap::region_attr(const void* addr) {
+G1HeapRegionAttr G1CollectedHeap::region_attr(const void* addr) const {
   return _region_attr.at((HeapWord*)addr);
 }
 
+G1HeapRegionAttr G1CollectedHeap::region_attr(uint idx) const {
+  return _region_attr.get_by_index(idx);
+}
+
 void G1CollectedHeap::register_humongous_region_with_region_attr(uint index) {
   _region_attr.set_humongous(index, region_at(index)->rem_set()->is_tracked());
 }
@@ -177,7 +181,7 @@
 
 void G1CollectedHeap::register_old_region_with_region_attr(HeapRegion* r) {
   _region_attr.set_in_old(r->hrm_index(), r->rem_set()->is_tracked());
-  _rem_set->prepare_for_scan_rem_set(r->hrm_index());
+  _rem_set->prepare_for_scan_heap_roots(r->hrm_index());
 }
 
 void G1CollectedHeap::register_optional_region_with_region_attr(HeapRegion* r) {
--- a/src/hotspot/share/gc/g1/g1CollectionSet.cpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/g1CollectionSet.cpp	Thu Jun 27 11:48:32 2019 +0200
@@ -217,10 +217,13 @@
   }
 }
 
-void G1CollectionSet::iterate_incremental_part_from(HeapRegionClosure* cl, uint worker_id, uint total_workers) const {
+void G1CollectionSet::iterate_incremental_part_from(HeapRegionClosure* cl,
+                                                    HeapRegionClaimer* hr_claimer,
+                                                    uint worker_id,
+                                                    uint total_workers) const {
   assert_at_safepoint();
 
-  size_t len = _collection_set_cur_length - _inc_part_start;
+  size_t len = increment_length();
   if (len == 0) {
     return;
   }
@@ -229,9 +232,12 @@
   size_t cur_pos = start_pos;
 
   do {
-    HeapRegion* r = _g1h->region_at(_collection_set_regions[cur_pos + _inc_part_start]);
-    bool result = cl->do_heap_region(r);
-    guarantee(!result, "Must not cancel iteration");
+    uint region_idx = _collection_set_regions[cur_pos + _inc_part_start];
+    if (hr_claimer == NULL || hr_claimer->claim_region(region_idx)) {
+      HeapRegion* r = _g1h->region_at(region_idx);
+      bool result = cl->do_heap_region(r);
+      guarantee(!result, "Must not cancel iteration");
+    }
 
     cur_pos++;
     if (cur_pos == len) {
--- a/src/hotspot/share/gc/g1/g1CollectionSet.hpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/g1CollectionSet.hpp	Thu Jun 27 11:48:32 2019 +0200
@@ -36,6 +36,7 @@
 class G1Policy;
 class G1SurvivorRegions;
 class HeapRegion;
+class HeapRegionClaimer;
 class HeapRegionClosure;
 
 // The collection set.
@@ -279,7 +280,12 @@
 
   // Iterate over the current collection set increment applying the given HeapRegionClosure
   // from a starting position determined by the given worker id.
-  void iterate_incremental_part_from(HeapRegionClosure* cl, uint worker_id, uint total_workers) const;
+  void iterate_incremental_part_from(HeapRegionClosure* cl, HeapRegionClaimer* hr_claimer, uint worker_id, uint total_workers) const;
+
+  // Returns the length of the current increment in number of regions.
+  size_t increment_length() const { return _collection_set_cur_length - _inc_part_start; }
+  // Returns the length of the whole current collection set in number of regions
+  size_t cur_length() const { return _collection_set_cur_length; }
 
   // Iterate over the entire collection set (all increments calculated so far), applying
   // the given HeapRegionClosure on all of them.
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefine.cpp	Thu Jun 27 11:48:32 2019 +0200
@@ -206,7 +206,7 @@
     // available buffers near green_zone value.  When yellow_size is
     // large we don't want to allow a full step to accumulate before
     // doing any processing, as that might lead to significantly more
-    // than green_zone buffers to be processed by update_rs.
+    // than green_zone buffers to be processed during scanning.
     step = MIN2(step, ParallelGCThreads / 2.0);
   }
   size_t activate_offset = static_cast<size_t>(ceil(step * (worker_i + 1)));
@@ -322,18 +322,18 @@
 }
 
 static size_t calc_new_green_zone(size_t green,
-                                  double update_rs_time,
-                                  size_t update_rs_processed_buffers,
+                                  double log_buffer_scan_time,
+                                  size_t processed_log_buffers,
                                   double goal_ms) {
   // Adjust green zone based on whether we're meeting the time goal.
   // Limit to max_green_zone.
   const double inc_k = 1.1, dec_k = 0.9;
-  if (update_rs_time > goal_ms) {
+  if (log_buffer_scan_time > goal_ms) {
     if (green > 0) {
       green = static_cast<size_t>(green * dec_k);
     }
-  } else if (update_rs_time < goal_ms &&
-             update_rs_processed_buffers > green) {
+  } else if (log_buffer_scan_time < goal_ms &&
+             processed_log_buffers > green) {
     green = static_cast<size_t>(MAX2(green * inc_k, green + 1.0));
     green = MIN2(green, max_green_zone);
   }
@@ -350,20 +350,20 @@
   return MIN2(yellow + (yellow - green), max_red_zone);
 }
 
-void G1ConcurrentRefine::update_zones(double update_rs_time,
-                                      size_t update_rs_processed_buffers,
+void G1ConcurrentRefine::update_zones(double log_buffer_scan_time,
+                                      size_t processed_log_buffers,
                                       double goal_ms) {
   log_trace( CTRL_TAGS )("Updating Refinement Zones: "
-                         "update_rs time: %.3fms, "
-                         "update_rs buffers: " SIZE_FORMAT ", "
-                         "update_rs goal time: %.3fms",
-                         update_rs_time,
-                         update_rs_processed_buffers,
+                         "log buffer scan time: %.3fms, "
+                         "processed buffers: " SIZE_FORMAT ", "
+                         "goal time: %.3fms",
+                         log_buffer_scan_time,
+                         processed_log_buffers,
                          goal_ms);
 
   _green_zone = calc_new_green_zone(_green_zone,
-                                    update_rs_time,
-                                    update_rs_processed_buffers,
+                                    log_buffer_scan_time,
+                                    processed_log_buffers,
                                     goal_ms);
   _yellow_zone = calc_new_yellow_zone(_green_zone, _min_yellow_zone_size);
   _red_zone = calc_new_red_zone(_green_zone, _yellow_zone);
@@ -376,13 +376,13 @@
             _green_zone, _yellow_zone, _red_zone);
 }
 
-void G1ConcurrentRefine::adjust(double update_rs_time,
-                                size_t update_rs_processed_buffers,
+void G1ConcurrentRefine::adjust(double log_buffer_scan_time,
+                                size_t processed_log_buffers,
                                 double goal_ms) {
   G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
 
   if (G1UseAdaptiveConcRefinement) {
-    update_zones(update_rs_time, update_rs_processed_buffers, goal_ms);
+    update_zones(log_buffer_scan_time, processed_log_buffers, goal_ms);
 
     // Change the barrier params
     if (max_num_threads() == 0) {
--- a/src/hotspot/share/gc/g1/g1ConcurrentRefine.hpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/g1ConcurrentRefine.hpp	Thu Jun 27 11:48:32 2019 +0200
@@ -97,8 +97,8 @@
                      size_t min_yellow_zone_size);
 
   // Update green/yellow/red zone values based on how well goals are being met.
-  void update_zones(double update_rs_time,
-                    size_t update_rs_processed_buffers,
+  void update_zones(double log_buffer_scan_time,
+                    size_t processed_log_buffers,
                     double goal_ms);
 
   static uint worker_id_offset();
@@ -115,7 +115,7 @@
   void stop();
 
   // Adjust refinement thresholds based on work done during the pause and the goal time.
-  void adjust(double update_rs_time, size_t update_rs_processed_buffers, double goal_ms);
+  void adjust(double log_buffer_scan_time, size_t processed_log_buffers, double goal_ms);
 
   size_t activation_threshold(uint worker_id) const;
   size_t deactivation_threshold(uint worker_id) const;
--- a/src/hotspot/share/gc/g1/g1EvacFailure.cpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/g1EvacFailure.cpp	Thu Jun 27 11:48:32 2019 +0200
@@ -37,15 +37,19 @@
 #include "oops/compressedOops.inline.hpp"
 #include "oops/oop.inline.hpp"
 
-class UpdateRSetDeferred : public BasicOopIterateClosure {
+class UpdateLogBuffersDeferred : public BasicOopIterateClosure {
 private:
   G1CollectedHeap* _g1h;
   G1DirtyCardQueue* _dcq;
   G1CardTable*    _ct;
 
+  // Remember the last enqueued card to avoid enqueuing the same card over and over;
+  // since we only ever handle a card once, this is sufficient.
+  size_t _last_enqueued_card;
+
 public:
-  UpdateRSetDeferred(G1DirtyCardQueue* dcq) :
-    _g1h(G1CollectedHeap::heap()), _dcq(dcq), _ct(_g1h->card_table()) {}
+  UpdateLogBuffersDeferred(G1DirtyCardQueue* dcq) :
+    _g1h(G1CollectedHeap::heap()), _dcq(dcq), _ct(_g1h->card_table()), _last_enqueued_card(SIZE_MAX) {}
 
   virtual void do_oop(narrowOop* p) { do_oop_work(p); }
   virtual void do_oop(      oop* p) { do_oop_work(p); }
@@ -62,8 +66,9 @@
       return;
     }
     size_t card_index = _ct->index_for(p);
-    if (_ct->mark_card_deferred(card_index)) {
+    if (card_index != _last_enqueued_card) {
       _dcq->enqueue(_ct->byte_for_index(card_index));
+      _last_enqueued_card = card_index;
     }
   }
 };
@@ -73,21 +78,21 @@
   G1ConcurrentMark* _cm;
   HeapRegion* _hr;
   size_t _marked_bytes;
-  UpdateRSetDeferred* _update_rset_cl;
+  UpdateLogBuffersDeferred* _log_buffer_cl;
   bool _during_initial_mark;
   uint _worker_id;
   HeapWord* _last_forwarded_object_end;
 
 public:
   RemoveSelfForwardPtrObjClosure(HeapRegion* hr,
-                                 UpdateRSetDeferred* update_rset_cl,
+                                 UpdateLogBuffersDeferred* log_buffer_cl,
                                  bool during_initial_mark,
                                  uint worker_id) :
     _g1h(G1CollectedHeap::heap()),
     _cm(_g1h->concurrent_mark()),
     _hr(hr),
     _marked_bytes(0),
-    _update_rset_cl(update_rset_cl),
+    _log_buffer_cl(log_buffer_cl),
     _during_initial_mark(during_initial_mark),
     _worker_id(worker_id),
     _last_forwarded_object_end(hr->bottom()) { }
@@ -144,7 +149,7 @@
       // The problem is that, if evacuation fails, we might have
       // remembered set entries missing given that we skipped cards on
       // the collection set. So, we'll recreate such entries now.
-      obj->oop_iterate(_update_rset_cl);
+      obj->oop_iterate(_log_buffer_cl);
 
       HeapWord* obj_end = obj_addr + obj_size;
       _last_forwarded_object_end = obj_end;
@@ -193,25 +198,22 @@
 class RemoveSelfForwardPtrHRClosure: public HeapRegionClosure {
   G1CollectedHeap* _g1h;
   uint _worker_id;
-  HeapRegionClaimer* _hrclaimer;
 
   G1DirtyCardQueue _dcq;
-  UpdateRSetDeferred _update_rset_cl;
+  UpdateLogBuffersDeferred _log_buffer_cl;
 
 public:
-  RemoveSelfForwardPtrHRClosure(uint worker_id,
-                                HeapRegionClaimer* hrclaimer) :
+  RemoveSelfForwardPtrHRClosure(uint worker_id) :
     _g1h(G1CollectedHeap::heap()),
     _worker_id(worker_id),
-    _hrclaimer(hrclaimer),
     _dcq(&_g1h->dirty_card_queue_set()),
-    _update_rset_cl(&_dcq){
+    _log_buffer_cl(&_dcq) {
   }
 
   size_t remove_self_forward_ptr_by_walking_hr(HeapRegion* hr,
                                                bool during_initial_mark) {
     RemoveSelfForwardPtrObjClosure rspc(hr,
-                                        &_update_rset_cl,
+                                        &_log_buffer_cl,
                                         during_initial_mark,
                                         _worker_id);
     hr->object_iterate(&rspc);
@@ -225,26 +227,24 @@
     assert(!hr->is_pinned(), "Unexpected pinned region at index %u", hr->hrm_index());
     assert(hr->in_collection_set(), "bad CS");
 
-    if (_hrclaimer->claim_region(hr->hrm_index())) {
-      if (hr->evacuation_failed()) {
-        hr->clear_index_in_opt_cset();
+    if (hr->evacuation_failed()) {
+      hr->clear_index_in_opt_cset();
 
-        bool during_initial_mark = _g1h->collector_state()->in_initial_mark_gc();
-        bool during_conc_mark = _g1h->collector_state()->mark_or_rebuild_in_progress();
+      bool during_initial_mark = _g1h->collector_state()->in_initial_mark_gc();
+      bool during_conc_mark = _g1h->collector_state()->mark_or_rebuild_in_progress();
 
-        hr->note_self_forwarding_removal_start(during_initial_mark,
+      hr->note_self_forwarding_removal_start(during_initial_mark,
                                                during_conc_mark);
-        _g1h->verifier()->check_bitmaps("Self-Forwarding Ptr Removal", hr);
+      _g1h->verifier()->check_bitmaps("Self-Forwarding Ptr Removal", hr);
 
-        hr->reset_bot();
-
-        size_t live_bytes = remove_self_forward_ptr_by_walking_hr(hr, during_initial_mark);
+      hr->reset_bot();
 
-        hr->rem_set()->clean_strong_code_roots(hr);
-        hr->rem_set()->clear_locked(true);
+      size_t live_bytes = remove_self_forward_ptr_by_walking_hr(hr, during_initial_mark);
 
-        hr->note_self_forwarding_removal_end(live_bytes);
-      }
+      hr->rem_set()->clean_strong_code_roots(hr);
+      hr->rem_set()->clear_locked(true);
+
+      hr->note_self_forwarding_removal_end(live_bytes);
     }
     return false;
   }
@@ -256,7 +256,7 @@
   _hrclaimer(_g1h->workers()->active_workers()) { }
 
 void G1ParRemoveSelfForwardPtrsTask::work(uint worker_id) {
-  RemoveSelfForwardPtrHRClosure rsfp_cl(worker_id, &_hrclaimer);
+  RemoveSelfForwardPtrHRClosure rsfp_cl(worker_id);
 
-  _g1h->collection_set_iterate_increment_from(&rsfp_cl, worker_id);
+  _g1h->collection_set_iterate_increment_from(&rsfp_cl, &_hrclaimer, worker_id);
 }
--- a/src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp	Thu Jun 27 11:48:32 2019 +0200
@@ -66,14 +66,30 @@
   _gc_par_phases[WaitForStrongCLD] = new WorkerDataArray<double>(max_gc_threads, "Wait For Strong CLD (ms):");
   _gc_par_phases[WeakCLDRoots] = new WorkerDataArray<double>(max_gc_threads, "Weak CLD Roots (ms):");
 
-  _gc_par_phases[UpdateRS] = new WorkerDataArray<double>(max_gc_threads, "Update RS (ms):");
+  _gc_par_phases[MergeRS] = new WorkerDataArray<double>(max_gc_threads, "Remembered Sets (ms):");
+  _merge_rs_merged_sparse = new WorkerDataArray<size_t>(max_gc_threads, "Merged Sparse:");
+  _gc_par_phases[MergeRS]->link_thread_work_items(_merge_rs_merged_sparse, MergeRSMergedSparse);
+  _merge_rs_merged_fine = new WorkerDataArray<size_t>(max_gc_threads, "Merged Fine:");
+  _gc_par_phases[MergeRS]->link_thread_work_items(_merge_rs_merged_fine, MergeRSMergedFine);
+  _merge_rs_merged_coarse = new WorkerDataArray<size_t>(max_gc_threads, "Merged Coarse:");
+  _gc_par_phases[MergeRS]->link_thread_work_items(_merge_rs_merged_coarse, MergeRSMergedCoarse);
+
+  _gc_par_phases[OptMergeRS] = new WorkerDataArray<double>(max_gc_threads, "Optional Remembered Sets (ms):");
+  _opt_merge_rs_merged_sparse = new WorkerDataArray<size_t>(max_gc_threads, "Merged Sparse:");
+  _gc_par_phases[OptMergeRS]->link_thread_work_items(_opt_merge_rs_merged_sparse, MergeRSMergedSparse);
+  _opt_merge_rs_merged_fine = new WorkerDataArray<size_t>(max_gc_threads, "Merged Fine:");
+  _gc_par_phases[OptMergeRS]->link_thread_work_items(_opt_merge_rs_merged_fine, MergeRSMergedFine);
+  _opt_merge_rs_merged_coarse = new WorkerDataArray<size_t>(max_gc_threads, "Merged Coarse:");
+  _gc_par_phases[OptMergeRS]->link_thread_work_items(_opt_merge_rs_merged_coarse, MergeRSMergedCoarse);
+
+  _gc_par_phases[MergeLB] = new WorkerDataArray<double>(max_gc_threads, "Log Buffers (ms):");
   if (G1HotCardCache::default_use_cache()) {
-    _gc_par_phases[ScanHCC] = new WorkerDataArray<double>(max_gc_threads, "Scan HCC (ms):");
+    _gc_par_phases[MergeHCC] = new WorkerDataArray<double>(max_gc_threads, "Hot Card Cache (ms):");
   } else {
-    _gc_par_phases[ScanHCC] = NULL;
+    _gc_par_phases[MergeHCC] = NULL;
   }
-  _gc_par_phases[ScanRS] = new WorkerDataArray<double>(max_gc_threads, "Scan RS (ms):");
-  _gc_par_phases[OptScanRS] = new WorkerDataArray<double>(max_gc_threads, "Optional Scan RS (ms):");
+  _gc_par_phases[ScanHR] = new WorkerDataArray<double>(max_gc_threads, "Scan Heap Roots (ms):");
+  _gc_par_phases[OptScanHR] = new WorkerDataArray<double>(max_gc_threads, "Optional Scan Heap Roots (ms):");
   _gc_par_phases[CodeRoots] = new WorkerDataArray<double>(max_gc_threads, "Code Root Scan (ms):");
   _gc_par_phases[OptCodeRoots] = new WorkerDataArray<double>(max_gc_threads, "Optional Code Root Scan (ms):");
   _gc_par_phases[ObjCopy] = new WorkerDataArray<double>(max_gc_threads, "Object Copy (ms):");
@@ -84,30 +100,30 @@
   _gc_par_phases[GCWorkerEnd] = new WorkerDataArray<double>(max_gc_threads, "GC Worker End (ms):");
   _gc_par_phases[Other] = new WorkerDataArray<double>(max_gc_threads, "GC Worker Other (ms):");
 
-  _scan_rs_scanned_cards = new WorkerDataArray<size_t>(max_gc_threads, "Scanned Cards:");
-  _gc_par_phases[ScanRS]->link_thread_work_items(_scan_rs_scanned_cards, ScanRSScannedCards);
-  _scan_rs_claimed_cards = new WorkerDataArray<size_t>(max_gc_threads, "Claimed Cards:");
-  _gc_par_phases[ScanRS]->link_thread_work_items(_scan_rs_claimed_cards, ScanRSClaimedCards);
-  _scan_rs_skipped_cards = new WorkerDataArray<size_t>(max_gc_threads, "Skipped Cards:");
-  _gc_par_phases[ScanRS]->link_thread_work_items(_scan_rs_skipped_cards, ScanRSSkippedCards);
+  _scan_hr_scanned_cards = new WorkerDataArray<size_t>(max_gc_threads, "Scanned Cards:");
+  _gc_par_phases[ScanHR]->link_thread_work_items(_scan_hr_scanned_cards, ScanHRScannedCards);
+  _scan_hr_scanned_blocks = new WorkerDataArray<size_t>(max_gc_threads, "Scanned Blocks:");
+  _gc_par_phases[ScanHR]->link_thread_work_items(_scan_hr_scanned_blocks, ScanHRScannedBlocks);
+  _scan_hr_claimed_chunks = new WorkerDataArray<size_t>(max_gc_threads, "Claimed Chunks:");
+  _gc_par_phases[ScanHR]->link_thread_work_items(_scan_hr_claimed_chunks, ScanHRClaimedChunks);
 
-  _opt_scan_rs_scanned_cards = new WorkerDataArray<size_t>(max_gc_threads, "Scanned Cards:");
-  _gc_par_phases[OptScanRS]->link_thread_work_items(_opt_scan_rs_scanned_cards, ScanRSScannedCards);
-  _opt_scan_rs_claimed_cards = new WorkerDataArray<size_t>(max_gc_threads, "Claimed Cards:");
-  _gc_par_phases[OptScanRS]->link_thread_work_items(_opt_scan_rs_claimed_cards, ScanRSClaimedCards);
-  _opt_scan_rs_skipped_cards = new WorkerDataArray<size_t>(max_gc_threads, "Skipped Cards:");
-  _gc_par_phases[OptScanRS]->link_thread_work_items(_opt_scan_rs_skipped_cards, ScanRSSkippedCards);
-  _opt_scan_rs_scanned_opt_refs = new WorkerDataArray<size_t>(max_gc_threads, "Scanned Refs:");
-  _gc_par_phases[OptScanRS]->link_thread_work_items(_opt_scan_rs_scanned_opt_refs, ScanRSScannedOptRefs);
-  _opt_scan_rs_used_memory = new WorkerDataArray<size_t>(max_gc_threads, "Used Memory:");
-  _gc_par_phases[OptScanRS]->link_thread_work_items(_opt_scan_rs_used_memory, ScanRSUsedMemory);
+  _opt_scan_hr_scanned_cards = new WorkerDataArray<size_t>(max_gc_threads, "Scanned Cards:");
+  _gc_par_phases[OptScanHR]->link_thread_work_items(_opt_scan_hr_scanned_cards, ScanHRScannedCards);
+  _opt_scan_hr_scanned_blocks = new WorkerDataArray<size_t>(max_gc_threads, "Scanned Blocks:");
+  _gc_par_phases[OptScanHR]->link_thread_work_items(_opt_scan_hr_scanned_blocks, ScanHRScannedBlocks);
+  _opt_scan_hr_claimed_chunks = new WorkerDataArray<size_t>(max_gc_threads, "Claimed Chunks:");
+  _gc_par_phases[OptScanHR]->link_thread_work_items(_opt_scan_hr_claimed_chunks, ScanHRClaimedChunks);
+  _opt_scan_hr_scanned_opt_refs = new WorkerDataArray<size_t>(max_gc_threads, "Scanned Refs:");
+  _gc_par_phases[OptScanHR]->link_thread_work_items(_opt_scan_hr_scanned_opt_refs, ScanHRScannedOptRefs);
+  _opt_scan_hr_used_memory = new WorkerDataArray<size_t>(max_gc_threads, "Used Memory:");
+  _gc_par_phases[OptScanHR]->link_thread_work_items(_opt_scan_hr_used_memory, ScanHRUsedMemory);
 
-  _update_rs_processed_buffers = new WorkerDataArray<size_t>(max_gc_threads, "Processed Buffers:");
-  _gc_par_phases[UpdateRS]->link_thread_work_items(_update_rs_processed_buffers, UpdateRSProcessedBuffers);
-  _update_rs_scanned_cards = new WorkerDataArray<size_t>(max_gc_threads, "Scanned Cards:");
-  _gc_par_phases[UpdateRS]->link_thread_work_items(_update_rs_scanned_cards, UpdateRSScannedCards);
-  _update_rs_skipped_cards = new WorkerDataArray<size_t>(max_gc_threads, "Skipped Cards:");
-  _gc_par_phases[UpdateRS]->link_thread_work_items(_update_rs_skipped_cards, UpdateRSSkippedCards);
+  _merge_lb_processed_buffers = new WorkerDataArray<size_t>(max_gc_threads, "Processed Buffers:");
+  _gc_par_phases[MergeLB]->link_thread_work_items(_merge_lb_processed_buffers, MergeLBProcessedBuffers);
+  _merge_lb_dirty_cards = new WorkerDataArray<size_t>(max_gc_threads, "Dirty Cards:");
+  _gc_par_phases[MergeLB]->link_thread_work_items(_merge_lb_dirty_cards, MergeLBDirtyCards);
+  _merge_lb_skipped_cards = new WorkerDataArray<size_t>(max_gc_threads, "Skipped Cards:");
+  _gc_par_phases[MergeLB]->link_thread_work_items(_merge_lb_skipped_cards, MergeLBSkippedCards);
 
   _obj_copy_lab_waste = new WorkerDataArray<size_t>(max_gc_threads, "LAB Waste");
   _gc_par_phases[ObjCopy]->link_thread_work_items(_obj_copy_lab_waste, ObjCopyLABWaste);
@@ -148,6 +164,8 @@
   _cur_optional_evac_ms = 0.0;
   _cur_collection_code_root_fixup_time_ms = 0.0;
   _cur_strong_code_root_purge_time_ms = 0.0;
+  _cur_merge_heap_roots_time_ms = 0.0;
+  _cur_optional_merge_heap_roots_time_ms = 0.0;
   _cur_evac_fail_recalc_used = 0.0;
   _cur_evac_fail_remove_self_forwards = 0.0;
   _cur_string_deduplication_time_ms = 0.0;
@@ -160,6 +178,7 @@
   _cur_collection_start_sec = 0.0;
   _root_region_scan_wait_time_ms = 0.0;
   _external_accounted_time_ms = 0.0;
+  _recorded_prepare_heap_roots_time_ms = 0.0;
   _recorded_clear_claimed_marks_time_ms = 0.0;
   _recorded_young_cset_choice_time_ms = 0.0;
   _recorded_non_young_cset_choice_time_ms = 0.0;
@@ -219,9 +238,7 @@
       record_time_secs(GCWorkerTotal, i , total_worker_time);
 
       double worker_known_time = worker_time(ExtRootScan, i) +
-                                 worker_time(ScanHCC, i) +
-                                 worker_time(UpdateRS, i) +
-                                 worker_time(ScanRS, i) +
+                                 worker_time(ScanHR, i) +
                                  worker_time(CodeRoots, i) +
                                  worker_time(ObjCopy, i) +
                                  worker_time(Termination, i);
@@ -231,11 +248,15 @@
       // Make sure all slots are uninitialized since this thread did not seem to have been started
       ASSERT_PHASE_UNINITIALIZED(GCWorkerEnd);
       ASSERT_PHASE_UNINITIALIZED(ExtRootScan);
-      ASSERT_PHASE_UNINITIALIZED(ScanHCC);
-      ASSERT_PHASE_UNINITIALIZED(UpdateRS);
-      ASSERT_PHASE_UNINITIALIZED(ScanRS);
+      ASSERT_PHASE_UNINITIALIZED(MergeHCC);
+      ASSERT_PHASE_UNINITIALIZED(MergeRS);
+      ASSERT_PHASE_UNINITIALIZED(OptMergeRS);
+      ASSERT_PHASE_UNINITIALIZED(MergeLB);
+      ASSERT_PHASE_UNINITIALIZED(ScanHR);
       ASSERT_PHASE_UNINITIALIZED(CodeRoots);
+      ASSERT_PHASE_UNINITIALIZED(OptCodeRoots);
       ASSERT_PHASE_UNINITIALIZED(ObjCopy);
+      ASSERT_PHASE_UNINITIALIZED(OptObjCopy);
       ASSERT_PHASE_UNINITIALIZED(Termination);
     }
   }
@@ -365,6 +386,7 @@
                         _recorded_young_cset_choice_time_ms +
                         _recorded_non_young_cset_choice_time_ms +
                         _cur_region_register_time +
+                        _recorded_prepare_heap_roots_time_ms +
                         _recorded_clear_claimed_marks_time_ms;
 
   info_time("Pre Evacuate Collection Set", sum_ms);
@@ -380,6 +402,7 @@
     trace_count("Humongous Candidate", _cur_fast_reclaim_humongous_candidates);
   }
 
+  debug_time("Prepare Heap Roots", _recorded_prepare_heap_roots_time_ms);
   if (_recorded_clear_claimed_marks_time_ms > 0.0) {
     debug_time("Clear Claimed Marks", _recorded_clear_claimed_marks_time_ms);
   }
@@ -387,10 +410,13 @@
 }
 
 double G1GCPhaseTimes::print_evacuate_optional_collection_set() const {
-  const double sum_ms = _cur_optional_evac_ms;
+  const double sum_ms = _cur_optional_evac_ms + _cur_optional_merge_heap_roots_time_ms;
   if (sum_ms > 0) {
-    info_time("Evacuate Optional Collection Set", sum_ms);
-    debug_phase(_gc_par_phases[OptScanRS]);
+    info_time("Merge Optional Heap Roots", _cur_optional_merge_heap_roots_time_ms);
+    debug_phase(_gc_par_phases[OptMergeRS]);
+
+    info_time("Evacuate Optional Collection Set", _cur_optional_evac_ms);
+    debug_phase(_gc_par_phases[OptScanHR]);
     debug_phase(_gc_par_phases[OptObjCopy]);
     debug_phase(_gc_par_phases[OptCodeRoots]);
     debug_phase(_gc_par_phases[OptTermination]);
@@ -398,21 +424,23 @@
   return sum_ms;
 }
 
-double G1GCPhaseTimes::print_evacuate_collection_set() const {
-  const double sum_ms = _cur_collection_initial_evac_time_ms;
+double G1GCPhaseTimes::print_evacuate_initial_collection_set() const {
+  info_time("Merge Heap Roots", _cur_merge_heap_roots_time_ms);
 
-  info_time("Evacuate Collection Set", sum_ms);
+  debug_phase(_gc_par_phases[MergeRS]);
+  if (G1HotCardCache::default_use_cache()) {
+    debug_phase(_gc_par_phases[MergeHCC]);
+  }
+  debug_phase(_gc_par_phases[MergeLB]);
+
+  info_time("Evacuate Collection Set", _cur_collection_initial_evac_time_ms);
 
   trace_phase(_gc_par_phases[GCWorkerStart], false);
   debug_phase(_gc_par_phases[ExtRootScan]);
   for (int i = ExtRootScanSubPhasesFirst; i <= ExtRootScanSubPhasesLast; i++) {
     trace_phase(_gc_par_phases[i]);
   }
-  if (G1HotCardCache::default_use_cache()) {
-    debug_phase(_gc_par_phases[ScanHCC]);
-  }
-  debug_phase(_gc_par_phases[UpdateRS]);
-  debug_phase(_gc_par_phases[ScanRS]);
+  debug_phase(_gc_par_phases[ScanHR]);
   debug_phase(_gc_par_phases[CodeRoots]);
   debug_phase(_gc_par_phases[ObjCopy]);
   debug_phase(_gc_par_phases[Termination]);
@@ -420,7 +448,7 @@
   debug_phase(_gc_par_phases[GCWorkerTotal]);
   trace_phase(_gc_par_phases[GCWorkerEnd], false);
 
-  return sum_ms;
+  return _cur_collection_initial_evac_time_ms + _cur_merge_heap_roots_time_ms;
 }
 
 double G1GCPhaseTimes::print_post_evacuate_collection_set() const {
@@ -503,7 +531,7 @@
 
   double accounted_ms = 0.0;
   accounted_ms += print_pre_evacuate_collection_set();
-  accounted_ms += print_evacuate_collection_set();
+  accounted_ms += print_evacuate_initial_collection_set();
   accounted_ms += print_evacuate_optional_collection_set();
   accounted_ms += print_post_evacuate_collection_set();
   print_other(accounted_ms);
@@ -530,10 +558,12 @@
       "CMRefRoots",
       "WaitForStrongCLD",
       "WeakCLDRoots",
-      "UpdateRS",
-      "ScanHCC",
-      "ScanRS",
-      "OptScanRS",
+      "MergeRS",
+      "OptMergeRS",
+      "MergeLB",
+      "MergeHCC",
+      "ScanHR",
+      "OptScanHR",
       "CodeRoots",
       "OptCodeRoots",
       "ObjCopy",
@@ -580,8 +610,8 @@
   _stopped = true;
 }
 
-G1GCParPhaseTimesTracker::G1GCParPhaseTimesTracker(G1GCPhaseTimes* phase_times, G1GCPhaseTimes::GCParPhases phase, uint worker_id) :
-  _start_time(), _phase(phase), _phase_times(phase_times), _worker_id(worker_id), _event() {
+G1GCParPhaseTimesTracker::G1GCParPhaseTimesTracker(G1GCPhaseTimes* phase_times, G1GCPhaseTimes::GCParPhases phase, uint worker_id, bool must_record) :
+  _start_time(), _phase(phase), _phase_times(phase_times), _worker_id(worker_id), _event(), _must_record(must_record) {
   if (_phase_times != NULL) {
     _start_time = Ticks::now();
   }
@@ -589,7 +619,11 @@
 
 G1GCParPhaseTimesTracker::~G1GCParPhaseTimesTracker() {
   if (_phase_times != NULL) {
-    _phase_times->record_time_secs(_phase, _worker_id, (Ticks::now() - _start_time).seconds());
+    if (_must_record) {
+      _phase_times->record_time_secs(_phase, _worker_id, (Ticks::now() - _start_time).seconds());
+    } else {
+      _phase_times->record_or_add_time_secs(_phase, _worker_id, (Ticks::now() - _start_time).seconds());
+    }
     _event.commit(GCId::current(), _worker_id, G1GCPhaseTimes::phase_name(_phase));
   }
 }
--- a/src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp	Thu Jun 27 11:48:32 2019 +0200
@@ -60,10 +60,12 @@
     CMRefRoots,
     WaitForStrongCLD,
     WeakCLDRoots,
-    UpdateRS,
-    ScanHCC,
-    ScanRS,
-    OptScanRS,
+    MergeRS,
+    OptMergeRS,
+    MergeLB,
+    MergeHCC,
+    ScanHR,
+    OptScanHR,
     CodeRoots,
     OptCodeRoots,
     ObjCopy,
@@ -84,18 +86,24 @@
   static const GCParPhases ExtRootScanSubPhasesFirst = ThreadRoots;
   static const GCParPhases ExtRootScanSubPhasesLast = WeakCLDRoots;
 
-  enum GCScanRSWorkItems {
-    ScanRSScannedCards,
-    ScanRSClaimedCards,
-    ScanRSSkippedCards,
-    ScanRSScannedOptRefs,
-    ScanRSUsedMemory
+  enum GCMergeRSWorkTimes {
+    MergeRSMergedSparse,
+    MergeRSMergedFine,
+    MergeRSMergedCoarse
   };
 
-  enum GCUpdateRSWorkItems {
-    UpdateRSProcessedBuffers,
-    UpdateRSScannedCards,
-    UpdateRSSkippedCards
+  enum GCScanHRWorkItems {
+    ScanHRScannedCards,
+    ScanHRScannedBlocks,
+    ScanHRClaimedChunks,
+    ScanHRScannedOptRefs,
+    ScanHRUsedMemory
+  };
+
+  enum GCMergeLBWorkItems {
+    MergeLBProcessedBuffers,
+    MergeLBDirtyCards,
+    MergeLBSkippedCards
   };
 
   enum GCObjCopyWorkItems {
@@ -109,19 +117,27 @@
 
   WorkerDataArray<double>* _gc_par_phases[GCParPhasesSentinel];
 
-  WorkerDataArray<size_t>* _update_rs_processed_buffers;
-  WorkerDataArray<size_t>* _update_rs_scanned_cards;
-  WorkerDataArray<size_t>* _update_rs_skipped_cards;
+  WorkerDataArray<size_t>* _merge_rs_merged_sparse;
+  WorkerDataArray<size_t>* _merge_rs_merged_fine;
+  WorkerDataArray<size_t>* _merge_rs_merged_coarse;
+
+  WorkerDataArray<size_t>* _merge_lb_processed_buffers;
+  WorkerDataArray<size_t>* _merge_lb_dirty_cards;
+  WorkerDataArray<size_t>* _merge_lb_skipped_cards;
 
-  WorkerDataArray<size_t>* _scan_rs_scanned_cards;
-  WorkerDataArray<size_t>* _scan_rs_claimed_cards;
-  WorkerDataArray<size_t>* _scan_rs_skipped_cards;
+  WorkerDataArray<size_t>* _scan_hr_scanned_cards;
+  WorkerDataArray<size_t>* _scan_hr_scanned_blocks;
+  WorkerDataArray<size_t>* _scan_hr_claimed_chunks;
 
-  WorkerDataArray<size_t>* _opt_scan_rs_scanned_cards;
-  WorkerDataArray<size_t>* _opt_scan_rs_claimed_cards;
-  WorkerDataArray<size_t>* _opt_scan_rs_skipped_cards;
-  WorkerDataArray<size_t>* _opt_scan_rs_scanned_opt_refs;
-  WorkerDataArray<size_t>* _opt_scan_rs_used_memory;
+  WorkerDataArray<size_t>* _opt_merge_rs_merged_sparse;
+  WorkerDataArray<size_t>* _opt_merge_rs_merged_fine;
+  WorkerDataArray<size_t>* _opt_merge_rs_merged_coarse;
+
+  WorkerDataArray<size_t>* _opt_scan_hr_scanned_cards;
+  WorkerDataArray<size_t>* _opt_scan_hr_scanned_blocks;
+  WorkerDataArray<size_t>* _opt_scan_hr_claimed_chunks;
+  WorkerDataArray<size_t>* _opt_scan_hr_scanned_opt_refs;
+  WorkerDataArray<size_t>* _opt_scan_hr_used_memory;
 
   WorkerDataArray<size_t>* _obj_copy_lab_waste;
   WorkerDataArray<size_t>* _obj_copy_lab_undo_waste;
@@ -145,6 +161,9 @@
 
   double _cur_string_deduplication_time_ms;
 
+  double _cur_merge_heap_roots_time_ms;
+  double _cur_optional_merge_heap_roots_time_ms;
+
   double _cur_prepare_tlab_time_ms;
   double _cur_resize_tlab_time_ms;
 
@@ -159,6 +178,8 @@
 
   double _external_accounted_time_ms;
 
+  double _recorded_prepare_heap_roots_time_ms;
+
   double _recorded_clear_claimed_marks_time_ms;
 
   double _recorded_young_cset_choice_time_ms;
@@ -208,7 +229,8 @@
   void trace_count(const char* name, size_t value) const;
 
   double print_pre_evacuate_collection_set() const;
-  double print_evacuate_collection_set() const;
+  double print_merge_heap_roots_time() const;
+  double print_evacuate_initial_collection_set() const;
   double print_evacuate_optional_collection_set() const;
   double print_post_evacuate_collection_set() const;
   void print_other(double accounted_ms) const;
@@ -278,6 +300,14 @@
     _cur_strong_code_root_purge_time_ms = ms;
   }
 
+  void record_merge_heap_roots_time(double ms) {
+    _cur_merge_heap_roots_time_ms += ms;
+  }
+
+  void record_or_add_optional_merge_heap_roots_time(double ms) {
+    _cur_optional_merge_heap_roots_time_ms += ms;
+  }
+
   void record_evac_fail_recalc_used_time(double ms) {
     _cur_evac_fail_recalc_used = ms;
   }
@@ -357,6 +387,10 @@
     _external_accounted_time_ms += time_ms;
   }
 
+  void record_prepare_heap_roots_time_ms(double recorded_prepare_heap_roots_time_ms) {
+    _recorded_prepare_heap_roots_time_ms = recorded_prepare_heap_roots_time_ms;
+  }
+
   void record_clear_claimed_marks_time_ms(double recorded_clear_claimed_marks_time_ms) {
     _recorded_clear_claimed_marks_time_ms = recorded_clear_claimed_marks_time_ms;
   }
@@ -397,6 +431,10 @@
     return _cur_fast_reclaim_humongous_time_ms;
   }
 
+  size_t fast_reclaim_humongous_candidates() const {
+    return _cur_fast_reclaim_humongous_candidates;
+  }
+
   ReferenceProcessorPhaseTimes* ref_phase_times() { return &_ref_phase_times; }
 
   WeakProcessorPhaseTimes* weak_phase_times() { return &_weak_phase_times; }
@@ -424,8 +462,10 @@
   G1GCPhaseTimes* _phase_times;
   uint _worker_id;
   EventGCPhaseParallel _event;
+  bool _must_record;
+
 public:
-  G1GCParPhaseTimesTracker(G1GCPhaseTimes* phase_times, G1GCPhaseTimes::GCParPhases phase, uint worker_id);
+  G1GCParPhaseTimesTracker(G1GCPhaseTimes* phase_times, G1GCPhaseTimes::GCParPhases phase, uint worker_id, bool must_record = true);
   virtual ~G1GCParPhaseTimesTracker();
 };
 
--- a/src/hotspot/share/gc/g1/g1HeterogeneousHeapPolicy.cpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/g1HeterogeneousHeapPolicy.cpp	Thu Jun 27 11:48:32 2019 +0200
@@ -39,8 +39,8 @@
 }
 
 // After a collection pause, young list target length is updated. So we need to make sure we have enough regions in dram for young gen.
-void G1HeterogeneousHeapPolicy::record_collection_pause_end(double pause_time_ms, size_t cards_scanned, size_t heap_used_bytes_before_gc) {
-  G1Policy::record_collection_pause_end(pause_time_ms, cards_scanned, heap_used_bytes_before_gc);
+void G1HeterogeneousHeapPolicy::record_collection_pause_end(double pause_time_ms, size_t heap_used_bytes_before_gc) {
+  G1Policy::record_collection_pause_end(pause_time_ms, heap_used_bytes_before_gc);
   _manager->adjust_dram_regions((uint)young_list_target_length(), G1CollectedHeap::heap()->workers());
 }
 
--- a/src/hotspot/share/gc/g1/g1HeterogeneousHeapPolicy.hpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/g1HeterogeneousHeapPolicy.hpp	Thu Jun 27 11:48:32 2019 +0200
@@ -38,7 +38,7 @@
   // initialize policy
   virtual void init(G1CollectedHeap* g1h, G1CollectionSet* collection_set);
   // Record end of an evacuation pause.
-  virtual void record_collection_pause_end(double pause_time_ms, size_t cards_scanned, size_t heap_used_bytes_before_gc);
+  virtual void record_collection_pause_end(double pause_time_ms, size_t heap_used_bytes_before_gc);
   // Record the end of full collection.
   virtual void record_full_collection_end();
 
--- a/src/hotspot/share/gc/g1/g1ParScanThreadState.cpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/g1ParScanThreadState.cpp	Thu Jun 27 11:48:32 2019 +0200
@@ -51,6 +51,7 @@
     _tenuring_threshold(g1h->policy()->tenuring_threshold()),
     _scanner(g1h, this),
     _worker_id(worker_id),
+    _last_enqueued_card(SIZE_MAX),
     _stack_trim_upper_threshold(GCDrainStackTargetSize * 2 + 1),
     _stack_trim_lower_threshold(GCDrainStackTargetSize),
     _trim_ticks(),
@@ -371,7 +372,7 @@
     }
 
     size_t used_memory = pss->oops_into_optional_region(hr)->used_memory();
-    _g1h->phase_times()->record_or_add_thread_work_item(G1GCPhaseTimes::OptScanRS, worker_index, used_memory, G1GCPhaseTimes::ScanRSUsedMemory);
+    _g1h->phase_times()->record_or_add_thread_work_item(G1GCPhaseTimes::OptScanHR, worker_index, used_memory, G1GCPhaseTimes::ScanHRUsedMemory);
   }
 }
 
--- a/src/hotspot/share/gc/g1/g1ParScanThreadState.hpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/g1ParScanThreadState.hpp	Thu Jun 27 11:48:32 2019 +0200
@@ -60,6 +60,10 @@
 
   uint _worker_id;
 
+  // Remember the last enqueued card to avoid enqueuing the same card over and over;
+  // since we only ever scan a card once, this is sufficient.
+  size_t _last_enqueued_card;
+
   // Upper and lower threshold to start and end work queue draining.
   uint const _stack_trim_upper_threshold;
   uint const _stack_trim_lower_threshold;
@@ -128,8 +132,9 @@
     }
     size_t card_index = ct()->index_for(p);
     // If the card hasn't been added to the buffer, do it.
-    if (ct()->mark_card_deferred(card_index)) {
+    if (_last_enqueued_card != card_index) {
       dirty_card_queue().enqueue(ct()->byte_for_index(card_index));
+      _last_enqueued_card = card_index;
     }
   }
 
--- a/src/hotspot/share/gc/g1/g1Policy.cpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/g1Policy.cpp	Thu Jun 27 11:48:32 2019 +0200
@@ -572,10 +572,24 @@
   return result;
 }
 
+double G1Policy::log_buffer_processing_time() const {
+  double all_cards_processing_time = average_time_ms(G1GCPhaseTimes::ScanHR) + average_time_ms(G1GCPhaseTimes::OptScanHR);
+  size_t log_buffer_dirty_cards = phase_times()->sum_thread_work_items(G1GCPhaseTimes::MergeLB, G1GCPhaseTimes::MergeLBDirtyCards);
+  size_t scan_heap_roots_cards = phase_times()->sum_thread_work_items(G1GCPhaseTimes::ScanHR, G1GCPhaseTimes::ScanHRScannedCards) +
+                                 phase_times()->sum_thread_work_items(G1GCPhaseTimes::OptScanHR, G1GCPhaseTimes::ScanHRScannedCards);
+  // This may happen if there are duplicate cards in different log buffers.
+  if (log_buffer_dirty_cards > scan_heap_roots_cards) {
+    return all_cards_processing_time + average_time_ms(G1GCPhaseTimes::MergeLB);
+  }
+  return (all_cards_processing_time * log_buffer_dirty_cards / scan_heap_roots_cards) + average_time_ms(G1GCPhaseTimes::MergeLB);
+}
+
 // Anything below that is considered to be zero
 #define MIN_TIMER_GRANULARITY 0.0000001
 
-void G1Policy::record_collection_pause_end(double pause_time_ms, size_t cards_scanned, size_t heap_used_bytes_before_gc) {
+void G1Policy::record_collection_pause_end(double pause_time_ms, size_t heap_used_bytes_before_gc) {
+  G1GCPhaseTimes* p = phase_times();
+
   double end_time_sec = os::elapsedTime();
 
   assert_used_and_recalculate_used_equal(_g1h);
@@ -645,29 +659,40 @@
   _short_lived_surv_rate_group->start_adding_regions();
   // Do that for any other surv rate groups
 
-  double scan_hcc_time_ms = G1HotCardCache::default_use_cache() ? average_time_ms(G1GCPhaseTimes::ScanHCC) : 0.0;
+  double scan_hcc_time_ms = G1HotCardCache::default_use_cache() ? average_time_ms(G1GCPhaseTimes::MergeHCC) : 0.0;
 
   if (update_stats) {
-    double cost_per_card_ms = 0.0;
-    if (_pending_cards > 0) {
-      cost_per_card_ms = (average_time_ms(G1GCPhaseTimes::UpdateRS)) / (double) _pending_cards;
-      _analytics->report_cost_per_card_ms(cost_per_card_ms);
+    double cost_per_log_buffer_entry = 0.0;
+    size_t const pending_log_buffer_entries = p->sum_thread_work_items(G1GCPhaseTimes::MergeLB, G1GCPhaseTimes::MergeLBDirtyCards);
+    if (pending_log_buffer_entries > 0) {
+      cost_per_log_buffer_entry = log_buffer_processing_time() / pending_log_buffer_entries;
+      _analytics->report_cost_per_log_buffer_entry_ms(cost_per_log_buffer_entry);
     }
     _analytics->report_cost_scan_hcc(scan_hcc_time_ms);
 
-    double cost_per_entry_ms = 0.0;
-    if (cards_scanned > 10) {
-      double avg_time_scan_rs = average_time_ms(G1GCPhaseTimes::ScanRS);
-      if (this_pause_was_young_only) {
-        avg_time_scan_rs += average_time_ms(G1GCPhaseTimes::OptScanRS);
-      }
-      cost_per_entry_ms = avg_time_scan_rs / cards_scanned;
-      _analytics->report_cost_per_entry_ms(cost_per_entry_ms, this_pause_was_young_only);
+    size_t const total_cards_scanned = p->sum_thread_work_items(G1GCPhaseTimes::ScanHR, G1GCPhaseTimes::ScanHRScannedCards) +
+                                       p->sum_thread_work_items(G1GCPhaseTimes::OptScanHR, G1GCPhaseTimes::ScanHRScannedCards);
+    size_t remset_cards_scanned = 0;
+    // There might have been duplicate log buffer entries in the queues which could
+    // increase this value beyond the cards scanned. In this case attribute all cards
+    // to the log buffers.
+    if (pending_log_buffer_entries <= total_cards_scanned) {
+      remset_cards_scanned = total_cards_scanned - pending_log_buffer_entries;
+    }
+
+    double cost_per_remset_card_ms = 0.0;
+    if (remset_cards_scanned > 10) {
+      double avg_time_remset_scan = ((average_time_ms(G1GCPhaseTimes::ScanHR) + average_time_ms(G1GCPhaseTimes::OptScanHR)) *
+                                     remset_cards_scanned / total_cards_scanned) +
+                                    average_time_ms(G1GCPhaseTimes::MergeRS);
+
+      cost_per_remset_card_ms = avg_time_remset_scan / remset_cards_scanned;
+      _analytics->report_cost_per_remset_card_ms(cost_per_remset_card_ms, this_pause_was_young_only);
     }
 
     if (_max_rs_lengths > 0) {
       double cards_per_entry_ratio =
-        (double) cards_scanned / (double) _max_rs_lengths;
+        (double) remset_cards_scanned / (double) _max_rs_lengths;
       _analytics->report_cards_per_entry_ratio(cards_per_entry_ratio, this_pause_was_young_only);
     }
 
@@ -759,20 +784,26 @@
   }
 
   // Note that _mmu_tracker->max_gc_time() returns the time in seconds.
-  double update_rs_time_goal_ms = _mmu_tracker->max_gc_time() * MILLIUNITS * G1RSetUpdatingPauseTimePercent / 100.0;
+  double scan_log_buffer_time_goal_ms = _mmu_tracker->max_gc_time() * MILLIUNITS * G1RSetUpdatingPauseTimePercent / 100.0;
 
-  if (update_rs_time_goal_ms < scan_hcc_time_ms) {
+  if (scan_log_buffer_time_goal_ms < scan_hcc_time_ms) {
     log_debug(gc, ergo, refine)("Adjust concurrent refinement thresholds (scanning the HCC expected to take longer than Update RS time goal)."
-                                "Update RS time goal: %1.2fms Scan HCC time: %1.2fms",
-                                update_rs_time_goal_ms, scan_hcc_time_ms);
+                                "Log Buffer Scan time goal: %1.2fms Scan HCC time: %1.2fms",
+                                scan_log_buffer_time_goal_ms, scan_hcc_time_ms);
 
-    update_rs_time_goal_ms = 0;
+    scan_log_buffer_time_goal_ms = 0;
   } else {
-    update_rs_time_goal_ms -= scan_hcc_time_ms;
+    scan_log_buffer_time_goal_ms -= scan_hcc_time_ms;
   }
-  _g1h->concurrent_refine()->adjust(average_time_ms(G1GCPhaseTimes::UpdateRS),
-                                    phase_times()->sum_thread_work_items(G1GCPhaseTimes::UpdateRS),
-                                    update_rs_time_goal_ms);
+
+  double const log_buffer_time = log_buffer_processing_time();
+
+  log_debug(gc, ergo, refine)("Concurrent refinement times: Log Buffer Scan time goal: %1.2fms Log Buffer Scan time: %1.2fms HCC time: %1.2fms",
+                              scan_log_buffer_time_goal_ms, log_buffer_time, scan_hcc_time_ms);
+
+  _g1h->concurrent_refine()->adjust(log_buffer_time,
+                                    phase_times()->sum_thread_work_items(G1GCPhaseTimes::MergeLB, G1GCPhaseTimes::MergeLBProcessedBuffers),
+                                    scan_log_buffer_time_goal_ms);
 }
 
 G1IHOPControl* G1Policy::create_ihop_control(const G1Predictions* predictor){
--- a/src/hotspot/share/gc/g1/g1Policy.hpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/g1Policy.hpp	Thu Jun 27 11:48:32 2019 +0200
@@ -111,6 +111,8 @@
   bool should_update_surv_rate_group_predictors() {
     return collector_state()->in_young_only_phase() && !collector_state()->mark_or_rebuild_in_progress();
   }
+
+  double log_buffer_processing_time() const;
 public:
   const G1Predictions& predictor() const { return _predictor; }
   const G1Analytics* analytics()   const { return const_cast<const G1Analytics*>(_analytics); }
@@ -311,7 +313,7 @@
 
   // Record the start and end of an evacuation pause.
   void record_collection_pause_start(double start_time_sec);
-  virtual void record_collection_pause_end(double pause_time_ms, size_t cards_scanned, size_t heap_used_bytes_before_gc);
+  virtual void record_collection_pause_end(double pause_time_ms, size_t heap_used_bytes_before_gc);
 
   // Record the start and end of a full collection.
   void record_full_collection_start();
--- a/src/hotspot/share/gc/g1/g1RemSet.cpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/g1RemSet.cpp	Thu Jun 27 11:48:32 2019 +0200
@@ -38,7 +38,8 @@
 #include "gc/g1/g1SharedDirtyCardQueue.hpp"
 #include "gc/g1/heapRegion.inline.hpp"
 #include "gc/g1/heapRegionManager.inline.hpp"
-#include "gc/g1/heapRegionRemSet.hpp"
+#include "gc/g1/heapRegionRemSet.inline.hpp"
+#include "gc/g1/sparsePRT.hpp"
 #include "gc/shared/gcTraceTime.inline.hpp"
 #include "gc/shared/suspendibleThreadSet.hpp"
 #include "jfr/jfrEvents.hpp"
@@ -52,40 +53,453 @@
 #include "utilities/stack.inline.hpp"
 #include "utilities/ticks.hpp"
 
-// Collects information about the overall remembered set scan progress during an evacuation.
+// Collects information about the overall heap root scan progress during an evacuation.
+//
+// Scanning the remembered sets works by first merging all sources of cards to be
+// scanned (log buffers, hcc, remembered sets) into a single data structure to remove
+// duplicates and simplify work distribution.
+//
+// During the following card scanning we not only scan this combined set of cards, but
+// also remember that these were completely scanned. The following evacuation passes
+// do not scan these cards again, and so need to be preserved across increments.
+//
+// The representation for all the cards to scan is the card table: cards can have
+// one of three states during GC:
+// - clean: these cards will not be scanned in this pass
+// - dirty: these cards will be scanned in this pass
+// - scanned: these cards have already been scanned in a previous pass
+//
+// After all evacuation is done, we reset the card table to clean.
+//
+// Work distribution occurs on "chunk" basis, i.e. contiguous ranges of cards. As an
+// additional optimization, during card merging we remember which regions and which
+// chunks actually contain cards to be scanned. Threads iterate only across these
+// regions, and only compete for chunks containing any cards.
+//
+// Within these chunks, a worker scans the card table on "blocks" of cards, i.e.
+// contiguous ranges of dirty cards to be scanned. These blocks are converted to actual
+// memory ranges and then passed on to actual scanning.
 class G1RemSetScanState : public CHeapObj<mtGC> {
+  class G1DirtyRegions;
+
+  size_t _max_regions;
+
+  // Has this region that is part of the regions in the collection set been processed yet.
+  typedef bool G1RemsetIterState;
+
+  G1RemsetIterState volatile* _collection_set_iter_state;
+
+  // Card table iteration claim for each heap region, from 0 (completely unscanned)
+  // to (>=) HeapRegion::CardsPerRegion (completely scanned).
+  uint volatile* _card_table_scan_state;
+
+  // Random power of two number of cards we want to claim per thread. This corresponds
+  // to a 64k of memory work chunk area for every thread.
+  // We use the same claim size as Parallel GC. No particular measurements have been
+  // performed to determine an optimal number.
+  static const uint CardsPerChunk = 128;
+
+  uint _scan_chunks_per_region;
+  bool* _region_scan_chunks;
+  uint8_t _scan_chunks_shift;
+public:
+  uint scan_chunk_size() const { return (uint)1 << _scan_chunks_shift; }
+
+  // Returns whether the chunk corresponding to the given region/card in region contain a
+  // dirty card, i.e. actually needs scanning.
+  bool chunk_needs_scan(uint const region_idx, uint const card_in_region) const {
+    size_t const idx = (size_t)region_idx * _scan_chunks_per_region + (card_in_region >> _scan_chunks_shift);
+    assert(idx < (_max_regions * _scan_chunks_per_region), "Index " SIZE_FORMAT " out of bounds " SIZE_FORMAT,
+           idx, _max_regions * _scan_chunks_per_region);
+    return _region_scan_chunks[idx];
+  }
+
 private:
+  // The complete set of regions which card table needs to be cleared at the end of GC because
+  // we scribbled all over them.
+  G1DirtyRegions* _all_dirty_regions;
+  // The set of regions which card table needs to be scanned for new dirty cards
+  // in the current evacuation pass.
+  G1DirtyRegions* _next_dirty_regions;
+
+  // Set of (unique) regions that can be added to concurrently.
+  class G1DirtyRegions : public CHeapObj<mtGC> {
+    uint* _buffer;
+    uint _cur_idx;
+    size_t _max_regions;
+
+    bool* _contains;
+
+  public:
+    G1DirtyRegions(size_t max_regions) :
+      _buffer(NEW_C_HEAP_ARRAY(uint, max_regions, mtGC)),
+      _cur_idx(0),
+      _max_regions(max_regions),
+      _contains(NEW_C_HEAP_ARRAY(bool, max_regions, mtGC)) {
+
+      reset();
+    }
+
+    static size_t chunk_size() { return M; }
+
+    ~G1DirtyRegions() {
+      FREE_C_HEAP_ARRAY(uint, _buffer);
+      FREE_C_HEAP_ARRAY(bool, _contains);
+    }
+
+    void reset() {
+      _cur_idx = 0;
+      ::memset(_contains, false, _max_regions * sizeof(bool));
+    }
+
+    uint size() const { return _cur_idx; }
+
+    uint at(uint idx) const {
+      assert(idx < _cur_idx, "Index %u beyond valid regions", idx);
+      return _buffer[idx];
+    }
+
+    void add_dirty_region(uint region) {
+      if (_contains[region]) {
+        return;
+      }
+
+      bool marked_as_dirty = Atomic::cmpxchg(true, &_contains[region], false) == false;
+      if (marked_as_dirty) {
+        uint allocated = Atomic::add(1u, &_cur_idx) - 1;
+        _buffer[allocated] = region;
+      }
+    }
+
+    // Creates the union of this and the other G1DirtyRegions.
+    void merge(const G1DirtyRegions* other) {
+      for (uint i = 0; i < other->size(); i++) {
+        uint region = other->at(i);
+        if (!_contains[region]) {
+          _buffer[_cur_idx++] = region;
+          _contains[region] = true;
+        }
+      }
+    }
+  };
+
+  // Returns whether the given region contains cards we need to scan. The remembered
+  // set and other sources may contain cards that
+  // - are in uncommitted regions
+  // - are located in the collection set
+  // - are located in free regions
+  // as we do not clean up remembered sets before merging heap roots.
+  bool contains_cards_to_process(uint const region_idx) const {
+    HeapRegion* hr = G1CollectedHeap::heap()->region_at_or_null(region_idx);
+    return (hr != NULL && !hr->in_collection_set() && hr->is_old_or_humongous_or_archive());
+  }
+
+  class G1MergeCardSetClosure : public HeapRegionClosure {
+    G1RemSetScanState* _scan_state;
+    G1CardTable* _ct;
+
+    uint _merged_sparse;
+    uint _merged_fine;
+    uint _merged_coarse;
+
+    // Returns if the region contains cards we need to scan. If so, remember that
+    // region in the current set of dirty regions.
+    bool remember_if_interesting(uint const region_idx) {
+      if (!_scan_state->contains_cards_to_process(region_idx)) {
+        return false;
+      }
+      _scan_state->add_dirty_region(region_idx);
+      return true;
+    }
+  public:
+    G1MergeCardSetClosure(G1RemSetScanState* scan_state) :
+      _scan_state(scan_state),
+      _ct(G1CollectedHeap::heap()->card_table()),
+      _merged_sparse(0),
+      _merged_fine(0),
+      _merged_coarse(0) { }
+
+    void next_coarse_prt(uint const region_idx) {
+      if (!remember_if_interesting(region_idx)) {
+        return;
+      }
+
+      _merged_coarse++;
+
+      size_t region_base_idx = (size_t)region_idx << HeapRegion::LogCardsPerRegion;
+      _ct->mark_region_dirty(region_base_idx, HeapRegion::CardsPerRegion);
+      _scan_state->set_chunk_region_dirty(region_base_idx);
+    }
+
+    void next_fine_prt(uint const region_idx, BitMap* bm) {
+      if (!remember_if_interesting(region_idx)) {
+        return;
+      }
+
+      _merged_fine++;
+
+      size_t const region_base_idx = (size_t)region_idx << HeapRegion::LogCardsPerRegion;
+      BitMap::idx_t cur = bm->get_next_one_offset(0);
+      while (cur != bm->size()) {
+        _ct->mark_clean_as_dirty(region_base_idx + cur);
+        _scan_state->set_chunk_dirty(region_base_idx + cur);
+        cur = bm->get_next_one_offset(cur + 1);
+      }
+    }
+
+    void next_sparse_prt(uint const region_idx, SparsePRTEntry::card_elem_t* cards, uint const num_cards) {
+      if (!remember_if_interesting(region_idx)) {
+        return;
+      }
+
+      _merged_sparse++;
+
+      size_t const region_base_idx = (size_t)region_idx << HeapRegion::LogCardsPerRegion;
+      for (uint i = 0; i < num_cards; i++) {
+        size_t card_idx = region_base_idx + cards[i];
+        _ct->mark_clean_as_dirty(card_idx);
+        _scan_state->set_chunk_dirty(card_idx);
+      }
+    }
+
+    virtual bool do_heap_region(HeapRegion* r) {
+      assert(r->in_collection_set() || r->is_starts_humongous(), "must be");
+
+      HeapRegionRemSet* rem_set = r->rem_set();
+      if (!rem_set->is_empty()) {
+        rem_set->iterate_prts(*this);
+      }
+
+      return false;
+    }
+
+    size_t merged_sparse() const { return _merged_sparse; }
+    size_t merged_fine() const { return _merged_fine; }
+    size_t merged_coarse() const { return _merged_coarse; }
+  };
+
+  // Visitor for the remembered sets of humongous candidate regions to merge their
+  // remembered set into the card table.
+  class G1FlushHumongousCandidateRemSets : public HeapRegionClosure {
+    G1MergeCardSetClosure _cl;
+
+  public:
+    G1FlushHumongousCandidateRemSets(G1RemSetScanState* scan_state) : _cl(scan_state) { }
+
+    virtual bool do_heap_region(HeapRegion* r) {
+      G1CollectedHeap* g1h = G1CollectedHeap::heap();
+
+      if (!r->is_starts_humongous() ||
+          !g1h->region_attr(r->hrm_index()).is_humongous() ||
+          r->rem_set()->is_empty()) {
+        return false;
+      }
+
+      guarantee(r->rem_set()->occupancy_less_or_equal_than(G1RSetSparseRegionEntries),
+                "Found a not-small remembered set here. This is inconsistent with previous assumptions.");
+
+      _cl.do_heap_region(r);
+
+      // We should only clear the card based remembered set here as we will not
+      // implicitly rebuild anything else during eager reclaim. Note that at the moment
+      // (and probably never) we do not enter this path if there are other kind of
+      // remembered sets for this region.
+      r->rem_set()->clear_locked(true /* only_cardset */);
+      // Clear_locked() above sets the state to Empty. However we want to continue
+      // collecting remembered set entries for humongous regions that were not
+      // reclaimed.
+      r->rem_set()->set_state_complete();
+#ifdef ASSERT
+      G1HeapRegionAttr region_attr = g1h->region_attr(r->hrm_index());
+      assert(region_attr.needs_remset_update(), "must be");
+#endif
+      assert(r->rem_set()->is_empty(), "At this point any humongous candidate remembered set must be empty.");
+
+      return false;
+    }
+
+    size_t merged_sparse() const { return _cl.merged_sparse(); }
+    size_t merged_fine() const { return _cl.merged_fine(); }
+    size_t merged_coarse() const { return _cl.merged_coarse(); }
+  };
+
+  // Visitor for the log buffer entries to merge them into the card table.
+  class G1MergeLogBufferCardsClosure : public G1CardTableEntryClosure {
+    G1RemSetScanState* _scan_state;
+    G1CardTable* _ct;
+
+    size_t _cards_dirty;
+    size_t _cards_skipped;
+  public:
+    G1MergeLogBufferCardsClosure(G1CollectedHeap* g1h, G1RemSetScanState* scan_state) :
+      _scan_state(scan_state), _ct(g1h->card_table()), _cards_dirty(0), _cards_skipped(0)
+    {}
+
+    bool do_card_ptr(CardValue* card_ptr, uint worker_i) {
+      // The only time we care about recording cards that
+      // contain references that point into the collection set
+      // is during RSet updating within an evacuation pause.
+      // In this case worker_id should be the id of a GC worker thread.
+      assert(SafepointSynchronize::is_at_safepoint(), "not during an evacuation pause");
+
+      uint const region_idx = _ct->region_idx_for(card_ptr);
+
+      // The second clause must come after - the log buffers might contain cards to uncommited
+      // regions.
+      // This code may count duplicate entries in the log buffers (even if rare) multiple
+      // times.
+      if (_scan_state->contains_cards_to_process(region_idx) && (*card_ptr == G1CardTable::dirty_card_val())) {
+        _scan_state->add_dirty_region(region_idx);
+        _scan_state->set_chunk_dirty(_ct->index_for_cardvalue(card_ptr));
+        _cards_dirty++;
+      } else {
+        // We may have had dirty cards in the (initial) collection set (or the
+        // young regions which are always in the initial collection set). We do
+        // not fix their cards here: we already added these regions to the set of
+        // regions to clear the card table at the end during the prepare() phase.
+        _cards_skipped++;
+      }
+      return true;
+    }
+
+    size_t cards_dirty() const { return _cards_dirty; }
+    size_t cards_skipped() const { return _cards_skipped; }
+  };
+
+  class G1MergeHeapRootsTask : public AbstractGangTask {
+    HeapRegionClaimer _hr_claimer;
+    G1RemSetScanState* _scan_state;
+    bool _remembered_set_only;
+
+    G1GCPhaseTimes::GCParPhases _merge_phase;
+
+    volatile bool _fast_reclaim_handled;
+
+  public:
+    G1MergeHeapRootsTask(G1RemSetScanState* scan_state, uint num_workers, bool remembered_set_only, G1GCPhaseTimes::GCParPhases merge_phase) :
+      AbstractGangTask("G1 Merge Heap Roots"),
+      _hr_claimer(num_workers),
+      _scan_state(scan_state),
+      _remembered_set_only(remembered_set_only),
+      _merge_phase(merge_phase),
+      _fast_reclaim_handled(false) { }
+
+    virtual void work(uint worker_id) {
+      G1CollectedHeap* g1h = G1CollectedHeap::heap();
+      G1GCPhaseTimes* p = g1h->phase_times();
+
+      // We schedule flushing the remembered sets of humongous fast reclaim candidates
+      // onto the card table first to allow the remaining parallelized tasks hide it.
+      if (!_remembered_set_only &&
+          p->fast_reclaim_humongous_candidates() > 0 &&
+          !_fast_reclaim_handled &&
+          !Atomic::cmpxchg(true, &_fast_reclaim_handled, false)) {
+
+        G1FlushHumongousCandidateRemSets cl(_scan_state);
+        g1h->heap_region_iterate(&cl);
+
+        p->record_or_add_thread_work_item(_merge_phase, worker_id, cl.merged_sparse(), G1GCPhaseTimes::MergeRSMergedSparse);
+        p->record_or_add_thread_work_item(_merge_phase, worker_id, cl.merged_fine(), G1GCPhaseTimes::MergeRSMergedFine);
+        p->record_or_add_thread_work_item(_merge_phase, worker_id, cl.merged_coarse(), G1GCPhaseTimes::MergeRSMergedCoarse);
+      }
+
+      // Merge remembered sets of current candidates.
+      {
+        G1GCParPhaseTimesTracker x(p, _merge_phase, worker_id, !_remembered_set_only /* must_record */);
+        G1MergeCardSetClosure cl(_scan_state);
+        g1h->collection_set_iterate_increment_from(&cl, &_hr_claimer, worker_id);
+
+        p->record_or_add_thread_work_item(_merge_phase, worker_id, cl.merged_sparse(), G1GCPhaseTimes::MergeRSMergedSparse);
+        p->record_or_add_thread_work_item(_merge_phase, worker_id, cl.merged_fine(), G1GCPhaseTimes::MergeRSMergedFine);
+        p->record_or_add_thread_work_item(_merge_phase, worker_id, cl.merged_coarse(), G1GCPhaseTimes::MergeRSMergedCoarse);
+      }
+
+      // Apply closure to log entries in the HCC.
+      if (!_remembered_set_only && G1HotCardCache::default_use_cache()) {
+        assert(_merge_phase == G1GCPhaseTimes::MergeRS, "Wrong merge phase");
+        G1GCParPhaseTimesTracker x(p, G1GCPhaseTimes::MergeHCC, worker_id);
+        G1MergeLogBufferCardsClosure cl(g1h, _scan_state);
+        g1h->iterate_hcc_closure(&cl, worker_id);
+      }
+
+      // Now apply the closure to all remaining log entries.
+      if (!_remembered_set_only) {
+        assert(_merge_phase == G1GCPhaseTimes::MergeRS, "Wrong merge phase");
+        G1GCParPhaseTimesTracker x(p, G1GCPhaseTimes::MergeLB, worker_id);
+
+        G1MergeLogBufferCardsClosure cl(g1h, _scan_state);
+        g1h->iterate_dirty_card_closure(&cl, worker_id);
+
+        p->record_thread_work_item(G1GCPhaseTimes::MergeLB, worker_id, cl.cards_dirty(), G1GCPhaseTimes::MergeLBDirtyCards);
+        p->record_thread_work_item(G1GCPhaseTimes::MergeLB, worker_id, cl.cards_skipped(), G1GCPhaseTimes::MergeLBSkippedCards);
+      }
+    }
+  };
+
+  // Creates a snapshot of the current _top values at the start of collection to
+  // filter out card marks that we do not want to scan.
+  class G1ResetScanTopClosure : public HeapRegionClosure {
+    G1RemSetScanState* _scan_state;
+
+  public:
+    G1ResetScanTopClosure(G1RemSetScanState* scan_state) : _scan_state(scan_state) { }
+
+    virtual bool do_heap_region(HeapRegion* r) {
+      uint hrm_index = r->hrm_index();
+      if (r->in_collection_set()) {
+        // Young regions had their card table marked as young at their allocation;
+        // we need to make sure that these marks are cleared at the end of GC, *but*
+        // they should not be scanned for cards.
+        // So directly add them to the "all_dirty_regions".
+        // Same for regions in the (initial) collection set: they may contain cards from
+        // the log buffers, make sure they are cleaned.
+        _scan_state->add_all_dirty_region(hrm_index);
+       } else if (r->is_old_or_humongous_or_archive()) {
+        _scan_state->set_scan_top(hrm_index, r->top());
+       }
+       return false;
+     }
+  };
+  // For each region, contains the maximum top() value to be used during this garbage
+  // collection. Subsumes common checks like filtering out everything but old and
+  // humongous regions outside the collection set.
+  // This is valid because we are not interested in scanning stray remembered set
+  // entries from free or archive regions.
+  HeapWord** _scan_top;
+
   class G1ClearCardTableTask : public AbstractGangTask {
     G1CollectedHeap* _g1h;
-    uint* _dirty_region_list;
-    size_t _num_dirty_regions;
-    size_t _chunk_length;
+    G1DirtyRegions* _regions;
+    uint _chunk_length;
 
-    size_t volatile _cur_dirty_regions;
+    uint volatile _cur_dirty_regions;
+
+    G1RemSetScanState* _scan_state;
+
   public:
     G1ClearCardTableTask(G1CollectedHeap* g1h,
-                         uint* dirty_region_list,
-                         size_t num_dirty_regions,
-                         size_t chunk_length) :
+                         G1DirtyRegions* regions,
+                         uint chunk_length,
+                         G1RemSetScanState* scan_state) :
       AbstractGangTask("G1 Clear Card Table Task"),
       _g1h(g1h),
-      _dirty_region_list(dirty_region_list),
-      _num_dirty_regions(num_dirty_regions),
+      _regions(regions),
       _chunk_length(chunk_length),
-      _cur_dirty_regions(0) {
+      _cur_dirty_regions(0),
+      _scan_state(scan_state) {
 
       assert(chunk_length > 0, "must be");
     }
 
-    static size_t chunk_size() { return M; }
+    static uint chunk_size() { return M; }
 
     void work(uint worker_id) {
-      while (_cur_dirty_regions < _num_dirty_regions) {
-        size_t next = Atomic::add(_chunk_length, &_cur_dirty_regions) - _chunk_length;
-        size_t max = MIN2(next + _chunk_length, _num_dirty_regions);
+      while (_cur_dirty_regions < _regions->size()) {
+        uint next = Atomic::add(_chunk_length, &_cur_dirty_regions) - _chunk_length;
+        uint max = MIN2(next + _chunk_length, _regions->size());
 
-        for (size_t i = next; i < max; i++) {
-          HeapRegion* r = _g1h->region_at(_dirty_region_list[i]);
+        for (uint i = next; i < max; i++) {
+          HeapRegion* r = _g1h->region_at(_regions->at(i));
           if (!r->is_survivor()) {
             r->clear_cardtable();
           }
@@ -94,159 +508,222 @@
     }
   };
 
-  size_t _max_regions;
-
-  // Scan progress for the remembered set of a single region. Transitions from
-  // Unclaimed -> Claimed -> Complete.
-  // At each of the transitions the thread that does the transition needs to perform
-  // some special action once. This is the reason for the extra "Claimed" state.
-  typedef jint G1RemsetIterState;
-
-  static const G1RemsetIterState Unclaimed = 0; // The remembered set has not been scanned yet.
-  static const G1RemsetIterState Claimed = 1;   // The remembered set is currently being scanned.
-  static const G1RemsetIterState Complete = 2;  // The remembered set has been completely scanned.
+  // Clear the card table of "dirty" regions.
+  void clear_card_table(WorkGang* workers) {
+    uint num_regions = _all_dirty_regions->size();
 
-  G1RemsetIterState volatile* _iter_states;
-  // The current location where the next thread should continue scanning in a region's
-  // remembered set.
-  size_t volatile* _iter_claims;
+    if (num_regions == 0) {
+      return;
+    }
 
-  // Temporary buffer holding the regions we used to store remembered set scan duplicate
-  // information. These are also called "dirty". Valid entries are from [0.._cur_dirty_region)
-  uint* _dirty_region_buffer;
-
-  // Flag for every region whether it is in the _dirty_region_buffer already
-  // to avoid duplicates.
-  bool volatile* _in_dirty_region_buffer;
-  size_t _cur_dirty_region;
+    uint const num_chunks = (uint)(align_up((size_t)num_regions << HeapRegion::LogCardsPerRegion, G1ClearCardTableTask::chunk_size()) / G1ClearCardTableTask::chunk_size());
+    uint const num_workers = MIN2(num_chunks, workers->active_workers());
+    uint const chunk_length = G1ClearCardTableTask::chunk_size() / (uint)HeapRegion::CardsPerRegion;
 
-  // Creates a snapshot of the current _top values at the start of collection to
-  // filter out card marks that we do not want to scan.
-  class G1ResetScanTopClosure : public HeapRegionClosure {
-  private:
-    HeapWord** _scan_top;
-  public:
-    G1ResetScanTopClosure(HeapWord** scan_top) : _scan_top(scan_top) { }
+    // Iterate over the dirty cards region list.
+    G1ClearCardTableTask cl(G1CollectedHeap::heap(), _all_dirty_regions, chunk_length, this);
 
-    virtual bool do_heap_region(HeapRegion* r) {
-      uint hrm_index = r->hrm_index();
-      if (!r->in_collection_set() && r->is_old_or_humongous_or_archive() && !r->is_empty()) {
-        _scan_top[hrm_index] = r->top();
-      } else {
-        _scan_top[hrm_index] = NULL;
-      }
-      return false;
-    }
-  };
+    log_debug(gc, ergo)("Running %s using %u workers for %u "
+                        "units of work for %u regions.",
+                        cl.name(), num_workers, num_chunks, num_regions);
+    workers->run_task(&cl, num_workers);
 
-  // For each region, contains the maximum top() value to be used during this garbage
-  // collection. Subsumes common checks like filtering out everything but old and
-  // humongous regions outside the collection set.
-  // This is valid because we are not interested in scanning stray remembered set
-  // entries from free or archive regions.
-  HeapWord** _scan_top;
+#ifndef PRODUCT
+    G1CollectedHeap::heap()->verifier()->verify_card_table_cleanup();
+#endif
+  }
+
 public:
   G1RemSetScanState() :
     _max_regions(0),
-    _iter_states(NULL),
-    _iter_claims(NULL),
-    _dirty_region_buffer(NULL),
-    _in_dirty_region_buffer(NULL),
-    _cur_dirty_region(0),
+    _collection_set_iter_state(NULL),
+    _card_table_scan_state(NULL),
+    _scan_chunks_per_region((uint)(HeapRegion::CardsPerRegion / CardsPerChunk)),
+    _region_scan_chunks(NULL),
+    _scan_chunks_shift(0),
+    _all_dirty_regions(NULL),
+    _next_dirty_regions(NULL),
     _scan_top(NULL) {
   }
 
   ~G1RemSetScanState() {
-    if (_iter_states != NULL) {
-      FREE_C_HEAP_ARRAY(G1RemsetIterState, _iter_states);
-    }
-    if (_iter_claims != NULL) {
-      FREE_C_HEAP_ARRAY(size_t, _iter_claims);
-    }
-    if (_dirty_region_buffer != NULL) {
-      FREE_C_HEAP_ARRAY(uint, _dirty_region_buffer);
-    }
-    if (_in_dirty_region_buffer != NULL) {
-      FREE_C_HEAP_ARRAY(bool, _in_dirty_region_buffer);
-    }
-    if (_scan_top != NULL) {
-      FREE_C_HEAP_ARRAY(HeapWord*, _scan_top);
-    }
+    FREE_C_HEAP_ARRAY(G1RemsetIterState, _collection_set_iter_state);
+    FREE_C_HEAP_ARRAY(uint, _card_table_scan_state);
+    FREE_C_HEAP_ARRAY(bool, _region_scan_chunks);
+    FREE_C_HEAP_ARRAY(HeapWord*, _scan_top);
   }
 
-  void initialize(uint max_regions) {
-    assert(_iter_states == NULL, "Must not be initialized twice");
-    assert(_iter_claims == NULL, "Must not be initialized twice");
+  void initialize(size_t max_regions) {
+    assert(_collection_set_iter_state == NULL, "Must not be initialized twice");
     _max_regions = max_regions;
-    _iter_states = NEW_C_HEAP_ARRAY(G1RemsetIterState, max_regions, mtGC);
-    _iter_claims = NEW_C_HEAP_ARRAY(size_t, max_regions, mtGC);
-    _dirty_region_buffer = NEW_C_HEAP_ARRAY(uint, max_regions, mtGC);
-    _in_dirty_region_buffer = NEW_C_HEAP_ARRAY(bool, max_regions, mtGC);
+    _collection_set_iter_state = NEW_C_HEAP_ARRAY(G1RemsetIterState, max_regions, mtGC);
+    _card_table_scan_state = NEW_C_HEAP_ARRAY(uint, max_regions, mtGC);
+    _region_scan_chunks = NEW_C_HEAP_ARRAY(bool, max_regions * _scan_chunks_per_region, mtGC);
+
+    _scan_chunks_shift = (uint8_t)log2_intptr(HeapRegion::CardsPerRegion / _scan_chunks_per_region);
     _scan_top = NEW_C_HEAP_ARRAY(HeapWord*, max_regions, mtGC);
   }
 
-  void reset() {
-    for (uint i = 0; i < _max_regions; i++) {
-      _iter_states[i] = Unclaimed;
-      clear_scan_top(i);
+  void prepare() {
+    for (size_t i = 0; i < _max_regions; i++) {
+      _collection_set_iter_state[i] = false;
+      clear_scan_top((uint)i);
     }
 
-    G1ResetScanTopClosure cl(_scan_top);
+    _all_dirty_regions = new G1DirtyRegions(_max_regions);
+
+    G1ResetScanTopClosure cl(this);
     G1CollectedHeap::heap()->heap_region_iterate(&cl);
 
-    memset((void*)_iter_claims, 0, _max_regions * sizeof(size_t));
-    memset((void*)_in_dirty_region_buffer, false, _max_regions * sizeof(bool));
-    _cur_dirty_region = 0;
+    _next_dirty_regions = new G1DirtyRegions(_max_regions);
   }
 
-  // Attempt to claim the remembered set of the region for iteration. Returns true
-  // if this call caused the transition from Unclaimed to Claimed.
-  inline bool claim_iter(uint region) {
-    assert(region < _max_regions, "Tried to access invalid region %u", region);
-    if (_iter_states[region] != Unclaimed) {
-      return false;
+  void print_merge_heap_roots_stats() {
+    size_t num_scan_chunks = 0;
+    for (uint i = 0; i < _max_regions * _scan_chunks_per_region; i++) {
+      if (_region_scan_chunks[i]) {
+        num_scan_chunks++;
+      }
     }
-    G1RemsetIterState res = Atomic::cmpxchg(Claimed, &_iter_states[region], Unclaimed);
-    return (res == Unclaimed);
+    size_t num_visited_cards = num_scan_chunks * CardsPerChunk;
+    size_t total_dirty_region_cards = _next_dirty_regions->size() * HeapRegion::CardsPerRegion;
+
+    G1CollectedHeap* g1h = G1CollectedHeap::heap();
+    size_t total_old_region_cards =
+      (g1h->num_regions() - (g1h->num_free_regions() - g1h->collection_set()->cur_length())) * HeapRegion::CardsPerRegion;
+
+    log_debug(gc,remset)("Visited cards " SIZE_FORMAT " Total dirty " SIZE_FORMAT " (%.2lf%%) Total old " SIZE_FORMAT " (%.2lf%%)",
+                         num_visited_cards,
+                         total_dirty_region_cards,
+                         percent_of(num_visited_cards, total_dirty_region_cards),
+                         total_old_region_cards,
+                         percent_of(num_visited_cards, total_old_region_cards));
   }
 
-  // Try to atomically sets the iteration state to "complete". Returns true for the
-  // thread that caused the transition.
-  inline bool set_iter_complete(uint region) {
-    if (iter_is_complete(region)) {
-      return false;
+  void merge_heap_roots(WorkGang* workers, bool remembered_set_only, G1GCPhaseTimes::GCParPhases merge_phase) {
+    {
+      _all_dirty_regions->merge(_next_dirty_regions);
+      _next_dirty_regions->reset();
+      for (size_t i = 0; i < _max_regions; i++) {
+        _card_table_scan_state[i] = 0;
+      }
+
+      ::memset(_region_scan_chunks, false, _max_regions * _scan_chunks_per_region * sizeof(*_region_scan_chunks));
     }
-    G1RemsetIterState res = Atomic::cmpxchg(Complete, &_iter_states[region], Claimed);
-    return (res == Claimed);
+
+    size_t const increment_length = G1CollectedHeap::heap()->collection_set()->increment_length();
+
+    uint const num_workers = !remembered_set_only ? workers->active_workers() :
+                                                    MIN2(workers->active_workers(), (uint)increment_length);
+
+    {
+      G1MergeHeapRootsTask cl(this, num_workers, remembered_set_only, merge_phase);
+      log_debug(gc, ergo)("Running %s using %u workers for " SIZE_FORMAT " regions",
+                          cl.name(), num_workers, increment_length);
+      workers->run_task(&cl, num_workers);
+    }
+
+    if (log_is_enabled(Debug, gc, remset)) {
+      print_merge_heap_roots_stats();
+    }
   }
 
-  // Returns true if the region's iteration is complete.
-  inline bool iter_is_complete(uint region) const {
-    assert(region < _max_regions, "Tried to access invalid region %u", region);
-    return _iter_states[region] == Complete;
+  void set_chunk_region_dirty(size_t const region_card_idx) {
+    size_t chunk_idx = region_card_idx >> _scan_chunks_shift;
+    for (uint i = 0; i < _scan_chunks_per_region; i++) {
+      _region_scan_chunks[chunk_idx++] = true;
+    }
+  }
+
+  void set_chunk_dirty(size_t const card_idx) {
+    assert((card_idx >> _scan_chunks_shift) < (_max_regions * _scan_chunks_per_region),
+           "Trying to access index " SIZE_FORMAT " out of bounds " SIZE_FORMAT,
+           card_idx >> _scan_chunks_shift, _max_regions * _scan_chunks_per_region);
+    size_t const chunk_idx = card_idx >> _scan_chunks_shift;
+    if (!_region_scan_chunks[chunk_idx]) {
+      _region_scan_chunks[chunk_idx] = true;
+    }
   }
 
-  // The current position within the remembered set of the given region.
-  inline size_t iter_claimed(uint region) const {
-    assert(region < _max_regions, "Tried to access invalid region %u", region);
-    return _iter_claims[region];
+  void cleanup(WorkGang* workers) {
+    _all_dirty_regions->merge(_next_dirty_regions);
+
+    clear_card_table(workers);
+
+    delete _all_dirty_regions;
+    _all_dirty_regions = NULL;
+
+    delete _next_dirty_regions;
+    _next_dirty_regions = NULL;
   }
 
-  // Claim the next block of cards within the remembered set of the region with
-  // step size.
-  inline size_t iter_claimed_next(uint region, size_t step) {
-    return Atomic::add(step, &_iter_claims[region]) - step;
-  }
+  void iterate_dirty_regions_from(HeapRegionClosure* cl, uint worker_id) {
+    uint num_regions = _next_dirty_regions->size();
 
-  void add_dirty_region(uint region) {
-    if (_in_dirty_region_buffer[region]) {
+    if (num_regions == 0) {
       return;
     }
 
-    if (!Atomic::cmpxchg(true, &_in_dirty_region_buffer[region], false)) {
-      size_t allocated = Atomic::add(1u, &_cur_dirty_region) - 1;
-      _dirty_region_buffer[allocated] = region;
+    G1CollectedHeap* g1h = G1CollectedHeap::heap();
+
+    WorkGang* workers = g1h->workers();
+    uint const max_workers = workers->active_workers();
+
+    uint const start_pos = num_regions * worker_id / max_workers;
+    uint cur = start_pos;
+
+    do {
+      bool result = cl->do_heap_region(g1h->region_at(_next_dirty_regions->at(cur)));
+      guarantee(!result, "Not allowed to ask for early termination.");
+      cur++;
+      if (cur == _next_dirty_regions->size()) {
+        cur = 0;
+      }
+    } while (cur != start_pos);
+  }
+
+  // Attempt to claim the given region in the collection set for iteration. Returns true
+  // if this call caused the transition from Unclaimed to Claimed.
+  inline bool claim_collection_set_region(uint region) {
+    assert(region < _max_regions, "Tried to access invalid region %u", region);
+    if (_collection_set_iter_state[region]) {
+      return false;
     }
+    return !Atomic::cmpxchg(true, &_collection_set_iter_state[region], false);
+  }
+
+  bool has_cards_to_scan(uint region) {
+    assert(region < _max_regions, "Tried to access invalid region %u", region);
+    return _card_table_scan_state[region] < HeapRegion::CardsPerRegion;
+  }
+
+  uint claim_cards_to_scan(uint region, uint increment) {
+    assert(region < _max_regions, "Tried to access invalid region %u", region);
+    return Atomic::add(increment, &_card_table_scan_state[region]) - increment;
+  }
+
+  void add_dirty_region(uint const region) {
+#ifdef ASSERT
+   HeapRegion* hr = G1CollectedHeap::heap()->region_at(region);
+   assert(!hr->in_collection_set() && hr->is_old_or_humongous_or_archive(),
+          "Region %u is not suitable for scanning, is %sin collection set or %s",
+          hr->hrm_index(), hr->in_collection_set() ? "" : "not ", hr->get_short_type_str());
+#endif
+    _next_dirty_regions->add_dirty_region(region);
+  }
+
+  void add_all_dirty_region(uint region) {
+#ifdef ASSERT
+    HeapRegion* hr = G1CollectedHeap::heap()->region_at(region);
+    assert(hr->in_collection_set(),
+           "Only add young regions to all dirty regions directly but %u is %s",
+           hr->hrm_index(), hr->get_short_type_str());
+#endif
+    _all_dirty_regions->add_dirty_region(region);
+  }
+
+  void set_scan_top(uint region_idx, HeapWord* value) {
+    _scan_top[region_idx] = value;
   }
 
   HeapWord* scan_top(uint region_idx) const {
@@ -254,30 +731,7 @@
   }
 
   void clear_scan_top(uint region_idx) {
-    _scan_top[region_idx] = NULL;
-  }
-
-  // Clear the card table of "dirty" regions.
-  void clear_card_table(WorkGang* workers) {
-    if (_cur_dirty_region == 0) {
-      return;
-    }
-
-    size_t const num_chunks = align_up(_cur_dirty_region * HeapRegion::CardsPerRegion, G1ClearCardTableTask::chunk_size()) / G1ClearCardTableTask::chunk_size();
-    uint const num_workers = (uint)MIN2(num_chunks, (size_t)workers->active_workers());
-    size_t const chunk_length = G1ClearCardTableTask::chunk_size() / HeapRegion::CardsPerRegion;
-
-    // Iterate over the dirty cards region list.
-    G1ClearCardTableTask cl(G1CollectedHeap::heap(), _dirty_region_buffer, _cur_dirty_region, chunk_length);
-
-    log_debug(gc, ergo)("Running %s using %u workers for " SIZE_FORMAT " "
-                        "units of work for " SIZE_FORMAT " regions.",
-                        cl.name(), num_workers, num_chunks, _cur_dirty_region);
-    workers->run_task(&cl, num_workers);
-
-#ifndef PRODUCT
-    G1CollectedHeap::heap()->verifier()->verify_card_table_cleanup();
-#endif
+    set_scan_top(region_idx, NULL);
   }
 };
 
@@ -294,9 +748,7 @@
 }
 
 G1RemSet::~G1RemSet() {
-  if (_scan_state != NULL) {
-    delete _scan_state;
-  }
+  delete _scan_state;
 }
 
 uint G1RemSet::num_par_rem_sets() {
@@ -308,181 +760,252 @@
   _scan_state->initialize(max_regions);
 }
 
-class G1ScanRSForRegionClosure : public HeapRegionClosure {
+// Helper class to scan and detect ranges of cards that need to be scanned on the
+// card table.
+class G1CardTableScanner : public StackObj {
+public:
+  typedef CardTable::CardValue CardValue;
+
+private:
+  CardValue* const _base_addr;
+
+  CardValue* _cur_addr;
+  CardValue* const _end_addr;
+
+  static const size_t ToScanMask = G1CardTable::g1_card_already_scanned;
+  static const size_t ExpandedToScanMask = G1CardTable::WordAlreadyScanned;
+
+  bool cur_addr_aligned() const {
+    return ((uintptr_t)_cur_addr) % sizeof(size_t) == 0;
+  }
+
+  bool cur_card_is_dirty() const {
+    CardValue value = *_cur_addr;
+    return (value & ToScanMask) == 0;
+  }
+
+  bool cur_word_of_cards_contains_any_dirty_card() const {
+    assert(cur_addr_aligned(), "Current address should be aligned");
+    size_t const value = *(size_t*)_cur_addr;
+    return (~value & ExpandedToScanMask) != 0;
+  }
+
+  bool cur_word_of_cards_all_dirty_cards() const {
+    size_t const value = *(size_t*)_cur_addr;
+    return value == G1CardTable::WordAllDirty;
+  }
+
+  size_t get_and_advance_pos() {
+    _cur_addr++;
+    return pointer_delta(_cur_addr, _base_addr, sizeof(CardValue)) - 1;
+  }
+
+public:
+  G1CardTableScanner(CardValue* start_card, size_t size) :
+    _base_addr(start_card),
+    _cur_addr(start_card),
+    _end_addr(start_card + size) {
+
+    assert(is_aligned(start_card, sizeof(size_t)), "Unaligned start addr " PTR_FORMAT, p2i(start_card));
+    assert(is_aligned(size, sizeof(size_t)), "Unaligned size " SIZE_FORMAT, size);
+  }
+
+  size_t find_next_dirty() {
+    while (!cur_addr_aligned()) {
+      if (cur_card_is_dirty()) {
+        return get_and_advance_pos();
+      }
+      _cur_addr++;
+    }
+
+    assert(cur_addr_aligned(), "Current address should be aligned now.");
+    while (_cur_addr != _end_addr) {
+      if (cur_word_of_cards_contains_any_dirty_card()) {
+        for (size_t i = 0; i < sizeof(size_t); i++) {
+          if (cur_card_is_dirty()) {
+            return get_and_advance_pos();
+          }
+          _cur_addr++;
+        }
+        assert(false, "Should not reach here given we detected a dirty card in the word.");
+      }
+      _cur_addr += sizeof(size_t);
+    }
+    return get_and_advance_pos();
+  }
+
+  size_t find_next_non_dirty() {
+    assert(_cur_addr <= _end_addr, "Not allowed to search for marks after area.");
+
+    while (!cur_addr_aligned()) {
+      if (!cur_card_is_dirty()) {
+        return get_and_advance_pos();
+      }
+      _cur_addr++;
+    }
+
+    assert(cur_addr_aligned(), "Current address should be aligned now.");
+    while (_cur_addr != _end_addr) {
+      if (!cur_word_of_cards_all_dirty_cards()) {
+        for (size_t i = 0; i < sizeof(size_t); i++) {
+          if (!cur_card_is_dirty()) {
+            return get_and_advance_pos();
+          }
+          _cur_addr++;
+        }
+        assert(false, "Should not reach here given we detected a non-dirty card in the word.");
+      }
+      _cur_addr += sizeof(size_t);
+    }
+    return get_and_advance_pos();
+  }
+};
+
+// Helper class to claim dirty chunks within the card table.
+class G1CardTableChunkClaimer {
+  G1RemSetScanState* _scan_state;
+  uint _region_idx;
+  uint _cur_claim;
+
+public:
+  G1CardTableChunkClaimer(G1RemSetScanState* scan_state, uint region_idx) :
+    _scan_state(scan_state),
+    _region_idx(region_idx),
+    _cur_claim(0) {
+    guarantee(size() <= HeapRegion::CardsPerRegion, "Should not claim more space than possible.");
+  }
+
+  bool has_next() {
+    while (true) {
+      _cur_claim = _scan_state->claim_cards_to_scan(_region_idx, size());
+      if (_cur_claim >= HeapRegion::CardsPerRegion) {
+        return false;
+      }
+      if (_scan_state->chunk_needs_scan(_region_idx, _cur_claim)) {
+        return true;
+      }
+    }
+  }
+
+  uint value() const { return _cur_claim; }
+  uint size() const { return _scan_state->scan_chunk_size(); }
+};
+
+// Scans a heap region for dirty cards.
+class G1ScanHRForRegionClosure : public HeapRegionClosure {
   G1CollectedHeap* _g1h;
-  G1CardTable *_ct;
+  G1CardTable* _ct;
+  G1BlockOffsetTable* _bot;
 
   G1ParScanThreadState* _pss;
-  G1ScanCardClosure* _scan_objs_on_card_cl;
 
   G1RemSetScanState* _scan_state;
 
   G1GCPhaseTimes::GCParPhases _phase;
 
-  uint   _worker_i;
-
-  size_t _opt_refs_scanned;
-  size_t _opt_refs_memory_used;
+  uint   _worker_id;
 
   size_t _cards_scanned;
-  size_t _cards_claimed;
-  size_t _cards_skipped;
+  size_t _blocks_scanned;
+  size_t _chunks_claimed;
 
   Tickspan _rem_set_root_scan_time;
   Tickspan _rem_set_trim_partially_time;
 
-  Tickspan _strong_code_root_scan_time;
-  Tickspan _strong_code_trim_partially_time;
-
-  void claim_card(size_t card_index, const uint region_idx_for_card) {
-    _ct->set_card_claimed(card_index);
-    _scan_state->add_dirty_region(region_idx_for_card);
-  }
-
-  void scan_card(MemRegion mr, uint region_idx_for_card) {
+  void scan_memregion(uint region_idx_for_card, MemRegion mr) {
     HeapRegion* const card_region = _g1h->region_at(region_idx_for_card);
-    assert(!card_region->is_young(), "Should not scan card in young region %u", region_idx_for_card);
-    card_region->oops_on_card_seq_iterate_careful<true>(mr, _scan_objs_on_card_cl);
-    _scan_objs_on_card_cl->trim_queue_partially();
-    _cards_scanned++;
+    G1ScanCardClosure card_cl(_g1h, _pss);
+    card_region->oops_on_card_seq_iterate_careful<true>(mr, &card_cl);
+    _pss->trim_queue_partially();
   }
 
-  void scan_opt_rem_set_roots(HeapRegion* r) {
-    EventGCPhaseParallel event;
-
-    G1OopStarChunkedList* opt_rem_set_list = _pss->oops_into_optional_region(r);
-
-    G1ScanCardClosure scan_cl(_g1h, _pss);
-    G1ScanRSForOptionalClosure cl(_g1h, &scan_cl);
-    _opt_refs_scanned += opt_rem_set_list->oops_do(&cl, _pss->closures()->raw_strong_oops());
-    _opt_refs_memory_used += opt_rem_set_list->used_memory();
-
-    event.commit(GCId::current(), _worker_i, G1GCPhaseTimes::phase_name(_phase));
-  }
-
-  void scan_rem_set_roots(HeapRegion* r) {
-    EventGCPhaseParallel event;
-    uint const region_idx = r->hrm_index();
-
-    if (_scan_state->claim_iter(region_idx)) {
-      // If we ever free the collection set concurrently, we should also
-      // clear the card table concurrently therefore we won't need to
-      // add regions of the collection set to the dirty cards region.
-      _scan_state->add_dirty_region(region_idx);
-    }
-
-    if (r->rem_set()->cardset_is_empty()) {
+  void do_claimed_block(uint const region_idx_for_card, size_t const first_card, size_t const num_cards) {
+    HeapWord* const card_start = _bot->address_for_index_raw(first_card);
+#ifdef ASSERT
+    HeapRegion* hr = _g1h->region_at_or_null(region_idx_for_card);
+    assert(hr == NULL || hr->is_in_reserved(card_start),
+             "Card start " PTR_FORMAT " to scan outside of region %u", p2i(card_start), _g1h->region_at(region_idx_for_card)->hrm_index());
+#endif
+    HeapWord* const top = _scan_state->scan_top(region_idx_for_card);
+    if (card_start >= top) {
       return;
     }
 
-    // We claim cards in blocks so as to reduce the contention.
-    size_t const block_size = G1RSetScanBlockSize;
-
-    HeapRegionRemSetIterator iter(r->rem_set());
-    size_t card_index;
-
-    size_t claimed_card_block = _scan_state->iter_claimed_next(region_idx, block_size);
-    for (size_t current_card = 0; iter.has_next(card_index); current_card++) {
-      if (current_card >= claimed_card_block + block_size) {
-        claimed_card_block = _scan_state->iter_claimed_next(region_idx, block_size);
-      }
-      if (current_card < claimed_card_block) {
-        _cards_skipped++;
-        continue;
-      }
-      _cards_claimed++;
-
-      HeapWord* const card_start = _g1h->bot()->address_for_index_raw(card_index);
-      uint const region_idx_for_card = _g1h->addr_to_region(card_start);
+    MemRegion mr(card_start, MIN2(card_start + ((size_t)num_cards << BOTConstants::LogN_words), top));
+    scan_memregion(region_idx_for_card, mr);
 
-#ifdef ASSERT
-      HeapRegion* hr = _g1h->region_at_or_null(region_idx_for_card);
-      assert(hr == NULL || hr->is_in_reserved(card_start),
-             "Card start " PTR_FORMAT " to scan outside of region %u", p2i(card_start), _g1h->region_at(region_idx_for_card)->hrm_index());
-#endif
-      HeapWord* const top = _scan_state->scan_top(region_idx_for_card);
-      if (card_start >= top) {
-        continue;
-      }
+    _cards_scanned += num_cards;
+  }
 
-      // If the card is dirty, then G1 will scan it during Update RS.
-      if (_ct->is_card_claimed(card_index) || _ct->is_card_dirty(card_index)) {
-        continue;
-      }
-
-      // We claim lazily (so races are possible but they're benign), which reduces the
-      // number of duplicate scans (the rsets of the regions in the cset can intersect).
-      // Claim the card after checking bounds above: the remembered set may contain
-      // random cards into current survivor, and we would then have an incorrectly
-      // claimed card in survivor space. Card table clear does not reset the card table
-      // of survivor space regions.
-      claim_card(card_index, region_idx_for_card);
-
-      MemRegion const mr(card_start, MIN2(card_start + BOTConstants::N_words, top));
-
-      scan_card(mr, region_idx_for_card);
-    }
-    event.commit(GCId::current(), _worker_i, G1GCPhaseTimes::phase_name(_phase));
+  ALWAYSINLINE void do_card_block(uint const region_idx, size_t const first_card, size_t const num_cards) {
+    _ct->mark_as_scanned(first_card, num_cards);
+    do_claimed_block(region_idx, first_card, num_cards);
+    _blocks_scanned++;
   }
 
-  void scan_strong_code_roots(HeapRegion* r) {
+   void scan_heap_roots(HeapRegion* r) {
     EventGCPhaseParallel event;
-    // We pass a weak code blobs closure to the remembered set scanning because we want to avoid
-    // treating the nmethods visited to act as roots for concurrent marking.
-    // We only want to make sure that the oops in the nmethods are adjusted with regard to the
-    // objects copied by the current evacuation.
-    r->strong_code_roots_do(_pss->closures()->weak_codeblobs());
-    event.commit(GCId::current(), _worker_i, G1GCPhaseTimes::phase_name(G1GCPhaseTimes::CodeRoots));
+    uint const region_idx = r->hrm_index();
+
+    ResourceMark rm;
+
+    G1CardTableChunkClaimer claim(_scan_state, region_idx);
+
+    while (claim.has_next()) {
+      size_t const region_card_base_idx = ((size_t)region_idx << HeapRegion::LogCardsPerRegion) + claim.value();
+      CardTable::CardValue* const base_addr = _ct->byte_for_index(region_card_base_idx);
+
+      G1CardTableScanner scan(base_addr, claim.size());
+
+      size_t first_scan_idx = scan.find_next_dirty();
+      while (first_scan_idx != claim.size()) {
+        assert(*_ct->byte_for_index(region_card_base_idx + first_scan_idx) <= 0x1, "is %d at region %u idx " SIZE_FORMAT, *_ct->byte_for_index(region_card_base_idx + first_scan_idx), region_idx, first_scan_idx);
+
+        size_t const last_scan_idx = scan.find_next_non_dirty();
+        size_t const len = last_scan_idx - first_scan_idx;
+
+        do_card_block(region_idx, region_card_base_idx + first_scan_idx, len);
+
+        if (last_scan_idx == claim.size()) {
+          break;
+        }
+
+        first_scan_idx = scan.find_next_dirty();
+      }
+      _chunks_claimed++;
+    }
+
+    event.commit(GCId::current(), _worker_id, G1GCPhaseTimes::phase_name(G1GCPhaseTimes::ScanHR));
   }
 
 public:
-  G1ScanRSForRegionClosure(G1RemSetScanState* scan_state,
-                           G1ScanCardClosure* scan_obj_on_card,
+  G1ScanHRForRegionClosure(G1RemSetScanState* scan_state,
                            G1ParScanThreadState* pss,
-                           G1GCPhaseTimes::GCParPhases phase,
-                           uint worker_i) :
+                           uint worker_id,
+                           G1GCPhaseTimes::GCParPhases phase) :
     _g1h(G1CollectedHeap::heap()),
     _ct(_g1h->card_table()),
+    _bot(_g1h->bot()),
     _pss(pss),
-    _scan_objs_on_card_cl(scan_obj_on_card),
     _scan_state(scan_state),
     _phase(phase),
-    _worker_i(worker_i),
-    _opt_refs_scanned(0),
-    _opt_refs_memory_used(0),
+    _worker_id(worker_id),
     _cards_scanned(0),
-    _cards_claimed(0),
-    _cards_skipped(0),
+    _blocks_scanned(0),
+    _chunks_claimed(0),
     _rem_set_root_scan_time(),
-    _rem_set_trim_partially_time(),
-    _strong_code_root_scan_time(),
-    _strong_code_trim_partially_time() { }
+    _rem_set_trim_partially_time() {
+  }
 
   bool do_heap_region(HeapRegion* r) {
-    assert(r->in_collection_set(), "Region %u is not in the collection set.", r->hrm_index());
+    assert(!r->in_collection_set() && r->is_old_or_humongous_or_archive(),
+           "Should only be called on old gen non-collection set regions but region %u is not.",
+           r->hrm_index());
     uint const region_idx = r->hrm_index();
 
-    // The individual references for the optional remembered set are per-worker, so we
-    // always need to scan them.
-    if (r->has_index_in_opt_cset()) {
+    if (_scan_state->has_cards_to_scan(region_idx)) {
       G1EvacPhaseWithTrimTimeTracker timer(_pss, _rem_set_root_scan_time, _rem_set_trim_partially_time);
-      scan_opt_rem_set_roots(r);
-    }
-
-    // Do an early out if we know we are complete.
-    if (_scan_state->iter_is_complete(region_idx)) {
-      return false;
-    }
-
-    {
-      G1EvacPhaseWithTrimTimeTracker timer(_pss, _rem_set_root_scan_time, _rem_set_trim_partially_time);
-      scan_rem_set_roots(r);
-    }
-
-    if (_scan_state->set_iter_complete(region_idx)) {
-      G1EvacPhaseWithTrimTimeTracker timer(_pss, _strong_code_root_scan_time, _strong_code_trim_partially_time);
-      // Scan the strong code root list attached to the current region
-      scan_strong_code_roots(r);
+      scan_heap_roots(r);
     }
     return false;
   }
@@ -490,120 +1013,156 @@
   Tickspan rem_set_root_scan_time() const { return _rem_set_root_scan_time; }
   Tickspan rem_set_trim_partially_time() const { return _rem_set_trim_partially_time; }
 
+  size_t cards_scanned() const { return _cards_scanned; }
+  size_t blocks_scanned() const { return _blocks_scanned; }
+  size_t chunks_claimed() const { return _chunks_claimed; }
+};
+
+void G1RemSet::scan_heap_roots(G1ParScanThreadState* pss,
+                            uint worker_id,
+                            G1GCPhaseTimes::GCParPhases scan_phase,
+                            G1GCPhaseTimes::GCParPhases objcopy_phase) {
+  G1ScanHRForRegionClosure cl(_scan_state, pss, worker_id, scan_phase);
+  _scan_state->iterate_dirty_regions_from(&cl, worker_id);
+
+  G1GCPhaseTimes* p = _g1p->phase_times();
+
+  p->record_or_add_time_secs(objcopy_phase, worker_id, cl.rem_set_trim_partially_time().seconds());
+
+  p->record_or_add_time_secs(scan_phase, worker_id, cl.rem_set_root_scan_time().seconds());
+  p->record_or_add_thread_work_item(scan_phase, worker_id, cl.cards_scanned(), G1GCPhaseTimes::ScanHRScannedCards);
+  p->record_or_add_thread_work_item(scan_phase, worker_id, cl.blocks_scanned(), G1GCPhaseTimes::ScanHRScannedBlocks);
+  p->record_or_add_thread_work_item(scan_phase, worker_id, cl.chunks_claimed(), G1GCPhaseTimes::ScanHRClaimedChunks);
+}
+
+// Heap region closure to be applied to all regions in the current collection set
+// increment to fix up non-card related roots.
+class G1ScanCollectionSetRegionClosure : public HeapRegionClosure {
+  G1ParScanThreadState* _pss;
+  G1RemSetScanState* _scan_state;
+
+  G1GCPhaseTimes::GCParPhases _scan_phase;
+  G1GCPhaseTimes::GCParPhases _code_roots_phase;
+
+  uint _worker_id;
+
+  size_t _opt_refs_scanned;
+  size_t _opt_refs_memory_used;
+
+  Tickspan _strong_code_root_scan_time;
+  Tickspan _strong_code_trim_partially_time;
+
+  Tickspan _rem_set_opt_root_scan_time;
+  Tickspan _rem_set_opt_trim_partially_time;
+
+  void scan_opt_rem_set_roots(HeapRegion* r) {
+    EventGCPhaseParallel event;
+
+    G1OopStarChunkedList* opt_rem_set_list = _pss->oops_into_optional_region(r);
+
+    G1ScanCardClosure scan_cl(G1CollectedHeap::heap(), _pss);
+    G1ScanRSForOptionalClosure cl(G1CollectedHeap::heap(), &scan_cl);
+    _opt_refs_scanned += opt_rem_set_list->oops_do(&cl, _pss->closures()->raw_strong_oops());
+    _opt_refs_memory_used += opt_rem_set_list->used_memory();
+
+    event.commit(GCId::current(), _worker_id, G1GCPhaseTimes::phase_name(_scan_phase));
+  }
+
+public:
+  G1ScanCollectionSetRegionClosure(G1RemSetScanState* scan_state,
+                                   G1ParScanThreadState* pss,
+                                   uint worker_i,
+                                   G1GCPhaseTimes::GCParPhases scan_phase,
+                                   G1GCPhaseTimes::GCParPhases code_roots_phase) :
+    _pss(pss),
+    _scan_state(scan_state),
+    _scan_phase(scan_phase),
+    _code_roots_phase(code_roots_phase),
+    _worker_id(worker_i),
+    _opt_refs_scanned(0),
+    _opt_refs_memory_used(0),
+    _strong_code_root_scan_time(),
+    _strong_code_trim_partially_time(),
+    _rem_set_opt_root_scan_time(),
+    _rem_set_opt_trim_partially_time() { }
+
+  bool do_heap_region(HeapRegion* r) {
+    uint const region_idx = r->hrm_index();
+
+    // The individual references for the optional remembered set are per-worker, so we
+    // always need to scan them.
+    if (r->has_index_in_opt_cset()) {
+      G1EvacPhaseWithTrimTimeTracker timer(_pss, _rem_set_opt_root_scan_time, _rem_set_opt_trim_partially_time);
+      scan_opt_rem_set_roots(r);
+    }
+
+    if (_scan_state->claim_collection_set_region(region_idx)) {
+      EventGCPhaseParallel event;
+
+      G1EvacPhaseWithTrimTimeTracker timer(_pss, _strong_code_root_scan_time, _strong_code_trim_partially_time);
+      // Scan the strong code root list attached to the current region
+      r->strong_code_roots_do(_pss->closures()->weak_codeblobs());
+
+      event.commit(GCId::current(), _worker_id, G1GCPhaseTimes::phase_name(_code_roots_phase));
+    }
+
+    return false;
+  }
+
   Tickspan strong_code_root_scan_time() const { return _strong_code_root_scan_time;  }
   Tickspan strong_code_root_trim_partially_time() const { return _strong_code_trim_partially_time; }
 
-  size_t cards_scanned() const { return _cards_scanned; }
-  size_t cards_claimed() const { return _cards_claimed; }
-  size_t cards_skipped() const { return _cards_skipped; }
+  Tickspan rem_set_opt_root_scan_time() const { return _rem_set_opt_root_scan_time; }
+  Tickspan rem_set_opt_trim_partially_time() const { return _rem_set_opt_trim_partially_time; }
 
   size_t opt_refs_scanned() const { return _opt_refs_scanned; }
   size_t opt_refs_memory_used() const { return _opt_refs_memory_used; }
 };
 
-void G1RemSet::scan_rem_set(G1ParScanThreadState* pss,
-                            uint worker_i,
-                            G1GCPhaseTimes::GCParPhases scan_phase,
-                            G1GCPhaseTimes::GCParPhases objcopy_phase,
-                            G1GCPhaseTimes::GCParPhases coderoots_phase) {
-  assert(pss->trim_ticks().value() == 0, "Queues must have been trimmed before entering.");
-
-  G1ScanCardClosure scan_cl(_g1h, pss);
-  G1ScanRSForRegionClosure cl(_scan_state, &scan_cl, pss, scan_phase, worker_i);
-  _g1h->collection_set_iterate_increment_from(&cl, worker_i);
-
-  G1GCPhaseTimes* p = _g1p->phase_times();
-
-  p->record_or_add_time_secs(objcopy_phase, worker_i, cl.rem_set_trim_partially_time().seconds());
+void G1RemSet::scan_collection_set_regions(G1ParScanThreadState* pss,
+                                           uint worker_id,
+                                           G1GCPhaseTimes::GCParPhases scan_phase,
+                                           G1GCPhaseTimes::GCParPhases coderoots_phase,
+                                           G1GCPhaseTimes::GCParPhases objcopy_phase) {
+  G1ScanCollectionSetRegionClosure cl(_scan_state, pss, worker_id, scan_phase, coderoots_phase);
+  _g1h->collection_set_iterate_increment_from(&cl, worker_id);
 
-  p->record_or_add_time_secs(scan_phase, worker_i, cl.rem_set_root_scan_time().seconds());
-  p->record_or_add_thread_work_item(scan_phase, worker_i, cl.cards_scanned(), G1GCPhaseTimes::ScanRSScannedCards);
-  p->record_or_add_thread_work_item(scan_phase, worker_i, cl.cards_claimed(), G1GCPhaseTimes::ScanRSClaimedCards);
-  p->record_or_add_thread_work_item(scan_phase, worker_i, cl.cards_skipped(), G1GCPhaseTimes::ScanRSSkippedCards);
-  // At this time we only record some metrics for the optional remembered set.
-  if (scan_phase == G1GCPhaseTimes::OptScanRS) {
-    p->record_or_add_thread_work_item(scan_phase, worker_i, cl.opt_refs_scanned(), G1GCPhaseTimes::ScanRSScannedOptRefs);
-    p->record_or_add_thread_work_item(scan_phase, worker_i, cl.opt_refs_memory_used(), G1GCPhaseTimes::ScanRSUsedMemory);
-  }
-
-  p->record_or_add_time_secs(coderoots_phase, worker_i, cl.strong_code_root_scan_time().seconds());
-  p->add_time_secs(objcopy_phase, worker_i, cl.strong_code_root_trim_partially_time().seconds());
-}
-
-// Closure used for updating rem sets. Only called during an evacuation pause.
-class G1RefineCardClosure: public G1CardTableEntryClosure {
-  G1RemSet* _g1rs;
-  G1ScanCardClosure* _update_rs_cl;
-
-  size_t _cards_scanned;
-  size_t _cards_skipped;
-public:
-  G1RefineCardClosure(G1CollectedHeap* g1h, G1ScanCardClosure* update_rs_cl) :
-    _g1rs(g1h->rem_set()), _update_rs_cl(update_rs_cl), _cards_scanned(0), _cards_skipped(0)
-  {}
+  G1GCPhaseTimes* p = _g1h->phase_times();
 
-  bool do_card_ptr(CardValue* card_ptr, uint worker_i) {
-    // The only time we care about recording cards that
-    // contain references that point into the collection set
-    // is during RSet updating within an evacuation pause.
-    // In this case worker_i should be the id of a GC worker thread.
-    assert(SafepointSynchronize::is_at_safepoint(), "not during an evacuation pause");
-
-    bool card_scanned = _g1rs->refine_card_during_gc(card_ptr, _update_rs_cl);
-
-    if (card_scanned) {
-      _update_rs_cl->trim_queue_partially();
-      _cards_scanned++;
-    } else {
-      _cards_skipped++;
-    }
-    return true;
-  }
-
-  size_t cards_scanned() const { return _cards_scanned; }
-  size_t cards_skipped() const { return _cards_skipped; }
-};
+  p->record_or_add_time_secs(scan_phase, worker_id, cl.rem_set_opt_root_scan_time().seconds());
+  p->record_or_add_time_secs(scan_phase, worker_id, cl.rem_set_opt_trim_partially_time().seconds());
 
-void G1RemSet::update_rem_set(G1ParScanThreadState* pss, uint worker_i) {
-  G1GCPhaseTimes* p = _g1p->phase_times();
-
-  // Apply closure to log entries in the HCC.
-  if (G1HotCardCache::default_use_cache()) {
-    G1EvacPhaseTimesTracker x(p, pss, G1GCPhaseTimes::ScanHCC, worker_i);
+  p->record_or_add_time_secs(coderoots_phase, worker_id, cl.strong_code_root_scan_time().seconds());
+  p->add_time_secs(objcopy_phase, worker_id, cl.strong_code_root_trim_partially_time().seconds());
 
-    G1ScanCardClosure scan_hcc_cl(_g1h, pss);
-    G1RefineCardClosure refine_card_cl(_g1h, &scan_hcc_cl);
-    _g1h->iterate_hcc_closure(&refine_card_cl, worker_i);
-  }
-
-  // Now apply the closure to all remaining log entries.
-  {
-    G1EvacPhaseTimesTracker x(p, pss, G1GCPhaseTimes::UpdateRS, worker_i);
-
-    G1ScanCardClosure update_rs_cl(_g1h, pss);
-    G1RefineCardClosure refine_card_cl(_g1h, &update_rs_cl);
-    _g1h->iterate_dirty_card_closure(&refine_card_cl, worker_i);
-
-    p->record_thread_work_item(G1GCPhaseTimes::UpdateRS, worker_i, refine_card_cl.cards_scanned(), G1GCPhaseTimes::UpdateRSScannedCards);
-    p->record_thread_work_item(G1GCPhaseTimes::UpdateRS, worker_i, refine_card_cl.cards_skipped(), G1GCPhaseTimes::UpdateRSSkippedCards);
+  // At this time we record some metrics only for the evacuations after the initial one.
+  if (scan_phase == G1GCPhaseTimes::OptScanHR) {
+    p->record_or_add_thread_work_item(scan_phase, worker_id, cl.opt_refs_scanned(), G1GCPhaseTimes::ScanHRScannedOptRefs);
+    p->record_or_add_thread_work_item(scan_phase, worker_id, cl.opt_refs_memory_used(), G1GCPhaseTimes::ScanHRUsedMemory);
   }
 }
 
-void G1RemSet::prepare_for_scan_rem_set() {
-  G1BarrierSet::dirty_card_queue_set().concatenate_logs();
-  _scan_state->reset();
+void G1RemSet::prepare_for_scan_heap_roots() {
+  G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set();
+  dcqs.concatenate_logs();
+
+  _scan_state->prepare();
 }
 
-void G1RemSet::prepare_for_scan_rem_set(uint region_idx) {
+void G1RemSet::merge_heap_roots(bool remembered_set_only, G1GCPhaseTimes::GCParPhases merge_phase) {
+  _scan_state->merge_heap_roots(_g1h->workers(), remembered_set_only, merge_phase);
+}
+
+void G1RemSet::prepare_for_scan_heap_roots(uint region_idx) {
   _scan_state->clear_scan_top(region_idx);
 }
 
-void G1RemSet::cleanup_after_scan_rem_set() {
+void G1RemSet::cleanup_after_scan_heap_roots() {
   G1GCPhaseTimes* phase_times = _g1h->phase_times();
 
   // Set all cards back to clean.
   double start = os::elapsedTime();
-  _scan_state->clear_card_table(_g1h->workers());
+  _scan_state->cleanup(_g1h->workers());
   phase_times->record_clear_ct_time((os::elapsedTime() - start) * 1000.0);
 }
 
@@ -759,53 +1318,6 @@
   G1BarrierSet::shared_dirty_card_queue().enqueue(card_ptr);
 }
 
-bool G1RemSet::refine_card_during_gc(CardValue* card_ptr,
-                                     G1ScanCardClosure* update_rs_cl) {
-  assert(_g1h->is_gc_active(), "Only call during GC");
-
-  // Construct the region representing the card.
-  HeapWord* card_start = _ct->addr_for(card_ptr);
-  // And find the region containing it.
-  uint const card_region_idx = _g1h->addr_to_region(card_start);
-
-  HeapWord* scan_limit = _scan_state->scan_top(card_region_idx);
-  if (scan_limit == NULL) {
-    // This is a card into an uncommitted region. We need to bail out early as we
-    // should not access the corresponding card table entry.
-    return false;
-  }
-
-  check_card_ptr(card_ptr, _ct);
-
-  // If the card is no longer dirty, nothing to do. This covers cards that were already
-  // scanned as parts of the remembered sets.
-  if (*card_ptr != G1CardTable::dirty_card_val()) {
-    return false;
-  }
-
-  // We claim lazily (so races are possible but they're benign), which reduces the
-  // number of potential duplicate scans (multiple threads may enqueue the same card twice).
-  *card_ptr = G1CardTable::clean_card_val() | G1CardTable::claimed_card_val();
-
-  _scan_state->add_dirty_region(card_region_idx);
-  if (scan_limit <= card_start) {
-    // If the card starts above the area in the region containing objects to scan, skip it.
-    return false;
-  }
-
-  // Don't use addr_for(card_ptr + 1) which can ask for
-  // a card beyond the heap.
-  HeapWord* card_end = card_start + G1CardTable::card_size_in_words;
-  MemRegion dirty_region(card_start, MIN2(scan_limit, card_end));
-  assert(!dirty_region.is_empty(), "sanity");
-
-  HeapRegion* const card_region = _g1h->region_at(card_region_idx);
-  assert(!card_region->is_young(), "Should not scan card in young region %u", card_region_idx);
-  bool card_processed = card_region->oops_on_card_seq_iterate_careful<true>(dirty_region, update_rs_cl);
-  assert(card_processed, "must be");
-  return true;
-}
-
 void G1RemSet::print_periodic_summary_info(const char* header, uint period_count) {
   if ((G1SummarizeRSetStatsPeriod > 0) && log_is_enabled(Trace, gc, remset) &&
       (period_count % G1SummarizeRSetStatsPeriod == 0)) {
--- a/src/hotspot/share/gc/g1/g1RemSet.hpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/g1RemSet.hpp	Thu Jun 27 11:48:32 2019 +0200
@@ -46,6 +46,7 @@
 class G1HotCardCache;
 class G1RemSetScanState;
 class G1ParScanThreadState;
+class G1ParScanThreadStateSet;
 class G1Policy;
 class G1ScanCardClosure;
 class HeapRegionClaimer;
@@ -84,39 +85,39 @@
            G1HotCardCache* hot_card_cache);
   ~G1RemSet();
 
-  // Scan all remembered sets of the collection set for references into the collection
-  // set.
-  // Further applies heap_region_codeblobs on the oops of the unmarked nmethods on the strong code
-  // roots list for each region in the collection set.
-  void scan_rem_set(G1ParScanThreadState* pss,
-                    uint worker_i,
-                    G1GCPhaseTimes::GCParPhases scan_phase,
-                    G1GCPhaseTimes::GCParPhases objcopy_phase,
-                    G1GCPhaseTimes::GCParPhases coderoots_phase);
+  // Scan all cards in the non-collection set regions that potentially contain
+  // references into the current whole collection set.
+  void scan_heap_roots(G1ParScanThreadState* pss,
+                       uint worker_id,
+                       G1GCPhaseTimes::GCParPhases scan_phase,
+                       G1GCPhaseTimes::GCParPhases objcopy_phase);
+
+  // Merge cards from various sources (remembered sets, hot card cache, log buffers)
+  // and calculate the cards that need to be scanned later (via scan_heap_roots()).
+  // If remembered_set_only is set, only merge remembered set cards.
+  void merge_heap_roots(bool remembered_set_only, G1GCPhaseTimes::GCParPhases merge_phase);
 
-  // Flush remaining refinement buffers for cross-region references to either evacuate references
-  // into the collection set or update the remembered set.
-  void update_rem_set(G1ParScanThreadState* pss, uint worker_i);
-
-  // Prepare for and cleanup after scanning the remembered sets. Must be called
+  // Prepare for and cleanup after scanning the heap roots. Must be called
   // once before and after in sequential code.
-  void prepare_for_scan_rem_set();
-  void cleanup_after_scan_rem_set();
-  // Prepares the given region for remembered set scanning.
-  void prepare_for_scan_rem_set(uint region_idx);
+  void prepare_for_scan_heap_roots();
+  // Cleans the card table from temporary duplicate detection information.
+  void cleanup_after_scan_heap_roots();
+  // Prepares the given region for heap root scanning.
+  void prepare_for_scan_heap_roots(uint region_idx);
 
-  G1RemSetScanState* scan_state() const { return _scan_state; }
+  // Do work for regions in the current increment of the collection set, scanning
+  // non-card based (heap) roots.
+  void scan_collection_set_regions(G1ParScanThreadState* pss,
+                                   uint worker_id,
+                                   G1GCPhaseTimes::GCParPhases scan_phase,
+                                   G1GCPhaseTimes::GCParPhases coderoots_phase,
+                                   G1GCPhaseTimes::GCParPhases objcopy_phase);
 
   // Refine the card corresponding to "card_ptr". Safe to be called concurrently
   // to the mutator.
   void refine_card_concurrently(CardValue* card_ptr,
                                 uint worker_i);
 
-  // Refine the card corresponding to "card_ptr", applying the given closure to
-  // all references found. Must only be called during gc.
-  // Returns whether the card has been scanned.
-  bool refine_card_during_gc(CardValue* card_ptr, G1ScanCardClosure* update_rs_cl);
-
   // Print accumulated summary info from the start of the VM.
   void print_summary_info();
 
--- a/src/hotspot/share/gc/g1/heapRegion.cpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/heapRegion.cpp	Thu Jun 27 11:48:32 2019 +0200
@@ -49,6 +49,7 @@
 
 int    HeapRegion::LogOfHRGrainBytes = 0;
 int    HeapRegion::LogOfHRGrainWords = 0;
+int    HeapRegion::LogCardsPerRegion = 0;
 size_t HeapRegion::GrainBytes        = 0;
 size_t HeapRegion::GrainWords        = 0;
 size_t HeapRegion::CardsPerRegion    = 0;
@@ -105,6 +106,8 @@
   guarantee(CardsPerRegion == 0, "we should only set it once");
   CardsPerRegion = GrainBytes >> G1CardTable::card_shift;
 
+  LogCardsPerRegion = log2_long((jlong) CardsPerRegion);
+
   if (G1HeapRegionSize != GrainBytes) {
     FLAG_SET_ERGO(G1HeapRegionSize, GrainBytes);
   }
--- a/src/hotspot/share/gc/g1/heapRegion.hpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/heapRegion.hpp	Thu Jun 27 11:48:32 2019 +0200
@@ -60,7 +60,6 @@
 class G1CMBitMap;
 class G1IsAliveAndApplyClosure;
 class HeapRegionRemSet;
-class HeapRegionRemSetIterator;
 class HeapRegion;
 class HeapRegionSetBase;
 class nmethod;
@@ -315,6 +314,7 @@
 
   static int    LogOfHRGrainBytes;
   static int    LogOfHRGrainWords;
+  static int    LogCardsPerRegion;
 
   static size_t GrainBytes;
   static size_t GrainWords;
--- a/src/hotspot/share/gc/g1/heapRegionRemSet.cpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/heapRegionRemSet.cpp	Thu Jun 27 11:48:32 2019 +0200
@@ -27,7 +27,7 @@
 #include "gc/g1/g1CollectedHeap.inline.hpp"
 #include "gc/g1/g1ConcurrentRefine.hpp"
 #include "gc/g1/heapRegionManager.inline.hpp"
-#include "gc/g1/heapRegionRemSet.hpp"
+#include "gc/g1/heapRegionRemSet.inline.hpp"
 #include "gc/shared/space.inline.hpp"
 #include "memory/allocation.hpp"
 #include "memory/padded.inline.hpp"
@@ -42,195 +42,21 @@
 const char* HeapRegionRemSet::_state_strings[] =  {"Untracked", "Updating", "Complete"};
 const char* HeapRegionRemSet::_short_state_strings[] =  {"UNTRA", "UPDAT", "CMPLT"};
 
-class PerRegionTable: public CHeapObj<mtGC> {
-  friend class OtherRegionsTable;
-  friend class HeapRegionRemSetIterator;
-
-  HeapRegion*     _hr;
-  CHeapBitMap     _bm;
-  jint            _occupied;
-
-  // next pointer for free/allocated 'all' list
-  PerRegionTable* _next;
-
-  // prev pointer for the allocated 'all' list
-  PerRegionTable* _prev;
-
-  // next pointer in collision list
-  PerRegionTable * _collision_list_next;
-
-  // Global free list of PRTs
-  static PerRegionTable* volatile _free_list;
-
-protected:
-  // We need access in order to union things into the base table.
-  BitMap* bm() { return &_bm; }
-
-  PerRegionTable(HeapRegion* hr) :
-    _hr(hr),
-    _bm(HeapRegion::CardsPerRegion, mtGC),
-    _occupied(0),
-    _next(NULL), _prev(NULL),
-    _collision_list_next(NULL)
-  {}
-
-  void add_card_work(CardIdx_t from_card, bool par) {
-    if (!_bm.at(from_card)) {
-      if (par) {
-        if (_bm.par_at_put(from_card, 1)) {
-          Atomic::inc(&_occupied);
-        }
-      } else {
-        _bm.at_put(from_card, 1);
-        _occupied++;
-      }
-    }
-  }
-
-  void add_reference_work(OopOrNarrowOopStar from, bool par) {
-    // Must make this robust in case "from" is not in "_hr", because of
-    // concurrency.
-
-    HeapRegion* loc_hr = hr();
-    // If the test below fails, then this table was reused concurrently
-    // with this operation.  This is OK, since the old table was coarsened,
-    // and adding a bit to the new table is never incorrect.
-    if (loc_hr->is_in_reserved(from)) {
-      CardIdx_t from_card = OtherRegionsTable::card_within_region(from, loc_hr);
-      add_card_work(from_card, par);
+PerRegionTable* PerRegionTable::alloc(HeapRegion* hr) {
+  PerRegionTable* fl = _free_list;
+  while (fl != NULL) {
+    PerRegionTable* nxt = fl->next();
+    PerRegionTable* res = Atomic::cmpxchg(nxt, &_free_list, fl);
+    if (res == fl) {
+      fl->init(hr, true);
+      return fl;
+    } else {
+      fl = _free_list;
     }
   }
-
-public:
-
-  HeapRegion* hr() const { return OrderAccess::load_acquire(&_hr); }
-
-  jint occupied() const {
-    // Overkill, but if we ever need it...
-    // guarantee(_occupied == _bm.count_one_bits(), "Check");
-    return _occupied;
-  }
-
-  void init(HeapRegion* hr, bool clear_links_to_all_list) {
-    if (clear_links_to_all_list) {
-      set_next(NULL);
-      set_prev(NULL);
-    }
-    _collision_list_next = NULL;
-    _occupied = 0;
-    _bm.clear();
-    // Make sure that the bitmap clearing above has been finished before publishing
-    // this PRT to concurrent threads.
-    OrderAccess::release_store(&_hr, hr);
-  }
-
-  void add_reference(OopOrNarrowOopStar from) {
-    add_reference_work(from, /*parallel*/ true);
-  }
-
-  void seq_add_reference(OopOrNarrowOopStar from) {
-    add_reference_work(from, /*parallel*/ false);
-  }
-
-  void add_card(CardIdx_t from_card_index) {
-    add_card_work(from_card_index, /*parallel*/ true);
-  }
-
-  void seq_add_card(CardIdx_t from_card_index) {
-    add_card_work(from_card_index, /*parallel*/ false);
-  }
-
-  // (Destructively) union the bitmap of the current table into the given
-  // bitmap (which is assumed to be of the same size.)
-  void union_bitmap_into(BitMap* bm) {
-    bm->set_union(_bm);
-  }
-
-  // Mem size in bytes.
-  size_t mem_size() const {
-    return sizeof(PerRegionTable) + _bm.size_in_words() * HeapWordSize;
-  }
-
-  // Requires "from" to be in "hr()".
-  bool contains_reference(OopOrNarrowOopStar from) const {
-    assert(hr()->is_in_reserved(from), "Precondition.");
-    size_t card_ind = pointer_delta(from, hr()->bottom(),
-                                    G1CardTable::card_size);
-    return _bm.at(card_ind);
-  }
-
-  // Bulk-free the PRTs from prt to last, assumes that they are
-  // linked together using their _next field.
-  static void bulk_free(PerRegionTable* prt, PerRegionTable* last) {
-    while (true) {
-      PerRegionTable* fl = _free_list;
-      last->set_next(fl);
-      PerRegionTable* res = Atomic::cmpxchg(prt, &_free_list, fl);
-      if (res == fl) {
-        return;
-      }
-    }
-    ShouldNotReachHere();
-  }
-
-  static void free(PerRegionTable* prt) {
-    bulk_free(prt, prt);
-  }
-
-  // Returns an initialized PerRegionTable instance.
-  static PerRegionTable* alloc(HeapRegion* hr) {
-    PerRegionTable* fl = _free_list;
-    while (fl != NULL) {
-      PerRegionTable* nxt = fl->next();
-      PerRegionTable* res = Atomic::cmpxchg(nxt, &_free_list, fl);
-      if (res == fl) {
-        fl->init(hr, true);
-        return fl;
-      } else {
-        fl = _free_list;
-      }
-    }
-    assert(fl == NULL, "Loop condition.");
-    return new PerRegionTable(hr);
-  }
-
-  PerRegionTable* next() const { return _next; }
-  void set_next(PerRegionTable* next) { _next = next; }
-  PerRegionTable* prev() const { return _prev; }
-  void set_prev(PerRegionTable* prev) { _prev = prev; }
-
-  // Accessor and Modification routines for the pointer for the
-  // singly linked collision list that links the PRTs within the
-  // OtherRegionsTable::_fine_grain_regions hash table.
-  //
-  // It might be useful to also make the collision list doubly linked
-  // to avoid iteration over the collisions list during scrubbing/deletion.
-  // OTOH there might not be many collisions.
-
-  PerRegionTable* collision_list_next() const {
-    return _collision_list_next;
-  }
-
-  void set_collision_list_next(PerRegionTable* next) {
-    _collision_list_next = next;
-  }
-
-  PerRegionTable** collision_list_next_addr() {
-    return &_collision_list_next;
-  }
-
-  static size_t fl_mem_size() {
-    PerRegionTable* cur = _free_list;
-    size_t res = 0;
-    while (cur != NULL) {
-      res += cur->mem_size();
-      cur = cur->next();
-    }
-    return res;
-  }
-
-  static void test_fl_mem_size();
-};
+  assert(fl == NULL, "Loop condition.");
+  return new PerRegionTable(hr);
+}
 
 PerRegionTable* volatile PerRegionTable::_free_list = NULL;
 
@@ -696,175 +522,3 @@
 size_t HeapRegionRemSet::strong_code_roots_mem_size() {
   return _code_roots.mem_size();
 }
-
-HeapRegionRemSetIterator:: HeapRegionRemSetIterator(HeapRegionRemSet* hrrs) :
-  _hrrs(hrrs),
-  _coarse_map(&hrrs->_other_regions._coarse_map),
-  _bot(hrrs->_bot),
-  _g1h(G1CollectedHeap::heap()),
-  _n_yielded_fine(0),
-  _n_yielded_coarse(0),
-  _n_yielded_sparse(0),
-  _is(Sparse),
-  _cur_region_card_offset(0),
-  // Set these values so that we increment to the first region.
-  _coarse_cur_region_index(-1),
-  _coarse_cur_region_cur_card(HeapRegion::CardsPerRegion-1),
-  _fine_cur_prt(NULL),
-  _cur_card_in_prt(HeapRegion::CardsPerRegion),
-  _sparse_iter(&hrrs->_other_regions._sparse_table) {}
-
-bool HeapRegionRemSetIterator::coarse_has_next(size_t& card_index) {
-  if (_hrrs->_other_regions._n_coarse_entries == 0) return false;
-  // Go to the next card.
-  _coarse_cur_region_cur_card++;
-  // Was the last the last card in the current region?
-  if (_coarse_cur_region_cur_card == HeapRegion::CardsPerRegion) {
-    // Yes: find the next region.  This may leave _coarse_cur_region_index
-    // Set to the last index, in which case there are no more coarse
-    // regions.
-    _coarse_cur_region_index =
-      (int) _coarse_map->get_next_one_offset(_coarse_cur_region_index + 1);
-    if ((size_t)_coarse_cur_region_index < _coarse_map->size()) {
-      _coarse_cur_region_cur_card = 0;
-      HeapWord* r_bot =
-        _g1h->region_at((uint) _coarse_cur_region_index)->bottom();
-      _cur_region_card_offset = _bot->index_for_raw(r_bot);
-    } else {
-      return false;
-    }
-  }
-  // If we didn't return false above, then we can yield a card.
-  card_index = _cur_region_card_offset + _coarse_cur_region_cur_card;
-  return true;
-}
-
-bool HeapRegionRemSetIterator::fine_has_next(size_t& card_index) {
-  if (fine_has_next()) {
-    _cur_card_in_prt =
-      _fine_cur_prt->_bm.get_next_one_offset(_cur_card_in_prt + 1);
-  }
-  if (_cur_card_in_prt == HeapRegion::CardsPerRegion) {
-    // _fine_cur_prt may still be NULL in case if there are not PRTs at all for
-    // the remembered set.
-    if (_fine_cur_prt == NULL || _fine_cur_prt->next() == NULL) {
-      return false;
-    }
-    PerRegionTable* next_prt = _fine_cur_prt->next();
-    switch_to_prt(next_prt);
-    _cur_card_in_prt = _fine_cur_prt->_bm.get_next_one_offset(_cur_card_in_prt + 1);
-  }
-
-  card_index = _cur_region_card_offset + _cur_card_in_prt;
-  guarantee(_cur_card_in_prt < HeapRegion::CardsPerRegion,
-            "Card index " SIZE_FORMAT " must be within the region", _cur_card_in_prt);
-  return true;
-}
-
-bool HeapRegionRemSetIterator::fine_has_next() {
-  return _cur_card_in_prt != HeapRegion::CardsPerRegion;
-}
-
-void HeapRegionRemSetIterator::switch_to_prt(PerRegionTable* prt) {
-  assert(prt != NULL, "Cannot switch to NULL prt");
-  _fine_cur_prt = prt;
-
-  HeapWord* r_bot = _fine_cur_prt->hr()->bottom();
-  _cur_region_card_offset = _bot->index_for_raw(r_bot);
-
-  // The bitmap scan for the PRT always scans from _cur_region_cur_card + 1.
-  // To avoid special-casing this start case, and not miss the first bitmap
-  // entry, initialize _cur_region_cur_card with -1 instead of 0.
-  _cur_card_in_prt = (size_t)-1;
-}
-
-bool HeapRegionRemSetIterator::has_next(size_t& card_index) {
-  switch (_is) {
-  case Sparse: {
-    if (_sparse_iter.has_next(card_index)) {
-      _n_yielded_sparse++;
-      return true;
-    }
-    // Otherwise, deliberate fall-through
-    _is = Fine;
-    PerRegionTable* initial_fine_prt = _hrrs->_other_regions._first_all_fine_prts;
-    if (initial_fine_prt != NULL) {
-      switch_to_prt(_hrrs->_other_regions._first_all_fine_prts);
-    }
-  }
-  case Fine:
-    if (fine_has_next(card_index)) {
-      _n_yielded_fine++;
-      return true;
-    }
-    // Otherwise, deliberate fall-through
-    _is = Coarse;
-  case Coarse:
-    if (coarse_has_next(card_index)) {
-      _n_yielded_coarse++;
-      return true;
-    }
-    // Otherwise...
-    break;
-  }
-  return false;
-}
-
-#ifndef PRODUCT
-void HeapRegionRemSet::test() {
-  os::sleep(Thread::current(), (jlong)5000, false);
-  G1CollectedHeap* g1h = G1CollectedHeap::heap();
-
-  // Run with "-XX:G1LogRSetRegionEntries=2", so that 1 and 5 end up in same
-  // hash bucket.
-  HeapRegion* hr0 = g1h->region_at(0);
-  HeapRegion* hr1 = g1h->region_at(1);
-  HeapRegion* hr2 = g1h->region_at(5);
-  HeapRegion* hr3 = g1h->region_at(6);
-  HeapRegion* hr4 = g1h->region_at(7);
-  HeapRegion* hr5 = g1h->region_at(8);
-
-  HeapWord* hr1_start = hr1->bottom();
-  HeapWord* hr1_mid = hr1_start + HeapRegion::GrainWords/2;
-  HeapWord* hr1_last = hr1->end() - 1;
-
-  HeapWord* hr2_start = hr2->bottom();
-  HeapWord* hr2_mid = hr2_start + HeapRegion::GrainWords/2;
-  HeapWord* hr2_last = hr2->end() - 1;
-
-  HeapWord* hr3_start = hr3->bottom();
-  HeapWord* hr3_mid = hr3_start + HeapRegion::GrainWords/2;
-  HeapWord* hr3_last = hr3->end() - 1;
-
-  HeapRegionRemSet* hrrs = hr0->rem_set();
-
-  // Make three references from region 0x101...
-  hrrs->add_reference((OopOrNarrowOopStar)hr1_start);
-  hrrs->add_reference((OopOrNarrowOopStar)hr1_mid);
-  hrrs->add_reference((OopOrNarrowOopStar)hr1_last);
-
-  hrrs->add_reference((OopOrNarrowOopStar)hr2_start);
-  hrrs->add_reference((OopOrNarrowOopStar)hr2_mid);
-  hrrs->add_reference((OopOrNarrowOopStar)hr2_last);
-
-  hrrs->add_reference((OopOrNarrowOopStar)hr3_start);
-  hrrs->add_reference((OopOrNarrowOopStar)hr3_mid);
-  hrrs->add_reference((OopOrNarrowOopStar)hr3_last);
-
-  // Now cause a coarsening.
-  hrrs->add_reference((OopOrNarrowOopStar)hr4->bottom());
-  hrrs->add_reference((OopOrNarrowOopStar)hr5->bottom());
-
-  // Now, does iteration yield these three?
-  HeapRegionRemSetIterator iter(hrrs);
-  size_t sum = 0;
-  size_t card_index;
-  while (iter.has_next(card_index)) {
-    HeapWord* card_start = g1h->bot()->address_for_index(card_index);
-    tty->print_cr("  Card " PTR_FORMAT ".", p2i(card_start));
-    sum++;
-  }
-  guarantee(sum == 11 - 3 + 2048, "Failure");
-  guarantee(sum == hrrs->occupied(), "Failure");
-}
-#endif
--- a/src/hotspot/share/gc/g1/heapRegionRemSet.hpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/heapRegionRemSet.hpp	Thu Jun 27 11:48:32 2019 +0200
@@ -28,6 +28,7 @@
 #include "gc/g1/g1CodeCacheRemSet.hpp"
 #include "gc/g1/g1FromCardCache.hpp"
 #include "gc/g1/sparsePRT.hpp"
+#include "utilities/bitMap.hpp"
 
 // Remembered set for a heap region.  Represent a set of "cards" that
 // contain pointers into the owner heap region.  Cards are defined somewhat
@@ -37,7 +38,6 @@
 class G1BlockOffsetTable;
 class G1CardLiveData;
 class HeapRegion;
-class HeapRegionRemSetIterator;
 class PerRegionTable;
 class SparsePRT;
 class nmethod;
@@ -67,8 +67,6 @@
 //      thinking the PRT is for a different region, does no harm.
 
 class OtherRegionsTable {
-  friend class HeapRegionRemSetIterator;
-
   G1CollectedHeap* _g1h;
   Mutex*           _m;
 
@@ -125,6 +123,9 @@
   // Create a new remembered set. The given mutex is used to ensure consistency.
   OtherRegionsTable(Mutex* m);
 
+  template <class Closure>
+  void iterate(Closure& v);
+
   // Returns the card index of the given within_region pointer relative to the bottom
   // of the given heap region.
   static CardIdx_t card_within_region(OopOrNarrowOopStar within_region, HeapRegion* hr);
@@ -157,9 +158,140 @@
   void clear();
 };
 
+class PerRegionTable: public CHeapObj<mtGC> {
+  friend class OtherRegionsTable;
+
+  HeapRegion*     _hr;
+  CHeapBitMap     _bm;
+  jint            _occupied;
+
+  // next pointer for free/allocated 'all' list
+  PerRegionTable* _next;
+
+  // prev pointer for the allocated 'all' list
+  PerRegionTable* _prev;
+
+  // next pointer in collision list
+  PerRegionTable * _collision_list_next;
+
+  // Global free list of PRTs
+  static PerRegionTable* volatile _free_list;
+
+protected:
+  PerRegionTable(HeapRegion* hr) :
+    _hr(hr),
+    _bm(HeapRegion::CardsPerRegion, mtGC),
+    _occupied(0),
+    _next(NULL), _prev(NULL),
+    _collision_list_next(NULL)
+  {}
+
+  inline void add_card_work(CardIdx_t from_card, bool par);
+
+  inline void add_reference_work(OopOrNarrowOopStar from, bool par);
+
+public:
+  // We need access in order to union things into the base table.
+  BitMap* bm() { return &_bm; }
+
+  HeapRegion* hr() const { return OrderAccess::load_acquire(&_hr); }
+
+  jint occupied() const {
+    // Overkill, but if we ever need it...
+    // guarantee(_occupied == _bm.count_one_bits(), "Check");
+    return _occupied;
+  }
+
+  void init(HeapRegion* hr, bool clear_links_to_all_list);
+
+  inline void add_reference(OopOrNarrowOopStar from);
+
+  inline void seq_add_reference(OopOrNarrowOopStar from);
+
+  inline void add_card(CardIdx_t from_card_index);
+
+  void seq_add_card(CardIdx_t from_card_index);
+
+  // (Destructively) union the bitmap of the current table into the given
+  // bitmap (which is assumed to be of the same size.)
+  void union_bitmap_into(BitMap* bm) {
+    bm->set_union(_bm);
+  }
+
+  // Mem size in bytes.
+  size_t mem_size() const {
+    return sizeof(PerRegionTable) + _bm.size_in_words() * HeapWordSize;
+  }
+
+  // Requires "from" to be in "hr()".
+  bool contains_reference(OopOrNarrowOopStar from) const {
+    assert(hr()->is_in_reserved(from), "Precondition.");
+    size_t card_ind = pointer_delta(from, hr()->bottom(),
+                                    G1CardTable::card_size);
+    return _bm.at(card_ind);
+  }
+
+  // Bulk-free the PRTs from prt to last, assumes that they are
+  // linked together using their _next field.
+  static void bulk_free(PerRegionTable* prt, PerRegionTable* last) {
+    while (true) {
+      PerRegionTable* fl = _free_list;
+      last->set_next(fl);
+      PerRegionTable* res = Atomic::cmpxchg(prt, &_free_list, fl);
+      if (res == fl) {
+        return;
+      }
+    }
+    ShouldNotReachHere();
+  }
+
+  static void free(PerRegionTable* prt) {
+    bulk_free(prt, prt);
+  }
+
+  // Returns an initialized PerRegionTable instance.
+  static PerRegionTable* alloc(HeapRegion* hr);
+
+  PerRegionTable* next() const { return _next; }
+  void set_next(PerRegionTable* next) { _next = next; }
+  PerRegionTable* prev() const { return _prev; }
+  void set_prev(PerRegionTable* prev) { _prev = prev; }
+
+  // Accessor and Modification routines for the pointer for the
+  // singly linked collision list that links the PRTs within the
+  // OtherRegionsTable::_fine_grain_regions hash table.
+  //
+  // It might be useful to also make the collision list doubly linked
+  // to avoid iteration over the collisions list during scrubbing/deletion.
+  // OTOH there might not be many collisions.
+
+  PerRegionTable* collision_list_next() const {
+    return _collision_list_next;
+  }
+
+  void set_collision_list_next(PerRegionTable* next) {
+    _collision_list_next = next;
+  }
+
+  PerRegionTable** collision_list_next_addr() {
+    return &_collision_list_next;
+  }
+
+  static size_t fl_mem_size() {
+    PerRegionTable* cur = _free_list;
+    size_t res = 0;
+    while (cur != NULL) {
+      res += cur->mem_size();
+      cur = cur->next();
+    }
+    return res;
+  }
+
+  static void test_fl_mem_size();
+};
+
 class HeapRegionRemSet : public CHeapObj<mtGC> {
   friend class VMStructs;
-  friend class HeapRegionRemSetIterator;
 
 private:
   G1BlockOffsetTable* _bot;
@@ -182,18 +314,23 @@
   // Setup sparse and fine-grain tables sizes.
   static void setup_remset_size();
 
-  bool cardset_is_empty() const {
-    return _other_regions.is_empty();
-  }
-
   bool is_empty() const {
-    return (strong_code_roots_list_length() == 0) && cardset_is_empty();
+    return (strong_code_roots_list_length() == 0) && _other_regions.is_empty();
   }
 
   bool occupancy_less_or_equal_than(size_t occ) const {
     return (strong_code_roots_list_length() == 0) && _other_regions.occupancy_less_or_equal_than(occ);
   }
 
+  // For each PRT in the card (remembered) set call one of the following methods
+  // of the given closure:
+  //
+  // set_full_region_dirty(uint region_idx) - pass the region index for coarse PRTs
+  // set_bitmap_dirty(uint region_idx, BitMap* bitmap) - pass the region index and bitmap for fine PRTs
+  // set_cards_dirty(uint region_idx, elem_t* cards, uint num_cards) - pass region index and cards for sparse PRTs
+  template <class Closure>
+  inline void iterate_prts(Closure& cl);
+
   size_t occupied() {
     MutexLocker x(&_m, Mutex::_no_safepoint_check_flag);
     return occupied_locked();
@@ -339,70 +476,4 @@
 #endif
 };
 
-class HeapRegionRemSetIterator : public StackObj {
-private:
-  // The region RSet over which we are iterating.
-  HeapRegionRemSet* _hrrs;
-
-  // Local caching of HRRS fields.
-  const BitMap*             _coarse_map;
-
-  G1BlockOffsetTable*       _bot;
-  G1CollectedHeap*          _g1h;
-
-  // The number of cards yielded since initialization.
-  size_t _n_yielded_fine;
-  size_t _n_yielded_coarse;
-  size_t _n_yielded_sparse;
-
-  // Indicates what granularity of table that we are currently iterating over.
-  // We start iterating over the sparse table, progress to the fine grain
-  // table, and then finish with the coarse table.
-  enum IterState {
-    Sparse,
-    Fine,
-    Coarse
-  };
-  IterState _is;
-
-  // For both Coarse and Fine remembered set iteration this contains the
-  // first card number of the heap region we currently iterate over.
-  size_t _cur_region_card_offset;
-
-  // Current region index for the Coarse remembered set iteration.
-  int    _coarse_cur_region_index;
-  size_t _coarse_cur_region_cur_card;
-
-  bool coarse_has_next(size_t& card_index);
-
-  // The PRT we are currently iterating over.
-  PerRegionTable* _fine_cur_prt;
-  // Card offset within the current PRT.
-  size_t _cur_card_in_prt;
-
-  // Update internal variables when switching to the given PRT.
-  void switch_to_prt(PerRegionTable* prt);
-  bool fine_has_next();
-  bool fine_has_next(size_t& card_index);
-
-  // The Sparse remembered set iterator.
-  SparsePRTIter _sparse_iter;
-
-public:
-  HeapRegionRemSetIterator(HeapRegionRemSet* hrrs);
-
-  // If there remains one or more cards to be yielded, returns true and
-  // sets "card_index" to one of those cards (which is then considered
-  // yielded.)   Otherwise, returns false (and leaves "card_index"
-  // undefined.)
-  bool has_next(size_t& card_index);
-
-  size_t n_yielded_fine() { return _n_yielded_fine; }
-  size_t n_yielded_coarse() { return _n_yielded_coarse; }
-  size_t n_yielded_sparse() { return _n_yielded_sparse; }
-  size_t n_yielded() {
-    return n_yielded_fine() + n_yielded_coarse() + n_yielded_sparse();
-  }
-};
-
 #endif // SHARE_GC_G1_HEAPREGIONREMSET_HPP
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/hotspot/share/gc/g1/heapRegionRemSet.inline.hpp	Thu Jun 27 11:48:32 2019 +0200
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2018, 2019, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_VM_GC_G1_HEAPREGIONREMSET_INLINE_HPP
+#define SHARE_VM_GC_G1_HEAPREGIONREMSET_INLINE_HPP
+
+#include "gc/g1/heapRegion.inline.hpp"
+#include "gc/g1/heapRegionRemSet.hpp"
+#include "gc/g1/sparsePRT.hpp"
+#include "utilities/bitMap.inline.hpp"
+
+template <class Closure>
+inline void HeapRegionRemSet::iterate_prts(Closure& cl) {
+  _other_regions.iterate(cl);
+}
+
+inline void PerRegionTable::add_card_work(CardIdx_t from_card, bool par) {
+  if (!_bm.at(from_card)) {
+    if (par) {
+      if (_bm.par_set_bit(from_card)) {
+        Atomic::inc(&_occupied);
+      }
+    } else {
+      _bm.set_bit(from_card);
+      _occupied++;
+    }
+  }
+}
+
+inline void PerRegionTable::add_reference_work(OopOrNarrowOopStar from, bool par) {
+  // Must make this robust in case "from" is not in "_hr", because of
+  // concurrency.
+
+  HeapRegion* loc_hr = hr();
+  // If the test below fails, then this table was reused concurrently
+  // with this operation.  This is OK, since the old table was coarsened,
+  // and adding a bit to the new table is never incorrect.
+  if (loc_hr->is_in_reserved(from)) {
+    CardIdx_t from_card = OtherRegionsTable::card_within_region(from, loc_hr);
+    add_card_work(from_card, par);
+  }
+}
+
+inline void PerRegionTable::add_card(CardIdx_t from_card_index) {
+  add_card_work(from_card_index, /*parallel*/ true);
+}
+
+inline void PerRegionTable::seq_add_card(CardIdx_t from_card_index) {
+  add_card_work(from_card_index, /*parallel*/ false);
+}
+
+inline void PerRegionTable::add_reference(OopOrNarrowOopStar from) {
+  add_reference_work(from, /*parallel*/ true);
+}
+
+inline void PerRegionTable::seq_add_reference(OopOrNarrowOopStar from) {
+  add_reference_work(from, /*parallel*/ false);
+}
+
+inline void PerRegionTable::init(HeapRegion* hr, bool clear_links_to_all_list) {
+  if (clear_links_to_all_list) {
+    set_next(NULL);
+    set_prev(NULL);
+  }
+  _collision_list_next = NULL;
+  _occupied = 0;
+  _bm.clear();
+  // Make sure that the bitmap clearing above has been finished before publishing
+  // this PRT to concurrent threads.
+  OrderAccess::release_store(&_hr, hr);
+}
+
+template <class Closure>
+void OtherRegionsTable::iterate(Closure& cl) {
+  if (_n_coarse_entries > 0) {
+    BitMap::idx_t cur = _coarse_map.get_next_one_offset(0);
+    while (cur != _coarse_map.size()) {
+      cl.next_coarse_prt((uint)cur);
+      cur = _coarse_map.get_next_one_offset(cur + 1);
+    }
+  }
+  {
+    PerRegionTable* cur = _first_all_fine_prts;
+    while (cur != NULL) {
+      cl.next_fine_prt(cur->hr()->hrm_index(), cur->bm());
+      cur = cur->next();
+    }
+  }
+  {
+    SparsePRTBucketIter iter(&_sparse_table);
+    SparsePRTEntry* cur;
+    while (iter.has_next(cur)) {
+      cl.next_sparse_prt(cur->r_ind(), cur->cards(), cur->num_valid_cards());
+    }
+  }
+}
+
+#endif // SHARE_VM_GC_G1_HEAPREGIONREMSET_INLINE_HPP
--- a/src/hotspot/share/gc/g1/sparsePRT.cpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/sparsePRT.cpp	Thu Jun 27 11:48:32 2019 +0200
@@ -275,6 +275,19 @@
   return false;
 }
 
+bool RSHashTableBucketIter::has_next(SparsePRTEntry*& entry) {
+  while (_bl_ind == RSHashTable::NullEntry)  {
+    if (_tbl_ind == (int)_rsht->capacity() - 1) {
+      return false;
+    }
+    _tbl_ind++;
+    _bl_ind = _rsht->_buckets[_tbl_ind];
+  }
+  entry = _rsht->entry(_bl_ind);
+  _bl_ind = entry->next_index();
+  return true;
+}
+
 bool RSHashTable::contains_card(RegionIdx_t region_index, CardIdx_t card_index) const {
   SparsePRTEntry* e = get_entry(region_index);
   return (e != NULL && e->contains_card(card_index));
--- a/src/hotspot/share/gc/g1/sparsePRT.hpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/g1/sparsePRT.hpp	Thu Jun 27 11:48:32 2019 +0200
@@ -38,10 +38,11 @@
 // that might contain pointers into the owner region.
 
 class SparsePRTEntry: public CHeapObj<mtGC> {
-private:
+public:
   // The type of a card entry.
   typedef uint16_t card_elem_t;
 
+private:
   // We need to make sizeof(SparsePRTEntry) an even multiple of maximum member size,
   // in order to force correct alignment that could otherwise cause SIGBUS errors
   // when reading the member variables. This calculates the minimum number of card
@@ -96,6 +97,8 @@
   // Copy the current entry's cards into the "_card" array of "e."
   inline void copy_cards(SparsePRTEntry* e) const;
 
+  card_elem_t* cards() { return _cards; }
+
   inline CardIdx_t card(int i) const {
     assert(i >= 0, "must be nonnegative");
     assert(i < cards_num(), "range checking");
@@ -106,7 +109,7 @@
 class RSHashTable : public CHeapObj<mtGC> {
 
   friend class RSHashTableIter;
-
+  friend class RSHashTableBucketIter;
 
   // Inverse maximum hash table occupancy used.
   static float TableOccupancyFactor;
@@ -209,12 +212,29 @@
   bool has_next(size_t& card_index);
 };
 
+// This is embedded in HRRS iterator.
+class RSHashTableBucketIter {
+  int _tbl_ind;         // [-1, 0.._rsht->_capacity)
+  int _bl_ind;          // [-1, 0.._rsht->_capacity)
+
+  RSHashTable* _rsht;
+
+public:
+  RSHashTableBucketIter(RSHashTable* rsht) :
+    _tbl_ind(0),
+    _bl_ind(rsht->_buckets[_tbl_ind]),
+    _rsht(rsht) { }
+
+  bool has_next(SparsePRTEntry*& entry);
+};
+
 // Concurrent access to a SparsePRT must be serialized by some external mutex.
 
 class SparsePRTIter;
 
 class SparsePRT {
   friend class SparsePRTIter;
+  friend class SparsePRTBucketIter;
 
   RSHashTable* _table;
 
@@ -262,4 +282,14 @@
   }
 };
 
+class SparsePRTBucketIter: public RSHashTableBucketIter {
+public:
+  SparsePRTBucketIter(const SparsePRT* sprt) :
+    RSHashTableBucketIter(sprt->_table) {}
+
+  bool has_next(SparsePRTEntry*& entry) {
+    return RSHashTableBucketIter::has_next(entry);
+  }
+};
+
 #endif // SHARE_GC_G1_SPARSEPRT_HPP
--- a/src/hotspot/share/gc/shared/cardTable.hpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/shared/cardTable.hpp	Thu Jun 27 11:48:32 2019 +0200
@@ -103,15 +103,11 @@
 
   enum CardValues {
     clean_card                  = (CardValue)-1,
-    // The mask contains zeros in places for all other values.
-    clean_card_mask             = clean_card - 31,
 
     dirty_card                  =  0,
     precleaned_card             =  1,
-    claimed_card                =  2,
-    deferred_card               =  4,
-    last_card                   =  8,
-    CT_MR_BS_last_reserved      = 16
+    last_card                   =  2,
+    CT_MR_BS_last_reserved      =  4
   };
 
   // a word's worth (row) of clean card values
@@ -242,11 +238,8 @@
   };
 
   static CardValue clean_card_val()          { return clean_card; }
-  static CardValue clean_card_mask_val()     { return clean_card_mask; }
   static CardValue dirty_card_val()          { return dirty_card; }
-  static CardValue claimed_card_val()        { return claimed_card; }
   static CardValue precleaned_card_val()     { return precleaned_card; }
-  static CardValue deferred_card_val()       { return deferred_card; }
   static intptr_t clean_card_row_val()   { return clean_card_row; }
 
   // Card marking array base (adjusted for heap low boundary)
--- a/src/hotspot/share/gc/shared/workerDataArray.hpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/shared/workerDataArray.hpp	Thu Jun 27 11:48:32 2019 +0200
@@ -34,7 +34,7 @@
 class WorkerDataArray  : public CHeapObj<mtGC> {
   friend class WDAPrinter;
 public:
-  static const uint MaxThreadWorkItems = 5;
+  static const uint MaxThreadWorkItems = 6;
 private:
   T*          _data;
   uint        _length;
--- a/src/hotspot/share/gc/shared/workerDataArray.inline.hpp	Thu Jun 27 03:33:44 2019 +0200
+++ b/src/hotspot/share/gc/shared/workerDataArray.inline.hpp	Thu Jun 27 11:48:32 2019 +0200
@@ -101,7 +101,7 @@
 template <typename T>
 void WorkerDataArray<T>::add(uint worker_i, T value) {
   assert(worker_i < _length, "Worker %d is greater than max: %d", worker_i, _length);
-  assert(_data[worker_i] != uninitialized(), "No data to add to for worker %d", worker_i);
+  assert(_data[worker_i] != uninitialized(), "No data to add to %s for worker %d", _title, worker_i);
   _data[worker_i] += value;
 }
 
--- a/test/hotspot/jtreg/gc/g1/TestGCLogMessages.java	Thu Jun 27 03:33:44 2019 +0200
+++ b/test/hotspot/jtreg/gc/g1/TestGCLogMessages.java	Thu Jun 27 11:48:32 2019 +0200
@@ -95,21 +95,28 @@
         new LogMessageWithLevel("Post Evacuate Collection Set", Level.INFO),
         new LogMessageWithLevel("Other", Level.INFO),
 
-        // Update RS
-        new LogMessageWithLevel("Update RS", Level.DEBUG),
+        // Merge Heap Roots
+        new LogMessageWithLevel("Merge Heap Roots", Level.INFO),
+        new LogMessageWithLevel("Remembered Sets", Level.DEBUG),
+        new LogMessageWithLevel("Merged Sparse", Level.DEBUG),
+        new LogMessageWithLevel("Merged Fine", Level.DEBUG),
+        new LogMessageWithLevel("Merged Coarse", Level.DEBUG),
+        new LogMessageWithLevel("Hot Card Cache", Level.DEBUG),
+        new LogMessageWithLevel("Log Buffers", Level.DEBUG),
         new LogMessageWithLevel("Processed Buffers", Level.DEBUG),
-        new LogMessageWithLevel("Scanned Cards", Level.DEBUG),
+        new LogMessageWithLevel("Dirty Cards", Level.DEBUG),
         new LogMessageWithLevel("Skipped Cards", Level.DEBUG),
-        new LogMessageWithLevel("Scan HCC", Level.DEBUG),
-        // Scan RS
-        new LogMessageWithLevel("Scan RS", Level.DEBUG),
+        // Scan Heap Roots
+        new LogMessageWithLevel("Scan Heap Roots", Level.DEBUG),
         new LogMessageWithLevel("Scanned Cards", Level.DEBUG),
-        new LogMessageWithLevel("Claimed Cards", Level.DEBUG),
-        new LogMessageWithLevel("Skipped Cards", Level.DEBUG),
+        new LogMessageWithLevel("Scanned Blocks", Level.DEBUG),
+        new LogMessageWithLevel("Claimed Chunks", Level.DEBUG),
+        // Code Roots Scan
+        new LogMessageWithLevel("Code Root Scan", Level.DEBUG),
         // Object Copy
         new LogMessageWithLevel("Object Copy", Level.DEBUG),
-        new LogMessageWithLevel("Scanned Cards", Level.DEBUG),
-        new LogMessageWithLevel("Claimed Cards", Level.DEBUG),
+        new LogMessageWithLevel("LAB Waste", Level.DEBUG),
+        new LogMessageWithLevel("LAB Undo Waste", Level.DEBUG),
         // Ext Root Scan
         new LogMessageWithLevel("Thread Roots", Level.TRACE),
         new LogMessageWithLevel("Universe Roots", Level.TRACE),
@@ -133,6 +140,7 @@
         new LogMessageWithLevel("Table Fixup", Level.DEBUG),
         new LogMessageWithLevel("Expand Heap After Collection", Level.DEBUG),
         new LogMessageWithLevel("Region Register", Level.DEBUG),
+        new LogMessageWithLevel("Prepare Heap Roots", Level.DEBUG),
         // Free CSet
         new LogMessageWithLevel("Free Collection Set", Level.DEBUG),
         new LogMessageWithLevel("Free Collection Set Serial", Level.TRACE),
--- a/test/jdk/jdk/jfr/event/gc/collection/TestG1ParallelPhases.java	Thu Jun 27 03:33:44 2019 +0200
+++ b/test/jdk/jdk/jfr/event/gc/collection/TestG1ParallelPhases.java	Thu Jun 27 11:48:32 2019 +0200
@@ -100,20 +100,30 @@
             "CMRefRoots",
             "WaitForStrongCLD",
             "WeakCLDRoots",
-            "UpdateRS",
-            "ScanHCC",
-            "ScanRS",
+            "MergeHCC",
+            "MergeRS",
+            "MergeLB",
+            "ScanHR",
             "CodeRoots",
             "ObjCopy",
             "Termination",
             "StringDedupQueueFixup",
             "StringDedupTableFixup",
             "RedirtyCards",
-       //     "PreserveCMReferents",
             "NonYoungFreeCSet",
             "YoungFreeCSet"
         );
 
+        // Some GC phases may or may not occur depending on environment. Filter them out
+        // since we can not reliably guarantee that they occur (or not).
+        Set<String> optPhases = of(
+            "OptScanHR",
+            "OptMergeRS",
+            "OptCodeRoots",
+            "OptObjCopy"
+        );
+        usedPhases.removeAll(optPhases);
+
         assertTrue(usedPhases.equals(allPhases), "Compare events expected and received"
             + ", Not found phases: " + allPhases.stream().filter(p -> !usedPhases.contains(p)).collect(joining(", "))
             + ", Not expected phases: " + usedPhases.stream().filter(p -> !allPhases.contains(p)).collect(joining(", ")));