6923991: G1: improve scalability of RSet scanning
authoriveresov
Thu, 11 Feb 2010 15:52:19 -0800
changeset 4902 991aaddb5165
parent 4901 304ce755c6ee
child 4903 1837a9533d89
6923991: G1: improve scalability of RSet scanning Summary: Implemented block-based work stealing. Moved copying during the rset scanning phase to the main copying phase. Made the size of rset table depend on the region size. Reviewed-by: apetrusenko, tonyp
hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp
hotspot/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp
hotspot/src/share/vm/gc_implementation/g1/g1OopClosures.hpp
hotspot/src/share/vm/gc_implementation/g1/g1OopClosures.inline.hpp
hotspot/src/share/vm/gc_implementation/g1/g1RemSet.cpp
hotspot/src/share/vm/gc_implementation/g1/g1_globals.hpp
hotspot/src/share/vm/gc_implementation/g1/g1_specialized_oop_closures.hpp
hotspot/src/share/vm/gc_implementation/g1/heapRegionRemSet.cpp
hotspot/src/share/vm/gc_implementation/g1/heapRegionRemSet.hpp
hotspot/src/share/vm/gc_implementation/g1/sparsePRT.cpp
hotspot/src/share/vm/gc_implementation/g1/sparsePRT.hpp
hotspot/src/share/vm/memory/cardTableModRefBS.hpp
hotspot/src/share/vm/utilities/globalDefinitions.hpp
--- a/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp	Tue Feb 09 13:56:09 2010 -0800
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp	Thu Feb 11 15:52:19 2010 -0800
@@ -2646,6 +2646,13 @@
 
 // </NEW PREDICTION>
 
+struct PrepareForRSScanningClosure : public HeapRegionClosure {
+  bool doHeapRegion(HeapRegion *r) {
+    r->rem_set()->set_iter_claimed(0);
+    return false;
+  }
+};
+
 void
 G1CollectedHeap::do_collection_pause_at_safepoint() {
   if (PrintHeapAtGC) {
@@ -2784,6 +2791,8 @@
         gclog_or_tty->print_cr("\nAfter pause, heap:");
         print();
 #endif
+        PrepareForRSScanningClosure prepare_for_rs_scan;
+        collection_set_iterate(&prepare_for_rs_scan);
 
         setup_surviving_young_words();
 
@@ -3781,22 +3790,16 @@
   return obj;
 }
 
-template <bool do_gen_barrier, G1Barrier barrier, bool do_mark_forwardee, bool skip_cset_test>
+template <bool do_gen_barrier, G1Barrier barrier, bool do_mark_forwardee>
 template <class T>
-void G1ParCopyClosure <do_gen_barrier, barrier, do_mark_forwardee, skip_cset_test>
+void G1ParCopyClosure <do_gen_barrier, barrier, do_mark_forwardee>
 ::do_oop_work(T* p) {
   oop obj = oopDesc::load_decode_heap_oop(p);
   assert(barrier != G1BarrierRS || obj != NULL,
          "Precondition: G1BarrierRS implies obj is nonNull");
 
-  // The only time we skip the cset test is when we're scanning
-  // references popped from the queue. And we only push on the queue
-  // references that we know point into the cset, so no point in
-  // checking again. But we'll leave an assert here for peace of mind.
-  assert(!skip_cset_test || _g1->obj_in_cs(obj), "invariant");
-
   // here the null check is implicit in the cset_fast_test() test
-  if (skip_cset_test || _g1->in_cset_fast_test(obj)) {
+  if (_g1->in_cset_fast_test(obj)) {
 #if G1_REM_SET_LOGGING
     gclog_or_tty->print_cr("Loc "PTR_FORMAT" contains pointer "PTR_FORMAT" "
                            "into CS.", p, (void*) obj);
@@ -3813,7 +3816,6 @@
     }
   }
 
-  // When scanning moved objs, must look at all oops.
   if (barrier == G1BarrierEvac && obj != NULL) {
     _par_scan_state->update_rs(_from, p, _par_scan_state->queue_num());
   }
@@ -3823,8 +3825,8 @@
   }
 }
 
-template void G1ParCopyClosure<false, G1BarrierEvac, false, true>::do_oop_work(oop* p);
-template void G1ParCopyClosure<false, G1BarrierEvac, false, true>::do_oop_work(narrowOop* p);
+template void G1ParCopyClosure<false, G1BarrierEvac, false>::do_oop_work(oop* p);
+template void G1ParCopyClosure<false, G1BarrierEvac, false>::do_oop_work(narrowOop* p);
 
 template <class T> void G1ParScanPartialArrayClosure::do_oop_nv(T* p) {
   assert(has_partial_array_mask(p), "invariant");
@@ -3896,11 +3898,11 @@
           assert(UseCompressedOops, "Error");
           narrowOop* p = (narrowOop*) stolen_task;
           assert(has_partial_array_mask(p) ||
-                 _g1h->obj_in_cs(oopDesc::load_decode_heap_oop(p)), "Error");
+                 _g1h->is_in_g1_reserved(oopDesc::load_decode_heap_oop(p)), "Error");
           pss->push_on_queue(p);
         } else {
           oop* p = (oop*) stolen_task;
-          assert(has_partial_array_mask(p) || _g1h->obj_in_cs(*p), "Error");
+          assert(has_partial_array_mask(p) || _g1h->is_in_g1_reserved(*p), "Error");
           pss->push_on_queue(p);
         }
         continue;
@@ -3962,6 +3964,7 @@
     G1ParScanExtRootClosure         only_scan_root_cl(_g1h, &pss);
     G1ParScanPermClosure            only_scan_perm_cl(_g1h, &pss);
     G1ParScanHeapRSClosure          only_scan_heap_rs_cl(_g1h, &pss);
+    G1ParPushHeapRSClosure          push_heap_rs_cl(_g1h, &pss);
 
     G1ParScanAndMarkExtRootClosure  scan_mark_root_cl(_g1h, &pss);
     G1ParScanAndMarkPermClosure     scan_mark_perm_cl(_g1h, &pss);
@@ -3985,7 +3988,7 @@
     _g1h->g1_process_strong_roots(/* not collecting perm */ false,
                                   SharedHeap::SO_AllClasses,
                                   scan_root_cl,
-                                  &only_scan_heap_rs_cl,
+                                  &push_heap_rs_cl,
                                   scan_so_cl,
                                   scan_perm_cl,
                                   i);
--- a/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp	Tue Feb 09 13:56:09 2010 -0800
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp	Thu Feb 11 15:52:19 2010 -0800
@@ -1623,7 +1623,7 @@
   template <class T> void push_on_queue(T* ref) {
     assert(ref != NULL, "invariant");
     assert(has_partial_array_mask(ref) ||
-           _g1h->obj_in_cs(oopDesc::load_decode_heap_oop(ref)), "invariant");
+           _g1h->is_in_g1_reserved(oopDesc::load_decode_heap_oop(ref)), "invariant");
 #ifdef ASSERT
     if (has_partial_array_mask(ref)) {
       oop p = clear_partial_array_mask(ref);
@@ -1644,9 +1644,9 @@
       assert((oop*)ref != NULL, "pop_local() returned true");
       assert(UseCompressedOops || !ref.is_narrow(), "Error");
       assert(has_partial_array_mask((oop*)ref) ||
-             _g1h->obj_in_cs(ref.is_narrow() ? oopDesc::load_decode_heap_oop((narrowOop*)ref)
-                                             : oopDesc::load_decode_heap_oop((oop*)ref)),
-             "invariant");
+             _g1h->is_in_g1_reserved(ref.is_narrow() ? oopDesc::load_decode_heap_oop((narrowOop*)ref)
+                                                     : oopDesc::load_decode_heap_oop((oop*)ref)),
+              "invariant");
       IF_G1_DETAILED_STATS(note_pop());
     } else {
       StarTask null_task;
@@ -1659,9 +1659,9 @@
     assert((oop*)new_ref != NULL, "pop() from a local non-empty stack");
     assert(UseCompressedOops || !new_ref.is_narrow(), "Error");
     assert(has_partial_array_mask((oop*)new_ref) ||
-           _g1h->obj_in_cs(new_ref.is_narrow() ? oopDesc::load_decode_heap_oop((narrowOop*)new_ref)
-                                               : oopDesc::load_decode_heap_oop((oop*)new_ref)),
-             "invariant");
+           _g1h->is_in_g1_reserved(new_ref.is_narrow() ? oopDesc::load_decode_heap_oop((narrowOop*)new_ref)
+                                                       : oopDesc::load_decode_heap_oop((oop*)new_ref)),
+           "invariant");
     ref = new_ref;
   }
 
@@ -1825,12 +1825,12 @@
           assert(UseCompressedOops, "Error");
           narrowOop* p = (narrowOop*)ref_to_scan;
           assert(!has_partial_array_mask(p) &&
-                 _g1h->obj_in_cs(oopDesc::load_decode_heap_oop(p)), "sanity");
+                 _g1h->is_in_g1_reserved(oopDesc::load_decode_heap_oop(p)), "sanity");
           deal_with_reference(p);
         } else {
           oop* p = (oop*)ref_to_scan;
-          assert((has_partial_array_mask(p) && _g1h->obj_in_cs(clear_partial_array_mask(p))) ||
-                 _g1h->obj_in_cs(oopDesc::load_decode_heap_oop(p)), "sanity");
+          assert((has_partial_array_mask(p) && _g1h->is_in_g1_reserved(clear_partial_array_mask(p))) ||
+                 _g1h->is_in_g1_reserved(oopDesc::load_decode_heap_oop(p)), "sanity");
           deal_with_reference(p);
         }
       }
@@ -1844,12 +1844,12 @@
             assert(UseCompressedOops, "Error");
             narrowOop* p = (narrowOop*)ref_to_scan;
             assert(!has_partial_array_mask(p) &&
-                   _g1h->obj_in_cs(oopDesc::load_decode_heap_oop(p)), "sanity");
+                    _g1h->is_in_g1_reserved(oopDesc::load_decode_heap_oop(p)), "sanity");
             deal_with_reference(p);
           } else {
             oop* p = (oop*)ref_to_scan;
             assert((has_partial_array_mask(p) && _g1h->obj_in_cs(clear_partial_array_mask(p))) ||
-                  _g1h->obj_in_cs(oopDesc::load_decode_heap_oop(p)), "sanity");
+                   _g1h->is_in_g1_reserved(oopDesc::load_decode_heap_oop(p)), "sanity");
             deal_with_reference(p);
           }
         }
--- a/hotspot/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp	Tue Feb 09 13:56:09 2010 -0800
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp	Thu Feb 11 15:52:19 2010 -0800
@@ -205,6 +205,7 @@
   // policy is created before the heap, we have to set this up here,
   // so it's done as soon as possible.
   HeapRegion::setup_heap_region_size(Arguments::min_heap_size());
+  HeapRegionRemSet::setup_remset_size();
 
   _recent_prev_end_times_for_all_gcs_sec->add(os::elapsedTime());
   _prev_collection_pause_end_ms = os::elapsedTime() * 1000.0;
--- a/hotspot/src/share/vm/gc_implementation/g1/g1OopClosures.hpp	Tue Feb 09 13:56:09 2010 -0800
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1OopClosures.hpp	Thu Feb 11 15:52:19 2010 -0800
@@ -53,6 +53,15 @@
   bool apply_to_weak_ref_discovered_field() { return true; }
 };
 
+class G1ParPushHeapRSClosure : public G1ParClosureSuper {
+public:
+  G1ParPushHeapRSClosure(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state) :
+    G1ParClosureSuper(g1, par_scan_state) { }
+  template <class T> void do_oop_nv(T* p);
+  virtual void do_oop(oop* p)          { do_oop_nv(p); }
+  virtual void do_oop(narrowOop* p)    { do_oop_nv(p); }
+};
+
 class G1ParScanClosure : public G1ParClosureSuper {
 public:
   G1ParScanClosure(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state) :
@@ -100,7 +109,7 @@
 };
 
 template<bool do_gen_barrier, G1Barrier barrier,
-         bool do_mark_forwardee, bool skip_cset_test>
+         bool do_mark_forwardee>
 class G1ParCopyClosure : public G1ParCopyHelper {
   G1ParScanClosure _scanner;
   template <class T> void do_oop_work(T* p);
@@ -116,12 +125,13 @@
   virtual void do_oop(narrowOop* p) { do_oop_nv(p); }
 };
 
-typedef G1ParCopyClosure<false, G1BarrierNone, false, false> G1ParScanExtRootClosure;
-typedef G1ParCopyClosure<true,  G1BarrierNone, false, false> G1ParScanPermClosure;
-typedef G1ParCopyClosure<false, G1BarrierRS,   false, false> G1ParScanHeapRSClosure;
-typedef G1ParCopyClosure<false, G1BarrierNone, true,  false> G1ParScanAndMarkExtRootClosure;
-typedef G1ParCopyClosure<true,  G1BarrierNone, true,  false> G1ParScanAndMarkPermClosure;
-typedef G1ParCopyClosure<false, G1BarrierRS,   true,  false> G1ParScanAndMarkHeapRSClosure;
+typedef G1ParCopyClosure<false, G1BarrierNone, false> G1ParScanExtRootClosure;
+typedef G1ParCopyClosure<true,  G1BarrierNone, false> G1ParScanPermClosure;
+typedef G1ParCopyClosure<false, G1BarrierRS,   false> G1ParScanHeapRSClosure;
+typedef G1ParCopyClosure<false, G1BarrierNone, true> G1ParScanAndMarkExtRootClosure;
+typedef G1ParCopyClosure<true,  G1BarrierNone, true> G1ParScanAndMarkPermClosure;
+typedef G1ParCopyClosure<false, G1BarrierRS,   true> G1ParScanAndMarkHeapRSClosure;
+
 // This is the only case when we set skip_cset_test. Basically, this
 // closure is (should?) only be called directly while we're draining
 // the overflow and task queues. In that case we know that the
@@ -132,7 +142,7 @@
 // We need a separate closure to handle references during evacuation
 // failure processing, as we cannot asume that the reference already
 // points into the collection set (like G1ParScanHeapEvacClosure does).
-typedef G1ParCopyClosure<false, G1BarrierEvac, false, false> G1ParScanHeapEvacFailureClosure;
+typedef G1ParCopyClosure<false, G1BarrierEvac, false> G1ParScanHeapEvacFailureClosure;
 
 class FilterIntoCSClosure: public OopClosure {
   G1CollectedHeap* _g1;
--- a/hotspot/src/share/vm/gc_implementation/g1/g1OopClosures.inline.hpp	Tue Feb 09 13:56:09 2010 -0800
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1OopClosures.inline.hpp	Thu Feb 11 15:52:19 2010 -0800
@@ -104,3 +104,16 @@
     }
   }
 }
+
+template <class T> inline void G1ParPushHeapRSClosure::do_oop_nv(T* p) {
+  T heap_oop = oopDesc::load_heap_oop(p);
+
+  if (!oopDesc::is_null(heap_oop)) {
+    oop obj = oopDesc::decode_heap_oop_not_null(heap_oop);
+    if (_g1->in_cset_fast_test(obj)) {
+      Prefetch::write(obj->mark_addr(), 0);
+      Prefetch::read(obj->mark_addr(), (HeapWordSize*2));
+      _par_scan_state->push_on_queue(p);
+    }
+  }
+}
--- a/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.cpp	Tue Feb 09 13:56:09 2010 -0800
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.cpp	Thu Feb 11 15:52:19 2010 -0800
@@ -155,8 +155,8 @@
   G1BlockOffsetSharedArray* _bot_shared;
   CardTableModRefBS *_ct_bs;
   int _worker_i;
+  int _block_size;
   bool _try_claimed;
-  size_t _min_skip_distance, _max_skip_distance;
 public:
   ScanRSClosure(OopsInHeapRegionClosure* oc, int worker_i) :
     _oc(oc),
@@ -168,8 +168,7 @@
     _g1h = G1CollectedHeap::heap();
     _bot_shared = _g1h->bot_shared();
     _ct_bs = (CardTableModRefBS*) (_g1h->barrier_set());
-    _min_skip_distance = 16;
-    _max_skip_distance = 2 * _g1h->n_par_threads() * _min_skip_distance;
+    _block_size = MAX2<int>(G1RSetScanBlockSize, 1);
   }
 
   void set_try_claimed() { _try_claimed = true; }
@@ -225,12 +224,15 @@
     HeapRegionRemSetIterator* iter = _g1h->rem_set_iterator(_worker_i);
     hrrs->init_iterator(iter);
     size_t card_index;
-    size_t skip_distance = 0, current_card = 0, jump_to_card = 0;
-    while (iter->has_next(card_index)) {
-      if (current_card < jump_to_card) {
-        ++current_card;
-        continue;
+
+    // We claim cards in block so as to recude the contention. The block size is determined by
+    // the G1RSetScanBlockSize parameter.
+    size_t jump_to_card = hrrs->iter_claimed_next(_block_size);
+    for (size_t current_card = 0; iter->has_next(card_index); current_card++) {
+      if (current_card >= jump_to_card + _block_size) {
+        jump_to_card = hrrs->iter_claimed_next(_block_size);
       }
+      if (current_card < jump_to_card) continue;
       HeapWord* card_start = _g1h->bot_shared()->address_for_index(card_index);
 #if 0
       gclog_or_tty->print("Rem set iteration yielded card [" PTR_FORMAT ", " PTR_FORMAT ").\n",
@@ -247,22 +249,14 @@
 
        // If the card is dirty, then we will scan it during updateRS.
       if (!card_region->in_collection_set() && !_ct_bs->is_card_dirty(card_index)) {
-          if (!_ct_bs->is_card_claimed(card_index) && _ct_bs->claim_card(card_index)) {
-            scanCard(card_index, card_region);
-          } else if (_try_claimed) {
-            if (jump_to_card == 0 || jump_to_card != current_card) {
-              // We did some useful work in the previous iteration.
-              // Decrease the distance.
-              skip_distance = MAX2(skip_distance >> 1, _min_skip_distance);
-            } else {
-              // Previous iteration resulted in a claim failure.
-              // Increase the distance.
-              skip_distance = MIN2(skip_distance << 1, _max_skip_distance);
-            }
-            jump_to_card = current_card + skip_distance;
-          }
+        // We make the card as "claimed" lazily (so races are possible but they're benign),
+        // which reduces the number of duplicate scans (the rsets of the regions in the cset
+        // can intersect).
+        if (!_ct_bs->is_card_claimed(card_index)) {
+          _ct_bs->set_card_claimed(card_index);
+          scanCard(card_index, card_region);
+        }
       }
-      ++current_card;
     }
     if (!_try_claimed) {
       hrrs->set_iter_complete();
@@ -299,30 +293,18 @@
   double rs_time_start = os::elapsedTime();
   HeapRegion *startRegion = calculateStartRegion(worker_i);
 
-  BufferingOopsInHeapRegionClosure boc(oc);
-  ScanRSClosure scanRScl(&boc, worker_i);
+  ScanRSClosure scanRScl(oc, worker_i);
   _g1->collection_set_iterate_from(startRegion, &scanRScl);
   scanRScl.set_try_claimed();
   _g1->collection_set_iterate_from(startRegion, &scanRScl);
 
-  boc.done();
-  double closure_app_time_sec = boc.closure_app_seconds();
-  double scan_rs_time_sec = (os::elapsedTime() - rs_time_start) -
-    closure_app_time_sec;
-  double closure_app_time_ms = closure_app_time_sec * 1000.0;
+  double scan_rs_time_sec = os::elapsedTime() - rs_time_start;
 
   assert( _cards_scanned != NULL, "invariant" );
   _cards_scanned[worker_i] = scanRScl.cards_done();
 
   _g1p->record_scan_rs_start_time(worker_i, rs_time_start * 1000.0);
   _g1p->record_scan_rs_time(worker_i, scan_rs_time_sec * 1000.0);
-
-  double scan_new_refs_time_ms = _g1p->get_scan_new_refs_time(worker_i);
-  if (scan_new_refs_time_ms > 0.0) {
-    closure_app_time_ms += scan_new_refs_time_ms;
-  }
-
-  _g1p->record_obj_copy_time(worker_i, closure_app_time_ms);
 }
 
 void HRInto_G1RemSet::updateRS(int worker_i) {
@@ -449,9 +431,8 @@
       oc->do_oop(p);
     }
   }
-  _g1p->record_scan_new_refs_time(worker_i,
-                                  (os::elapsedTime() - scan_new_refs_start_sec)
-                                  * 1000.0);
+  double scan_new_refs_time_ms = (os::elapsedTime() - scan_new_refs_start_sec) * 1000.0;
+  _g1p->record_scan_new_refs_time(worker_i, scan_new_refs_time_ms);
 }
 
 void HRInto_G1RemSet::cleanupHRRS() {
--- a/hotspot/src/share/vm/gc_implementation/g1/g1_globals.hpp	Tue Feb 09 13:56:09 2010 -0800
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1_globals.hpp	Thu Feb 11 15:52:19 2010 -0800
@@ -207,8 +207,20 @@
   develop(bool, G1PrintOopAppls, false,                                     \
           "When true, print applications of closures to external locs.")    \
                                                                             \
-  develop(intx, G1LogRSRegionEntries, 7,                                    \
-          "Log_2 of max number of regions for which we keep bitmaps.")      \
+  develop(intx, G1RSetRegionEntriesBase, 256,                               \
+          "Max number of regions in a fine-grain table per MB.")            \
+                                                                            \
+  product(intx, G1RSetRegionEntries, 0,                                     \
+          "Max number of regions for which we keep bitmaps."                \
+          "Will be set ergonomically by default")                           \
+                                                                            \
+  develop(intx, G1RSetSparseRegionEntriesBase, 4,                           \
+          "Max number of entries per region in a sparse table "             \
+          "per MB.")                                                        \
+                                                                            \
+  product(intx, G1RSetSparseRegionEntries, 0,                               \
+          "Max number of entries per region in a sparse table."             \
+          "Will be set ergonomically by default.")                          \
                                                                             \
   develop(bool, G1RecordHRRSOops, false,                                    \
           "When true, record recent calls to rem set operations.")          \
@@ -293,6 +305,10 @@
   develop(bool, G1VerifyCTCleanup, false,                                   \
           "Verify card table cleanup.")                                     \
                                                                             \
+  product(uintx, G1RSetScanBlockSize, 64,                                   \
+          "Size of a work unit of cards claimed by a worker thread"         \
+          "during RSet scanning.")                                          \
+                                                                            \
   develop(bool, ReduceInitialCardMarksForG1, false,                         \
           "When ReduceInitialCardMarks is true, this flag setting "         \
           " controls whether G1 allows the RICM optimization")
--- a/hotspot/src/share/vm/gc_implementation/g1/g1_specialized_oop_closures.hpp	Tue Feb 09 13:56:09 2010 -0800
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1_specialized_oop_closures.hpp	Thu Feb 11 15:52:19 2010 -0800
@@ -33,11 +33,12 @@
 };
 
 template<bool do_gen_barrier, G1Barrier barrier,
-         bool do_mark_forwardee, bool skip_cset_test>
+         bool do_mark_forwardee>
 class G1ParCopyClosure;
 class G1ParScanClosure;
+class G1ParPushHeapRSClosure;
 
-typedef G1ParCopyClosure<false, G1BarrierEvac, false, true> G1ParScanHeapEvacClosure;
+typedef G1ParCopyClosure<false, G1BarrierEvac, false> G1ParScanHeapEvacClosure;
 
 class FilterIntoCSClosure;
 class FilterOutOfRegionClosure;
@@ -51,6 +52,7 @@
 #define FURTHER_SPECIALIZED_OOP_OOP_ITERATE_CLOSURES(f) \
       f(G1ParScanHeapEvacClosure,_nv)                   \
       f(G1ParScanClosure,_nv)                           \
+      f(G1ParPushHeapRSClosure,_nv)                     \
       f(FilterIntoCSClosure,_nv)                        \
       f(FilterOutOfRegionClosure,_nv)                   \
       f(FilterInHeapRegionAndIntoCSClosure,_nv)         \
--- a/hotspot/src/share/vm/gc_implementation/g1/heapRegionRemSet.cpp	Tue Feb 09 13:56:09 2010 -0800
+++ b/hotspot/src/share/vm/gc_implementation/g1/heapRegionRemSet.cpp	Thu Feb 11 15:52:19 2010 -0800
@@ -505,12 +505,13 @@
   typedef PosParPRT* PosParPRTPtr;
   if (_max_fine_entries == 0) {
     assert(_mod_max_fine_entries_mask == 0, "Both or none.");
-    _max_fine_entries = (size_t)(1 << G1LogRSRegionEntries);
+    size_t max_entries_log = (size_t)log2_long((jlong)G1RSetRegionEntries);
+    _max_fine_entries = (size_t)(1 << max_entries_log);
     _mod_max_fine_entries_mask = _max_fine_entries - 1;
 #if SAMPLE_FOR_EVICTION
     assert(_fine_eviction_sample_size == 0
            && _fine_eviction_stride == 0, "All init at same time.");
-    _fine_eviction_sample_size = MAX2((size_t)4, (size_t)G1LogRSRegionEntries);
+    _fine_eviction_sample_size = MAX2((size_t)4, max_entries_log);
     _fine_eviction_stride = _max_fine_entries / _fine_eviction_sample_size;
 #endif
   }
@@ -655,13 +656,6 @@
 #endif
       }
 
-      // Otherwise, transfer from sparse to fine-grain.
-      CardIdx_t cards[SparsePRTEntry::CardsPerEntry];
-      if (G1HRRSUseSparseTable) {
-        bool res = _sparse_table.get_cards(from_hrs_ind, &cards[0]);
-        assert(res, "There should have been an entry");
-      }
-
       if (_n_fine_entries == _max_fine_entries) {
         prt = delete_region_table();
       } else {
@@ -676,10 +670,12 @@
       _fine_grain_regions[ind] = prt;
       _n_fine_entries++;
 
-      // Add in the cards from the sparse table.
       if (G1HRRSUseSparseTable) {
-        for (int i = 0; i < SparsePRTEntry::CardsPerEntry; i++) {
-          CardIdx_t c = cards[i];
+        // Transfer from sparse to fine-grain.
+        SparsePRTEntry *sprt_entry = _sparse_table.get_entry(from_hrs_ind);
+        assert(sprt_entry != NULL, "There should have been an entry");
+        for (int i = 0; i < SparsePRTEntry::cards_num(); i++) {
+          CardIdx_t c = sprt_entry->card(i);
           if (c != SparsePRTEntry::NullEntry) {
             prt->add_card(c);
           }
@@ -1084,6 +1080,19 @@
 {}
 
 
+void HeapRegionRemSet::setup_remset_size() {
+  // Setup sparse and fine-grain tables sizes.
+  // table_size = base * (log(region_size / 1M) + 1)
+  int region_size_log_mb = MAX2((int)HeapRegion::LogOfHRGrainBytes - (int)LOG_M, 0);
+  if (FLAG_IS_DEFAULT(G1RSetSparseRegionEntries)) {
+    G1RSetSparseRegionEntries = G1RSetSparseRegionEntriesBase * (region_size_log_mb + 1);
+  }
+  if (FLAG_IS_DEFAULT(G1RSetRegionEntries)) {
+    G1RSetRegionEntries = G1RSetRegionEntriesBase * (region_size_log_mb + 1);
+  }
+  guarantee(G1RSetSparseRegionEntries > 0 && G1RSetRegionEntries > 0 , "Sanity");
+}
+
 void HeapRegionRemSet::init_for_par_iteration() {
   _iter_state = Unclaimed;
 }
@@ -1399,7 +1408,7 @@
   os::sleep(Thread::current(), (jlong)5000, false);
   G1CollectedHeap* g1h = G1CollectedHeap::heap();
 
-  // Run with "-XX:G1LogRSRegionEntries=2", so that 1 and 5 end up in same
+  // Run with "-XX:G1LogRSetRegionEntries=2", so that 1 and 5 end up in same
   // hash bucket.
   HeapRegion* hr0 = g1h->region_at(0);
   HeapRegion* hr1 = g1h->region_at(1);
--- a/hotspot/src/share/vm/gc_implementation/g1/heapRegionRemSet.hpp	Tue Feb 09 13:56:09 2010 -0800
+++ b/hotspot/src/share/vm/gc_implementation/g1/heapRegionRemSet.hpp	Thu Feb 11 15:52:19 2010 -0800
@@ -187,7 +187,8 @@
   void clear_outgoing_entries();
 
   enum ParIterState { Unclaimed, Claimed, Complete };
-  ParIterState _iter_state;
+  volatile ParIterState _iter_state;
+  volatile jlong _iter_claimed;
 
   // Unused unless G1RecordHRRSOops is true.
 
@@ -209,6 +210,7 @@
                    HeapRegion* hr);
 
   static int num_par_rem_sets();
+  static void setup_remset_size();
 
   HeapRegion* hr() const {
     return _other_regions.hr();
@@ -272,6 +274,19 @@
   // Returns "true" iff the region's iteration is complete.
   bool iter_is_complete();
 
+  // Support for claiming blocks of cards during iteration
+  void set_iter_claimed(size_t x) { _iter_claimed = (jlong)x; }
+  size_t iter_claimed() const { return (size_t)_iter_claimed; }
+  // Claim the next block of cards
+  size_t iter_claimed_next(size_t step) {
+    size_t current, next;
+    do {
+      current = iter_claimed();
+      next = current + step;
+    } while (Atomic::cmpxchg((jlong)next, &_iter_claimed, (jlong)current) != (jlong)current);
+    return current;
+  }
+
   // Initialize the given iterator to iterate over this rem set.
   void init_iterator(HeapRegionRemSetIterator* iter) const;
 
--- a/hotspot/src/share/vm/gc_implementation/g1/sparsePRT.cpp	Tue Feb 09 13:56:09 2010 -0800
+++ b/hotspot/src/share/vm/gc_implementation/g1/sparsePRT.cpp	Thu Feb 11 15:52:19 2010 -0800
@@ -27,7 +27,7 @@
 
 #define SPARSE_PRT_VERBOSE 0
 
-#define UNROLL_CARD_LOOPS 1
+#define UNROLL_CARD_LOOPS  1
 
 void SparsePRT::init_iterator(SparsePRTIter* sprt_iter) {
     sprt_iter->init(this);
@@ -36,27 +36,32 @@
 void SparsePRTEntry::init(RegionIdx_t region_ind) {
   _region_ind = region_ind;
   _next_index = NullEntry;
+
 #if UNROLL_CARD_LOOPS
-  assert(CardsPerEntry == 4, "Assumption.  If changes, un-unroll.");
-  _cards[0] = NullEntry;
-  _cards[1] = NullEntry;
-  _cards[2] = NullEntry;
-  _cards[3] = NullEntry;
+  assert((cards_num() & (UnrollFactor - 1)) == 0, "Invalid number of cards in the entry");
+  for (int i = 0; i < cards_num(); i += UnrollFactor) {
+    _cards[i] = NullEntry;
+    _cards[i + 1] = NullEntry;
+    _cards[i + 2] = NullEntry;
+    _cards[i + 3] = NullEntry;
+  }
 #else
-  for (int i = 0; i < CardsPerEntry; i++)
+  for (int i = 0; i < cards_num(); i++)
     _cards[i] = NullEntry;
 #endif
 }
 
 bool SparsePRTEntry::contains_card(CardIdx_t card_index) const {
 #if UNROLL_CARD_LOOPS
-  assert(CardsPerEntry == 4, "Assumption.  If changes, un-unroll.");
-  if (_cards[0] == card_index) return true;
-  if (_cards[1] == card_index) return true;
-  if (_cards[2] == card_index) return true;
-  if (_cards[3] == card_index) return true;
+  assert((cards_num() & (UnrollFactor - 1)) == 0, "Invalid number of cards in the entry");
+  for (int i = 0; i < cards_num(); i += UnrollFactor) {
+    if (_cards[i] == card_index ||
+        _cards[i + 1] == card_index ||
+        _cards[i + 2] == card_index ||
+        _cards[i + 3] == card_index) return true;
+  }
 #else
-  for (int i = 0; i < CardsPerEntry; i++) {
+  for (int i = 0; i < cards_num(); i++) {
     if (_cards[i] == card_index) return true;
   }
 #endif
@@ -67,14 +72,16 @@
 int SparsePRTEntry::num_valid_cards() const {
   int sum = 0;
 #if UNROLL_CARD_LOOPS
-  assert(CardsPerEntry == 4, "Assumption.  If changes, un-unroll.");
-  if (_cards[0] != NullEntry) sum++;
-  if (_cards[1] != NullEntry) sum++;
-  if (_cards[2] != NullEntry) sum++;
-  if (_cards[3] != NullEntry) sum++;
+  assert((cards_num() & (UnrollFactor - 1)) == 0, "Invalid number of cards in the entry");
+  for (int i = 0; i < cards_num(); i += UnrollFactor) {
+    sum += (_cards[i] != NullEntry);
+    sum += (_cards[i + 1] != NullEntry);
+    sum += (_cards[i + 2] != NullEntry);
+    sum += (_cards[i + 3] != NullEntry);
+  }
 #else
-  for (int i = 0; i < CardsPerEntry; i++) {
-    if (_cards[i] != NulLEntry) sum++;
+  for (int i = 0; i < cards_num(); i++) {
+    sum += (_cards[i] != NullEntry);
   }
 #endif
   // Otherwise, we're full.
@@ -83,27 +90,27 @@
 
 SparsePRTEntry::AddCardResult SparsePRTEntry::add_card(CardIdx_t card_index) {
 #if UNROLL_CARD_LOOPS
-  assert(CardsPerEntry == 4, "Assumption.  If changes, un-unroll.");
-  CardIdx_t c = _cards[0];
-  if (c == card_index) return found;
-  if (c == NullEntry) { _cards[0] = card_index; return added; }
-  c = _cards[1];
-  if (c == card_index) return found;
-  if (c == NullEntry) { _cards[1] = card_index; return added; }
-  c = _cards[2];
-  if (c == card_index) return found;
-  if (c == NullEntry) { _cards[2] = card_index; return added; }
-  c = _cards[3];
-  if (c == card_index) return found;
-  if (c == NullEntry) { _cards[3] = card_index; return added; }
+  assert((cards_num() & (UnrollFactor - 1)) == 0, "Invalid number of cards in the entry");
+  CardIdx_t c;
+  for (int i = 0; i < cards_num(); i += UnrollFactor) {
+    c = _cards[i];
+    if (c == card_index) return found;
+    if (c == NullEntry) { _cards[i] = card_index; return added; }
+    c = _cards[i + 1];
+    if (c == card_index) return found;
+    if (c == NullEntry) { _cards[i + 1] = card_index; return added; }
+    c = _cards[i + 2];
+    if (c == card_index) return found;
+    if (c == NullEntry) { _cards[i + 2] = card_index; return added; }
+    c = _cards[i + 3];
+    if (c == card_index) return found;
+    if (c == NullEntry) { _cards[i + 3] = card_index; return added; }
+  }
 #else
-  for (int i = 0; i < CardsPerEntry; i++) {
+  for (int i = 0; i < cards_num(); i++) {
     CardIdx_t c = _cards[i];
     if (c == card_index) return found;
-    if (c == NullEntry) {
-      _cards[i] = card_index;
-      return added;
-    }
+    if (c == NullEntry) { _cards[i] = card_index; return added; }
   }
 #endif
   // Otherwise, we're full.
@@ -112,13 +119,15 @@
 
 void SparsePRTEntry::copy_cards(CardIdx_t* cards) const {
 #if UNROLL_CARD_LOOPS
-  assert(CardsPerEntry == 4, "Assumption.  If changes, un-unroll.");
-  cards[0] = _cards[0];
-  cards[1] = _cards[1];
-  cards[2] = _cards[2];
-  cards[3] = _cards[3];
+  assert((cards_num() & (UnrollFactor - 1)) == 0, "Invalid number of cards in the entry");
+  for (int i = 0; i < cards_num(); i += UnrollFactor) {
+    cards[i] = _cards[i];
+    cards[i + 1] = _cards[i + 1];
+    cards[i + 2] = _cards[i + 2];
+    cards[i + 3] = _cards[i + 3];
+  }
 #else
-  for (int i = 0; i < CardsPerEntry; i++) {
+  for (int i = 0; i < cards_num(); i++) {
     cards[i] = _cards[i];
   }
 #endif
@@ -133,7 +142,7 @@
 RSHashTable::RSHashTable(size_t capacity) :
   _capacity(capacity), _capacity_mask(capacity-1),
   _occupied_entries(0), _occupied_cards(0),
-  _entries(NEW_C_HEAP_ARRAY(SparsePRTEntry, capacity)),
+  _entries((SparsePRTEntry*)NEW_C_HEAP_ARRAY(char, SparsePRTEntry::size() * capacity)),
   _buckets(NEW_C_HEAP_ARRAY(int, capacity)),
   _free_list(NullEntry), _free_region(0)
 {
@@ -161,8 +170,8 @@
                 "_capacity too large");
 
   // This will put -1 == NullEntry in the key field of all entries.
-  memset(_entries, -1, _capacity * sizeof(SparsePRTEntry));
-  memset(_buckets, -1, _capacity * sizeof(int));
+  memset(_entries, NullEntry, _capacity * SparsePRTEntry::size());
+  memset(_buckets, NullEntry, _capacity * sizeof(int));
   _free_list = NullEntry;
   _free_region = 0;
 }
@@ -175,8 +184,8 @@
   if (res == SparsePRTEntry::added) _occupied_cards++;
 #if SPARSE_PRT_VERBOSE
   gclog_or_tty->print_cr("       after add_card[%d]: valid-cards = %d.",
-                pointer_delta(e, _entries, sizeof(SparsePRTEntry)),
-                e->num_valid_cards());
+                         pointer_delta(e, _entries, SparsePRTEntry::size()),
+                         e->num_valid_cards());
 #endif
   assert(e->num_valid_cards() > 0, "Postcondition");
   return res != SparsePRTEntry::overflow;
@@ -199,6 +208,22 @@
   return true;
 }
 
+SparsePRTEntry* RSHashTable::get_entry(RegionIdx_t region_ind) {
+  int ind = (int) (region_ind & capacity_mask());
+  int cur_ind = _buckets[ind];
+  SparsePRTEntry* cur;
+  while (cur_ind != NullEntry &&
+         (cur = entry(cur_ind))->r_ind() != region_ind) {
+    cur_ind = cur->next_index();
+  }
+
+  if (cur_ind == NullEntry) return NULL;
+  // Otherwise...
+  assert(cur->r_ind() == region_ind, "Postcondition of loop + test above.");
+  assert(cur->num_valid_cards() > 0, "Inv");
+  return cur;
+}
+
 bool RSHashTable::delete_entry(RegionIdx_t region_ind) {
   int ind = (int) (region_ind & capacity_mask());
   int* prev_loc = &_buckets[ind];
@@ -225,20 +250,8 @@
   int ind = (int) (region_ind & capacity_mask());
   int cur_ind = _buckets[ind];
   SparsePRTEntry* cur;
-  // XXX
-  // int k = 0;
   while (cur_ind != NullEntry &&
          (cur = entry(cur_ind))->r_ind() != region_ind) {
-    /*
-    k++;
-    if (k > 10) {
-      gclog_or_tty->print_cr("RSHashTable::entry_for_region_ind(%d): "
-                    "k = %d, cur_ind = %d.", region_ind, k, cur_ind);
-      if (k >= 1000) {
-        while (1) ;
-      }
-    }
-    */
     cur_ind = cur->next_index();
   }
 
@@ -319,7 +332,7 @@
 bool /* RSHashTable:: */ RSHashTableIter::has_next(size_t& card_index) {
   _card_ind++;
   CardIdx_t ci;
-  if (_card_ind < SparsePRTEntry::CardsPerEntry &&
+  if (_card_ind < SparsePRTEntry::cards_num() &&
       ((ci = _rsht->entry(_bl_ind)->card(_card_ind)) !=
        SparsePRTEntry::NullEntry)) {
     card_index = compute_card_ind(ci);
@@ -359,7 +372,7 @@
 
 size_t RSHashTable::mem_size() const {
   return sizeof(this) +
-    capacity() * (sizeof(SparsePRTEntry) + sizeof(int));
+    capacity() * (SparsePRTEntry::size() + sizeof(int));
 }
 
 // ----------------------------------------------------------------------
@@ -446,6 +459,10 @@
   return _next->get_cards(region_id, cards);
 }
 
+SparsePRTEntry* SparsePRT::get_entry(RegionIdx_t region_id) {
+  return _next->get_entry(region_id);
+}
+
 bool SparsePRT::delete_entry(RegionIdx_t region_id) {
   return _next->delete_entry(region_id);
 }
--- a/hotspot/src/share/vm/gc_implementation/g1/sparsePRT.hpp	Tue Feb 09 13:56:09 2010 -0800
+++ b/hotspot/src/share/vm/gc_implementation/g1/sparsePRT.hpp	Thu Feb 11 15:52:19 2010 -0800
@@ -32,21 +32,28 @@
 // insertions only enqueue old versions for deletions, but do not delete
 // old versions synchronously.
 
-
 class SparsePRTEntry: public CHeapObj {
 public:
-
   enum SomePublicConstants {
-    CardsPerEntry =  4,
-    NullEntry     = -1
+    NullEntry     = -1,
+    UnrollFactor  =  4
   };
-
 private:
   RegionIdx_t _region_ind;
   int         _next_index;
-  CardIdx_t   _cards[CardsPerEntry];
-
+  CardIdx_t   _cards[1];
+  // WARNING: Don't put any data members beyond this line. Card array has, in fact, variable length.
+  // It should always be the last data member.
 public:
+  // Returns the size of the entry, used for entry allocation.
+  static size_t size() { return sizeof(SparsePRTEntry) + sizeof(CardIdx_t) * (cards_num() - 1); }
+  // Returns the size of the card array.
+  static int cards_num() {
+    // The number of cards should be a multiple of 4, because that's our current
+    // unrolling factor.
+    static const int s = MAX2<int>(G1RSetSparseRegionEntries & ~(UnrollFactor - 1), UnrollFactor);
+    return s;
+  }
 
   // Set the region_ind to the given value, and delete all cards.
   inline void init(RegionIdx_t region_ind);
@@ -134,12 +141,15 @@
   bool add_card(RegionIdx_t region_id, CardIdx_t card_index);
 
   bool get_cards(RegionIdx_t region_id, CardIdx_t* cards);
+
   bool delete_entry(RegionIdx_t region_id);
 
   bool contains_card(RegionIdx_t region_id, CardIdx_t card_index) const;
 
   void add_entry(SparsePRTEntry* e);
 
+  SparsePRTEntry* get_entry(RegionIdx_t region_id);
+
   void clear();
 
   size_t capacity() const      { return _capacity;       }
@@ -148,7 +158,7 @@
   size_t occupied_cards() const   { return _occupied_cards;   }
   size_t mem_size() const;
 
-  SparsePRTEntry* entry(int i) const { return &_entries[i]; }
+  SparsePRTEntry* entry(int i) const { return (SparsePRTEntry*)((char*)_entries + SparsePRTEntry::size() * i); }
 
   void print();
 };
@@ -157,7 +167,7 @@
 class RSHashTableIter VALUE_OBJ_CLASS_SPEC {
   int _tbl_ind;         // [-1, 0.._rsht->_capacity)
   int _bl_ind;          // [-1, 0.._rsht->_capacity)
-  short _card_ind;      // [0..CardsPerEntry)
+  short _card_ind;      // [0..SparsePRTEntry::cards_num())
   RSHashTable* _rsht;
   size_t _heap_bot_card_ind;
 
@@ -176,7 +186,7 @@
   RSHashTableIter(size_t heap_bot_card_ind) :
     _tbl_ind(RSHashTable::NullEntry),
     _bl_ind(RSHashTable::NullEntry),
-    _card_ind((SparsePRTEntry::CardsPerEntry-1)),
+    _card_ind((SparsePRTEntry::cards_num() - 1)),
     _rsht(NULL),
     _heap_bot_card_ind(heap_bot_card_ind)
   {}
@@ -185,7 +195,7 @@
     _rsht = rsht;
     _tbl_ind = -1; // So that first increment gets to 0.
     _bl_ind = RSHashTable::NullEntry;
-    _card_ind = (SparsePRTEntry::CardsPerEntry-1);
+    _card_ind = (SparsePRTEntry::cards_num() - 1);
   }
 
   bool has_next(size_t& card_index);
@@ -241,9 +251,13 @@
 
   // If the table hold an entry for "region_ind",  Copies its
   // cards into "cards", which must be an array of length at least
-  // "CardsPerEntry", and returns "true"; otherwise, returns "false".
+  // "SparePRTEntry::cards_num()", and returns "true"; otherwise,
+  // returns "false".
   bool get_cards(RegionIdx_t region_ind, CardIdx_t* cards);
 
+  // Return the pointer to the entry associated with the given region.
+  SparsePRTEntry* get_entry(RegionIdx_t region_ind);
+
   // If there is an entry for "region_ind", removes it and return "true";
   // otherwise returns "false."
   bool delete_entry(RegionIdx_t region_ind);
--- a/hotspot/src/share/vm/memory/cardTableModRefBS.hpp	Tue Feb 09 13:56:09 2010 -0800
+++ b/hotspot/src/share/vm/memory/cardTableModRefBS.hpp	Thu Feb 11 15:52:19 2010 -0800
@@ -339,6 +339,16 @@
     return (val & (clean_card_mask_val() | claimed_card_val())) == claimed_card_val();
   }
 
+  void set_card_claimed(size_t card_index) {
+      jbyte val = _byte_map[card_index];
+      if (val == clean_card_val()) {
+        val = (jbyte)claimed_card_val();
+      } else {
+        val |= (jbyte)claimed_card_val();
+      }
+      _byte_map[card_index] = val;
+  }
+
   bool claim_card(size_t card_index);
 
   bool is_card_clean(size_t card_index) {
--- a/hotspot/src/share/vm/utilities/globalDefinitions.hpp	Tue Feb 09 13:56:09 2010 -0800
+++ b/hotspot/src/share/vm/utilities/globalDefinitions.hpp	Thu Feb 11 15:52:19 2010 -0800
@@ -139,6 +139,10 @@
 const size_t G                  = M*K;
 const size_t HWperKB            = K / sizeof(HeapWord);
 
+const size_t LOG_K              = 10;
+const size_t LOG_M              = 2 * LOG_K;
+const size_t LOG_G              = 2 * LOG_M;
+
 const jint min_jint = (jint)1 << (sizeof(jint)*BitsPerByte-1); // 0x80000000 == smallest jint
 const jint max_jint = (juint)min_jint - 1;                     // 0x7FFFFFFF == largest jint