6608385: G1: need to support parallel reference processing
authorjohnc
Tue, 25 Jan 2011 10:56:22 -0800
changeset 8073 a14dc27842dc
parent 8071 195789ab14f9
child 8074 b5905b50b907
6608385: G1: need to support parallel reference processing Summary: Implement support for ParallelRefProcEnabled in the reference processing that takes place at the end of G1 concurrent marking. Reviewed-by: tonyp, ysr
hotspot/src/share/vm/gc_implementation/g1/concurrentMark.cpp
hotspot/src/share/vm/gc_implementation/g1/concurrentMark.hpp
hotspot/src/share/vm/gc_implementation/g1/g1_globals.hpp
hotspot/src/share/vm/runtime/arguments.cpp
--- a/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.cpp	Fri Jan 21 11:30:22 2011 -0500
+++ b/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.cpp	Tue Jan 25 10:56:22 2011 -0800
@@ -1055,7 +1055,12 @@
       do {
         double start_vtime_sec = os::elapsedVTime();
         double start_time_sec = os::elapsedTime();
-        the_task->do_marking_step(10.0);
+        double mark_step_duration_ms = G1ConcMarkStepDurationMillis;
+
+        the_task->do_marking_step(mark_step_duration_ms,
+                                  true /* do_stealing    */,
+                                  true /* do_termination */);
+
         double end_time_sec = os::elapsedTime();
         double end_vtime_sec = os::elapsedVTime();
         double elapsed_vtime_sec = end_vtime_sec - start_vtime_sec;
@@ -1111,7 +1116,8 @@
 
   _restart_for_overflow = false;
 
-  set_phase(MAX2((size_t) 1, parallel_marking_threads()), true);
+  size_t active_workers = MAX2((size_t) 1, parallel_marking_threads());
+  set_phase(active_workers, true /* concurrent */);
 
   CMConcurrentMarkingTask markingTask(this, cmThread());
   if (parallel_marking_threads() > 0)
@@ -1176,6 +1182,12 @@
                                /* silent */           false,
                                /* use_prev_marking */ false);
     }
+    assert(!restart_for_overflow(), "sanity");
+  }
+
+  // Reset the marking state if marking completed
+  if (!restart_for_overflow()) {
+    set_non_marking_state();
   }
 
 #if VERIFY_OBJS_PROCESSED
@@ -1853,6 +1865,8 @@
   assert(local_free_list.is_empty(), "post-condition");
 }
 
+// Support closures for reference procssing in G1
+
 bool G1CMIsAliveClosure::do_object_b(oop obj) {
   HeapWord* addr = (HeapWord*)obj;
   return addr != NULL &&
@@ -1873,11 +1887,17 @@
   virtual void do_oop(      oop* p) { do_oop_work(p); }
 
   template <class T> void do_oop_work(T* p) {
-    oop thisOop = oopDesc::load_decode_heap_oop(p);
-    HeapWord* addr = (HeapWord*)thisOop;
-    if (_g1->is_in_g1_reserved(addr) && _g1->is_obj_ill(thisOop)) {
+    oop obj = oopDesc::load_decode_heap_oop(p);
+    HeapWord* addr = (HeapWord*)obj;
+
+    if (_cm->verbose_high())
+      gclog_or_tty->print_cr("\t[0] we're looking at location "
+                               "*"PTR_FORMAT" = "PTR_FORMAT,
+                               p, (void*) obj);
+
+    if (_g1->is_in_g1_reserved(addr) && _g1->is_obj_ill(obj)) {
       _bitMap->mark(addr);
-      _cm->mark_stack_push(thisOop);
+      _cm->mark_stack_push(obj);
     }
   }
 };
@@ -1899,6 +1919,199 @@
   }
 };
 
+// 'Keep Alive' closure used by parallel reference processing.
+// An instance of this closure is used in the parallel reference processing
+// code rather than an instance of G1CMKeepAliveClosure. We could have used
+// the G1CMKeepAliveClosure as it is MT-safe. Also reference objects are
+// placed on to discovered ref lists once so we can mark and push with no
+// need to check whether the object has already been marked. Using the
+// G1CMKeepAliveClosure would mean, however, having all the worker threads
+// operating on the global mark stack. This means that an individual
+// worker would be doing lock-free pushes while it processes its own
+// discovered ref list followed by drain call. If the discovered ref lists
+// are unbalanced then this could cause interference with the other
+// workers. Using a CMTask (and its embedded local data structures)
+// avoids that potential interference.
+class G1CMParKeepAliveAndDrainClosure: public OopClosure {
+  ConcurrentMark*  _cm;
+  CMTask*          _task;
+  CMBitMap*        _bitMap;
+  int              _ref_counter_limit;
+  int              _ref_counter;
+ public:
+  G1CMParKeepAliveAndDrainClosure(ConcurrentMark* cm,
+                                  CMTask* task,
+                                  CMBitMap* bitMap) :
+    _cm(cm), _task(task), _bitMap(bitMap),
+    _ref_counter_limit(G1RefProcDrainInterval)
+  {
+    assert(_ref_counter_limit > 0, "sanity");
+    _ref_counter = _ref_counter_limit;
+  }
+
+  virtual void do_oop(narrowOop* p) { do_oop_work(p); }
+  virtual void do_oop(      oop* p) { do_oop_work(p); }
+
+  template <class T> void do_oop_work(T* p) {
+    if (!_cm->has_overflown()) {
+      oop obj = oopDesc::load_decode_heap_oop(p);
+      if (_cm->verbose_high())
+        gclog_or_tty->print_cr("\t[%d] we're looking at location "
+                               "*"PTR_FORMAT" = "PTR_FORMAT,
+                               _task->task_id(), p, (void*) obj);
+
+      _task->deal_with_reference(obj);
+      _ref_counter--;
+
+      if (_ref_counter == 0) {
+        // We have dealt with _ref_counter_limit references, pushing them and objects
+        // reachable from them on to the local stack (and possibly the global stack).
+        // Call do_marking_step() to process these entries. We call the routine in a
+        // loop, which we'll exit if there's nothing more to do (i.e. we're done
+        // with the entries that we've pushed as a result of the deal_with_reference
+        // calls above) or we overflow.
+        // Note: CMTask::do_marking_step() can set the CMTask::has_aborted() flag
+        // while there may still be some work to do. (See the comment at the
+        // beginning of CMTask::do_marking_step() for those conditions - one of which
+        // is reaching the specified time target.) It is only when
+        // CMTask::do_marking_step() returns without setting the has_aborted() flag
+        // that the marking has completed.
+        do {
+          double mark_step_duration_ms = G1ConcMarkStepDurationMillis;
+          _task->do_marking_step(mark_step_duration_ms,
+                                 false /* do_stealing    */,
+                                 false /* do_termination */);
+        } while (_task->has_aborted() && !_cm->has_overflown());
+        _ref_counter = _ref_counter_limit;
+      }
+    } else {
+       if (_cm->verbose_high())
+         gclog_or_tty->print_cr("\t[%d] CM Overflow", _task->task_id());
+    }
+  }
+};
+
+class G1CMParDrainMarkingStackClosure: public VoidClosure {
+  ConcurrentMark* _cm;
+  CMTask* _task;
+ public:
+  G1CMParDrainMarkingStackClosure(ConcurrentMark* cm, CMTask* task) :
+    _cm(cm), _task(task)
+  {}
+
+  void do_void() {
+    do {
+      if (_cm->verbose_high())
+        gclog_or_tty->print_cr("\t[%d] Drain: Calling do marking_step", _task->task_id());
+
+      // We call CMTask::do_marking_step() to completely drain the local and
+      // global marking stacks. The routine is called in a loop, which we'll
+      // exit if there's nothing more to do (i.e. we'completely drained the
+      // entries that were pushed as a result of applying the
+      // G1CMParKeepAliveAndDrainClosure to the entries on the discovered ref
+      // lists above) or we overflow the global marking stack.
+      // Note: CMTask::do_marking_step() can set the CMTask::has_aborted() flag
+      // while there may still be some work to do. (See the comment at the
+      // beginning of CMTask::do_marking_step() for those conditions - one of which
+      // is reaching the specified time target.) It is only when
+      // CMTask::do_marking_step() returns without setting the has_aborted() flag
+      // that the marking has completed.
+
+      _task->do_marking_step(1000000000.0 /* something very large */,
+                             true /* do_stealing    */,
+                             true /* do_termination */);
+    } while (_task->has_aborted() && !_cm->has_overflown());
+  }
+};
+
+// Implementation of AbstractRefProcTaskExecutor for G1
+class G1RefProcTaskExecutor: public AbstractRefProcTaskExecutor {
+private:
+  G1CollectedHeap* _g1h;
+  ConcurrentMark*  _cm;
+  CMBitMap*        _bitmap;
+  WorkGang*        _workers;
+  int              _active_workers;
+
+public:
+  G1RefProcTaskExecutor(G1CollectedHeap* g1h,
+                        ConcurrentMark* cm,
+                        CMBitMap* bitmap,
+                        WorkGang* workers,
+                        int n_workers) :
+    _g1h(g1h), _cm(cm), _bitmap(bitmap),
+    _workers(workers), _active_workers(n_workers)
+  { }
+
+  // Executes the given task using concurrent marking worker threads.
+  virtual void execute(ProcessTask& task);
+  virtual void execute(EnqueueTask& task);
+};
+
+class G1RefProcTaskProxy: public AbstractGangTask {
+  typedef AbstractRefProcTaskExecutor::ProcessTask ProcessTask;
+  ProcessTask&     _proc_task;
+  G1CollectedHeap* _g1h;
+  ConcurrentMark*  _cm;
+  CMBitMap*        _bitmap;
+
+public:
+  G1RefProcTaskProxy(ProcessTask& proc_task,
+                     G1CollectedHeap* g1h,
+                     ConcurrentMark* cm,
+                     CMBitMap* bitmap) :
+    AbstractGangTask("Process reference objects in parallel"),
+    _proc_task(proc_task), _g1h(g1h), _cm(cm), _bitmap(bitmap)
+  {}
+
+  virtual void work(int i) {
+    CMTask* marking_task = _cm->task(i);
+    G1CMIsAliveClosure g1_is_alive(_g1h);
+    G1CMParKeepAliveAndDrainClosure g1_par_keep_alive(_cm, marking_task, _bitmap);
+    G1CMParDrainMarkingStackClosure g1_par_drain(_cm, marking_task);
+
+    _proc_task.work(i, g1_is_alive, g1_par_keep_alive, g1_par_drain);
+  }
+};
+
+void G1RefProcTaskExecutor::execute(ProcessTask& proc_task) {
+  assert(_workers != NULL, "Need parallel worker threads.");
+
+  G1RefProcTaskProxy proc_task_proxy(proc_task, _g1h, _cm, _bitmap);
+
+  // We need to reset the phase for each task execution so that
+  // the termination protocol of CMTask::do_marking_step works.
+  _cm->set_phase(_active_workers, false /* concurrent */);
+  _g1h->set_par_threads(_active_workers);
+  _workers->run_task(&proc_task_proxy);
+  _g1h->set_par_threads(0);
+}
+
+class G1RefEnqueueTaskProxy: public AbstractGangTask {
+  typedef AbstractRefProcTaskExecutor::EnqueueTask EnqueueTask;
+  EnqueueTask& _enq_task;
+
+public:
+  G1RefEnqueueTaskProxy(EnqueueTask& enq_task) :
+    AbstractGangTask("Enqueue reference objects in parallel"),
+    _enq_task(enq_task)
+  { }
+
+  virtual void work(int i) {
+    _enq_task.work(i);
+  }
+};
+
+void G1RefProcTaskExecutor::execute(EnqueueTask& enq_task) {
+  assert(_workers != NULL, "Need parallel worker threads.");
+
+  G1RefEnqueueTaskProxy enq_task_proxy(enq_task);
+
+  _g1h->set_par_threads(_active_workers);
+  _workers->run_task(&enq_task_proxy);
+  _g1h->set_par_threads(0);
+}
+
 void ConcurrentMark::weakRefsWork(bool clear_all_soft_refs) {
   ResourceMark rm;
   HandleMark   hm;
@@ -1917,18 +2130,52 @@
   G1CMDrainMarkingStackClosure
     g1_drain_mark_stack(nextMarkBitMap(), &_markStack, &g1_keep_alive);
 
-  // XXXYYY  Also: copy the parallel ref processing code from CMS.
-  rp->process_discovered_references(&g1_is_alive,
-                                    &g1_keep_alive,
-                                    &g1_drain_mark_stack,
-                                    NULL);
+  // We use the work gang from the G1CollectedHeap and we utilize all
+  // the worker threads.
+  int active_workers = MAX2(MIN2(g1h->workers()->total_workers(), (int)_max_task_num), 1);
+
+  G1RefProcTaskExecutor par_task_executor(g1h, this, nextMarkBitMap(),
+                                          g1h->workers(), active_workers);
+
+  if (rp->processing_is_mt()) {
+    // Set the degree of MT here.  If the discovery is done MT, there
+    // may have been a different number of threads doing the discovery
+    // and a different number of discovered lists may have Ref objects.
+    // That is OK as long as the Reference lists are balanced (see
+    // balance_all_queues() and balance_queues()).
+    rp->set_mt_degree(active_workers);
+
+    rp->process_discovered_references(&g1_is_alive,
+                                      &g1_keep_alive,
+                                      &g1_drain_mark_stack,
+                                      &par_task_executor);
+
+    // The work routines of the parallel keep_alive and drain_marking_stack
+    // will set the has_overflown flag if we overflow the global marking
+    // stack.
+  } else {
+    rp->process_discovered_references(&g1_is_alive,
+                                      &g1_keep_alive,
+                                      &g1_drain_mark_stack,
+                                      NULL);
+
+  }
+
   assert(_markStack.overflow() || _markStack.isEmpty(),
-         "mark stack should be empty (unless it overflowed)");
+      "mark stack should be empty (unless it overflowed)");
   if (_markStack.overflow()) {
+    // Should have been done already when we tried to push an
+    // entry on to the global mark stack. But let's do it again.
     set_has_overflown();
   }
 
-  rp->enqueue_discovered_references();
+  if (rp->processing_is_mt()) {
+    assert(rp->num_q() == active_workers, "why not");
+    rp->enqueue_discovered_references(&par_task_executor);
+  } else {
+    rp->enqueue_discovered_references();
+  }
+
   rp->verify_no_references_recorded();
   assert(!rp->discovery_enabled(), "should have been disabled");
 
@@ -1955,7 +2202,9 @@
       CMTask* task = _cm->task(worker_i);
       task->record_start_time();
       do {
-        task->do_marking_step(1000000000.0 /* something very large */);
+        task->do_marking_step(1000000000.0 /* something very large */,
+                              true /* do_stealing    */,
+                              true /* do_termination */);
       } while (task->has_aborted() && !_cm->has_overflown());
       // If we overflow, then we do not want to restart. We instead
       // want to abort remark and do concurrent marking again.
@@ -1978,7 +2227,7 @@
     G1CollectedHeap::StrongRootsScope srs(g1h);
     // this is remark, so we'll use up all available threads
     int active_workers = ParallelGCThreads;
-    set_phase(active_workers, false);
+    set_phase(active_workers, false /* concurrent */);
 
     CMRemarkTask remarkTask(this);
     // We will start all available threads, even if we decide that the
@@ -1992,7 +2241,7 @@
     G1CollectedHeap::StrongRootsScope srs(g1h);
     // this is remark, so we'll use up all available threads
     int active_workers = 1;
-    set_phase(active_workers, false);
+    set_phase(active_workers, false /* concurrent */);
 
     CMRemarkTask remarkTask(this);
     // We will start all available threads, even if we decide that the
@@ -2005,9 +2254,6 @@
 
   print_stats();
 
-  if (!restart_for_overflow())
-    set_non_marking_state();
-
 #if VERIFY_OBJS_PROCESSED
   if (_scan_obj_cl.objs_processed != ThreadLocalObjQueue::objs_enqueued) {
     gclog_or_tty->print_cr("Processed = %d, enqueued = %d.",
@@ -3124,7 +3370,7 @@
           // do nothing
         }
 #else // _CHECK_BOTH_FINGERS_
-      // we will only check the global finger
+        // we will only check the global finger
 
         if (objAddr < global_finger) {
           // see long comment above
@@ -3249,7 +3495,7 @@
   double elapsed_time_ms = curr_time_ms - _start_time_ms;
   if (elapsed_time_ms > _time_target_ms) {
     set_has_aborted();
-    _has_aborted_timed_out = true;
+    _has_timed_out = true;
     statsOnly( ++_aborted_timed_out );
     return;
   }
@@ -3754,7 +4000,9 @@
 
  *****************************************************************************/
 
-void CMTask::do_marking_step(double time_target_ms) {
+void CMTask::do_marking_step(double time_target_ms,
+                             bool do_stealing,
+                             bool do_termination) {
   assert(time_target_ms >= 1.0, "minimum granularity is 1ms");
   assert(concurrent() == _cm->concurrent(), "they should be the same");
 
@@ -3794,7 +4042,7 @@
 
   // clear all flags
   clear_has_aborted();
-  _has_aborted_timed_out = false;
+  _has_timed_out = false;
   _draining_satb_buffers = false;
 
   ++_calls;
@@ -3970,7 +4218,7 @@
   drain_global_stack(false);
 
   // Attempt at work stealing from other task's queues.
-  if (!has_aborted()) {
+  if (do_stealing && !has_aborted()) {
     // We have not aborted. This means that we have finished all that
     // we could. Let's try to do some stealing...
 
@@ -4011,7 +4259,7 @@
 
   // We still haven't aborted. Now, let's try to get into the
   // termination protocol.
-  if (!has_aborted()) {
+  if (do_termination && !has_aborted()) {
     // We cannot check whether the global stack is empty, since other
     // tasks might be concurrently pushing objects on it. We also cannot
     // check if the region stack is empty because if a thread is aborting
@@ -4087,7 +4335,7 @@
 
     statsOnly( ++_aborted );
 
-    if (_has_aborted_timed_out) {
+    if (_has_timed_out) {
       double diff_ms = elapsed_time_ms - _time_target_ms;
       // Keep statistics of how well we did with respect to hitting
       // our target only if we actually timed out (if we aborted for
--- a/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.hpp	Fri Jan 21 11:30:22 2011 -0500
+++ b/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.hpp	Tue Jan 25 10:56:22 2011 -0800
@@ -353,6 +353,10 @@
   friend class CMConcurrentMarkingTask;
   friend class G1ParNoteEndTask;
   friend class CalcLiveObjectsClosure;
+  friend class G1RefProcTaskProxy;
+  friend class G1RefProcTaskExecutor;
+  friend class G1CMParKeepAliveAndDrainClosure;
+  friend class G1CMParDrainMarkingStackClosure;
 
 protected:
   ConcurrentMarkThread* _cmThread;   // the thread doing the work
@@ -936,7 +940,7 @@
   // if this is true, then the task has aborted for some reason
   bool                        _has_aborted;
   // set when the task aborts because it has met its time quota
-  bool                        _has_aborted_timed_out;
+  bool                        _has_timed_out;
   // true when we're draining SATB buffers; this avoids the task
   // aborting due to SATB buffers being available (as we're already
   // dealing with them)
@@ -1041,7 +1045,7 @@
   // trying not to exceed the given duration. However, it might exit
   // prematurely, according to some conditions (i.e. SATB buffers are
   // available for processing).
-  void do_marking_step(double target_ms);
+  void do_marking_step(double target_ms, bool do_stealing, bool do_termination);
 
   // These two calls start and stop the timer
   void record_start_time() {
@@ -1063,7 +1067,8 @@
   bool has_aborted()            { return _has_aborted; }
   void set_has_aborted()        { _has_aborted = true; }
   void clear_has_aborted()      { _has_aborted = false; }
-  bool claimed() { return _claimed; }
+  bool has_timed_out()          { return _has_timed_out; }
+  bool claimed()                { return _claimed; }
 
   // Support routines for the partially scanned region that may be
   // recorded as a result of aborting while draining the CMRegionStack
--- a/hotspot/src/share/vm/gc_implementation/g1/g1_globals.hpp	Fri Jan 21 11:30:22 2011 -0500
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1_globals.hpp	Tue Jan 25 10:56:22 2011 -0800
@@ -81,6 +81,14 @@
   product(intx, G1MarkRegionStackSize, 1024 * 1024,                         \
           "Size of the region stack for concurrent marking.")               \
                                                                             \
+  product(double, G1ConcMarkStepDurationMillis, 10.0,                       \
+          "Target duration of individual concurrent marking steps "         \
+          "in milliseconds.")                                               \
+                                                                            \
+  product(intx, G1RefProcDrainInterval, 10,                                 \
+          "The number of discovered reference objects to process before "   \
+          "draining concurrent marking work queues.")                       \
+                                                                            \
   develop(bool, G1SATBBarrierPrintNullPreVals, false,                       \
           "If true, count frac of ptr writes with null pre-vals.")          \
                                                                             \
--- a/hotspot/src/share/vm/runtime/arguments.cpp	Fri Jan 21 11:30:22 2011 -0500
+++ b/hotspot/src/share/vm/runtime/arguments.cpp	Tue Jan 25 10:56:22 2011 -0800
@@ -1941,10 +1941,16 @@
     status = false;
   }
 
+#ifndef SERIALGC
   if (UseG1GC) {
     status = status && verify_percentage(InitiatingHeapOccupancyPercent,
                                          "InitiatingHeapOccupancyPercent");
+    status = status && verify_min_value(G1RefProcDrainInterval, 1,
+                                        "G1RefProcDrainInterval");
+    status = status && verify_min_value((intx)G1ConcMarkStepDurationMillis, 1,
+                                        "G1ConcMarkStepDurationMillis");
   }
+#endif
 
   status = status && verify_interval(RefDiscoveryPolicy,
                                      ReferenceProcessor::DiscoveryPolicyMin,