6672778: G1 should trim task queues more aggressively during evacuation pauses
authortschatzl
Fri, 27 Apr 2018 12:06:46 +0200
changeset 49911 358be4680d12
parent 49910 c822dd1a3b66
child 49912 64d4f6fa21e5
child 56497 598e0c5b6f8c
6672778: G1 should trim task queues more aggressively during evacuation pauses Summary: Regularly try to drain task queues. This helps memory usage and performance during garbage collection. Reviewed-by: sjohanss, sangheki
src/hotspot/share/gc/g1/bufferingOopClosure.hpp
src/hotspot/share/gc/g1/g1Arguments.cpp
src/hotspot/share/gc/g1/g1CollectedHeap.cpp
src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp
src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp
src/hotspot/share/gc/g1/g1OopClosures.cpp
src/hotspot/share/gc/g1/g1OopClosures.hpp
src/hotspot/share/gc/g1/g1OopClosures.inline.hpp
src/hotspot/share/gc/g1/g1ParScanThreadState.cpp
src/hotspot/share/gc/g1/g1ParScanThreadState.hpp
src/hotspot/share/gc/g1/g1ParScanThreadState.inline.hpp
src/hotspot/share/gc/g1/g1Policy.cpp
src/hotspot/share/gc/g1/g1RemSet.cpp
src/hotspot/share/gc/g1/g1RemSet.hpp
src/hotspot/share/gc/g1/g1RootClosures.cpp
src/hotspot/share/gc/g1/g1RootClosures.hpp
src/hotspot/share/gc/g1/g1RootProcessor.cpp
src/hotspot/share/gc/g1/g1RootProcessor.hpp
src/hotspot/share/gc/g1/g1SharedClosures.hpp
src/hotspot/share/gc/shared/taskqueue.hpp
src/hotspot/share/gc/shared/taskqueue.inline.hpp
src/hotspot/share/utilities/ticks.hpp
src/hotspot/share/utilities/ticks.inline.hpp
test/hotspot/gtest/gc/g1/test_bufferingOopClosure.cpp
--- a/src/hotspot/share/gc/g1/bufferingOopClosure.hpp	Fri Apr 27 10:59:46 2018 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,144 +0,0 @@
-/*
- * Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-#ifndef SHARE_VM_GC_G1_BUFFERINGOOPCLOSURE_HPP
-#define SHARE_VM_GC_G1_BUFFERINGOOPCLOSURE_HPP
-
-#include "memory/iterator.hpp"
-#include "oops/oopsHierarchy.hpp"
-#include "runtime/os.hpp"
-#include "utilities/debug.hpp"
-
-// A BufferingOops closure tries to separate out the cost of finding roots
-// from the cost of applying closures to them.  It maintains an array of
-// ref-containing locations.  Until the array is full, applying the closure
-// to an oop* merely records that location in the array.  Since this
-// closure app cost is small, an elapsed timer can approximately attribute
-// all of this cost to the cost of finding the roots.  When the array fills
-// up, the wrapped closure is applied to all elements, keeping track of
-// this elapsed time of this process, and leaving the array empty.
-// The caller must be sure to call "done" to process any unprocessed
-// buffered entries.
-
-class BufferingOopClosure: public OopClosure {
-  friend class BufferingOopClosureTest;
-protected:
-  static const size_t BufferLength = 1024;
-
-  // We need to know if the buffered addresses contain oops or narrowOops.
-  // We can't tag the addresses the way StarTask does, because we need to
-  // be able to handle unaligned addresses coming from oops embedded in code.
-  //
-  // The addresses for the full-sized oops are filled in from the bottom,
-  // while the addresses for the narrowOops are filled in from the top.
-  OopOrNarrowOopStar  _buffer[BufferLength];
-  OopOrNarrowOopStar* _oop_top;
-  OopOrNarrowOopStar* _narrowOop_bottom;
-
-  OopClosure* _oc;
-  double      _closure_app_seconds;
-
-
-  bool is_buffer_empty() {
-    return _oop_top == _buffer && _narrowOop_bottom == (_buffer + BufferLength - 1);
-  }
-
-  bool is_buffer_full() {
-    return _narrowOop_bottom < _oop_top;
-  }
-
-  // Process addresses containing full-sized oops.
-  void process_oops() {
-    for (OopOrNarrowOopStar* curr = _buffer; curr < _oop_top; ++curr) {
-      _oc->do_oop((oop*)(*curr));
-    }
-    _oop_top = _buffer;
-  }
-
-  // Process addresses containing narrow oops.
-  void process_narrowOops() {
-    for (OopOrNarrowOopStar* curr = _buffer + BufferLength - 1; curr > _narrowOop_bottom; --curr) {
-      _oc->do_oop((narrowOop*)(*curr));
-    }
-    _narrowOop_bottom = _buffer + BufferLength - 1;
-  }
-
-  // Apply the closure to all oops and clear the buffer.
-  // Accumulate the time it took.
-  void process_buffer() {
-    double start = os::elapsedTime();
-
-    process_oops();
-    process_narrowOops();
-
-    _closure_app_seconds += (os::elapsedTime() - start);
-  }
-
-  void process_buffer_if_full() {
-    if (is_buffer_full()) {
-      process_buffer();
-    }
-  }
-
-  void add_narrowOop(narrowOop* p) {
-    assert(!is_buffer_full(), "Buffer should not be full");
-    *_narrowOop_bottom = (OopOrNarrowOopStar)p;
-    _narrowOop_bottom--;
-  }
-
-  void add_oop(oop* p) {
-    assert(!is_buffer_full(), "Buffer should not be full");
-    *_oop_top = (OopOrNarrowOopStar)p;
-    _oop_top++;
-  }
-
-public:
-  virtual void do_oop(narrowOop* p) {
-    process_buffer_if_full();
-    add_narrowOop(p);
-  }
-
-  virtual void do_oop(oop* p)       {
-    process_buffer_if_full();
-    add_oop(p);
-  }
-
-  void done() {
-    if (!is_buffer_empty()) {
-      process_buffer();
-    }
-  }
-
-  double closure_app_seconds() {
-    return _closure_app_seconds;
-  }
-
-  BufferingOopClosure(OopClosure *oc) :
-    _oc(oc),
-    _oop_top(_buffer),
-    _narrowOop_bottom(_buffer + BufferLength - 1),
-    _closure_app_seconds(0.0) { }
-};
-
-#endif // SHARE_VM_GC_G1_BUFFERINGOOPCLOSURE_HPP
--- a/src/hotspot/share/gc/g1/g1Arguments.cpp	Fri Apr 27 10:59:46 2018 +0200
+++ b/src/hotspot/share/gc/g1/g1Arguments.cpp	Fri Apr 27 12:06:46 2018 +0200
@@ -126,6 +126,11 @@
 
   log_trace(gc)("MarkStackSize: %uk  MarkStackSizeMax: %uk", (unsigned int) (MarkStackSize / K), (uint) (MarkStackSizeMax / K));
 
+  // By default do not let the target stack size to be more than 1/4 of the entries
+  if (FLAG_IS_DEFAULT(GCDrainStackTargetSize)) {
+    FLAG_SET_ERGO(uintx, GCDrainStackTargetSize, MIN2(GCDrainStackTargetSize, (uintx)TASKQUEUE_SIZE / 4));
+  }
+
 #ifdef COMPILER2
   // Enable loop strip mining to offer better pause time guarantees
   if (FLAG_IS_DEFAULT(UseCountedLoopSafepoints)) {
--- a/src/hotspot/share/gc/g1/g1CollectedHeap.cpp	Fri Apr 27 10:59:46 2018 +0200
+++ b/src/hotspot/share/gc/g1/g1CollectedHeap.cpp	Fri Apr 27 12:06:46 2018 +0200
@@ -28,7 +28,6 @@
 #include "classfile/symbolTable.hpp"
 #include "code/codeCache.hpp"
 #include "code/icBuffer.hpp"
-#include "gc/g1/bufferingOopClosure.hpp"
 #include "gc/g1/g1Allocator.inline.hpp"
 #include "gc/g1/g1BarrierSet.hpp"
 #include "gc/g1/g1CollectedHeap.inline.hpp"
@@ -1841,7 +1840,7 @@
   while (dcqs.apply_closure_during_gc(cl, worker_i)) {
     n_completed_buffers++;
   }
-  g1_policy()->phase_times()->record_thread_work_item(G1GCPhaseTimes::UpdateRS, worker_i, n_completed_buffers);
+  g1_policy()->phase_times()->record_thread_work_item(G1GCPhaseTimes::UpdateRS, worker_i, n_completed_buffers, G1GCPhaseTimes::UpdateRSProcessedBuffers);
   dcqs.clear_n_completed_buffers();
   assert(!dcqs.completed_buffers_exist_dirty(), "Completed buffers exist!");
 }
@@ -3130,15 +3129,13 @@
 
       double start_strong_roots_sec = os::elapsedTime();
 
-      _root_processor->evacuate_roots(pss->closures(), worker_id);
+      _root_processor->evacuate_roots(pss, worker_id);
 
       // We pass a weak code blobs closure to the remembered set scanning because we want to avoid
       // treating the nmethods visited to act as roots for concurrent marking.
       // We only want to make sure that the oops in the nmethods are adjusted with regard to the
       // objects copied by the current evacuation.
-      _g1h->g1_rem_set()->oops_into_collection_set_do(pss,
-                                                      pss->closures()->weak_codeblobs(),
-                                                      worker_id);
+      _g1h->g1_rem_set()->oops_into_collection_set_do(pss, worker_id);
 
       double strong_roots_sec = os::elapsedTime() - start_strong_roots_sec;
 
@@ -3152,9 +3149,11 @@
         evac_term_attempts = evac.term_attempts();
         term_sec = evac.term_time();
         double elapsed_sec = os::elapsedTime() - start;
-        _g1h->g1_policy()->phase_times()->add_time_secs(G1GCPhaseTimes::ObjCopy, worker_id, elapsed_sec - term_sec);
-        _g1h->g1_policy()->phase_times()->record_time_secs(G1GCPhaseTimes::Termination, worker_id, term_sec);
-        _g1h->g1_policy()->phase_times()->record_thread_work_item(G1GCPhaseTimes::Termination, worker_id, evac_term_attempts);
+
+        G1GCPhaseTimes* p = _g1h->g1_policy()->phase_times();
+        p->add_time_secs(G1GCPhaseTimes::ObjCopy, worker_id, elapsed_sec - term_sec);
+        p->record_time_secs(G1GCPhaseTimes::Termination, worker_id, term_sec);
+        p->record_thread_work_item(G1GCPhaseTimes::Termination, worker_id, evac_term_attempts);
       }
 
       assert(pss->queue_is_empty(), "should be empty");
--- a/src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp	Fri Apr 27 10:59:46 2018 +0200
+++ b/src/hotspot/share/gc/g1/g1GCPhaseTimes.cpp	Fri Apr 27 12:06:46 2018 +0200
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, 2017, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2018, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -26,6 +26,7 @@
 #include "gc/g1/g1CollectedHeap.inline.hpp"
 #include "gc/g1/g1GCPhaseTimes.hpp"
 #include "gc/g1/g1HotCardCache.hpp"
+#include "gc/g1/g1ParScanThreadState.inline.hpp"
 #include "gc/g1/g1StringDedup.hpp"
 #include "gc/shared/workerDataArray.inline.hpp"
 #include "memory/resourceArea.hpp"
@@ -167,9 +168,12 @@
 }
 
 #define ASSERT_PHASE_UNINITIALIZED(phase) \
-    assert(_gc_par_phases[phase]->get(i) == uninitialized, "Phase " #phase " reported for thread that was not started");
+    assert(_gc_par_phases[phase] == NULL || _gc_par_phases[phase]->get(i) == uninitialized, "Phase " #phase " reported for thread that was not started");
 
 double G1GCPhaseTimes::worker_time(GCParPhases phase, uint worker) {
+  if (_gc_par_phases[phase] == NULL) {
+    return 0.0;
+  }
   double value = _gc_par_phases[phase]->get(worker);
   if (value != WorkerDataArray<double>::uninitialized()) {
     return value;
@@ -189,21 +193,20 @@
       double total_worker_time = _gc_par_phases[GCWorkerEnd]->get(i) - _gc_par_phases[GCWorkerStart]->get(i);
       record_time_secs(GCWorkerTotal, i , total_worker_time);
 
-      double worker_known_time =
-          worker_time(ExtRootScan, i)
-          + worker_time(SATBFiltering, i)
-          + worker_time(UpdateRS, i)
-          + worker_time(ScanRS, i)
-          + worker_time(CodeRoots, i)
-          + worker_time(ObjCopy, i)
-          + worker_time(Termination, i);
+      double worker_known_time = worker_time(ExtRootScan, i) +
+                                 worker_time(ScanHCC, i) +
+                                 worker_time(UpdateRS, i) +
+                                 worker_time(ScanRS, i) +
+                                 worker_time(CodeRoots, i) +
+                                 worker_time(ObjCopy, i) +
+                                 worker_time(Termination, i);
 
       record_time_secs(Other, i, total_worker_time - worker_known_time);
     } else {
       // Make sure all slots are uninitialized since this thread did not seem to have been started
       ASSERT_PHASE_UNINITIALIZED(GCWorkerEnd);
       ASSERT_PHASE_UNINITIALIZED(ExtRootScan);
-      ASSERT_PHASE_UNINITIALIZED(SATBFiltering);
+      ASSERT_PHASE_UNINITIALIZED(ScanHCC);
       ASSERT_PHASE_UNINITIALIZED(UpdateRS);
       ASSERT_PHASE_UNINITIALIZED(ScanRS);
       ASSERT_PHASE_UNINITIALIZED(CodeRoots);
@@ -225,6 +228,14 @@
   _gc_par_phases[phase]->add(worker_i, secs);
 }
 
+void G1GCPhaseTimes::record_or_add_objcopy_time_secs(uint worker_i, double secs) {
+  if (_gc_par_phases[ObjCopy]->get(worker_i) == _gc_par_phases[ObjCopy]->uninitialized()) {
+    record_time_secs(ObjCopy, worker_i, secs);
+  } else {
+    add_time_secs(ObjCopy, worker_i, secs);
+  }
+}
+
 void G1GCPhaseTimes::record_thread_work_item(GCParPhases phase, uint worker_i, size_t count, uint index) {
   _gc_par_phases[phase]->set_thread_work_item(worker_i, count, index);
 }
@@ -463,16 +474,49 @@
   }
 }
 
+G1EvacPhaseWithTrimTimeTracker::G1EvacPhaseWithTrimTimeTracker(G1ParScanThreadState* pss, Tickspan& total_time, Tickspan& trim_time) :
+  _pss(pss),
+  _start(Ticks::now()),
+  _total_time(total_time),
+  _trim_time(trim_time) {
+
+  assert(_pss->trim_ticks().value() == 0, "Possibly remaining trim ticks left over from previous use");
+}
+
+G1EvacPhaseWithTrimTimeTracker::~G1EvacPhaseWithTrimTimeTracker() {
+  _total_time += (Ticks::now() - _start) - _pss->trim_ticks();
+  _trim_time += _pss->trim_ticks();
+  _pss->reset_trim_ticks();
+}
+
 G1GCParPhaseTimesTracker::G1GCParPhaseTimesTracker(G1GCPhaseTimes* phase_times, G1GCPhaseTimes::GCParPhases phase, uint worker_id) :
     _phase_times(phase_times), _phase(phase), _worker_id(worker_id) {
   if (_phase_times != NULL) {
-    _start_time = os::elapsedTime();
+    _start_time = Ticks::now();
   }
 }
 
 G1GCParPhaseTimesTracker::~G1GCParPhaseTimesTracker() {
   if (_phase_times != NULL) {
-    _phase_times->record_time_secs(_phase, _worker_id, os::elapsedTime() - _start_time);
+    _phase_times->record_time_secs(_phase, _worker_id, TicksToTimeHelper::seconds(Ticks::now() - _start_time));
   }
 }
 
+G1EvacPhaseTimesTracker::G1EvacPhaseTimesTracker(G1GCPhaseTimes* phase_times,
+                                                 G1ParScanThreadState* pss,
+                                                 G1GCPhaseTimes::GCParPhases phase,
+                                                 uint worker_id) :
+  G1GCParPhaseTimesTracker(phase_times, phase, worker_id),
+  _total_time(),
+  _trim_time(),
+  _trim_tracker(pss, _total_time, _trim_time) {
+}
+
+G1EvacPhaseTimesTracker::~G1EvacPhaseTimesTracker() {
+  if (_phase_times != NULL) {
+    // Exclude trim time by increasing the start time.
+    _start_time += _trim_time;
+    _phase_times->record_or_add_objcopy_time_secs(_worker_id, TicksToTimeHelper::seconds(_trim_time));
+  }
+}
+
--- a/src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp	Fri Apr 27 10:59:46 2018 +0200
+++ b/src/hotspot/share/gc/g1/g1GCPhaseTimes.hpp	Fri Apr 27 12:06:46 2018 +0200
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, 2017, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2018, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -31,6 +31,7 @@
 #include "utilities/macros.hpp"
 
 class LineBuffer;
+class G1ParScanThreadState;
 class STWGCTimer;
 
 template <class T> class WorkerDataArray;
@@ -198,6 +199,8 @@
   // add a number of seconds to a phase
   void add_time_secs(GCParPhases phase, uint worker_i, double secs);
 
+  void record_or_add_objcopy_time_secs(uint worker_i, double secs);
+
   void record_thread_work_item(GCParPhases phase, uint worker_i, size_t count, uint index = 0);
 
   // return the average time for a phase in milliseconds
@@ -369,14 +372,36 @@
   ReferenceProcessorPhaseTimes* ref_phase_times() { return &_ref_phase_times; }
 };
 
-class G1GCParPhaseTimesTracker : public StackObj {
-  double _start_time;
+class G1EvacPhaseWithTrimTimeTracker : public StackObj {
+  G1ParScanThreadState* _pss;
+  Ticks _start;
+
+  Tickspan& _total_time;
+  Tickspan& _trim_time;
+public:
+  G1EvacPhaseWithTrimTimeTracker(G1ParScanThreadState* pss, Tickspan& total_time, Tickspan& trim_time);
+  ~G1EvacPhaseWithTrimTimeTracker();
+};
+
+class G1GCParPhaseTimesTracker : public CHeapObj<mtGC> {
+protected:
+  Ticks _start_time;
   G1GCPhaseTimes::GCParPhases _phase;
   G1GCPhaseTimes* _phase_times;
   uint _worker_id;
 public:
   G1GCParPhaseTimesTracker(G1GCPhaseTimes* phase_times, G1GCPhaseTimes::GCParPhases phase, uint worker_id);
-  ~G1GCParPhaseTimesTracker();
+  virtual ~G1GCParPhaseTimesTracker();
+};
+
+class G1EvacPhaseTimesTracker : public G1GCParPhaseTimesTracker {
+  Tickspan _total_time;
+  Tickspan _trim_time;
+
+  G1EvacPhaseWithTrimTimeTracker _trim_tracker;
+public:
+  G1EvacPhaseTimesTracker(G1GCPhaseTimes* phase_times, G1ParScanThreadState* pss, G1GCPhaseTimes::GCParPhases phase, uint worker_id);
+  virtual ~G1EvacPhaseTimesTracker();
 };
 
 #endif // SHARE_VM_GC_G1_G1GCPHASETIMES_HPP
--- a/src/hotspot/share/gc/g1/g1OopClosures.cpp	Fri Apr 27 10:59:46 2018 +0200
+++ b/src/hotspot/share/gc/g1/g1OopClosures.cpp	Fri Apr 27 12:06:46 2018 +0200
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014, 2017, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2018, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -56,6 +56,8 @@
     cld->oops_do(_closure, _must_claim, /*clear_modified_oops*/true);
 
     _closure->set_scanned_cld(NULL);
+
+    _closure->trim_queue_partially();
   }
   _count++;
 }
--- a/src/hotspot/share/gc/g1/g1OopClosures.hpp	Fri Apr 27 10:59:46 2018 +0200
+++ b/src/hotspot/share/gc/g1/g1OopClosures.hpp	Fri Apr 27 12:06:46 2018 +0200
@@ -57,6 +57,8 @@
   // This closure needs special handling for InstanceRefKlass.
   virtual ReferenceIterationMode reference_iteration_mode() { return DO_DISCOVERED_AND_DISCOVERY; }
   void set_region(HeapRegion* from) { _from = from; }
+
+  inline void trim_queue_partially();
 };
 
 // Used during the Update RS phase to refine remaining cards in the DCQ during garbage collection.
@@ -126,6 +128,8 @@
  public:
   void set_scanned_cld(ClassLoaderData* cld) { _scanned_cld = cld; }
   inline void do_cld_barrier(oop new_obj);
+
+  inline void trim_queue_partially();
 };
 
 enum G1Barrier {
--- a/src/hotspot/share/gc/g1/g1OopClosures.inline.hpp	Fri Apr 27 10:59:46 2018 +0200
+++ b/src/hotspot/share/gc/g1/g1OopClosures.inline.hpp	Fri Apr 27 12:06:46 2018 +0200
@@ -67,6 +67,10 @@
   }
 }
 
+inline void G1ScanClosureBase::trim_queue_partially() {
+  _par_scan_state->trim_queue_partially();
+}
+
 template <class T>
 inline void G1ScanEvacuatedObjClosure::do_oop_nv(T* p) {
   T heap_oop = RawAccess<>::oop_load(p);
@@ -225,6 +229,10 @@
   _cm->mark_in_next_bitmap(_worker_id, to_obj, from_obj->size());
 }
 
+void G1ParCopyHelper::trim_queue_partially() {
+  _par_scan_state->trim_queue_partially();
+}
+
 template <G1Barrier barrier, G1Mark do_mark_object>
 template <class T>
 void G1ParCopyClosure<barrier, do_mark_object>::do_oop_work(T* p) {
@@ -269,6 +277,7 @@
       mark_object(obj);
     }
   }
+  trim_queue_partially();
 }
 
 template <class T> void G1RebuildRemSetClosure::do_oop_nv(T* p) {
--- a/src/hotspot/share/gc/g1/g1ParScanThreadState.cpp	Fri Apr 27 10:59:46 2018 +0200
+++ b/src/hotspot/share/gc/g1/g1ParScanThreadState.cpp	Fri Apr 27 12:06:46 2018 +0200
@@ -43,11 +43,15 @@
     _dcq(&g1h->dirty_card_queue_set()),
     _ct(g1h->card_table()),
     _closures(NULL),
+    _plab_allocator(NULL),
+    _age_table(false),
+    _tenuring_threshold(g1h->g1_policy()->tenuring_threshold()),
+    _scanner(g1h, this),
     _hash_seed(17),
     _worker_id(worker_id),
-    _tenuring_threshold(g1h->g1_policy()->tenuring_threshold()),
-    _age_table(false),
-    _scanner(g1h, this),
+    _stack_trim_upper_threshold(GCDrainStackTargetSize * 2 + 1),
+    _stack_trim_lower_threshold(GCDrainStackTargetSize),
+    _trim_ticks(),
     _old_gen_is_full(false)
 {
   // we allocate G1YoungSurvRateNumRegions plus one entries, since
@@ -138,16 +142,8 @@
 void G1ParScanThreadState::trim_queue() {
   StarTask ref;
   do {
-    // Drain the overflow stack first, so other threads can steal.
-    while (_refs->pop_overflow(ref)) {
-      if (!_refs->try_push_to_taskqueue(ref)) {
-        dispatch_reference(ref);
-      }
-    }
-
-    while (_refs->pop_local(ref)) {
-      dispatch_reference(ref);
-    }
+    // Fully drain the queue.
+    trim_queue_to_threshold(0);
   } while (!_refs->is_empty());
 }
 
@@ -314,7 +310,7 @@
       // length field of the from-space object.
       arrayOop(obj)->set_length(0);
       oop* old_p = set_partial_array_mask(old);
-      push_on_queue(old_p);
+      do_oop_partial_array(old_p);
     } else {
       HeapRegion* const to_region = _g1h->heap_region_containing(obj_ptr);
       _scanner.set_region(to_region);
--- a/src/hotspot/share/gc/g1/g1ParScanThreadState.hpp	Fri Apr 27 10:59:46 2018 +0200
+++ b/src/hotspot/share/gc/g1/g1ParScanThreadState.hpp	Fri Apr 27 12:06:46 2018 +0200
@@ -35,6 +35,7 @@
 #include "gc/shared/ageTable.hpp"
 #include "memory/allocation.hpp"
 #include "oops/oop.hpp"
+#include "utilities/ticks.hpp"
 
 class G1PLABAllocator;
 class G1EvacuationRootClosures;
@@ -42,7 +43,6 @@
 class outputStream;
 
 class G1ParScanThreadState : public CHeapObj<mtGC> {
- private:
   G1CollectedHeap* _g1h;
   RefToScanQueue*  _refs;
   DirtyCardQueue   _dcq;
@@ -60,6 +60,11 @@
   int  _hash_seed;
   uint _worker_id;
 
+  // Upper and lower threshold to start and end work queue draining.
+  uint const _stack_trim_upper_threshold;
+  uint const _stack_trim_lower_threshold;
+
+  Tickspan _trim_ticks;
   // Map from young-age-index (0 == not young, 1 is youngest) to
   // surviving words. base is what we get back from the malloc call
   size_t* _surviving_young_words_base;
@@ -83,7 +88,7 @@
     return _dest[original.value()];
   }
 
- public:
+public:
   G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id, size_t young_cset_length);
   virtual ~G1ParScanThreadState();
 
@@ -129,7 +134,7 @@
 
   void flush(size_t* surviving_young_words);
 
- private:
+private:
   #define G1_PARTIAL_ARRAY_MASK 0x2
 
   inline bool has_partial_array_mask(oop* ref) const {
@@ -185,11 +190,19 @@
   void report_promotion_event(InCSetState const dest_state,
                               oop const old, size_t word_sz, uint age,
                               HeapWord * const obj_ptr) const;
- public:
 
+  inline bool needs_partial_trimming() const;
+  inline bool is_partially_trimmed() const;
+
+  inline void trim_queue_to_threshold(uint threshold);
+public:
   oop copy_to_survivor_space(InCSetState const state, oop const obj, markOop const old_mark);
 
   void trim_queue();
+  void trim_queue_partially();
+
+  Tickspan trim_ticks() const;
+  void reset_trim_ticks();
 
   inline void steal_and_trim_queue(RefToScanQueueSet *task_queues);
 
--- a/src/hotspot/share/gc/g1/g1ParScanThreadState.inline.hpp	Fri Apr 27 10:59:46 2018 +0200
+++ b/src/hotspot/share/gc/g1/g1ParScanThreadState.inline.hpp	Fri Apr 27 12:06:46 2018 +0200
@@ -29,6 +29,7 @@
 #include "gc/g1/g1RemSet.hpp"
 #include "oops/access.inline.hpp"
 #include "oops/oop.inline.hpp"
+#include "utilities/ticks.inline.hpp"
 
 template <class T> void G1ParScanThreadState::do_oop_evac(T* p) {
   // Reference should not be NULL here as such are never pushed to the task queue.
@@ -151,4 +152,46 @@
   }
 }
 
+inline bool G1ParScanThreadState::needs_partial_trimming() const {
+  return !_refs->overflow_empty() || _refs->size() > _stack_trim_upper_threshold;
+}
+
+inline bool G1ParScanThreadState::is_partially_trimmed() const {
+  return _refs->overflow_empty() && _refs->size() <= _stack_trim_lower_threshold;
+}
+
+inline void G1ParScanThreadState::trim_queue_to_threshold(uint threshold) {
+  StarTask ref;
+  // Drain the overflow stack first, so other threads can potentially steal.
+  while (_refs->pop_overflow(ref)) {
+    if (!_refs->try_push_to_taskqueue(ref)) {
+      dispatch_reference(ref);
+    }
+  }
+
+  while (_refs->pop_local(ref, threshold)) {
+    dispatch_reference(ref);
+  }
+}
+
+inline void G1ParScanThreadState::trim_queue_partially() {
+  if (!needs_partial_trimming()) {
+    return;
+  }
+
+  const Ticks start = Ticks::now();
+  do {
+    trim_queue_to_threshold(_stack_trim_lower_threshold);
+  } while (!is_partially_trimmed());
+  _trim_ticks += Ticks::now() - start;
+}
+
+inline Tickspan G1ParScanThreadState::trim_ticks() const {
+  return _trim_ticks;
+}
+
+inline void G1ParScanThreadState::reset_trim_ticks() {
+  _trim_ticks = Tickspan();
+}
+
 #endif // SHARE_VM_GC_G1_G1PARSCANTHREADSTATE_INLINE_HPP
--- a/src/hotspot/share/gc/g1/g1Policy.cpp	Fri Apr 27 10:59:46 2018 +0200
+++ b/src/hotspot/share/gc/g1/g1Policy.cpp	Fri Apr 27 12:06:46 2018 +0200
@@ -628,7 +628,7 @@
   if (update_stats) {
     double cost_per_card_ms = 0.0;
     if (_pending_cards > 0) {
-      cost_per_card_ms = (average_time_ms(G1GCPhaseTimes::UpdateRS) - scan_hcc_time_ms) / (double) _pending_cards;
+      cost_per_card_ms = (average_time_ms(G1GCPhaseTimes::UpdateRS)) / (double) _pending_cards;
       _analytics->report_cost_per_card_ms(cost_per_card_ms);
     }
     _analytics->report_cost_scan_hcc(scan_hcc_time_ms);
@@ -730,9 +730,9 @@
   } else {
     update_rs_time_goal_ms -= scan_hcc_time_ms;
   }
-  _g1h->concurrent_refine()->adjust(average_time_ms(G1GCPhaseTimes::UpdateRS) - scan_hcc_time_ms,
-                                   phase_times()->sum_thread_work_items(G1GCPhaseTimes::UpdateRS),
-                                   update_rs_time_goal_ms);
+  _g1h->concurrent_refine()->adjust(average_time_ms(G1GCPhaseTimes::UpdateRS),
+                                    phase_times()->sum_thread_work_items(G1GCPhaseTimes::UpdateRS),
+                                    update_rs_time_goal_ms);
 
   cset_chooser()->verify();
 }
--- a/src/hotspot/share/gc/g1/g1RemSet.cpp	Fri Apr 27 10:59:46 2018 +0200
+++ b/src/hotspot/share/gc/g1/g1RemSet.cpp	Fri Apr 27 12:06:46 2018 +0200
@@ -33,6 +33,7 @@
 #include "gc/g1/g1GCPhaseTimes.hpp"
 #include "gc/g1/g1HotCardCache.hpp"
 #include "gc/g1/g1OopClosures.inline.hpp"
+#include "gc/g1/g1RootClosures.hpp"
 #include "gc/g1/g1RemSet.hpp"
 #include "gc/g1/heapRegion.inline.hpp"
 #include "gc/g1/heapRegionManager.inline.hpp"
@@ -306,32 +307,21 @@
 
 G1ScanRSForRegionClosure::G1ScanRSForRegionClosure(G1RemSetScanState* scan_state,
                                                    G1ScanObjsDuringScanRSClosure* scan_obj_on_card,
-                                                   CodeBlobClosure* code_root_cl,
+                                                   G1ParScanThreadState* pss,
                                                    uint worker_i) :
+  _g1h(G1CollectedHeap::heap()),
+  _ct(_g1h->card_table()),
+  _pss(pss),
+  _scan_objs_on_card_cl(scan_obj_on_card),
   _scan_state(scan_state),
-  _scan_objs_on_card_cl(scan_obj_on_card),
-  _code_root_cl(code_root_cl),
-  _strong_code_root_scan_time_sec(0.0),
+  _worker_i(worker_i),
   _cards_claimed(0),
   _cards_scanned(0),
   _cards_skipped(0),
-  _worker_i(worker_i) {
-  _g1h = G1CollectedHeap::heap();
-  _bot = _g1h->bot();
-  _ct = _g1h->card_table();
-}
-
-void G1ScanRSForRegionClosure::scan_card(MemRegion mr, uint region_idx_for_card) {
-  HeapRegion* const card_region = _g1h->region_at(region_idx_for_card);
-  _scan_objs_on_card_cl->set_region(card_region);
-  card_region->oops_on_card_seq_iterate_careful<true>(mr, _scan_objs_on_card_cl);
-  _cards_scanned++;
-}
-
-void G1ScanRSForRegionClosure::scan_strong_code_roots(HeapRegion* r) {
-  double scan_start = os::elapsedTime();
-  r->strong_code_roots_do(_code_root_cl);
-  _strong_code_root_scan_time_sec += (os::elapsedTime() - scan_start);
+  _rem_set_root_scan_time(),
+  _rem_set_trim_partially_time(),
+  _strong_code_root_scan_time(),
+  _strong_code_trim_partially_time() {
 }
 
 void G1ScanRSForRegionClosure::claim_card(size_t card_index, const uint region_idx_for_card){
@@ -339,13 +329,17 @@
   _scan_state->add_dirty_region(region_idx_for_card);
 }
 
-bool G1ScanRSForRegionClosure::do_heap_region(HeapRegion* r) {
-  assert(r->in_collection_set(), "should only be called on elements of CS.");
-  uint region_idx = r->hrm_index();
+void G1ScanRSForRegionClosure::scan_card(MemRegion mr, uint region_idx_for_card) {
+  HeapRegion* const card_region = _g1h->region_at(region_idx_for_card);
+  _scan_objs_on_card_cl->set_region(card_region);
+  card_region->oops_on_card_seq_iterate_careful<true>(mr, _scan_objs_on_card_cl);
+  _scan_objs_on_card_cl->trim_queue_partially();
+  _cards_scanned++;
+}
 
-  if (_scan_state->iter_is_complete(region_idx)) {
-    return false;
-  }
+void G1ScanRSForRegionClosure::scan_rem_set_roots(HeapRegion* r) {
+  uint const region_idx = r->hrm_index();
+
   if (_scan_state->claim_iter(region_idx)) {
     // If we ever free the collection set concurrently, we should also
     // clear the card table concurrently therefore we won't need to
@@ -397,33 +391,52 @@
 
     scan_card(mr, region_idx_for_card);
   }
+}
+
+void G1ScanRSForRegionClosure::scan_strong_code_roots(HeapRegion* r) {
+  r->strong_code_roots_do(_pss->closures()->weak_codeblobs());
+}
+
+bool G1ScanRSForRegionClosure::do_heap_region(HeapRegion* r) {
+  assert(r->in_collection_set(),
+         "Should only be called on elements of the collection set but region %u is not.",
+         r->hrm_index());
+  uint const region_idx = r->hrm_index();
+
+  // Do an early out if we know we are complete.
+  if (_scan_state->iter_is_complete(region_idx)) {
+    return false;
+  }
+
+  {
+    G1EvacPhaseWithTrimTimeTracker timer(_pss, _rem_set_root_scan_time, _rem_set_trim_partially_time);
+    scan_rem_set_roots(r);
+  }
+
   if (_scan_state->set_iter_complete(region_idx)) {
+    G1EvacPhaseWithTrimTimeTracker timer(_pss, _strong_code_root_scan_time, _strong_code_trim_partially_time);
     // Scan the strong code root list attached to the current region
     scan_strong_code_roots(r);
   }
   return false;
 }
 
-void G1RemSet::scan_rem_set(G1ParScanThreadState* pss,
-                            CodeBlobClosure* heap_region_codeblobs,
-                            uint worker_i) {
-  double rs_time_start = os::elapsedTime();
-
+void G1RemSet::scan_rem_set(G1ParScanThreadState* pss, uint worker_i) {
   G1ScanObjsDuringScanRSClosure scan_cl(_g1h, pss);
-  G1ScanRSForRegionClosure cl(_scan_state, &scan_cl, heap_region_codeblobs, worker_i);
+  G1ScanRSForRegionClosure cl(_scan_state, &scan_cl, pss, worker_i);
   _g1h->collection_set_iterate_from(&cl, worker_i);
 
-  double scan_rs_time_sec = (os::elapsedTime() - rs_time_start) -
-                             cl.strong_code_root_scan_time_sec();
-
   G1GCPhaseTimes* p = _g1p->phase_times();
 
-  p->record_time_secs(G1GCPhaseTimes::ScanRS, worker_i, scan_rs_time_sec);
+  p->record_time_secs(G1GCPhaseTimes::ScanRS, worker_i, TicksToTimeHelper::seconds(cl.rem_set_root_scan_time()));
+  p->add_time_secs(G1GCPhaseTimes::ObjCopy, worker_i, TicksToTimeHelper::seconds(cl.rem_set_trim_partially_time()));
+
   p->record_thread_work_item(G1GCPhaseTimes::ScanRS, worker_i, cl.cards_scanned(), G1GCPhaseTimes::ScanRSScannedCards);
   p->record_thread_work_item(G1GCPhaseTimes::ScanRS, worker_i, cl.cards_claimed(), G1GCPhaseTimes::ScanRSClaimedCards);
   p->record_thread_work_item(G1GCPhaseTimes::ScanRS, worker_i, cl.cards_skipped(), G1GCPhaseTimes::ScanRSSkippedCards);
 
-  p->record_time_secs(G1GCPhaseTimes::CodeRoots, worker_i, cl.strong_code_root_scan_time_sec());
+  p->record_time_secs(G1GCPhaseTimes::CodeRoots, worker_i, TicksToTimeHelper::seconds(cl.strong_code_root_scan_time()));
+  p->add_time_secs(G1GCPhaseTimes::ObjCopy, worker_i, TicksToTimeHelper::seconds(cl.strong_code_root_trim_partially_time()));
 }
 
 // Closure used for updating rem sets. Only called during an evacuation pause.
@@ -448,6 +461,7 @@
     bool card_scanned = _g1rs->refine_card_during_gc(card_ptr, _update_rs_cl);
 
     if (card_scanned) {
+      _update_rs_cl->trim_queue_partially();
       _cards_scanned++;
     } else {
       _cards_skipped++;
@@ -460,32 +474,37 @@
 };
 
 void G1RemSet::update_rem_set(G1ParScanThreadState* pss, uint worker_i) {
-  G1ScanObjsDuringUpdateRSClosure update_rs_cl(_g1h, pss, worker_i);
-  G1RefineCardClosure refine_card_cl(_g1h, &update_rs_cl);
+  G1GCPhaseTimes* p = _g1p->phase_times();
 
-  G1GCParPhaseTimesTracker x(_g1p->phase_times(), G1GCPhaseTimes::UpdateRS, worker_i);
+  // Apply closure to log entries in the HCC.
   if (G1HotCardCache::default_use_cache()) {
-    // Apply the closure to the entries of the hot card cache.
-    G1GCParPhaseTimesTracker y(_g1p->phase_times(), G1GCPhaseTimes::ScanHCC, worker_i);
+    G1EvacPhaseTimesTracker x(p, pss, G1GCPhaseTimes::ScanHCC, worker_i);
+
+    G1ScanObjsDuringUpdateRSClosure scan_hcc_cl(_g1h, pss, worker_i);
+    G1RefineCardClosure refine_card_cl(_g1h, &scan_hcc_cl);
     _g1h->iterate_hcc_closure(&refine_card_cl, worker_i);
   }
-  // Apply the closure to all remaining log entries.
-  _g1h->iterate_dirty_card_closure(&refine_card_cl, worker_i);
+
+  // Now apply the closure to all remaining log entries.
+  {
+    G1EvacPhaseTimesTracker x(p, pss, G1GCPhaseTimes::UpdateRS, worker_i);
 
-  G1GCPhaseTimes* p = _g1p->phase_times();
-  p->record_thread_work_item(G1GCPhaseTimes::UpdateRS, worker_i, refine_card_cl.cards_scanned(), G1GCPhaseTimes::UpdateRSScannedCards);
-  p->record_thread_work_item(G1GCPhaseTimes::UpdateRS, worker_i, refine_card_cl.cards_skipped(), G1GCPhaseTimes::UpdateRSSkippedCards);
+    G1ScanObjsDuringUpdateRSClosure update_rs_cl(_g1h, pss, worker_i);
+    G1RefineCardClosure refine_card_cl(_g1h, &update_rs_cl);
+    _g1h->iterate_dirty_card_closure(&refine_card_cl, worker_i);
+
+    p->record_thread_work_item(G1GCPhaseTimes::UpdateRS, worker_i, refine_card_cl.cards_scanned(), G1GCPhaseTimes::UpdateRSScannedCards);
+    p->record_thread_work_item(G1GCPhaseTimes::UpdateRS, worker_i, refine_card_cl.cards_skipped(), G1GCPhaseTimes::UpdateRSSkippedCards);
+  }
 }
 
 void G1RemSet::cleanupHRRS() {
   HeapRegionRemSet::cleanup();
 }
 
-void G1RemSet::oops_into_collection_set_do(G1ParScanThreadState* pss,
-                                           CodeBlobClosure* heap_region_codeblobs,
-                                           uint worker_i) {
+void G1RemSet::oops_into_collection_set_do(G1ParScanThreadState* pss, uint worker_i) {
   update_rem_set(pss, worker_i);
-  scan_rem_set(pss, heap_region_codeblobs, worker_i);;
+  scan_rem_set(pss, worker_i);;
 }
 
 void G1RemSet::prepare_for_oops_into_collection_set_do() {
--- a/src/hotspot/share/gc/g1/g1RemSet.hpp	Fri Apr 27 10:59:46 2018 +0200
+++ b/src/hotspot/share/gc/g1/g1RemSet.hpp	Fri Apr 27 12:06:46 2018 +0200
@@ -32,6 +32,7 @@
 #include "gc/g1/heapRegion.hpp"
 #include "memory/allocation.hpp"
 #include "memory/iterator.hpp"
+#include "utilities/ticks.hpp"
 
 // A G1RemSet provides ways of iterating over pointers into a selected
 // collection set.
@@ -61,9 +62,7 @@
 
   // Scan all remembered sets of the collection set for references into the collection
   // set.
-  void scan_rem_set(G1ParScanThreadState* pss,
-                    CodeBlobClosure* heap_region_codeblobs,
-                    uint worker_i);
+  void scan_rem_set(G1ParScanThreadState* pss, uint worker_i);
 
   // Flush remaining refinement buffers for cross-region references to either evacuate references
   // into the collection set or update the remembered set.
@@ -102,9 +101,7 @@
   //
   // Further applies heap_region_codeblobs on the oops of the unmarked nmethods on the strong code
   // roots list for each region in the collection set.
-  void oops_into_collection_set_do(G1ParScanThreadState* pss,
-                                   CodeBlobClosure* heap_region_codeblobs,
-                                   uint worker_i);
+  void oops_into_collection_set_do(G1ParScanThreadState* pss, uint worker_i);
 
   // Prepare for and cleanup after an oops_into_collection_set_do
   // call.  Must call each of these once before and after (in sequential
@@ -138,37 +135,44 @@
 };
 
 class G1ScanRSForRegionClosure : public HeapRegionClosure {
+  G1CollectedHeap* _g1h;
+  G1CardTable *_ct;
+
+  G1ParScanThreadState* _pss;
+  G1ScanObjsDuringScanRSClosure* _scan_objs_on_card_cl;
+
   G1RemSetScanState* _scan_state;
 
+  uint   _worker_i;
+
   size_t _cards_scanned;
   size_t _cards_claimed;
   size_t _cards_skipped;
 
-  G1CollectedHeap* _g1h;
-
-  G1ScanObjsDuringScanRSClosure* _scan_objs_on_card_cl;
-  CodeBlobClosure* _code_root_cl;
+  Tickspan _rem_set_root_scan_time;
+  Tickspan _rem_set_trim_partially_time;
 
-  G1BlockOffsetTable* _bot;
-  G1CardTable *_ct;
-
-  double _strong_code_root_scan_time_sec;
-  uint   _worker_i;
+  Tickspan _strong_code_root_scan_time;
+  Tickspan _strong_code_trim_partially_time;
 
   void claim_card(size_t card_index, const uint region_idx_for_card);
   void scan_card(MemRegion mr, uint region_idx_for_card);
+
+  void scan_rem_set_roots(HeapRegion* r);
   void scan_strong_code_roots(HeapRegion* r);
 public:
   G1ScanRSForRegionClosure(G1RemSetScanState* scan_state,
                            G1ScanObjsDuringScanRSClosure* scan_obj_on_card,
-                           CodeBlobClosure* code_root_cl,
+                           G1ParScanThreadState* pss,
                            uint worker_i);
 
   bool do_heap_region(HeapRegion* r);
 
-  double strong_code_root_scan_time_sec() {
-    return _strong_code_root_scan_time_sec;
-  }
+  Tickspan rem_set_root_scan_time() const { return _rem_set_root_scan_time; }
+  Tickspan rem_set_trim_partially_time() const { return _rem_set_trim_partially_time; }
+
+  Tickspan strong_code_root_scan_time() const { return _strong_code_root_scan_time;  }
+  Tickspan strong_code_root_trim_partially_time() const { return _strong_code_trim_partially_time; }
 
   size_t cards_scanned() const { return _cards_scanned; }
   size_t cards_claimed() const { return _cards_claimed; }
--- a/src/hotspot/share/gc/g1/g1RootClosures.cpp	Fri Apr 27 10:59:46 2018 +0200
+++ b/src/hotspot/share/gc/g1/g1RootClosures.cpp	Fri Apr 27 12:06:46 2018 +0200
@@ -37,8 +37,8 @@
                        bool in_young_gc) :
       _closures(g1h, pss, in_young_gc, /* must_claim_cld */ false) {}
 
-  OopClosure* weak_oops()   { return &_closures._buffered_oops; }
-  OopClosure* strong_oops() { return &_closures._buffered_oops; }
+  OopClosure* weak_oops()   { return &_closures._oops; }
+  OopClosure* strong_oops() { return &_closures._oops; }
 
   CLDClosure* weak_clds()             { return &_closures._clds; }
   CLDClosure* strong_clds()           { return &_closures._clds; }
@@ -47,9 +47,6 @@
   CodeBlobClosure* strong_codeblobs()      { return &_closures._codeblobs; }
   CodeBlobClosure* weak_codeblobs()        { return &_closures._codeblobs; }
 
-  void flush()                 { _closures._buffered_oops.done(); }
-  double closure_app_seconds() { return _closures._buffered_oops.closure_app_seconds(); }
-
   OopClosure* raw_strong_oops() { return &_closures._oops; }
 
   bool trace_metadata()         { return false; }
@@ -79,8 +76,8 @@
       _strong(g1h, pss, /* process_only_dirty_klasses */ false, /* must_claim_cld */ true),
       _weak(g1h, pss,   /* process_only_dirty_klasses */ false, /* must_claim_cld */ true) {}
 
-  OopClosure* weak_oops()   { return &_weak._buffered_oops; }
-  OopClosure* strong_oops() { return &_strong._buffered_oops; }
+  OopClosure* weak_oops()   { return &_weak._oops; }
+  OopClosure* strong_oops() { return &_strong._oops; }
 
   // If MarkWeak is G1MarkPromotedFromRoot then the weak CLDs must be processed in a second pass.
   CLDClosure* weak_clds()             { return null_if<G1MarkPromotedFromRoot>(&_weak._clds); }
@@ -93,16 +90,6 @@
   CodeBlobClosure* strong_codeblobs()      { return &_strong._codeblobs; }
   CodeBlobClosure* weak_codeblobs()        { return &_weak._codeblobs; }
 
-  void flush() {
-    _strong._buffered_oops.done();
-    _weak._buffered_oops.done();
-  }
-
-  double closure_app_seconds() {
-    return _strong._buffered_oops.closure_app_seconds() +
-           _weak._buffered_oops.closure_app_seconds();
-  }
-
   OopClosure* raw_strong_oops() { return &_strong._oops; }
 
   // If we are not marking all weak roots then we are tracing
--- a/src/hotspot/share/gc/g1/g1RootClosures.hpp	Fri Apr 27 10:59:46 2018 +0200
+++ b/src/hotspot/share/gc/g1/g1RootClosures.hpp	Fri Apr 27 12:06:46 2018 +0200
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, 2017, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -49,10 +49,6 @@
 
 class G1EvacuationRootClosures : public G1RootClosures {
 public:
-  // Flush any buffered state and deferred processing
-  virtual void flush() = 0;
-  virtual double closure_app_seconds() = 0;
-
   // Applied to the weakly reachable CLDs when all strongly reachable
   // CLDs are guaranteed to have been processed.
   virtual CLDClosure* second_pass_weak_clds() = 0;
--- a/src/hotspot/share/gc/g1/g1RootProcessor.cpp	Fri Apr 27 10:59:46 2018 +0200
+++ b/src/hotspot/share/gc/g1/g1RootProcessor.cpp	Fri Apr 27 12:06:46 2018 +0200
@@ -28,12 +28,12 @@
 #include "classfile/stringTable.hpp"
 #include "classfile/systemDictionary.hpp"
 #include "code/codeCache.hpp"
-#include "gc/g1/bufferingOopClosure.hpp"
 #include "gc/g1/g1BarrierSet.hpp"
 #include "gc/g1/g1CodeBlobClosure.hpp"
 #include "gc/g1/g1CollectedHeap.inline.hpp"
 #include "gc/g1/g1CollectorState.hpp"
 #include "gc/g1/g1GCPhaseTimes.hpp"
+#include "gc/g1/g1ParScanThreadState.inline.hpp"
 #include "gc/g1/g1Policy.hpp"
 #include "gc/g1/g1RootClosures.hpp"
 #include "gc/g1/g1RootProcessor.hpp"
@@ -73,10 +73,12 @@
     _lock(Mutex::leaf, "G1 Root Scanning barrier lock", false, Monitor::_safepoint_check_never),
     _n_workers_discovered_strong_classes(0) {}
 
-void G1RootProcessor::evacuate_roots(G1EvacuationRootClosures* closures, uint worker_i) {
-  double ext_roots_start = os::elapsedTime();
+void G1RootProcessor::evacuate_roots(G1ParScanThreadState* pss, uint worker_i) {
   G1GCPhaseTimes* phase_times = _g1h->g1_policy()->phase_times();
 
+  G1EvacPhaseTimesTracker timer(phase_times, pss, G1GCPhaseTimes::ExtRootScan, worker_i);
+
+  G1EvacuationRootClosures* closures = pss->closures();
   process_java_roots(closures, phase_times, worker_i);
 
   // This is the point where this worker thread will not find more strong CLDs/nmethods.
@@ -118,17 +120,6 @@
     assert(closures->second_pass_weak_clds() == NULL, "Should be null if not tracing metadata.");
   }
 
-  // Finish up any enqueued closure apps (attributed as object copy time).
-  closures->flush();
-
-  double obj_copy_time_sec = closures->closure_app_seconds();
-
-  phase_times->record_time_secs(G1GCPhaseTimes::ObjCopy, worker_i, obj_copy_time_sec);
-
-  double ext_root_time_sec = os::elapsedTime() - ext_roots_start - obj_copy_time_sec;
-
-  phase_times->record_time_secs(G1GCPhaseTimes::ExtRootScan, worker_i, ext_root_time_sec);
-
   // During conc marking we have to filter the per-thread SATB buffers
   // to make sure we remove any oops into the CSet (which will show up
   // as implicitly live).
--- a/src/hotspot/share/gc/g1/g1RootProcessor.hpp	Fri Apr 27 10:59:46 2018 +0200
+++ b/src/hotspot/share/gc/g1/g1RootProcessor.hpp	Fri Apr 27 12:06:46 2018 +0200
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, 2017, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -34,6 +34,7 @@
 class G1CollectedHeap;
 class G1EvacuationRootClosures;
 class G1GCPhaseTimes;
+class G1ParScanThreadState;
 class G1RootClosures;
 class Monitor;
 class OopClosure;
@@ -97,10 +98,10 @@
 public:
   G1RootProcessor(G1CollectedHeap* g1h, uint n_workers);
 
-  // Apply closures to the strongly and weakly reachable roots in the system
+  // Apply correct closures from pss to the strongly and weakly reachable roots in the system
   // in a single pass.
   // Record and report timing measurements for sub phases using the worker_i
-  void evacuate_roots(G1EvacuationRootClosures* closures, uint worker_i);
+  void evacuate_roots(G1ParScanThreadState* pss, uint worker_id);
 
   // Apply oops, clds and blobs to all strongly reachable roots in the system
   void process_strong_roots(OopClosure* oops,
--- a/src/hotspot/share/gc/g1/g1SharedClosures.hpp	Fri Apr 27 10:59:46 2018 +0200
+++ b/src/hotspot/share/gc/g1/g1SharedClosures.hpp	Fri Apr 27 12:06:46 2018 +0200
@@ -22,7 +22,6 @@
  *
  */
 
-#include "gc/g1/bufferingOopClosure.hpp"
 #include "gc/g1/g1CodeBlobClosure.hpp"
 #include "gc/g1/g1OopClosures.hpp"
 #include "memory/iterator.hpp"
@@ -39,12 +38,10 @@
 
   G1CLDScanClosure                _clds;
   G1CodeBlobClosure               _codeblobs;
-  BufferingOopClosure             _buffered_oops;
 
   G1SharedClosures(G1CollectedHeap* g1h, G1ParScanThreadState* pss, bool process_only_dirty, bool must_claim_cld) :
     _oops(g1h, pss),
     _oops_in_cld(g1h, pss),
     _clds(&_oops_in_cld, process_only_dirty, must_claim_cld),
-    _codeblobs(&_oops),
-    _buffered_oops(&_oops) {}
+    _codeblobs(&_oops) {}
 };
--- a/src/hotspot/share/gc/shared/taskqueue.hpp	Fri Apr 27 10:59:46 2018 +0200
+++ b/src/hotspot/share/gc/shared/taskqueue.hpp	Fri Apr 27 12:06:46 2018 +0200
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2017, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2018, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -285,9 +285,10 @@
   inline bool push(E t);
 
   // Attempts to claim a task from the "local" end of the queue (the most
-  // recently pushed).  If successful, returns true and sets t to the task;
-  // otherwise, returns false (the queue is empty).
-  inline bool pop_local(volatile E& t);
+  // recently pushed) as long as the number of entries exceeds the threshold.
+  // If successful, returns true and sets t to the task; otherwise, returns false
+  // (the queue is empty or the number of elements below the threshold).
+  inline bool pop_local(volatile E& t, uint threshold = 0);
 
   // Like pop_local(), but uses the "global" end of the queue (the least
   // recently pushed).
--- a/src/hotspot/share/gc/shared/taskqueue.inline.hpp	Fri Apr 27 10:59:46 2018 +0200
+++ b/src/hotspot/share/gc/shared/taskqueue.inline.hpp	Fri Apr 27 12:06:46 2018 +0200
@@ -152,7 +152,7 @@
 }
 
 template<class E, MEMFLAGS F, unsigned int N> inline bool
-GenericTaskQueue<E, F, N>::pop_local(volatile E& t) {
+GenericTaskQueue<E, F, N>::pop_local(volatile E& t, uint threshold) {
   uint localBot = _bottom;
   // This value cannot be N-1.  That can only occur as a result of
   // the assignment to bottom in this method.  If it does, this method
@@ -160,7 +160,7 @@
   // since this is pop_local.)
   uint dirty_n_elems = dirty_size(localBot, _age.top());
   assert(dirty_n_elems != N - 1, "Shouldn't be possible...");
-  if (dirty_n_elems == 0) return false;
+  if (dirty_n_elems <= threshold) return false;
   localBot = decrement_index(localBot);
   _bottom = localBot;
   // This is necessary to prevent any read below from being reordered
--- a/src/hotspot/share/utilities/ticks.hpp	Fri Apr 27 10:59:46 2018 +0200
+++ b/src/hotspot/share/utilities/ticks.hpp	Fri Apr 27 12:06:46 2018 +0200
@@ -47,6 +47,11 @@
     return *this;
   }
 
+  Tickspan& operator-=(const Tickspan& rhs) {
+    _span_ticks -= rhs._span_ticks;
+    return *this;
+  }
+
   jlong value() const {
     return _span_ticks;
   }
--- a/src/hotspot/share/utilities/ticks.inline.hpp	Fri Apr 27 10:59:46 2018 +0200
+++ b/src/hotspot/share/utilities/ticks.inline.hpp	Fri Apr 27 12:06:46 2018 +0200
@@ -32,6 +32,11 @@
   return lhs;
 }
 
+inline Tickspan operator-(Tickspan lhs, const Tickspan& rhs) {
+  lhs -= rhs;
+  return lhs;
+}
+
 inline bool operator==(const Tickspan& lhs, const Tickspan& rhs) {
   return lhs.value() == rhs.value();
 }
--- a/test/hotspot/gtest/gc/g1/test_bufferingOopClosure.cpp	Fri Apr 27 10:59:46 2018 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,257 +0,0 @@
-/*
- * Copyright (c) 2014, 2016, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-
-#include "precompiled.hpp"
-#include "gc/g1/bufferingOopClosure.hpp"
-#include "memory/iterator.hpp"
-#include "unittest.hpp"
-
-class BufferingOopClosureTest : public ::testing::Test {
- public:
-  // Helper class to fake a set of oop*s and narrowOop*s.
-  class FakeRoots {
-   public:
-    // Used for sanity checking of the values passed to the do_oops functions in the test.
-    static const uintptr_t NarrowOopMarker = uintptr_t(1) << (BitsPerWord -1);
-
-    int    _num_narrow;
-    int    _num_full;
-    void** _narrow;
-    void** _full;
-
-    FakeRoots(int num_narrow, int num_full) :
-        _num_narrow(num_narrow),
-        _num_full(num_full),
-        _narrow((void**)::malloc(sizeof(void*) * num_narrow)),
-        _full((void**)::malloc(sizeof(void*) * num_full)) {
-
-      for (int i = 0; i < num_narrow; i++) {
-        _narrow[i] = (void*)(NarrowOopMarker + (uintptr_t)i);
-      }
-      for (int i = 0; i < num_full; i++) {
-        _full[i] = (void*)(uintptr_t)i;
-      }
-    }
-
-    ~FakeRoots() {
-      ::free(_narrow);
-      ::free(_full);
-    }
-
-    void oops_do_narrow_then_full(OopClosure* cl) {
-      for (int i = 0; i < _num_narrow; i++) {
-        cl->do_oop((narrowOop*)_narrow[i]);
-      }
-      for (int i = 0; i < _num_full; i++) {
-        cl->do_oop((oop*)_full[i]);
-      }
-    }
-
-    void oops_do_full_then_narrow(OopClosure* cl) {
-      for (int i = 0; i < _num_full; i++) {
-        cl->do_oop((oop*)_full[i]);
-      }
-      for (int i = 0; i < _num_narrow; i++) {
-        cl->do_oop((narrowOop*)_narrow[i]);
-      }
-    }
-
-    void oops_do_mixed(OopClosure* cl) {
-      int i;
-      for (i = 0; i < _num_full && i < _num_narrow; i++) {
-        cl->do_oop((oop*)_full[i]);
-        cl->do_oop((narrowOop*)_narrow[i]);
-      }
-      for (int j = i; j < _num_full; j++) {
-        cl->do_oop((oop*)_full[i]);
-      }
-      for (int j = i; j < _num_narrow; j++) {
-        cl->do_oop((narrowOop*)_narrow[i]);
-      }
-    }
-
-    static const int MaxOrder = 2;
-
-    void oops_do(OopClosure* cl, int do_oop_order) {
-      switch(do_oop_order) {
-        case 0:
-          oops_do_narrow_then_full(cl);
-          break;
-        case 1:
-          oops_do_full_then_narrow(cl);
-          break;
-        case 2:
-          oops_do_mixed(cl);
-          break;
-        default:
-          oops_do_narrow_then_full(cl);
-          break;
-      }
-    }
-  };
-
-  class CountOopClosure : public OopClosure {
-    int _narrow_oop_count;
-    int _full_oop_count;
-   public:
-    CountOopClosure() : _narrow_oop_count(0), _full_oop_count(0) {}
-    void do_oop(narrowOop* p) {
-      EXPECT_NE(uintptr_t(0), (uintptr_t(p) & FakeRoots::NarrowOopMarker))
-              << "The narrowOop was unexpectedly not marked with the NarrowOopMarker";
-      _narrow_oop_count++;
-    }
-
-    void do_oop(oop* p){
-      EXPECT_EQ(uintptr_t(0), (uintptr_t(p) & FakeRoots::NarrowOopMarker))
-              << "The oop was unexpectedly marked with the NarrowOopMarker";
-      _full_oop_count++;
-    }
-
-    int narrow_oop_count() { return _narrow_oop_count; }
-    int full_oop_count()   { return _full_oop_count; }
-    int all_oop_count()    { return _narrow_oop_count + _full_oop_count; }
-  };
-
-  class DoNothingOopClosure : public OopClosure {
-   public:
-    void do_oop(narrowOop* p) {}
-    void do_oop(oop* p)       {}
-  };
-
-  static void testCount(int num_narrow, int num_full, int do_oop_order) {
-    FakeRoots fr(num_narrow, num_full);
-
-    CountOopClosure coc;
-    BufferingOopClosure boc(&coc);
-
-    fr.oops_do(&boc, do_oop_order);
-
-    boc.done();
-
-    EXPECT_EQ(num_narrow, coc.narrow_oop_count()) << "when running testCount("
-            << num_narrow << ", " << num_full << ", " << do_oop_order << ")";
-
-    EXPECT_EQ(num_full, coc.full_oop_count()) << "when running testCount("
-            << num_narrow << ", " << num_full << ", " << do_oop_order << ")";
-
-    EXPECT_EQ(num_narrow + num_full, coc.all_oop_count()) << "when running testCount("
-            << num_narrow << ", " << num_full << ", " << do_oop_order << ")";
-  }
-
-  static void testIsBufferEmptyOrFull(int num_narrow, int num_full, bool expect_empty, bool expect_full) {
-    FakeRoots fr(num_narrow, num_full);
-
-    DoNothingOopClosure cl;
-    BufferingOopClosure boc(&cl);
-
-    fr.oops_do(&boc, 0);
-
-    EXPECT_EQ(expect_empty, boc.is_buffer_empty())
-            << "when running testIsBufferEmptyOrFull("
-            << num_narrow << ", " << num_full << ", "
-            << expect_empty << ", " << expect_full << ")";
-
-    EXPECT_EQ(expect_full, boc.is_buffer_full())
-            << "when running testIsBufferEmptyOrFull("
-            << num_narrow << ", " << num_full << ", "
-            << expect_empty << ", " << expect_full << ")";
-  }
-
-  static void testEmptyAfterDone(int num_narrow, int num_full) {
-    FakeRoots fr(num_narrow, num_full);
-
-    DoNothingOopClosure cl;
-    BufferingOopClosure boc(&cl);
-
-    fr.oops_do(&boc, 0);
-
-    // Make sure all get processed.
-    boc.done();
-
-    EXPECT_TRUE(boc.is_buffer_empty()) << "Should be empty after call to done()."
-            << " testEmptyAfterDone(" << num_narrow << ", " << num_full << ")";
-  }
-
-  static int get_buffer_length() {
-    return BufferingOopClosure::BufferLength;
-  }
-};
-
-TEST_VM_F(BufferingOopClosureTest, count_test) {
-  int bl = BufferingOopClosureTest::get_buffer_length();
-
-  for (int order = 0; order < FakeRoots::MaxOrder; order++) {
-    testCount(0,      0,      order);
-    testCount(10,     0,      order);
-    testCount(0,      10,     order);
-    testCount(10,     10,     order);
-    testCount(bl,     10,     order);
-    testCount(10,     bl,     order);
-    testCount(bl,     bl,     order);
-    testCount(bl + 1, 10,     order);
-    testCount(10,     bl + 1, order);
-    testCount(bl + 1, bl,     order);
-    testCount(bl,     bl + 1, order);
-    testCount(bl + 1, bl + 1, order);
-  }
-}
-
-TEST_VM_F(BufferingOopClosureTest, buffer_empty_or_full) {
-  int bl = BufferingOopClosureTest::get_buffer_length();
-
-  testIsBufferEmptyOrFull(0,      0,      true,  false);
-  testIsBufferEmptyOrFull(1,      0,      false, false);
-  testIsBufferEmptyOrFull(0,      1,      false, false);
-  testIsBufferEmptyOrFull(1,      1,      false, false);
-  testIsBufferEmptyOrFull(10,     0,      false, false);
-  testIsBufferEmptyOrFull(0,      10,     false, false);
-  testIsBufferEmptyOrFull(10,     10,     false, false);
-  testIsBufferEmptyOrFull(0,      bl,     false, true);
-  testIsBufferEmptyOrFull(bl,     0,      false, true);
-  testIsBufferEmptyOrFull(bl / 2, bl / 2, false, true);
-  testIsBufferEmptyOrFull(bl - 1, 1,      false, true);
-  testIsBufferEmptyOrFull(1,      bl - 1, false, true);
-  // Processed
-  testIsBufferEmptyOrFull(bl + 1, 0,      false, false);
-  testIsBufferEmptyOrFull(bl * 2, 0,      false, true);
-}
-
-TEST_VM_F(BufferingOopClosureTest, empty_after_done) {
-  int bl = BufferingOopClosureTest::get_buffer_length();
-
-  testEmptyAfterDone(0,      0);
-  testEmptyAfterDone(1,      0);
-  testEmptyAfterDone(0,      1);
-  testEmptyAfterDone(1,      1);
-  testEmptyAfterDone(10,     0);
-  testEmptyAfterDone(0,      10);
-  testEmptyAfterDone(10,     10);
-  testEmptyAfterDone(0,      bl);
-  testEmptyAfterDone(bl,     0);
-  testEmptyAfterDone(bl / 2, bl / 2);
-  testEmptyAfterDone(bl - 1, 1);
-  testEmptyAfterDone(1,      bl - 1);
-  // Processed
-  testEmptyAfterDone(bl + 1, 0);
-  testEmptyAfterDone(bl * 2, 0);
-}