6984287: Regularize how GC parallel workers are specified.
authorjmasa
Mon, 20 Sep 2010 14:38:38 -0700
changeset 6759 67b1a69ef5aa
parent 6451 516540f1f076
child 6760 532b60549a90
6984287: Regularize how GC parallel workers are specified. Summary: Associate number of GC workers with the workgang as opposed to the task. Reviewed-by: johnc, ysr
hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/cmsCollectorPolicy.cpp
hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/compactibleFreeListSpace.cpp
hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp
hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.hpp
hotspot/src/share/vm/gc_implementation/g1/concurrentMark.cpp
hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp
hotspot/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp
hotspot/src/share/vm/gc_implementation/g1/g1RemSet.cpp
hotspot/src/share/vm/gc_implementation/parNew/parCardTableModRefBS.cpp
hotspot/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp
hotspot/src/share/vm/gc_implementation/parNew/parNewGeneration.hpp
hotspot/src/share/vm/gc_implementation/parallelScavenge/pcTasks.hpp
hotspot/src/share/vm/gc_implementation/parallelScavenge/psParallelCompact.cpp
hotspot/src/share/vm/gc_interface/collectedHeap.cpp
hotspot/src/share/vm/gc_interface/collectedHeap.hpp
hotspot/src/share/vm/includeDB_core
hotspot/src/share/vm/memory/genCollectedHeap.cpp
hotspot/src/share/vm/memory/genCollectedHeap.hpp
hotspot/src/share/vm/memory/referenceProcessor.cpp
hotspot/src/share/vm/memory/referenceProcessor.hpp
hotspot/src/share/vm/memory/sharedHeap.cpp
hotspot/src/share/vm/memory/sharedHeap.hpp
hotspot/src/share/vm/utilities/taskqueue.cpp
hotspot/src/share/vm/utilities/taskqueue.hpp
hotspot/src/share/vm/utilities/workgroup.cpp
hotspot/src/share/vm/utilities/workgroup.hpp
hotspot/src/share/vm/utilities/yieldingWorkgroup.cpp
hotspot/src/share/vm/utilities/yieldingWorkgroup.hpp
--- a/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/cmsCollectorPolicy.cpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/cmsCollectorPolicy.cpp	Mon Sep 20 14:38:38 2010 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -39,7 +39,7 @@
   if (_generations == NULL)
     vm_exit_during_initialization("Unable to allocate gen spec");
 
-  if (UseParNewGC && ParallelGCThreads > 0) {
+  if (ParNewGeneration::in_use()) {
     if (UseAdaptiveSizePolicy) {
       _generations[0] = new GenerationSpec(Generation::ASParNew,
                                            _initial_gen0_size, _max_gen0_size);
@@ -79,7 +79,7 @@
 
 void ConcurrentMarkSweepPolicy::initialize_gc_policy_counters() {
   // initialize the policy counters - 2 collectors, 3 generations
-  if (UseParNewGC && ParallelGCThreads > 0) {
+  if (ParNewGeneration::in_use()) {
     _gc_policy_counters = new GCPolicyCounters("ParNew:CMS", 2, 3);
   }
   else {
@@ -102,7 +102,7 @@
 
   assert(size_policy() != NULL, "A size policy is required");
   // initialize the policy counters - 2 collectors, 3 generations
-  if (UseParNewGC && ParallelGCThreads > 0) {
+  if (ParNewGeneration::in_use()) {
     _gc_policy_counters = new CMSGCAdaptivePolicyCounters("ParNew:CMS", 2, 3,
       size_policy());
   }
--- a/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/compactibleFreeListSpace.cpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/compactibleFreeListSpace.cpp	Mon Sep 20 14:38:38 2010 -0700
@@ -124,7 +124,8 @@
   checkFreeListConsistency();
 
   // Initialize locks for parallel case.
-  if (ParallelGCThreads > 0) {
+
+  if (CollectedHeap::use_parallel_gc_threads()) {
     for (size_t i = IndexSetStart; i < IndexSetSize; i += IndexSetStride) {
       _indexedFreeListParLocks[i] = new Mutex(Mutex::leaf - 1, // == ExpandHeap_lock - 1
                                               "a freelist par lock",
@@ -1071,7 +1072,8 @@
   // at address below "p" in finding the object that contains "p"
   // and those objects (if garbage) may have been modified to hold
   // live range information.
-  // assert(ParallelGCThreads > 0 || _bt.block_start(p) == p, "Should be a block boundary");
+  // assert(CollectedHeap::use_parallel_gc_threads() || _bt.block_start(p) == p,
+  //        "Should be a block boundary");
   if (FreeChunk::indicatesFreeChunk(p)) return false;
   klassOop k = oop(p)->klass_or_null();
   if (k != NULL) {
@@ -2932,7 +2934,9 @@
          "n_tasks calculation incorrect");
   SequentialSubTasksDone* pst = conc_par_seq_tasks();
   assert(!pst->valid(), "Clobbering existing data?");
-  pst->set_par_threads(n_threads);
+  // Sets the condition for completion of the subtask (how many threads
+  // need to finish in order to be done).
+  pst->set_n_threads(n_threads);
   pst->set_n_tasks((int)n_tasks);
 }
 
@@ -2972,6 +2976,8 @@
          "n_tasks calculation incorrect");
   SequentialSubTasksDone* pst = conc_par_seq_tasks();
   assert(!pst->valid(), "Clobbering existing data?");
-  pst->set_par_threads(n_threads);
+  // Sets the condition for completion of the subtask (how many threads
+  // need to finish in order to be done).
+  pst->set_n_threads(n_threads);
   pst->set_n_tasks((int)n_tasks);
 }
--- a/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp	Mon Sep 20 14:38:38 2010 -0700
@@ -195,7 +195,7 @@
            "Offset of FreeChunk::_prev within FreeChunk must match"
            "  that of OopDesc::_klass within OopDesc");
   )
-  if (ParallelGCThreads > 0) {
+  if (CollectedHeap::use_parallel_gc_threads()) {
     typedef CMSParGCThreadState* CMSParGCThreadStatePtr;
     _par_gc_thread_states =
       NEW_C_HEAP_ARRAY(CMSParGCThreadStatePtr, ParallelGCThreads);
@@ -616,7 +616,7 @@
   }
 
   // Support for multi-threaded concurrent phases
-  if (ParallelGCThreads > 0 && CMSConcurrentMTEnabled) {
+  if (CollectedHeap::use_parallel_gc_threads() && CMSConcurrentMTEnabled) {
     if (FLAG_IS_DEFAULT(ConcGCThreads)) {
       // just for now
       FLAG_SET_DEFAULT(ConcGCThreads, (ParallelGCThreads + 3)/4);
@@ -628,6 +628,8 @@
         warning("GC/CMS: _conc_workers allocation failure: "
               "forcing -CMSConcurrentMTEnabled");
         CMSConcurrentMTEnabled = false;
+      } else {
+        _conc_workers->initialize_workers();
       }
     } else {
       CMSConcurrentMTEnabled = false;
@@ -936,7 +938,7 @@
   // along with all the other pointers into the heap but
   // compaction is expected to be a rare event with
   // a heap using cms so don't do it without seeing the need.
-  if (ParallelGCThreads > 0) {
+  if (CollectedHeap::use_parallel_gc_threads()) {
     for (uint i = 0; i < ParallelGCThreads; i++) {
       _par_gc_thread_states[i]->promo.reset();
     }
@@ -2630,7 +2632,8 @@
   // Should call gc_prologue_work() for all cms gens we are responsible for
   bool registerClosure =    _collectorState >= Marking
                          && _collectorState < Sweeping;
-  ModUnionClosure* muc = ParallelGCThreads > 0 ? &_modUnionClosurePar
+  ModUnionClosure* muc = CollectedHeap::use_parallel_gc_threads() ?
+                                               &_modUnionClosurePar
                                                : &_modUnionClosure;
   _cmsGen->gc_prologue_work(full, registerClosure, muc);
   _permGen->gc_prologue_work(full, registerClosure, muc);
@@ -2731,7 +2734,7 @@
   collector()->gc_epilogue(full);
 
   // Also reset promotion tracking in par gc thread states.
-  if (ParallelGCThreads > 0) {
+  if (CollectedHeap::use_parallel_gc_threads()) {
     for (uint i = 0; i < ParallelGCThreads; i++) {
       _par_gc_thread_states[i]->promo.stopTrackingPromotions(i);
     }
@@ -3731,7 +3734,6 @@
 // MT Concurrent Marking Task
 class CMSConcMarkingTask: public YieldingFlexibleGangTask {
   CMSCollector* _collector;
-  YieldingFlexibleWorkGang* _workers;        // the whole gang
   int           _n_workers;                  // requested/desired # workers
   bool          _asynch;
   bool          _result;
@@ -3751,21 +3753,19 @@
   CMSConcMarkingTask(CMSCollector* collector,
                  CompactibleFreeListSpace* cms_space,
                  CompactibleFreeListSpace* perm_space,
-                 bool asynch, int n_workers,
+                 bool asynch,
                  YieldingFlexibleWorkGang* workers,
                  OopTaskQueueSet* task_queues):
     YieldingFlexibleGangTask("Concurrent marking done multi-threaded"),
     _collector(collector),
     _cms_space(cms_space),
     _perm_space(perm_space),
-    _asynch(asynch), _n_workers(n_workers), _result(true),
-    _workers(workers), _task_queues(task_queues),
-    _term(n_workers, task_queues, _collector, asynch),
+    _asynch(asynch), _n_workers(0), _result(true),
+    _task_queues(task_queues),
+    _term(_n_workers, task_queues, _collector, asynch),
     _bit_map_lock(collector->bitMapLock())
   {
-    assert(n_workers <= workers->total_workers(),
-           "Else termination won't work correctly today"); // XXX FIX ME!
-    _requested_size = n_workers;
+    _requested_size = _n_workers;
     _term.set_task(this);
     assert(_cms_space->bottom() < _perm_space->bottom(),
            "Finger incorrectly initialized below");
@@ -3781,6 +3781,10 @@
 
   CMSConcMarkingTerminator* terminator() { return &_term; }
 
+  virtual void set_for_termination(int active_workers) {
+    terminator()->reset_for_reuse(active_workers);
+  }
+
   void work(int i);
 
   virtual void coordinator_yield();  // stuff done by coordinator
@@ -4220,9 +4224,12 @@
   CompactibleFreeListSpace* cms_space  = _cmsGen->cmsSpace();
   CompactibleFreeListSpace* perm_space = _permGen->cmsSpace();
 
-  CMSConcMarkingTask tsk(this, cms_space, perm_space,
-                         asynch, num_workers /* number requested XXX */,
-                         conc_workers(), task_queues());
+  CMSConcMarkingTask tsk(this,
+                         cms_space,
+                         perm_space,
+                         asynch,
+                         conc_workers(),
+                         task_queues());
 
   // Since the actual number of workers we get may be different
   // from the number we requested above, do we need to do anything different
@@ -4326,6 +4333,10 @@
   verify_overflow_empty();
   _abort_preclean = false;
   if (CMSPrecleaningEnabled) {
+    // Precleaning is currently not MT but the reference processor
+    // may be set for MT.  Disable it temporarily here.
+    ReferenceProcessor* rp = ref_processor();
+    ReferenceProcessorMTProcMutator z(rp, false);
     _eden_chunk_index = 0;
     size_t used = get_eden_used();
     size_t capacity = get_eden_capacity();
@@ -4918,7 +4929,7 @@
       // dirtied since the first checkpoint in this GC cycle and prior to
       // the most recent young generation GC, minus those cleaned up by the
       // concurrent precleaning.
-      if (CMSParallelRemarkEnabled && ParallelGCThreads > 0) {
+      if (CMSParallelRemarkEnabled && CollectedHeap::use_parallel_gc_threads()) {
         TraceTime t("Rescan (parallel) ", PrintGCDetails, false, gclog_or_tty);
         do_remark_parallel();
       } else {
@@ -5012,7 +5023,6 @@
 // Parallel remark task
 class CMSParRemarkTask: public AbstractGangTask {
   CMSCollector* _collector;
-  WorkGang*     _workers;
   int           _n_workers;
   CompactibleFreeListSpace* _cms_space;
   CompactibleFreeListSpace* _perm_space;
@@ -5025,21 +5035,21 @@
   CMSParRemarkTask(CMSCollector* collector,
                    CompactibleFreeListSpace* cms_space,
                    CompactibleFreeListSpace* perm_space,
-                   int n_workers, WorkGang* workers,
+                   int n_workers, FlexibleWorkGang* workers,
                    OopTaskQueueSet* task_queues):
     AbstractGangTask("Rescan roots and grey objects in parallel"),
     _collector(collector),
     _cms_space(cms_space), _perm_space(perm_space),
     _n_workers(n_workers),
-    _workers(workers),
     _task_queues(task_queues),
-    _term(workers->total_workers(), task_queues) { }
+    _term(n_workers, task_queues) { }
 
   OopTaskQueueSet* task_queues() { return _task_queues; }
 
   OopTaskQueue* work_queue(int i) { return task_queues()->queue(i); }
 
   ParallelTaskTerminator* terminator() { return &_term; }
+  int n_workers() { return _n_workers; }
 
   void work(int i);
 
@@ -5057,6 +5067,11 @@
   void do_work_steal(int i, Par_MarkRefsIntoAndScanClosure* cl, int* seed);
 };
 
+// work_queue(i) is passed to the closure
+// Par_MarkRefsIntoAndScanClosure.  The "i" parameter
+// also is passed to do_dirty_card_rescan_tasks() and to
+// do_work_steal() to select the i-th task_queue.
+
 void CMSParRemarkTask::work(int i) {
   elapsedTimer _timer;
   ResourceMark rm;
@@ -5128,6 +5143,7 @@
 
   // Do the rescan tasks for each of the two spaces
   // (cms_space and perm_space) in turn.
+  // "i" is passed to select the "i-th" task_queue
   do_dirty_card_rescan_tasks(_cms_space, i, &par_mrias_cl);
   do_dirty_card_rescan_tasks(_perm_space, i, &par_mrias_cl);
   _timer.stop();
@@ -5150,6 +5166,7 @@
   }
 }
 
+// Note that parameter "i" is not used.
 void
 CMSParRemarkTask::do_young_space_rescan(int i,
   Par_MarkRefsIntoAndScanClosure* cl, ContiguousSpace* space,
@@ -5309,8 +5326,13 @@
     size_t num_from_overflow_list = MIN2((size_t)(work_q->max_elems() - work_q->size())/4,
                                          (size_t)ParGCDesiredObjsFromOverflowList);
     // Now check if there's any work in the overflow list
+    // Passing ParallelGCThreads as the third parameter, no_of_gc_threads,
+    // only affects the number of attempts made to get work from the
+    // overflow list and does not affect the number of workers.  Just
+    // pass ParallelGCThreads so this behavior is unchanged.
     if (_collector->par_take_from_overflow_list(num_from_overflow_list,
-                                                work_q)) {
+                                                work_q,
+                                                ParallelGCThreads)) {
       // found something in global overflow list;
       // not yet ready to go stealing work from others.
       // We'd like to assert(work_q->size() != 0, ...)
@@ -5367,11 +5389,12 @@
 // Merge the per-thread plab arrays into the global survivor chunk
 // array which will provide the partitioning of the survivor space
 // for CMS rescan.
-void CMSCollector::merge_survivor_plab_arrays(ContiguousSpace* surv) {
+void CMSCollector::merge_survivor_plab_arrays(ContiguousSpace* surv,
+                                              int no_of_gc_threads) {
   assert(_survivor_plab_array  != NULL, "Error");
   assert(_survivor_chunk_array != NULL, "Error");
   assert(_collectorState == FinalMarking, "Error");
-  for (uint j = 0; j < ParallelGCThreads; j++) {
+  for (int j = 0; j < no_of_gc_threads; j++) {
     _cursor[j] = 0;
   }
   HeapWord* top = surv->top();
@@ -5379,7 +5402,7 @@
   for (i = 0; i < _survivor_chunk_capacity; i++) {  // all sca entries
     HeapWord* min_val = top;          // Higher than any PLAB address
     uint      min_tid = 0;            // position of min_val this round
-    for (uint j = 0; j < ParallelGCThreads; j++) {
+    for (int j = 0; j < no_of_gc_threads; j++) {
       ChunkArray* cur_sca = &_survivor_plab_array[j];
       if (_cursor[j] == cur_sca->end()) {
         continue;
@@ -5413,7 +5436,7 @@
   // Verify that we used up all the recorded entries
   #ifdef ASSERT
     size_t total = 0;
-    for (uint j = 0; j < ParallelGCThreads; j++) {
+    for (int j = 0; j < no_of_gc_threads; j++) {
       assert(_cursor[j] == _survivor_plab_array[j].end(), "Ctl pt invariant");
       total += _cursor[j];
     }
@@ -5448,13 +5471,15 @@
     // Each valid entry in [0, _eden_chunk_index) represents a task.
     size_t n_tasks = _eden_chunk_index + 1;
     assert(n_tasks == 1 || _eden_chunk_array != NULL, "Error");
-    pst->set_par_threads(n_threads);
+    // Sets the condition for completion of the subtask (how many threads
+    // need to finish in order to be done).
+    pst->set_n_threads(n_threads);
     pst->set_n_tasks((int)n_tasks);
   }
 
   // Merge the survivor plab arrays into _survivor_chunk_array
   if (_survivor_plab_array != NULL) {
-    merge_survivor_plab_arrays(dng->from());
+    merge_survivor_plab_arrays(dng->from(), n_threads);
   } else {
     assert(_survivor_chunk_index == 0, "Error");
   }
@@ -5463,7 +5488,9 @@
   {
     SequentialSubTasksDone* pst = dng->to()->par_seq_tasks();
     assert(!pst->valid(), "Clobbering existing data?");
-    pst->set_par_threads(n_threads);
+    // Sets the condition for completion of the subtask (how many threads
+    // need to finish in order to be done).
+    pst->set_n_threads(n_threads);
     pst->set_n_tasks(1);
     assert(pst->valid(), "Error");
   }
@@ -5474,7 +5501,9 @@
     assert(!pst->valid(), "Clobbering existing data?");
     size_t n_tasks = _survivor_chunk_index + 1;
     assert(n_tasks == 1 || _survivor_chunk_array != NULL, "Error");
-    pst->set_par_threads(n_threads);
+    // Sets the condition for completion of the subtask (how many threads
+    // need to finish in order to be done).
+    pst->set_n_threads(n_threads);
     pst->set_n_tasks((int)n_tasks);
     assert(pst->valid(), "Error");
   }
@@ -5483,7 +5512,7 @@
 // Parallel version of remark
 void CMSCollector::do_remark_parallel() {
   GenCollectedHeap* gch = GenCollectedHeap::heap();
-  WorkGang* workers = gch->workers();
+  FlexibleWorkGang* workers = gch->workers();
   assert(workers != NULL, "Need parallel worker threads.");
   int n_workers = workers->total_workers();
   CompactibleFreeListSpace* cms_space  = _cmsGen->cmsSpace();
@@ -5636,13 +5665,11 @@
 ////////////////////////////////////////////////////////
 // Parallel Reference Processing Task Proxy Class
 ////////////////////////////////////////////////////////
-class CMSRefProcTaskProxy: public AbstractGangTask {
+class CMSRefProcTaskProxy: public AbstractGangTaskWOopQueues {
   typedef AbstractRefProcTaskExecutor::ProcessTask ProcessTask;
   CMSCollector*          _collector;
   CMSBitMap*             _mark_bit_map;
   const MemRegion        _span;
-  OopTaskQueueSet*       _task_queues;
-  ParallelTaskTerminator _term;
   ProcessTask&           _task;
 
 public:
@@ -5650,24 +5677,21 @@
                       CMSCollector*    collector,
                       const MemRegion& span,
                       CMSBitMap*       mark_bit_map,
-                      int              total_workers,
+                      AbstractWorkGang* workers,
                       OopTaskQueueSet* task_queues):
-    AbstractGangTask("Process referents by policy in parallel"),
+    AbstractGangTaskWOopQueues("Process referents by policy in parallel",
+      task_queues),
     _task(task),
-    _collector(collector), _span(span), _mark_bit_map(mark_bit_map),
-    _task_queues(task_queues),
-    _term(total_workers, task_queues)
+    _collector(collector), _span(span), _mark_bit_map(mark_bit_map)
     {
       assert(_collector->_span.equals(_span) && !_span.is_empty(),
              "Inconsistency in _span");
     }
 
-  OopTaskQueueSet* task_queues() { return _task_queues; }
+  OopTaskQueueSet* task_queues() { return queues(); }
 
   OopTaskQueue* work_queue(int i) { return task_queues()->queue(i); }
 
-  ParallelTaskTerminator* terminator() { return &_term; }
-
   void do_work_steal(int i,
                      CMSParDrainMarkingStackClosure* drain,
                      CMSParKeepAliveClosure* keep_alive,
@@ -5739,8 +5763,13 @@
     size_t num_from_overflow_list = MIN2((size_t)(work_q->max_elems() - work_q->size())/4,
                                          (size_t)ParGCDesiredObjsFromOverflowList);
     // Now check if there's any work in the overflow list
+    // Passing ParallelGCThreads as the third parameter, no_of_gc_threads,
+    // only affects the number of attempts made to get work from the
+    // overflow list and does not affect the number of workers.  Just
+    // pass ParallelGCThreads so this behavior is unchanged.
     if (_collector->par_take_from_overflow_list(num_from_overflow_list,
-                                                work_q)) {
+                                                work_q,
+                                                ParallelGCThreads)) {
       // Found something in global overflow list;
       // not yet ready to go stealing work from others.
       // We'd like to assert(work_q->size() != 0, ...)
@@ -5773,13 +5802,12 @@
 void CMSRefProcTaskExecutor::execute(ProcessTask& task)
 {
   GenCollectedHeap* gch = GenCollectedHeap::heap();
-  WorkGang* workers = gch->workers();
+  FlexibleWorkGang* workers = gch->workers();
   assert(workers != NULL, "Need parallel worker threads.");
-  int n_workers = workers->total_workers();
   CMSRefProcTaskProxy rp_task(task, &_collector,
                               _collector.ref_processor()->span(),
                               _collector.markBitMap(),
-                              n_workers, _collector.task_queues());
+                              workers, _collector.task_queues());
   workers->run_task(&rp_task);
 }
 
@@ -5787,7 +5815,7 @@
 {
 
   GenCollectedHeap* gch = GenCollectedHeap::heap();
-  WorkGang* workers = gch->workers();
+  FlexibleWorkGang* workers = gch->workers();
   assert(workers != NULL, "Need parallel worker threads.");
   CMSRefEnqueueTaskProxy enq_task(task);
   workers->run_task(&enq_task);
@@ -5814,6 +5842,14 @@
   {
     TraceTime t("weak refs processing", PrintGCDetails, false, gclog_or_tty);
     if (rp->processing_is_mt()) {
+      // Set the degree of MT here.  If the discovery is done MT, there
+      // may have been a different number of threads doing the discovery
+      // and a different number of discovered lists may have Ref objects.
+      // That is OK as long as the Reference lists are balanced (see
+      // balance_all_queues() and balance_queues()).
+
+
+      rp->set_mt_degree(ParallelGCThreads);
       CMSRefProcTaskExecutor task_executor(*this);
       rp->process_discovered_references(&_is_alive_closure,
                                         &cmsKeepAliveClosure,
@@ -5874,6 +5910,7 @@
 
   rp->set_enqueuing_is_done(true);
   if (rp->processing_is_mt()) {
+    rp->balance_all_queues();
     CMSRefProcTaskExecutor task_executor(*this);
     rp->enqueue_discovered_references(&task_executor);
   } else {
@@ -8708,7 +8745,8 @@
 // similar changes might be needed.
 // CR 6797058 has been filed to consolidate the common code.
 bool CMSCollector::par_take_from_overflow_list(size_t num,
-                                               OopTaskQueue* work_q) {
+                                               OopTaskQueue* work_q,
+                                               int no_of_gc_threads) {
   assert(work_q->size() == 0, "First empty local work queue");
   assert(num < work_q->max_elems(), "Can't bite more than we can chew");
   if (_overflow_list == NULL) {
@@ -8717,7 +8755,9 @@
   // Grab the entire list; we'll put back a suffix
   oop prefix = (oop)Atomic::xchg_ptr(BUSY, &_overflow_list);
   Thread* tid = Thread::current();
-  size_t CMSOverflowSpinCount = (size_t)ParallelGCThreads;
+  // Before "no_of_gc_threads" was introduced CMSOverflowSpinCount was
+  // set to ParallelGCThreads.
+  size_t CMSOverflowSpinCount = (size_t) no_of_gc_threads; // was ParallelGCThreads;
   size_t sleep_time_millis = MAX2((size_t)1, num/100);
   // If the list is busy, we spin for a short while,
   // sleeping between attempts to get the list.
@@ -9256,4 +9296,3 @@
              true /* recordGCEndTime */,
              true /* countCollection */ );
 }
-
--- a/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.hpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.hpp	Mon Sep 20 14:38:38 2010 -0700
@@ -729,7 +729,9 @@
 
   // Support for marking stack overflow handling
   bool take_from_overflow_list(size_t num, CMSMarkStack* to_stack);
-  bool par_take_from_overflow_list(size_t num, OopTaskQueue* to_work_q);
+  bool par_take_from_overflow_list(size_t num,
+                                   OopTaskQueue* to_work_q,
+                                   int no_of_gc_threads);
   void push_on_overflow_list(oop p);
   void par_push_on_overflow_list(oop p);
   // the following is, obviously, not, in general, "MT-stable"
@@ -768,7 +770,7 @@
   void abortable_preclean(); // Preclean while looking for possible abort
   void initialize_sequential_subtasks_for_young_gen_rescan(int i);
   // Helper function for above; merge-sorts the per-thread plab samples
-  void merge_survivor_plab_arrays(ContiguousSpace* surv);
+  void merge_survivor_plab_arrays(ContiguousSpace* surv, int no_of_gc_threads);
   // Resets (i.e. clears) the per-thread plab sample vectors
   void reset_survivor_plab_arrays();
 
--- a/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.cpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.cpp	Mon Sep 20 14:38:38 2010 -0700
@@ -583,10 +583,13 @@
 #endif
 
     guarantee(parallel_marking_threads() > 0, "peace of mind");
-    _parallel_workers = new WorkGang("G1 Parallel Marking Threads",
-                                     (int) parallel_marking_threads(), false, true);
-    if (_parallel_workers == NULL)
+    _parallel_workers = new FlexibleWorkGang("G1 Parallel Marking Threads",
+         (int) _parallel_marking_threads, false, true);
+    if (_parallel_workers == NULL) {
       vm_exit_during_initialization("Failed necessary allocation.");
+    } else {
+      _parallel_workers->initialize_workers();
+    }
   }
 
   // so that the call below can read a sensible value
@@ -1451,7 +1454,7 @@
                                   _bm, _g1h->concurrent_mark(),
                                   _region_bm, _card_bm);
     calccl.no_yield();
-    if (ParallelGCThreads > 0) {
+    if (G1CollectedHeap::use_parallel_gc_threads()) {
       _g1h->heap_region_par_iterate_chunked(&calccl, i,
                                             HeapRegion::FinalCountClaimValue);
     } else {
@@ -1531,7 +1534,7 @@
     G1NoteEndOfConcMarkClosure g1_note_end(_g1h,
                                            &_par_cleanup_thread_state[i]->list,
                                            i);
-    if (ParallelGCThreads > 0) {
+    if (G1CollectedHeap::use_parallel_gc_threads()) {
       _g1h->heap_region_par_iterate_chunked(&g1_note_end, i,
                                             HeapRegion::NoteEndClaimValue);
     } else {
@@ -1575,7 +1578,7 @@
   {}
 
   void work(int i) {
-    if (ParallelGCThreads > 0) {
+    if (G1CollectedHeap::use_parallel_gc_threads()) {
       _g1rs->scrub_par(_region_bm, _card_bm, i,
                        HeapRegion::ScrubRemSetClaimValue);
     } else {
@@ -1647,7 +1650,7 @@
   // Do counting once more with the world stopped for good measure.
   G1ParFinalCountTask g1_par_count_task(g1h, nextMarkBitMap(),
                                         &_region_bm, &_card_bm);
-  if (ParallelGCThreads > 0) {
+  if (G1CollectedHeap::use_parallel_gc_threads()) {
     assert(g1h->check_heap_region_claim_values(
                                                HeapRegion::InitialClaimValue),
            "sanity check");
@@ -1695,7 +1698,7 @@
   // Note end of marking in all heap regions.
   double note_end_start = os::elapsedTime();
   G1ParNoteEndTask g1_par_note_end_task(g1h, _par_cleanup_thread_state);
-  if (ParallelGCThreads > 0) {
+  if (G1CollectedHeap::use_parallel_gc_threads()) {
     int n_workers = g1h->workers()->total_workers();
     g1h->set_par_threads(n_workers);
     g1h->workers()->run_task(&g1_par_note_end_task);
@@ -1720,7 +1723,7 @@
   if (G1ScrubRemSets) {
     double rs_scrub_start = os::elapsedTime();
     G1ParScrubRemSetTask g1_par_scrub_rs_task(g1h, &_region_bm, &_card_bm);
-    if (ParallelGCThreads > 0) {
+    if (G1CollectedHeap::use_parallel_gc_threads()) {
       int n_workers = g1h->workers()->total_workers();
       g1h->set_par_threads(n_workers);
       g1h->workers()->run_task(&g1_par_scrub_rs_task);
@@ -1934,7 +1937,7 @@
 
   g1h->ensure_parsability(false);
 
-  if (ParallelGCThreads > 0) {
+  if (G1CollectedHeap::use_parallel_gc_threads()) {
     G1CollectedHeap::StrongRootsScope srs(g1h);
     // this is remark, so we'll use up all available threads
     int active_workers = ParallelGCThreads;
@@ -3369,14 +3372,14 @@
 
   CMObjectClosure oc(this);
   SATBMarkQueueSet& satb_mq_set = JavaThread::satb_mark_queue_set();
-  if (ParallelGCThreads > 0)
+  if (G1CollectedHeap::use_parallel_gc_threads())
     satb_mq_set.set_par_closure(_task_id, &oc);
   else
     satb_mq_set.set_closure(&oc);
 
   // This keeps claiming and applying the closure to completed buffers
   // until we run out of buffers or we need to abort.
-  if (ParallelGCThreads > 0) {
+  if (G1CollectedHeap::use_parallel_gc_threads()) {
     while (!has_aborted() &&
            satb_mq_set.par_apply_closure_to_completed_buffer(_task_id)) {
       if (_cm->verbose_medium())
@@ -3396,7 +3399,7 @@
 
   if (!concurrent() && !has_aborted()) {
     // We should only do this during remark.
-    if (ParallelGCThreads > 0)
+    if (G1CollectedHeap::use_parallel_gc_threads())
       satb_mq_set.par_iterate_closure_all_threads(_task_id);
     else
       satb_mq_set.iterate_closure_all_threads();
@@ -3408,7 +3411,7 @@
          concurrent() ||
          satb_mq_set.completed_buffers_num() == 0, "invariant");
 
-  if (ParallelGCThreads > 0)
+  if (G1CollectedHeap::use_parallel_gc_threads())
     satb_mq_set.set_par_closure(_task_id, NULL);
   else
     satb_mq_set.set_closure(NULL);
--- a/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp	Mon Sep 20 14:38:38 2010 -0700
@@ -961,7 +961,8 @@
     }
 
     // Rebuild remembered sets of all regions.
-    if (ParallelGCThreads > 0) {
+
+    if (G1CollectedHeap::use_parallel_gc_threads()) {
       ParRebuildRSTask rebuild_rs_task(this);
       assert(check_heap_region_claim_values(
              HeapRegion::InitialClaimValue), "sanity check");
@@ -1960,7 +1961,7 @@
                                                  int worker,
                                                  jint claim_value) {
   const size_t regions = n_regions();
-  const size_t worker_num = (ParallelGCThreads > 0 ? ParallelGCThreads : 1);
+  const size_t worker_num = (G1CollectedHeap::use_parallel_gc_threads() ? ParallelGCThreads : 1);
   // try to spread out the starting points of the workers
   const size_t start_index = regions / worker_num * (size_t) worker;
 
@@ -2527,7 +2528,7 @@
 }
 
 void G1CollectedHeap::print_gc_threads_on(outputStream* st) const {
-  if (ParallelGCThreads > 0) {
+  if (G1CollectedHeap::use_parallel_gc_threads()) {
     workers()->print_worker_threads_on(st);
   }
 
@@ -2543,7 +2544,7 @@
 }
 
 void G1CollectedHeap::gc_threads_do(ThreadClosure* tc) const {
-  if (ParallelGCThreads > 0) {
+  if (G1CollectedHeap::use_parallel_gc_threads()) {
     workers()->threads_do(tc);
   }
   tc->do_thread(_cmThread);
@@ -3083,7 +3084,7 @@
   if (r != NULL) {
     r_used = r->used();
 
-    if (ParallelGCThreads > 0) {
+    if (G1CollectedHeap::use_parallel_gc_threads()) {
       // need to take the lock to guard against two threads calling
       // get_gc_alloc_region concurrently (very unlikely but...)
       MutexLockerEx x(ParGCRareEvent_lock, Mutex::_no_safepoint_check_flag);
@@ -4182,6 +4183,8 @@
 
 // *** Common G1 Evacuation Stuff
 
+// This method is run in a GC worker.
+
 void
 G1CollectedHeap::
 g1_process_strong_roots(bool collecting_perm_gen,
@@ -4259,7 +4262,7 @@
 };
 
 void G1CollectedHeap::save_marks() {
-  if (ParallelGCThreads == 0) {
+  if (!CollectedHeap::use_parallel_gc_threads()) {
     SaveMarksClosure sm;
     heap_region_iterate(&sm);
   }
@@ -4284,7 +4287,7 @@
 
   assert(dirty_card_queue_set().completed_buffers_num() == 0, "Should be empty");
   double start_par = os::elapsedTime();
-  if (ParallelGCThreads > 0) {
+  if (G1CollectedHeap::use_parallel_gc_threads()) {
     // The individual threads will set their evac-failure closures.
     StrongRootsScope srs(this);
     if (ParallelGCVerbose) G1ParScanThreadState::print_termination_stats_hdr();
--- a/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp	Mon Sep 20 14:38:38 2010 -0700
@@ -656,6 +656,9 @@
   bool _unclean_regions_coming;
 
 public:
+
+  SubTasksDone* process_strong_tasks() { return _process_strong_tasks; }
+
   void set_refine_cte_cl_concurrency(bool concurrent);
 
   RefToScanQueue *task_queue(int i) const;
@@ -684,7 +687,7 @@
 
   void set_par_threads(int t) {
     SharedHeap::set_par_threads(t);
-    _process_strong_tasks->set_par_threads(t);
+    _process_strong_tasks->set_n_threads(t);
   }
 
   virtual CollectedHeap::Name kind() const {
--- a/hotspot/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp	Mon Sep 20 14:38:38 2010 -0700
@@ -72,7 +72,10 @@
 // </NEW PREDICTION>
 
 G1CollectorPolicy::G1CollectorPolicy() :
-  _parallel_gc_threads((ParallelGCThreads > 0) ? ParallelGCThreads : 1),
+  _parallel_gc_threads(G1CollectedHeap::use_parallel_gc_threads()
+    ? ParallelGCThreads : 1),
+
+
   _n_pauses(0),
   _recent_CH_strong_roots_times_ms(new TruncatedSeq(NumPrevPausesForHeuristics)),
   _recent_G1_strong_roots_times_ms(new TruncatedSeq(NumPrevPausesForHeuristics)),
@@ -1073,7 +1076,7 @@
 }
 
 double G1CollectorPolicy::avg_value (double* data) {
-  if (ParallelGCThreads > 0) {
+  if (G1CollectedHeap::use_parallel_gc_threads()) {
     double ret = 0.0;
     for (uint i = 0; i < ParallelGCThreads; ++i)
       ret += data[i];
@@ -1084,7 +1087,7 @@
 }
 
 double G1CollectorPolicy::max_value (double* data) {
-  if (ParallelGCThreads > 0) {
+  if (G1CollectedHeap::use_parallel_gc_threads()) {
     double ret = data[0];
     for (uint i = 1; i < ParallelGCThreads; ++i)
       if (data[i] > ret)
@@ -1096,7 +1099,7 @@
 }
 
 double G1CollectorPolicy::sum_of_values (double* data) {
-  if (ParallelGCThreads > 0) {
+  if (G1CollectedHeap::use_parallel_gc_threads()) {
     double sum = 0.0;
     for (uint i = 0; i < ParallelGCThreads; i++)
       sum += data[i];
@@ -1110,7 +1113,7 @@
                                    double* data2) {
   double ret = data1[0] + data2[0];
 
-  if (ParallelGCThreads > 0) {
+  if (G1CollectedHeap::use_parallel_gc_threads()) {
     for (uint i = 1; i < ParallelGCThreads; ++i) {
       double data = data1[i] + data2[i];
       if (data > ret)
@@ -1126,7 +1129,7 @@
 void G1CollectorPolicy::record_collection_pause_end() {
   double end_time_sec = os::elapsedTime();
   double elapsed_ms = _last_pause_time_ms;
-  bool parallel = ParallelGCThreads > 0;
+  bool parallel = G1CollectedHeap::use_parallel_gc_threads();
   double evac_ms = (end_time_sec - _cur_G1_strong_roots_end_sec) * 1000.0;
   size_t rs_size =
     _cur_collection_pause_used_regions_at_start - collection_set_size();
@@ -1941,7 +1944,7 @@
       // Further, we're now always doing parallel collection.  But I'm still
       // leaving this here as a placeholder for a more precise assertion later.
       // (DLD, 10/05.)
-      assert((true || ParallelGCThreads > 0) ||
+      assert((true || G1CollectedHeap::use_parallel_gc_threads()) ||
              _g1->evacuation_failed() ||
              recent_survival_rate <= 1.0, "Or bad frac");
       return recent_survival_rate;
@@ -1961,7 +1964,7 @@
     // Further, we're now always doing parallel collection.  But I'm still
     // leaving this here as a placeholder for a more precise assertion later.
     // (DLD, 10/05.)
-    assert((true || ParallelGCThreads > 0) ||
+    assert((true || G1CollectedHeap::use_parallel_gc_threads()) ||
            last_survival_rate <= 1.0, "Or bad frac");
     return last_survival_rate;
   } else {
@@ -2121,7 +2124,7 @@
 }
 
 void G1CollectorPolicy::print_summary(PauseSummary* summary) const {
-  bool parallel = ParallelGCThreads > 0;
+  bool parallel = G1CollectedHeap::use_parallel_gc_threads();
   MainBodySummary*    body_summary = summary->main_body_summary();
   if (summary->get_total_seq()->num() > 0) {
     print_summary_sd(0, "Evacuation Pauses", summary->get_total_seq());
@@ -2559,7 +2562,7 @@
     gclog_or_tty->print_cr("  clear marked regions + work1: %8.3f ms.",
                   (clear_marked_end - start)*1000.0);
   }
-  if (ParallelGCThreads > 0) {
+  if (G1CollectedHeap::use_parallel_gc_threads()) {
     const size_t OverpartitionFactor = 4;
     const size_t MinWorkUnit = 8;
     const size_t WorkUnit =
--- a/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.cpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.cpp	Mon Sep 20 14:38:38 2010 -0700
@@ -523,7 +523,7 @@
   assert(!_traversal_in_progress, "Invariant between iterations.");
   set_traversal(true);
   if (ParallelGCThreads > 0) {
-    _seq_task->set_par_threads((int)n_workers());
+    _seq_task->set_n_threads((int)n_workers());
   }
   guarantee( _cards_scanned == NULL, "invariant" );
   _cards_scanned = NEW_C_HEAP_ARRAY(size_t, n_workers());
--- a/hotspot/src/share/vm/gc_implementation/parNew/parCardTableModRefBS.cpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/gc_implementation/parNew/parCardTableModRefBS.cpp	Mon Sep 20 14:38:38 2010 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2007, 2010 Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -44,7 +44,7 @@
 
     int n_strides = n_threads * StridesPerThread;
     SequentialSubTasksDone* pst = sp->par_seq_tasks();
-    pst->set_par_threads(n_threads);
+    pst->set_n_threads(n_threads);
     pst->set_n_tasks(n_strides);
 
     int stride = 0;
--- a/hotspot/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp	Mon Sep 20 14:38:38 2010 -0700
@@ -1533,3 +1533,7 @@
 const char* ParNewGeneration::name() const {
   return "par new generation";
 }
+
+bool ParNewGeneration::in_use() {
+  return UseParNewGC && ParallelGCThreads > 0;
+}
--- a/hotspot/src/share/vm/gc_implementation/parNew/parNewGeneration.hpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/gc_implementation/parNew/parNewGeneration.hpp	Mon Sep 20 14:38:38 2010 -0700
@@ -350,6 +350,8 @@
     delete _task_queues;
   }
 
+  static bool in_use();
+
   virtual void ref_processor_init();
   virtual Generation::Name kind()        { return Generation::ParNew; }
   virtual const char* name() const;
--- a/hotspot/src/share/vm/gc_implementation/parallelScavenge/pcTasks.hpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/gc_implementation/parallelScavenge/pcTasks.hpp	Mon Sep 20 14:38:38 2010 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005, 2008, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -242,7 +242,11 @@
 //
 
 class DrainStacksCompactionTask : public GCTask {
+ uint _stack_index;
+ uint stack_index() { return _stack_index; }
  public:
+  DrainStacksCompactionTask(uint stack_index) : GCTask(),
+                                                _stack_index(stack_index) {};
   char* name() { return (char *)"drain-region-task"; }
   virtual void do_it(GCTaskManager* manager, uint which);
 };
--- a/hotspot/src/share/vm/gc_implementation/parallelScavenge/psParallelCompact.cpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/gc_implementation/parallelScavenge/psParallelCompact.cpp	Mon Sep 20 14:38:38 2010 -0700
@@ -2449,7 +2449,7 @@
 
   const unsigned int task_count = MAX2(parallel_gc_threads, 1U);
   for (unsigned int j = 0; j < task_count; j++) {
-    q->enqueue(new DrainStacksCompactionTask());
+    q->enqueue(new DrainStacksCompactionTask(j));
   }
 
   // Find all regions that are available (can be filled immediately) and
--- a/hotspot/src/share/vm/gc_interface/collectedHeap.cpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/gc_interface/collectedHeap.cpp	Mon Sep 20 14:38:38 2010 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -34,7 +34,9 @@
 
 // Memory state functions.
 
-CollectedHeap::CollectedHeap()
+
+CollectedHeap::CollectedHeap() : _n_par_threads(0)
+
 {
   const size_t max_len = size_t(arrayOopDesc::max_array_length(T_INT));
   const size_t elements_per_word = HeapWordSize / sizeof(jint);
--- a/hotspot/src/share/vm/gc_interface/collectedHeap.hpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/gc_interface/collectedHeap.hpp	Mon Sep 20 14:38:38 2010 -0700
@@ -59,6 +59,8 @@
   MemRegion _reserved;
   BarrierSet* _barrier_set;
   bool _is_gc_active;
+  int _n_par_threads;
+
   unsigned int _total_collections;          // ... started
   unsigned int _total_full_collections;     // ... started
   NOT_PRODUCT(volatile size_t _promotion_failure_alot_count;)
@@ -293,6 +295,12 @@
   }
   GCCause::Cause gc_cause() { return _gc_cause; }
 
+  // Number of threads currently working on GC tasks.
+  int n_par_threads() { return _n_par_threads; }
+
+  // May be overridden to set additional parallelism.
+  virtual void set_par_threads(int t) { _n_par_threads = t; };
+
   // Preload classes into the shared portion of the heap, and then dump
   // that data to a file so that it can be loaded directly by another
   // VM (then terminate).
@@ -606,6 +614,14 @@
     return (CIFireOOMAt > 1 && _fire_out_of_memory_count >= CIFireOOMAt);
   }
 #endif
+
+ public:
+  // This is a convenience method that is used in cases where
+  // the actual number of GC worker threads is not pertinent but
+  // only whether there more than 0.  Use of this method helps
+  // reduce the occurrence of ParallelGCThreads to uses where the
+  // actual number may be germane.
+  static bool use_parallel_gc_threads() { return ParallelGCThreads > 0; }
 };
 
 // Class to set and reset the GC cause for a CollectedHeap.
--- a/hotspot/src/share/vm/includeDB_core	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/includeDB_core	Mon Sep 20 14:38:38 2010 -0700
@@ -4721,6 +4721,7 @@
 workgroup.cpp                           os.hpp
 workgroup.cpp                           workgroup.hpp
 
+workgroup.hpp                           taskqueue.hpp
 workgroup.hpp                           thread_<os_family>.inline.hpp
 
 xmlstream.cpp                           allocation.hpp
--- a/hotspot/src/share/vm/memory/genCollectedHeap.cpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/memory/genCollectedHeap.cpp	Mon Sep 20 14:38:38 2010 -0700
@@ -676,7 +676,7 @@
 
 void GenCollectedHeap::set_par_threads(int t) {
   SharedHeap::set_par_threads(t);
-  _gen_process_strong_tasks->set_par_threads(t);
+  _gen_process_strong_tasks->set_n_threads(t);
 }
 
 class AssertIsPermClosure: public OopClosure {
--- a/hotspot/src/share/vm/memory/genCollectedHeap.hpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/memory/genCollectedHeap.hpp	Mon Sep 20 14:38:38 2010 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -74,6 +74,7 @@
   // Data structure for claiming the (potentially) parallel tasks in
   // (gen-specific) strong roots processing.
   SubTasksDone* _gen_process_strong_tasks;
+  SubTasksDone* gen_process_strong_tasks() { return _gen_process_strong_tasks; }
 
   // In block contents verification, the number of header words to skip
   NOT_PRODUCT(static size_t _skip_header_HeapWords;)
--- a/hotspot/src/share/vm/memory/referenceProcessor.cpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/memory/referenceProcessor.cpp	Mon Sep 20 14:38:38 2010 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -137,16 +137,17 @@
   _discovery_is_atomic = atomic_discovery;
   _discovery_is_mt     = mt_discovery;
   _num_q               = mt_degree;
-  _discoveredSoftRefs  = NEW_C_HEAP_ARRAY(DiscoveredList, _num_q * subclasses_of_ref);
+  _max_num_q           = mt_degree;
+  _discoveredSoftRefs  = NEW_C_HEAP_ARRAY(DiscoveredList, _max_num_q * subclasses_of_ref);
   if (_discoveredSoftRefs == NULL) {
     vm_exit_during_initialization("Could not allocated RefProc Array");
   }
-  _discoveredWeakRefs    = &_discoveredSoftRefs[_num_q];
-  _discoveredFinalRefs   = &_discoveredWeakRefs[_num_q];
-  _discoveredPhantomRefs = &_discoveredFinalRefs[_num_q];
+  _discoveredWeakRefs    = &_discoveredSoftRefs[_max_num_q];
+  _discoveredFinalRefs   = &_discoveredWeakRefs[_max_num_q];
+  _discoveredPhantomRefs = &_discoveredFinalRefs[_max_num_q];
   assert(sentinel_ref() != NULL, "_sentinelRef is NULL");
   // Initialized all entries to _sentinelRef
-  for (int i = 0; i < _num_q * subclasses_of_ref; i++) {
+  for (int i = 0; i < _max_num_q * subclasses_of_ref; i++) {
         _discoveredSoftRefs[i].set_head(sentinel_ref());
     _discoveredSoftRefs[i].set_length(0);
   }
@@ -159,7 +160,7 @@
 #ifndef PRODUCT
 void ReferenceProcessor::verify_no_references_recorded() {
   guarantee(!_discovering_refs, "Discovering refs?");
-  for (int i = 0; i < _num_q * subclasses_of_ref; i++) {
+  for (int i = 0; i < _max_num_q * subclasses_of_ref; i++) {
     guarantee(_discoveredSoftRefs[i].empty(),
               "Found non-empty discovered list");
   }
@@ -167,7 +168,11 @@
 #endif
 
 void ReferenceProcessor::weak_oops_do(OopClosure* f) {
-  for (int i = 0; i < _num_q * subclasses_of_ref; i++) {
+  // Should this instead be
+  // for (int i = 0; i < subclasses_of_ref; i++_ {
+  //   for (int j = 0; j < _num_q; j++) {
+  //     int index = i * _max_num_q + j;
+  for (int i = 0; i < _max_num_q * subclasses_of_ref; i++) {
     if (UseCompressedOops) {
       f->do_oop((narrowOop*)_discoveredSoftRefs[i].adr_head());
     } else {
@@ -395,7 +400,15 @@
     assert(work_id < (unsigned int)_ref_processor.num_q(), "Index out-of-bounds");
     // Simplest first cut: static partitioning.
     int index = work_id;
-    for (int j = 0; j < subclasses_of_ref; j++, index += _n_queues) {
+    // The increment on "index" must correspond to the maximum number of queues
+    // (n_queues) with which that ReferenceProcessor was created.  That
+    // is because of the "clever" way the discovered references lists were
+    // allocated and are indexed into.  That number is ParallelGCThreads
+    // currently.  Assert that.
+    assert(_n_queues == (int) ParallelGCThreads, "Different number not expected");
+    for (int j = 0;
+         j < subclasses_of_ref;
+         j++, index += _n_queues) {
       _ref_processor.enqueue_discovered_reflist(
         _refs_lists[index], _pending_list_addr);
       _refs_lists[index].set_head(_sentinel_ref);
@@ -410,11 +423,11 @@
   if (_processing_is_mt && task_executor != NULL) {
     // Parallel code
     RefProcEnqueueTask tsk(*this, _discoveredSoftRefs,
-                           pending_list_addr, sentinel_ref(), _num_q);
+                           pending_list_addr, sentinel_ref(), _max_num_q);
     task_executor->execute(tsk);
   } else {
     // Serial code: call the parent class's implementation
-    for (int i = 0; i < _num_q * subclasses_of_ref; i++) {
+    for (int i = 0; i < _max_num_q * subclasses_of_ref; i++) {
       enqueue_discovered_reflist(_discoveredSoftRefs[i], pending_list_addr);
       _discoveredSoftRefs[i].set_head(sentinel_ref());
       _discoveredSoftRefs[i].set_length(0);
@@ -614,8 +627,9 @@
   complete_gc->do_void();
   NOT_PRODUCT(
     if (PrintGCDetails && TraceReferenceGC) {
-      gclog_or_tty->print(" Dropped %d dead Refs out of %d "
-        "discovered Refs by policy ", iter.removed(), iter.processed());
+      gclog_or_tty->print_cr(" Dropped %d dead Refs out of %d "
+        "discovered Refs by policy  list " INTPTR_FORMAT,
+        iter.removed(), iter.processed(), (address)refs_list.head());
     }
   )
 }
@@ -651,8 +665,9 @@
   }
   NOT_PRODUCT(
     if (PrintGCDetails && TraceReferenceGC) {
-      gclog_or_tty->print(" Dropped %d active Refs out of %d "
-        "Refs in discovered list ", iter.removed(), iter.processed());
+      gclog_or_tty->print_cr(" Dropped %d active Refs out of %d "
+        "Refs in discovered list " INTPTR_FORMAT,
+        iter.removed(), iter.processed(), (address)refs_list.head());
     }
   )
 }
@@ -689,8 +704,9 @@
   complete_gc->do_void();
   NOT_PRODUCT(
     if (PrintGCDetails && TraceReferenceGC) {
-      gclog_or_tty->print(" Dropped %d active Refs out of %d "
-        "Refs in discovered list ", iter.removed(), iter.processed());
+      gclog_or_tty->print_cr(" Dropped %d active Refs out of %d "
+        "Refs in discovered list " INTPTR_FORMAT,
+        iter.removed(), iter.processed(), (address)refs_list.head());
     }
   )
 }
@@ -704,6 +720,7 @@
                                    BoolObjectClosure* is_alive,
                                    OopClosure*        keep_alive,
                                    VoidClosure*       complete_gc) {
+  ResourceMark rm;
   DiscoveredListIterator iter(refs_list, keep_alive, is_alive);
   while (iter.has_next()) {
     iter.update_discovered();
@@ -743,8 +760,8 @@
 
 void ReferenceProcessor::abandon_partial_discovery() {
   // loop over the lists
-  for (int i = 0; i < _num_q * subclasses_of_ref; i++) {
-    if (TraceReferenceGC && PrintGCDetails && ((i % _num_q) == 0)) {
+  for (int i = 0; i < _max_num_q * subclasses_of_ref; i++) {
+    if (TraceReferenceGC && PrintGCDetails && ((i % _max_num_q) == 0)) {
       gclog_or_tty->print_cr(
         "\nAbandoning %s discovered list",
         list_name(i));
@@ -766,7 +783,9 @@
                     OopClosure& keep_alive,
                     VoidClosure& complete_gc)
   {
-    _ref_processor.process_phase1(_refs_lists[i], _policy,
+    Thread* thr = Thread::current();
+    int refs_list_index = ((WorkerThread*)thr)->id();
+    _ref_processor.process_phase1(_refs_lists[refs_list_index], _policy,
                                   &is_alive, &keep_alive, &complete_gc);
   }
 private:
@@ -802,6 +821,11 @@
                     OopClosure& keep_alive,
                     VoidClosure& complete_gc)
   {
+    // Don't use "refs_list_index" calculated in this way because
+    // balance_queues() has moved the Ref's into the first n queues.
+    // Thread* thr = Thread::current();
+    // int refs_list_index = ((WorkerThread*)thr)->id();
+    // _ref_processor.process_phase3(_refs_lists[refs_list_index], _clear_referent,
     _ref_processor.process_phase3(_refs_lists[i], _clear_referent,
                                   &is_alive, &keep_alive, &complete_gc);
   }
@@ -810,23 +834,47 @@
 };
 
 // Balances reference queues.
+// Move entries from all queues[0, 1, ..., _max_num_q-1] to
+// queues[0, 1, ..., _num_q-1] because only the first _num_q
+// corresponding to the active workers will be processed.
 void ReferenceProcessor::balance_queues(DiscoveredList ref_lists[])
 {
   // calculate total length
   size_t total_refs = 0;
-  for (int i = 0; i < _num_q; ++i) {
+  if (TraceReferenceGC && PrintGCDetails) {
+    gclog_or_tty->print_cr("\nBalance ref_lists ");
+  }
+
+  for (int i = 0; i < _max_num_q; ++i) {
     total_refs += ref_lists[i].length();
+    if (TraceReferenceGC && PrintGCDetails) {
+      gclog_or_tty->print("%d ", ref_lists[i].length());
+    }
+  }
+  if (TraceReferenceGC && PrintGCDetails) {
+    gclog_or_tty->print_cr(" = %d", total_refs);
   }
   size_t avg_refs = total_refs / _num_q + 1;
   int to_idx = 0;
-  for (int from_idx = 0; from_idx < _num_q; from_idx++) {
-    while (ref_lists[from_idx].length() > avg_refs) {
+  for (int from_idx = 0; from_idx < _max_num_q; from_idx++) {
+    bool move_all = false;
+    if (from_idx >= _num_q) {
+      move_all = ref_lists[from_idx].length() > 0;
+    }
+    while ((ref_lists[from_idx].length() > avg_refs) ||
+           move_all) {
       assert(to_idx < _num_q, "Sanity Check!");
       if (ref_lists[to_idx].length() < avg_refs) {
         // move superfluous refs
-        size_t refs_to_move =
-          MIN2(ref_lists[from_idx].length() - avg_refs,
-               avg_refs - ref_lists[to_idx].length());
+        size_t refs_to_move;
+        // Move all the Ref's if the from queue will not be processed.
+        if (move_all) {
+          refs_to_move = MIN2(ref_lists[from_idx].length(),
+                              avg_refs - ref_lists[to_idx].length());
+        } else {
+          refs_to_move = MIN2(ref_lists[from_idx].length() - avg_refs,
+                              avg_refs - ref_lists[to_idx].length());
+        }
         oop move_head = ref_lists[from_idx].head();
         oop move_tail = move_head;
         oop new_head  = move_head;
@@ -840,11 +888,35 @@
         ref_lists[to_idx].inc_length(refs_to_move);
         ref_lists[from_idx].set_head(new_head);
         ref_lists[from_idx].dec_length(refs_to_move);
+        if (ref_lists[from_idx].length() == 0) {
+          break;
+        }
       } else {
-        ++to_idx;
+        to_idx = (to_idx + 1) % _num_q;
       }
     }
   }
+#ifdef ASSERT
+  size_t balanced_total_refs = 0;
+  for (int i = 0; i < _max_num_q; ++i) {
+    balanced_total_refs += ref_lists[i].length();
+    if (TraceReferenceGC && PrintGCDetails) {
+      gclog_or_tty->print("%d ", ref_lists[i].length());
+    }
+  }
+  if (TraceReferenceGC && PrintGCDetails) {
+    gclog_or_tty->print_cr(" = %d", balanced_total_refs);
+    gclog_or_tty->flush();
+  }
+  assert(total_refs == balanced_total_refs, "Balancing was incomplete");
+#endif
+}
+
+void ReferenceProcessor::balance_all_queues() {
+  balance_queues(_discoveredSoftRefs);
+  balance_queues(_discoveredWeakRefs);
+  balance_queues(_discoveredFinalRefs);
+  balance_queues(_discoveredPhantomRefs);
 }
 
 void
@@ -857,8 +929,17 @@
   VoidClosure*                 complete_gc,
   AbstractRefProcTaskExecutor* task_executor)
 {
-  bool mt = task_executor != NULL && _processing_is_mt;
-  if (mt && ParallelRefProcBalancingEnabled) {
+  bool mt_processing = task_executor != NULL && _processing_is_mt;
+  // If discovery used MT and a dynamic number of GC threads, then
+  // the queues must be balanced for correctness if fewer than the
+  // maximum number of queues were used.  The number of queue used
+  // during discovery may be different than the number to be used
+  // for processing so don't depend of _num_q < _max_num_q as part
+  // of the test.
+  bool must_balance = _discovery_is_mt;
+
+  if ((mt_processing && ParallelRefProcBalancingEnabled) ||
+      must_balance) {
     balance_queues(refs_lists);
   }
   if (PrintReferenceGC && PrintGCDetails) {
@@ -875,7 +956,7 @@
   //   policy reasons. Keep alive the transitive closure of all
   //   such referents.
   if (policy != NULL) {
-    if (mt) {
+    if (mt_processing) {
       RefProcPhase1Task phase1(*this, refs_lists, policy, true /*marks_oops_alive*/);
       task_executor->execute(phase1);
     } else {
@@ -891,7 +972,7 @@
 
   // Phase 2:
   // . Traverse the list and remove any refs whose referents are alive.
-  if (mt) {
+  if (mt_processing) {
     RefProcPhase2Task phase2(*this, refs_lists, !discovery_is_atomic() /*marks_oops_alive*/);
     task_executor->execute(phase2);
   } else {
@@ -902,7 +983,7 @@
 
   // Phase 3:
   // . Traverse the list and process referents as appropriate.
-  if (mt) {
+  if (mt_processing) {
     RefProcPhase3Task phase3(*this, refs_lists, clear_referent, true /*marks_oops_alive*/);
     task_executor->execute(phase3);
   } else {
@@ -915,7 +996,11 @@
 
 void ReferenceProcessor::clean_up_discovered_references() {
   // loop over the lists
-  for (int i = 0; i < _num_q * subclasses_of_ref; i++) {
+  // Should this instead be
+  // for (int i = 0; i < subclasses_of_ref; i++_ {
+  //   for (int j = 0; j < _num_q; j++) {
+  //     int index = i * _max_num_q + j;
+  for (int i = 0; i < _max_num_q * subclasses_of_ref; i++) {
     if (TraceReferenceGC && PrintGCDetails && ((i % _num_q) == 0)) {
       gclog_or_tty->print_cr(
         "\nScrubbing %s discovered list of Null referents",
@@ -976,7 +1061,7 @@
       id = next_id();
     }
   }
-  assert(0 <= id && id < _num_q, "Id is out-of-bounds (call Freud?)");
+  assert(0 <= id && id < _max_num_q, "Id is out-of-bounds (call Freud?)");
 
   // Get the discovered queue to which we will add
   DiscoveredList* list = NULL;
@@ -1001,6 +1086,10 @@
     default:
       ShouldNotReachHere();
   }
+  if (TraceReferenceGC && PrintGCDetails) {
+    gclog_or_tty->print_cr("Thread %d gets list " INTPTR_FORMAT,
+      id, list);
+  }
   return list;
 }
 
@@ -1243,7 +1332,7 @@
   {
     TraceTime tt("Preclean SoftReferences", PrintGCDetails && PrintReferenceGC,
               false, gclog_or_tty);
-    for (int i = 0; i < _num_q; i++) {
+    for (int i = 0; i < _max_num_q; i++) {
       if (yield->should_return()) {
         return;
       }
@@ -1340,15 +1429,16 @@
 
   NOT_PRODUCT(
     if (PrintGCDetails && PrintReferenceGC) {
-      gclog_or_tty->print(" Dropped %d Refs out of %d "
-        "Refs in discovered list ", iter.removed(), iter.processed());
+      gclog_or_tty->print_cr(" Dropped %d Refs out of %d "
+        "Refs in discovered list " INTPTR_FORMAT,
+        iter.removed(), iter.processed(), (address)refs_list.head());
     }
   )
 }
 
 const char* ReferenceProcessor::list_name(int i) {
-   assert(i >= 0 && i <= _num_q * subclasses_of_ref, "Out of bounds index");
-   int j = i / _num_q;
+   assert(i >= 0 && i <= _max_num_q * subclasses_of_ref, "Out of bounds index");
+   int j = i / _max_num_q;
    switch (j) {
      case 0: return "SoftRef";
      case 1: return "WeakRef";
@@ -1372,7 +1462,7 @@
 #ifndef PRODUCT
 void ReferenceProcessor::clear_discovered_references() {
   guarantee(!_discovering_refs, "Discovering refs?");
-  for (int i = 0; i < _num_q * subclasses_of_ref; i++) {
+  for (int i = 0; i < _max_num_q * subclasses_of_ref; i++) {
     oop obj = _discoveredSoftRefs[i].head();
     while (obj != sentinel_ref()) {
       oop next = java_lang_ref_Reference::discovered(obj);
--- a/hotspot/src/share/vm/memory/referenceProcessor.hpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/memory/referenceProcessor.hpp	Mon Sep 20 14:38:38 2010 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2008, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -85,8 +85,10 @@
 
   // The discovered ref lists themselves
 
-  // The MT'ness degree of the queues below
+  // The active MT'ness degree of the queues below
   int             _num_q;
+  // The maximum MT'ness degree of the queues below
+  int             _max_num_q;
   // Arrays of lists of oops, one per thread
   DiscoveredList* _discoveredSoftRefs;
   DiscoveredList* _discoveredWeakRefs;
@@ -95,6 +97,7 @@
 
  public:
   int num_q()                            { return _num_q; }
+  void set_mt_degree(int v)              { _num_q = v; }
   DiscoveredList* discovered_soft_refs() { return _discoveredSoftRefs; }
   static oop  sentinel_ref()             { return _sentinelRef; }
   static oop* adr_sentinel_ref()         { return &_sentinelRef; }
@@ -244,6 +247,7 @@
     _bs(NULL),
     _is_alive_non_header(NULL),
     _num_q(0),
+    _max_num_q(0),
     _processing_is_mt(false),
     _next_id(0)
   {}
@@ -312,6 +316,9 @@
   void weak_oops_do(OopClosure* f);       // weak roots
   static void oops_do(OopClosure* f);     // strong root(s)
 
+  // Balance each of the discovered lists.
+  void balance_all_queues();
+
   // Discover a Reference object, using appropriate discovery criteria
   bool discover_reference(oop obj, ReferenceType rt);
 
--- a/hotspot/src/share/vm/memory/sharedHeap.cpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/memory/sharedHeap.cpp	Mon Sep 20 14:38:38 2010 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -50,7 +50,8 @@
   _perm_gen(NULL), _rem_set(NULL),
   _strong_roots_parity(0),
   _process_strong_tasks(new SubTasksDone(SH_PS_NumElements)),
-  _workers(NULL), _n_par_threads(0)
+  _n_par_threads(0),
+  _workers(NULL)
 {
   if (_process_strong_tasks == NULL || !_process_strong_tasks->valid()) {
     vm_exit_during_initialization("Failed necessary allocation.");
@@ -60,11 +61,13 @@
       (UseConcMarkSweepGC && CMSParallelRemarkEnabled) ||
        UseG1GC) &&
       ParallelGCThreads > 0) {
-    _workers = new WorkGang("Parallel GC Threads", ParallelGCThreads,
+    _workers = new FlexibleWorkGang("Parallel GC Threads", ParallelGCThreads,
                             /* are_GC_task_threads */true,
                             /* are_ConcurrentGC_threads */false);
     if (_workers == NULL) {
       vm_exit_during_initialization("Failed necessary allocation.");
+    } else {
+      _workers->initialize_workers();
     }
   }
 }
@@ -77,8 +80,9 @@
 }
 
 void SharedHeap::set_par_threads(int t) {
+  assert(t == 0 || !UseSerialGC, "Cannot have parallel threads");
   _n_par_threads = t;
-  _process_strong_tasks->set_par_threads(t);
+  _process_strong_tasks->set_n_threads(t);
 }
 
 class AssertIsPermClosure: public OopClosure {
--- a/hotspot/src/share/vm/memory/sharedHeap.hpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/memory/sharedHeap.hpp	Mon Sep 20 14:38:38 2010 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -38,6 +38,7 @@
 class ObjectClosure;
 class SubTasksDone;
 class WorkGang;
+class FlexibleWorkGang;
 class CollectorPolicy;
 class KlassHandle;
 
@@ -74,7 +75,7 @@
   int _strong_roots_parity;
 
   // If we're doing parallel GC, use this gang of threads.
-  WorkGang* _workers;
+  FlexibleWorkGang* _workers;
 
   // Number of parallel threads currently working on GC tasks.
   // O indicates use sequential code; 1 means use parallel code even with
@@ -189,7 +190,7 @@
     SO_CodeCache           = 0x10
   };
 
-  WorkGang* workers() const { return _workers; }
+  FlexibleWorkGang* workers() const { return _workers; }
 
   // Sets the number of parallel threads that will be doing tasks
   // (such as process strong roots) subsequently.
--- a/hotspot/src/share/vm/utilities/taskqueue.cpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/utilities/taskqueue.cpp	Mon Sep 20 14:38:38 2010 -0700
@@ -144,6 +144,7 @@
 
 bool
 ParallelTaskTerminator::offer_termination(TerminatorTerminator* terminator) {
+  assert(_n_threads > 0, "Initialization is incorrect");
   assert(_offered_termination < _n_threads, "Invariant");
   Atomic::inc(&_offered_termination);
 
@@ -255,3 +256,9 @@
     _index < objArrayOop(_obj)->length();
 }
 #endif // ASSERT
+
+void ParallelTaskTerminator::reset_for_reuse(int n_threads) {
+  reset_for_reuse();
+  _n_threads = n_threads;
+}
+
--- a/hotspot/src/share/vm/utilities/taskqueue.hpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/utilities/taskqueue.hpp	Mon Sep 20 14:38:38 2010 -0700
@@ -305,6 +305,12 @@
   return false;
 }
 
+// pop_local_slow() is done by the owning thread and is trying to
+// get the last task in the queue.  It will compete with pop_global()
+// that will be used by other threads.  The tag age is incremented
+// whenever the queue goes empty which it will do here if this thread
+// gets the last task or in pop_global() if the queue wraps (top == 0
+// and pop_global() succeeds, see pop_global()).
 template<class E, unsigned int N>
 bool GenericTaskQueue<E, N>::pop_local_slow(uint localBot, Age oldAge) {
   // This queue was observed to contain exactly one element; either this
@@ -637,6 +643,9 @@
   // in an MT-safe manner, once the previous round of use of
   // the terminator is finished.
   void reset_for_reuse();
+  // Same as above but the number of parallel threads is set to the
+  // given number.
+  void reset_for_reuse(int n_threads);
 
 #ifdef TRACESPINNING
   static uint total_yields() { return _total_yields; }
@@ -782,3 +791,4 @@
 
 typedef OverflowTaskQueue<size_t>             RegionTaskQueue;
 typedef GenericTaskQueueSet<RegionTaskQueue>  RegionTaskQueueSet;
+
--- a/hotspot/src/share/vm/utilities/workgroup.cpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/utilities/workgroup.cpp	Mon Sep 20 14:38:38 2010 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2007, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -53,28 +53,52 @@
                    int         workers,
                    bool        are_GC_task_threads,
                    bool        are_ConcurrentGC_threads) :
-  AbstractWorkGang(name, are_GC_task_threads, are_ConcurrentGC_threads)
-{
+  AbstractWorkGang(name, are_GC_task_threads, are_ConcurrentGC_threads) {
   // Save arguments.
   _total_workers = workers;
+}
+
+GangWorker* WorkGang::allocate_worker(int which) {
+  GangWorker* new_worker = new GangWorker(this, which);
+  return new_worker;
+}
+
+// The current implementation will exit if the allocation
+// of any worker fails.  Still, return a boolean so that
+// a future implementation can possibly do a partial
+// initialization of the workers and report such to the
+// caller.
+bool WorkGang::initialize_workers() {
 
   if (TraceWorkGang) {
-    tty->print_cr("Constructing work gang %s with %d threads", name, workers);
+    tty->print_cr("Constructing work gang %s with %d threads",
+                  name(),
+                  total_workers());
   }
-  _gang_workers = NEW_C_HEAP_ARRAY(GangWorker*, workers);
+  _gang_workers = NEW_C_HEAP_ARRAY(GangWorker*, total_workers());
   if (gang_workers() == NULL) {
     vm_exit_out_of_memory(0, "Cannot create GangWorker array.");
+    return false;
+  }
+  os::ThreadType worker_type;
+  if (are_ConcurrentGC_threads()) {
+    worker_type = os::cgc_thread;
+  } else {
+    worker_type = os::pgc_thread;
   }
   for (int worker = 0; worker < total_workers(); worker += 1) {
-    GangWorker* new_worker = new GangWorker(this, worker);
+    GangWorker* new_worker = allocate_worker(worker);
     assert(new_worker != NULL, "Failed to allocate GangWorker");
     _gang_workers[worker] = new_worker;
-    if (new_worker == NULL || !os::create_thread(new_worker, os::pgc_thread))
+    if (new_worker == NULL || !os::create_thread(new_worker, worker_type)) {
       vm_exit_out_of_memory(0, "Cannot create worker GC thread. Out of system resources.");
+      return false;
+    }
     if (!DisableStartThread) {
       os::start_thread(new_worker);
     }
   }
+  return true;
 }
 
 AbstractWorkGang::~AbstractWorkGang() {
@@ -383,7 +407,7 @@
   return _tasks != NULL;
 }
 
-void SubTasksDone::set_par_threads(int t) {
+void SubTasksDone::set_n_threads(int t) {
 #ifdef ASSERT
   assert(_claimed == 0 || _threads_completed == _n_threads,
          "should not be called while tasks are being processed!");
--- a/hotspot/src/share/vm/utilities/workgroup.hpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/utilities/workgroup.hpp	Mon Sep 20 14:38:38 2010 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2002, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -29,6 +29,7 @@
 class YieldingFlexibleGangWorker;
 class YieldingFlexibleGangTask;
 class WorkData;
+class AbstractWorkGang;
 
 // An abstract task to be worked on by a gang.
 // You subclass this to supply your own work() method
@@ -38,6 +39,13 @@
   // The argument tells you which member of the gang you are.
   virtual void work(int i) = 0;
 
+  // This method configures the task for proper termination.
+  // Some tasks do not have any requirements on termination
+  // and may inherit this method that does nothing.  Some
+  // tasks do some coordination on termination and override
+  // this method to implement that coordination.
+  virtual void set_for_termination(int active_workers) {};
+
   // Debugging accessor for the name.
   const char* name() const PRODUCT_RETURN_(return NULL;);
   int counter() { return _counter; }
@@ -64,6 +72,18 @@
   virtual ~AbstractGangTask() { }
 };
 
+class AbstractGangTaskWOopQueues : public AbstractGangTask {
+  OopTaskQueueSet*       _queues;
+  ParallelTaskTerminator _terminator;
+ public:
+  AbstractGangTaskWOopQueues(const char* name, OopTaskQueueSet* queues) :
+    AbstractGangTask(name), _queues(queues), _terminator(0, _queues) {}
+  ParallelTaskTerminator* terminator() { return &_terminator; }
+  virtual void set_for_termination(int active_workers) {
+    terminator()->reset_for_reuse(active_workers);
+  }
+  OopTaskQueueSet* queues() { return _queues; }
+};
 
 // Class AbstractWorkGang:
 // An abstract class representing a gang of workers.
@@ -114,6 +134,9 @@
   int total_workers() const {
     return _total_workers;
   }
+  virtual int active_workers() const {
+    return _total_workers;
+  }
   bool terminate() const {
     return _terminate;
   }
@@ -199,6 +222,13 @@
            bool are_GC_task_threads, bool are_ConcurrentGC_threads);
   // Run a task, returns when the task is done (or terminated).
   virtual void run_task(AbstractGangTask* task);
+  void run_task(AbstractGangTask* task, uint no_of_parallel_workers);
+  // Allocate a worker and return a pointer to it.
+  virtual GangWorker* allocate_worker(int which);
+  // Initialize workers in the gang.  Return true if initialization
+  // succeeded. The type of the worker can be overridden in a derived
+  // class with the appropriate implementation of allocate_worker().
+  bool initialize_workers();
 };
 
 // Class GangWorker:
@@ -226,6 +256,34 @@
   AbstractWorkGang* gang() const { return _gang; }
 };
 
+class FlexibleWorkGang: public WorkGang {
+ protected:
+  int _active_workers;
+ public:
+  // Constructor and destructor.
+  FlexibleWorkGang(const char* name, int workers,
+                   bool are_GC_task_threads,
+                   bool  are_ConcurrentGC_threads) :
+    WorkGang(name, workers, are_GC_task_threads, are_ConcurrentGC_threads) {
+    _active_workers = ParallelGCThreads;
+  };
+  // Accessors for fields
+  virtual int active_workers() const { return _active_workers; }
+  void set_active_workers(int v) { _active_workers = v; }
+};
+
+// Work gangs in garbage collectors: 2009-06-10
+//
+// SharedHeap - work gang for stop-the-world parallel collection.
+//   Used by
+//     ParNewGeneration
+//     CMSParRemarkTask
+//     CMSRefProcTaskExecutor
+//     G1CollectedHeap
+//     G1ParFinalCountTask
+// ConcurrentMark
+// CMSCollector
+
 // A class that acts as a synchronisation barrier. Workers enter
 // the barrier and must wait until all other workers have entered
 // before any of them may leave.
@@ -271,7 +329,7 @@
   int _n_threads;
   jint _threads_completed;
 #ifdef ASSERT
-  jint _claimed;
+  volatile jint _claimed;
 #endif
 
   // Set all tasks to unclaimed.
@@ -286,9 +344,10 @@
   // True iff the object is in a valid state.
   bool valid();
 
-  // Set the number of parallel threads doing the tasks to "t".  Can only
+  // Get/set the number of parallel threads doing the tasks to "t".  Can only
   // be called before tasks start or after they are complete.
-  void set_par_threads(int t);
+  int n_threads() { return _n_threads; }
+  void set_n_threads(int t);
 
   // Returns "false" if the task "t" is unclaimed, and ensures that task is
   // claimed.  The task "t" is required to be within the range of "this".
@@ -315,13 +374,17 @@
 protected:
   jint _n_tasks;     // Total number of tasks available.
   jint _n_claimed;   // Number of tasks claimed.
+  // _n_threads is used to determine when a sub task is done.
+  // See comments on SubTasksDone::_n_threads
   jint _n_threads;   // Total number of parallel threads.
   jint _n_completed; // Number of completed threads.
 
   void clear();
 
 public:
-  SequentialSubTasksDone() { clear(); }
+  SequentialSubTasksDone() {
+    clear();
+  }
   ~SequentialSubTasksDone() {}
 
   // True iff the object is in a valid state.
@@ -330,11 +393,12 @@
   // number of tasks
   jint n_tasks() const { return _n_tasks; }
 
-  // Set the number of parallel threads doing the tasks to t.
+  // Get/set the number of parallel threads doing the tasks to t.
   // Should be called before the task starts but it is safe
   // to call this once a task is running provided that all
   // threads agree on the number of threads.
-  void set_par_threads(int t) { _n_threads = t; }
+  int n_threads() { return _n_threads; }
+  void set_n_threads(int t) { _n_threads = t; }
 
   // Set the number of tasks to be claimed to t. As above,
   // should be called before the tasks start but it is safe
--- a/hotspot/src/share/vm/utilities/yieldingWorkgroup.cpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/utilities/yieldingWorkgroup.cpp	Mon Sep 20 14:38:38 2010 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2010 Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -32,29 +32,13 @@
 
 YieldingFlexibleWorkGang::YieldingFlexibleWorkGang(
   const char* name, int workers, bool are_GC_task_threads) :
-  AbstractWorkGang(name, are_GC_task_threads, false) {
-  // Save arguments.
-  _total_workers = workers;
-  assert(_total_workers > 0, "Must have more than 1 worker");
-
-  _yielded_workers = 0;
+  FlexibleWorkGang(name, workers, are_GC_task_threads, false),
+    _yielded_workers(0) {}
 
-  if (TraceWorkGang) {
-    tty->print_cr("Constructing work gang %s with %d threads", name, workers);
-  }
-  _gang_workers = NEW_C_HEAP_ARRAY(GangWorker*, workers);
-  assert(gang_workers() != NULL, "Failed to allocate gang workers");
-  for (int worker = 0; worker < total_workers(); worker += 1) {
-    YieldingFlexibleGangWorker* new_worker =
-      new YieldingFlexibleGangWorker(this, worker);
-    assert(new_worker != NULL, "Failed to allocate YieldingFlexibleGangWorker");
-    _gang_workers[worker] = new_worker;
-    if (new_worker == NULL || !os::create_thread(new_worker, os::pgc_thread))
-      vm_exit_out_of_memory(0, "Cannot create worker GC thread. Out of system resources.");
-    if (!DisableStartThread) {
-      os::start_thread(new_worker);
-    }
-  }
+GangWorker* YieldingFlexibleWorkGang::allocate_worker(int which) {
+  YieldingFlexibleGangWorker* new_member =
+      new YieldingFlexibleGangWorker(this, which);
+  return (YieldingFlexibleGangWorker*) new_member;
 }
 
 // Run a task; returns when the task is done, or the workers yield,
@@ -142,6 +126,7 @@
     _active_workers = total_workers();
   }
   new_task->set_actual_size(_active_workers);
+  new_task->set_for_termination(_active_workers);
 
   assert(_started_workers == 0, "Tabula rasa non");
   assert(_finished_workers == 0, "Tabula rasa non");
@@ -161,22 +146,22 @@
   for (Status status = yielding_task()->status();
        status != COMPLETED && status != YIELDED && status != ABORTED;
        status = yielding_task()->status()) {
-    assert(started_workers() <= active_workers(), "invariant");
-    assert(finished_workers() <= active_workers(), "invariant");
-    assert(yielded_workers() <= active_workers(), "invariant");
+    assert(started_workers() <= total_workers(), "invariant");
+    assert(finished_workers() <= total_workers(), "invariant");
+    assert(yielded_workers() <= total_workers(), "invariant");
     monitor()->wait(Mutex::_no_safepoint_check_flag);
   }
   switch (yielding_task()->status()) {
     case COMPLETED:
     case ABORTED: {
-      assert(finished_workers() == active_workers(), "Inconsistent status");
+      assert(finished_workers() == total_workers(), "Inconsistent status");
       assert(yielded_workers() == 0, "Invariant");
       reset();   // for next task; gang<->task binding released
       break;
     }
     case YIELDED: {
       assert(yielded_workers() > 0, "Invariant");
-      assert(yielded_workers() + finished_workers() == active_workers(),
+      assert(yielded_workers() + finished_workers() == total_workers(),
              "Inconsistent counts");
       break;
     }
@@ -208,7 +193,6 @@
 void YieldingFlexibleWorkGang::reset() {
   _started_workers  = 0;
   _finished_workers = 0;
-  _active_workers   = 0;
   yielding_task()->set_gang(NULL);
   _task = NULL;    // unbind gang from task
 }
@@ -216,7 +200,7 @@
 void YieldingFlexibleWorkGang::yield() {
   assert(task() != NULL, "Inconsistency; should have task binding");
   MutexLockerEx ml(monitor(), Mutex::_no_safepoint_check_flag);
-  assert(yielded_workers() < active_workers(), "Consistency check");
+  assert(yielded_workers() < total_workers(), "Consistency check");
   if (yielding_task()->status() == ABORTING) {
     // Do not yield; we need to abort as soon as possible
     // XXX NOTE: This can cause a performance pathology in the
@@ -227,7 +211,7 @@
     // us to return at each potential yield point.
     return;
   }
-  if (++_yielded_workers + finished_workers() == active_workers()) {
+  if (++_yielded_workers + finished_workers() == total_workers()) {
     yielding_task()->set_status(YIELDED);
     monitor()->notify_all();
   } else {
--- a/hotspot/src/share/vm/utilities/yieldingWorkgroup.hpp	Thu Sep 16 13:45:55 2010 -0700
+++ b/hotspot/src/share/vm/utilities/yieldingWorkgroup.hpp	Mon Sep 20 14:38:38 2010 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2010 Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -54,6 +54,25 @@
   virtual void loop();
 };
 
+class FlexibleGangTask: public AbstractGangTask {
+  int _actual_size;                      // size of gang obtained
+protected:
+  int _requested_size;                   // size of gang requested
+public:
+ FlexibleGangTask(const char* name): AbstractGangTask(name),
+    _requested_size(0) {}
+
+  // The abstract work method.
+  // The argument tells you which member of the gang you are.
+  virtual void work(int i) = 0;
+
+  int requested_size() const { return _requested_size; }
+  int actual_size()    const { return _actual_size; }
+
+  void set_requested_size(int sz) { _requested_size = sz; }
+  void set_actual_size(int sz)    { _actual_size    = sz; }
+};
+
 // An abstract task to be worked on by a flexible work gang,
 // and where the workers will periodically yield, usually
 // in response to some condition that is signalled by means
@@ -70,19 +89,15 @@
 // maximum) in response to task requests at certain points.
 // The last part (the flexible part) has not yet been fully
 // fleshed out and is a work in progress.
-class YieldingFlexibleGangTask: public AbstractGangTask {
+class YieldingFlexibleGangTask: public FlexibleGangTask {
   Status _status;
   YieldingFlexibleWorkGang* _gang;
-  int _actual_size;                      // size of gang obtained
 
 protected:
-  int _requested_size;                   // size of gang requested
-
   // Constructor and desctructor: only construct subclasses.
-  YieldingFlexibleGangTask(const char* name): AbstractGangTask(name),
+  YieldingFlexibleGangTask(const char* name): FlexibleGangTask(name),
     _status(INACTIVE),
-    _gang(NULL),
-    _requested_size(0) { }
+    _gang(NULL) { }
 
   virtual ~YieldingFlexibleGangTask() { }
 
@@ -126,20 +141,13 @@
   bool completed() const { return _status == COMPLETED; }
   bool aborted()   const { return _status == ABORTED; }
   bool active()    const { return _status == ACTIVE; }
-
-  int requested_size() const { return _requested_size; }
-  int actual_size()    const { return _actual_size; }
-
-  void set_requested_size(int sz) { _requested_size = sz; }
-  void set_actual_size(int sz)    { _actual_size    = sz; }
 };
-
 // Class YieldingWorkGang: A subclass of WorkGang.
 // In particular, a YieldingWorkGang is made up of
 // YieldingGangWorkers, and provides infrastructure
 // supporting yielding to the "GangOverseer",
 // being the thread that orchestrates the WorkGang via run_task().
-class YieldingFlexibleWorkGang: public AbstractWorkGang {
+class YieldingFlexibleWorkGang: public FlexibleWorkGang {
   // Here's the public interface to this class.
 public:
   // Constructor and destructor.
@@ -151,6 +159,9 @@
            "Incorrect cast");
     return (YieldingFlexibleGangTask*)task();
   }
+  // Allocate a worker and return a pointer to it.
+  GangWorker* allocate_worker(int which);
+
   // Run a task; returns when the task is done, or the workers yield,
   // or the task is aborted, or the work gang is terminated via stop().
   // A task that has been yielded can be continued via this same interface
@@ -180,10 +191,6 @@
   void abort();
 
 private:
-  // The currently active workers in this gang.
-  // This is a number that is dynamically adjusted by
-  // the run_task() method at each subsequent invocation,
-  // using data in the YieldingFlexibleGangTask.
   int _active_workers;
   int _yielded_workers;
   void wait_for_gang();
@@ -194,6 +201,7 @@
     return _active_workers;
   }
 
+  // Accessors for fields
   int yielded_workers() const {
     return _yielded_workers;
   }