6787254: Work queue capacity can be increased substantially on some platforms
authorysr
Fri, 30 Jan 2009 14:17:52 -0800
changeset 2005 42075507972b
parent 1912 13cc94b84c47
child 2006 f2d2f0f20063
6787254: Work queue capacity can be increased substantially on some platforms Summary: Increased the default and maximum size of the CMS marking stack and the size of the parallel workers' work queues in 64-bit mode. The latter was accomplished by an increase in the width of the Taskqueue's Age struct and its Tag field in 64-bit mode. Reviewed-by: jmasa, tonyp
hotspot/src/share/vm/runtime/globals.hpp
hotspot/src/share/vm/utilities/taskqueue.cpp
hotspot/src/share/vm/utilities/taskqueue.hpp
--- a/hotspot/src/share/vm/runtime/globals.hpp	Thu Jan 29 21:25:42 2009 -0800
+++ b/hotspot/src/share/vm/runtime/globals.hpp	Fri Jan 30 14:17:52 2009 -0800
@@ -1426,10 +1426,10 @@
   develop(bool, CMSOverflowEarlyRestoration, false,                         \
           "Whether preserved marks should be restored early")               \
                                                                             \
-  product(uintx, CMSMarkStackSize, 32*K,                                    \
+  product(uintx, CMSMarkStackSize, NOT_LP64(32*K) LP64_ONLY(4*M),           \
           "Size of CMS marking stack")                                      \
                                                                             \
-  product(uintx, CMSMarkStackSizeMax, 4*M,                                  \
+  product(uintx, CMSMarkStackSizeMax, NOT_LP64(4*M) LP64_ONLY(512*M),       \
           "Max size of CMS marking stack")                                  \
                                                                             \
   notproduct(bool, CMSMarkStackOverflowALot, false,                         \
--- a/hotspot/src/share/vm/utilities/taskqueue.cpp	Thu Jan 29 21:25:42 2009 -0800
+++ b/hotspot/src/share/vm/utilities/taskqueue.cpp	Fri Jan 30 14:17:52 2009 -0800
@@ -69,7 +69,7 @@
 ParallelTaskTerminator::offer_termination(TerminatorTerminator* terminator) {
   Atomic::inc(&_offered_termination);
 
-  juint yield_count = 0;
+  uint yield_count = 0;
   while (true) {
     if (_offered_termination == _n_threads) {
       //inner_termination_loop();
--- a/hotspot/src/share/vm/utilities/taskqueue.hpp	Thu Jan 29 21:25:42 2009 -0800
+++ b/hotspot/src/share/vm/utilities/taskqueue.hpp	Fri Jan 30 14:17:52 2009 -0800
@@ -22,67 +22,76 @@
  *
  */
 
+#ifdef LP64
+typedef juint TAG_TYPE;
+// for a taskqueue size of 4M
+#define LOG_TASKQ_SIZE 22
+#else
+typedef jushort TAG_TYPE;
+// for a taskqueue size of 16K
+#define LOG_TASKQ_SIZE 14
+#endif
+
 class TaskQueueSuper: public CHeapObj {
 protected:
   // The first free element after the last one pushed (mod _n).
-  // (For now we'll assume only 32-bit CAS).
-  volatile juint _bottom;
+  volatile uint _bottom;
 
   // log2 of the size of the queue.
   enum SomeProtectedConstants {
-    Log_n = 14
+    Log_n = LOG_TASKQ_SIZE
   };
+#undef LOG_TASKQ_SIZE
 
   // Size of the queue.
-  juint n() { return (1 << Log_n); }
+  uint n() { return (1 << Log_n); }
   // For computing "x mod n" efficiently.
-  juint n_mod_mask() { return n() - 1; }
+  uint n_mod_mask() { return n() - 1; }
 
   struct Age {
-    jushort _top;
-    jushort _tag;
+    TAG_TYPE _top;
+    TAG_TYPE _tag;
 
-    jushort tag() const { return _tag; }
-    jushort top() const { return _top; }
+    TAG_TYPE tag() const { return _tag; }
+    TAG_TYPE top() const { return _top; }
 
     Age() { _tag = 0; _top = 0; }
 
     friend bool operator ==(const Age& a1, const Age& a2) {
       return a1.tag() == a2.tag() && a1.top() == a2.top();
     }
-
   };
   Age _age;
   // These make sure we do single atomic reads and writes.
   Age get_age() {
-    jint res = *(volatile jint*)(&_age);
+    uint res = *(volatile uint*)(&_age);
     return *(Age*)(&res);
   }
   void set_age(Age a) {
-    *(volatile jint*)(&_age) = *(int*)(&a);
+    *(volatile uint*)(&_age) = *(uint*)(&a);
   }
 
-  jushort get_top() {
+  TAG_TYPE get_top() {
     return get_age().top();
   }
 
   // These both operate mod _n.
-  juint increment_index(juint ind) {
+  uint increment_index(uint ind) {
     return (ind + 1) & n_mod_mask();
   }
-  juint decrement_index(juint ind) {
+  uint decrement_index(uint ind) {
     return (ind - 1) & n_mod_mask();
   }
 
   // Returns a number in the range [0.._n).  If the result is "n-1", it
   // should be interpreted as 0.
-  juint dirty_size(juint bot, juint top) {
-    return ((jint)bot - (jint)top) & n_mod_mask();
+  uint dirty_size(uint bot, uint top) {
+    return ((int)bot - (int)top) & n_mod_mask();
   }
 
   // Returns the size corresponding to the given "bot" and "top".
-  juint size(juint bot, juint top) {
-    juint sz = dirty_size(bot, top);
+  uint size(uint bot, uint top) {
+    uint sz = dirty_size(bot, top);
     // Has the queue "wrapped", so that bottom is less than top?
     // There's a complicated special case here.  A pair of threads could
     // perform pop_local and pop_global operations concurrently, starting
@@ -94,7 +103,7 @@
     // owner performs pop_local's, and several concurrent threads
     // attempting to perform the pop_global will all perform the same CAS,
     // and only one can succeed.  Any stealing thread that reads after
-    // either the increment or decrement will seen an empty queue, and will
+    // either the increment or decrement will see an empty queue, and will
     // not join the competitors.  The "sz == -1 || sz == _n-1" state will
     // not be modified  by concurrent queues, so the owner thread can reset
     // the state to _bottom == top so subsequent pushes will be performed
@@ -112,11 +121,11 @@
   // Return an estimate of the number of elements in the queue.
   // The "careful" version admits the possibility of pop_local/pop_global
   // races.
-  juint size() {
+  uint size() {
     return size(_bottom, get_top());
   }
 
-  juint dirty_size() {
+  uint dirty_size() {
     return dirty_size(_bottom, get_top());
   }
 
@@ -127,15 +136,15 @@
 
   // Maximum number of elements allowed in the queue.  This is two less
   // than the actual queue size, for somewhat complicated reasons.
-  juint max_elems() { return n() - 2; }
+  uint max_elems() { return n() - 2; }
 
 };
 
 template<class E> class GenericTaskQueue: public TaskQueueSuper {
 private:
   // Slow paths for push, pop_local.  (pop_global has no fast path.)
-  bool push_slow(E t, juint dirty_n_elems);
-  bool pop_local_slow(juint localBot, Age oldAge);
+  bool push_slow(E t, uint dirty_n_elems);
+  bool pop_local_slow(uint localBot, Age oldAge);
 
 public:
   // Initializes the queue to empty.
@@ -170,7 +179,7 @@
 
 template<class E>
 GenericTaskQueue<E>::GenericTaskQueue():TaskQueueSuper() {
-  assert(sizeof(Age) == sizeof(jint), "Depends on this.");
+  assert(sizeof(Age) == sizeof(int), "Depends on this.");
 }
 
 template<class E>
@@ -182,9 +191,9 @@
 template<class E>
 void GenericTaskQueue<E>::oops_do(OopClosure* f) {
   // tty->print_cr("START OopTaskQueue::oops_do");
-  int iters = size();
-  juint index = _bottom;
-  for (int i = 0; i < iters; ++i) {
+  uint iters = size();
+  uint index = _bottom;
+  for (uint i = 0; i < iters; ++i) {
     index = decrement_index(index);
     // tty->print_cr("  doing entry %d," INTPTR_T " -> " INTPTR_T,
     //            index, &_elems[index], _elems[index]);
@@ -198,10 +207,10 @@
 
 
 template<class E>
-bool GenericTaskQueue<E>::push_slow(E t, juint dirty_n_elems) {
+bool GenericTaskQueue<E>::push_slow(E t, uint dirty_n_elems) {
   if (dirty_n_elems == n() - 1) {
     // Actually means 0, so do the push.
-    juint localBot = _bottom;
+    uint localBot = _bottom;
     _elems[localBot] = t;
     _bottom = increment_index(localBot);
     return true;
@@ -211,7 +220,7 @@
 
 template<class E>
 bool GenericTaskQueue<E>::
-pop_local_slow(juint localBot, Age oldAge) {
+pop_local_slow(uint localBot, Age oldAge) {
   // This queue was observed to contain exactly one element; either this
   // thread will claim it, or a competing "pop_global".  In either case,
   // the queue will be logically empty afterwards.  Create a new Age value
@@ -230,9 +239,8 @@
     Age tempAge;
     // No competing pop_global has yet incremented "top"; we'll try to
     // install new_age, thus claiming the element.
-    assert(sizeof(Age) == sizeof(jint) && sizeof(jint) == sizeof(juint),
-           "Assumption about CAS unit.");
-    *(jint*)&tempAge = Atomic::cmpxchg(*(jint*)&newAge, (volatile jint*)&_age, *(jint*)&oldAge);
+    assert(sizeof(Age) == sizeof(int), "Assumption about CAS unit.");
+    *(uint*)&tempAge = Atomic::cmpxchg(*(uint*)&newAge, (volatile uint*)&_age, *(uint*)&oldAge);
     if (tempAge == oldAge) {
       // We win.
       assert(dirty_size(localBot, get_top()) != n() - 1,
@@ -253,8 +261,8 @@
 bool GenericTaskQueue<E>::pop_global(E& t) {
   Age newAge;
   Age oldAge = get_age();
-  juint localBot = _bottom;
-  juint n_elems = size(localBot, oldAge.top());
+  uint localBot = _bottom;
+  uint n_elems = size(localBot, oldAge.top());
   if (n_elems == 0) {
     return false;
   }
@@ -263,7 +271,7 @@
   newAge._top = increment_index(newAge.top());
   if ( newAge._top == 0 ) newAge._tag++;
   Age resAge;
-  *(jint*)&resAge = Atomic::cmpxchg(*(jint*)&newAge, (volatile jint*)&_age, *(jint*)&oldAge);
+  *(uint*)&resAge = Atomic::cmpxchg(*(uint*)&newAge, (volatile uint*)&_age, *(uint*)&oldAge);
   // Note that using "_bottom" here might fail, since a pop_local might
   // have decremented it.
   assert(dirty_size(localBot, newAge._top) != n() - 1,
@@ -287,7 +295,7 @@
 
 template<class E> class GenericTaskQueueSet: public TaskQueueSetSuper {
 private:
-  int _n;
+  uint _n;
   GenericTaskQueue<E>** _queues;
 
 public:
@@ -300,51 +308,51 @@
     }
   }
 
-  bool steal_1_random(int queue_num, int* seed, E& t);
-  bool steal_best_of_2(int queue_num, int* seed, E& t);
-  bool steal_best_of_all(int queue_num, int* seed, E& t);
+  bool steal_1_random(uint queue_num, int* seed, E& t);
+  bool steal_best_of_2(uint queue_num, int* seed, E& t);
+  bool steal_best_of_all(uint queue_num, int* seed, E& t);
 
-  void register_queue(int i, GenericTaskQueue<E>* q);
+  void register_queue(uint i, GenericTaskQueue<E>* q);
 
-  GenericTaskQueue<E>* queue(int n);
+  GenericTaskQueue<E>* queue(uint n);
 
   // The thread with queue number "queue_num" (and whose random number seed
   // is at "seed") is trying to steal a task from some other queue.  (It
   // may try several queues, according to some configuration parameter.)
   // If some steal succeeds, returns "true" and sets "t" the stolen task,
   // otherwise returns false.
-  bool steal(int queue_num, int* seed, E& t);
+  bool steal(uint queue_num, int* seed, E& t);
 
   bool peek();
 };
 
 template<class E>
-void GenericTaskQueueSet<E>::register_queue(int i, GenericTaskQueue<E>* q) {
-  assert(0 <= i && i < _n, "index out of range.");
+void GenericTaskQueueSet<E>::register_queue(uint i, GenericTaskQueue<E>* q) {
+  assert(i < _n, "index out of range.");
   _queues[i] = q;
 }
 
 template<class E>
-GenericTaskQueue<E>* GenericTaskQueueSet<E>::queue(int i) {
+GenericTaskQueue<E>* GenericTaskQueueSet<E>::queue(uint i) {
   return _queues[i];
 }
 
 template<class E>
-bool GenericTaskQueueSet<E>::steal(int queue_num, int* seed, E& t) {
-  for (int i = 0; i < 2 * _n; i++)
+bool GenericTaskQueueSet<E>::steal(uint queue_num, int* seed, E& t) {
+  for (uint i = 0; i < 2 * _n; i++)
     if (steal_best_of_2(queue_num, seed, t))
       return true;
   return false;
 }
 
 template<class E>
-bool GenericTaskQueueSet<E>::steal_best_of_all(int queue_num, int* seed, E& t) {
+bool GenericTaskQueueSet<E>::steal_best_of_all(uint queue_num, int* seed, E& t) {
   if (_n > 2) {
     int best_k;
-    jint best_sz = 0;
-    for (int k = 0; k < _n; k++) {
+    uint best_sz = 0;
+    for (uint k = 0; k < _n; k++) {
       if (k == queue_num) continue;
-      jint sz = _queues[k]->size();
+      uint sz = _queues[k]->size();
       if (sz > best_sz) {
         best_sz = sz;
         best_k = k;
@@ -362,9 +370,9 @@
 }
 
 template<class E>
-bool GenericTaskQueueSet<E>::steal_1_random(int queue_num, int* seed, E& t) {
+bool GenericTaskQueueSet<E>::steal_1_random(uint queue_num, int* seed, E& t) {
   if (_n > 2) {
-    int k = queue_num;
+    uint k = queue_num;
     while (k == queue_num) k = randomParkAndMiller(seed) % _n;
     return _queues[2]->pop_global(t);
   } else if (_n == 2) {
@@ -378,20 +386,20 @@
 }
 
 template<class E>
-bool GenericTaskQueueSet<E>::steal_best_of_2(int queue_num, int* seed, E& t) {
+bool GenericTaskQueueSet<E>::steal_best_of_2(uint queue_num, int* seed, E& t) {
   if (_n > 2) {
-    int k1 = queue_num;
+    uint k1 = queue_num;
     while (k1 == queue_num) k1 = randomParkAndMiller(seed) % _n;
-    int k2 = queue_num;
+    uint k2 = queue_num;
     while (k2 == queue_num || k2 == k1) k2 = randomParkAndMiller(seed) % _n;
     // Sample both and try the larger.
-    juint sz1 = _queues[k1]->size();
-    juint sz2 = _queues[k2]->size();
+    uint sz1 = _queues[k1]->size();
+    uint sz2 = _queues[k2]->size();
     if (sz2 > sz1) return _queues[k2]->pop_global(t);
     else return _queues[k1]->pop_global(t);
   } else if (_n == 2) {
     // Just try the other one.
-    int k = (queue_num + 1) % 2;
+    uint k = (queue_num + 1) % 2;
     return _queues[k]->pop_global(t);
   } else {
     assert(_n == 1, "can't be zero.");
@@ -402,7 +410,7 @@
 template<class E>
 bool GenericTaskQueueSet<E>::peek() {
   // Try all the queues.
-  for (int j = 0; j < _n; j++) {
+  for (uint j = 0; j < _n; j++) {
     if (_queues[j]->peek())
       return true;
   }
@@ -422,7 +430,7 @@
 private:
   int _n_threads;
   TaskQueueSetSuper* _queue_set;
-  jint _offered_termination;
+  int _offered_termination;
 
   bool peek_in_queue_set();
 protected:
@@ -460,7 +468,7 @@
 
 template<class E> inline bool GenericTaskQueue<E>::push(E t) {
 #if SIMPLE_STACK
-  juint localBot = _bottom;
+  uint localBot = _bottom;
   if (_bottom < max_elems()) {
     _elems[localBot] = t;
     _bottom = localBot + 1;
@@ -469,10 +477,10 @@
     return false;
   }
 #else
-  juint localBot = _bottom;
+  uint localBot = _bottom;
   assert((localBot >= 0) && (localBot < n()), "_bottom out of range.");
-  jushort top = get_top();
-  juint dirty_n_elems = dirty_size(localBot, top);
+  TAG_TYPE top = get_top();
+  uint dirty_n_elems = dirty_size(localBot, top);
   assert((dirty_n_elems >= 0) && (dirty_n_elems < n()),
          "n_elems out of range.");
   if (dirty_n_elems < max_elems()) {
@@ -487,19 +495,19 @@
 
 template<class E> inline bool GenericTaskQueue<E>::pop_local(E& t) {
 #if SIMPLE_STACK
-  juint localBot = _bottom;
+  uint localBot = _bottom;
   assert(localBot > 0, "precondition.");
   localBot--;
   t = _elems[localBot];
   _bottom = localBot;
   return true;
 #else
-  juint localBot = _bottom;
+  uint localBot = _bottom;
   // This value cannot be n-1.  That can only occur as a result of
   // the assignment to bottom in this method.  If it does, this method
   // resets the size( to 0 before the next call (which is sequential,
   // since this is pop_local.)
-  juint dirty_n_elems = dirty_size(localBot, get_top());
+  uint dirty_n_elems = dirty_size(localBot, get_top());
   assert(dirty_n_elems != n() - 1, "Shouldn't be possible...");
   if (dirty_n_elems == 0) return false;
   localBot = decrement_index(localBot);
@@ -512,7 +520,7 @@
   // If there's still at least one element in the queue, based on the
   // "_bottom" and "age" we've read, then there can be no interference with
   // a "pop_global" operation, and we're done.
-  juint tp = get_top();
+  TAG_TYPE tp = get_top();    // XXX
   if (size(localBot, tp) > 0) {
     assert(dirty_size(localBot, tp) != n() - 1,
            "Shouldn't be possible...");
@@ -581,7 +589,7 @@
   bool is_empty();
   bool stealable_is_empty();
   bool overflow_is_empty();
-  juint stealable_size() { return _region_queue.size(); }
+  uint stealable_size() { return _region_queue.size(); }
   RegionTaskQueue* task_queue() { return &_region_queue; }
 };