# HG changeset patch
# User lana
# Date 1441322042 25200
# Node ID 6675700073c14bab9c25a306d5ed73f995dc1500
# Parent  eb1661ea942c998ea214ce0f727622ebaec94f0c# Parent  ab79437fbfaad3acd3ba87e85c23afbcda28c71e
Merge

diff -r eb1661ea942c -r 6675700073c1 hotspot/agent/src/share/classes/sun/jvm/hotspot/debugger/Address.java
--- a/hotspot/agent/src/share/classes/sun/jvm/hotspot/debugger/Address.java	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/agent/src/share/classes/sun/jvm/hotspot/debugger/Address.java	Thu Sep 03 16:14:02 2015 -0700
@@ -209,4 +209,7 @@
       returns the result as an Address. Returns null if the result was
       zero. */
   public Address    xorWithMask(long mask) throws UnsupportedOperationException;
+
+  // return address as long integer.
+  public long asLongValue();
 }
diff -r eb1661ea942c -r 6675700073c1 hotspot/agent/src/share/classes/sun/jvm/hotspot/debugger/bsd/BsdAddress.java
--- a/hotspot/agent/src/share/classes/sun/jvm/hotspot/debugger/bsd/BsdAddress.java	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/agent/src/share/classes/sun/jvm/hotspot/debugger/bsd/BsdAddress.java	Thu Sep 03 16:14:02 2015 -0700
@@ -288,7 +288,7 @@
     return new BsdAddress(debugger, value);
   }
 
-
+  public long asLongValue() { return addr; }
   //--------------------------------------------------------------------------------
   // Internals only below this point
   //
diff -r eb1661ea942c -r 6675700073c1 hotspot/agent/src/share/classes/sun/jvm/hotspot/debugger/dummy/DummyAddress.java
--- a/hotspot/agent/src/share/classes/sun/jvm/hotspot/debugger/dummy/DummyAddress.java	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/agent/src/share/classes/sun/jvm/hotspot/debugger/dummy/DummyAddress.java	Thu Sep 03 16:14:02 2015 -0700
@@ -275,6 +275,7 @@
     return new DummyAddress(debugger, value);
   }
 
+  public long asLongValue() { return addr; }
   //--------------------------------------------------------------------------------
   // Internals only below this point
   //
diff -r eb1661ea942c -r 6675700073c1 hotspot/agent/src/share/classes/sun/jvm/hotspot/debugger/linux/LinuxAddress.java
--- a/hotspot/agent/src/share/classes/sun/jvm/hotspot/debugger/linux/LinuxAddress.java	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/agent/src/share/classes/sun/jvm/hotspot/debugger/linux/LinuxAddress.java	Thu Sep 03 16:14:02 2015 -0700
@@ -288,6 +288,7 @@
     return new LinuxAddress(debugger, value);
   }
 
+  public long asLongValue() { return addr; }
 
   //--------------------------------------------------------------------------------
   // Internals only below this point
diff -r eb1661ea942c -r 6675700073c1 hotspot/agent/src/share/classes/sun/jvm/hotspot/debugger/proc/ProcAddress.java
--- a/hotspot/agent/src/share/classes/sun/jvm/hotspot/debugger/proc/ProcAddress.java	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/agent/src/share/classes/sun/jvm/hotspot/debugger/proc/ProcAddress.java	Thu Sep 03 16:14:02 2015 -0700
@@ -283,7 +283,7 @@
     return new ProcAddress(debugger, value);
   }
 
-
+  public long asLongValue() { return addr; }
   //--------------------------------------------------------------------------------
   // Internals only below this point
   //
diff -r eb1661ea942c -r 6675700073c1 hotspot/agent/src/share/classes/sun/jvm/hotspot/debugger/remote/RemoteAddress.java
--- a/hotspot/agent/src/share/classes/sun/jvm/hotspot/debugger/remote/RemoteAddress.java	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/agent/src/share/classes/sun/jvm/hotspot/debugger/remote/RemoteAddress.java	Thu Sep 03 16:14:02 2015 -0700
@@ -281,7 +281,7 @@
     return new RemoteAddress(debugger, value);
   }
 
-
+  public long asLongValue() { return addr; }
   //--------------------------------------------------------------------------------
   // Internals only below this point
   //
diff -r eb1661ea942c -r 6675700073c1 hotspot/agent/src/share/classes/sun/jvm/hotspot/debugger/windbg/WindbgAddress.java
--- a/hotspot/agent/src/share/classes/sun/jvm/hotspot/debugger/windbg/WindbgAddress.java	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/agent/src/share/classes/sun/jvm/hotspot/debugger/windbg/WindbgAddress.java	Thu Sep 03 16:14:02 2015 -0700
@@ -292,6 +292,7 @@
     return new WindbgAddress(debugger, value);
   }
 
+  public long asLongValue() { return addr; }
 
   //--------------------------------------------------------------------------------
   // Internals only below this point
diff -r eb1661ea942c -r 6675700073c1 hotspot/agent/src/share/classes/sun/jvm/hotspot/oops/Symbol.java
--- a/hotspot/agent/src/share/classes/sun/jvm/hotspot/oops/Symbol.java	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/agent/src/share/classes/sun/jvm/hotspot/oops/Symbol.java	Thu Sep 03 16:14:02 2015 -0700
@@ -80,10 +80,19 @@
   public byte getByteAt(long index) {
     return addr.getJByteAt(baseOffset + index);
   }
-
+  // _identity_hash is a short
   private static CIntegerField idHash;
 
-  public int identityHash() { return     (int)idHash.getValue(this.addr); }
+  public int identityHash() {
+    long addr_value = getAddress().asLongValue();
+    int  addr_bits = (int)(addr_value >> (VM.getVM().getLogMinObjAlignmentInBytes() + 3));
+    int  length = (int)getLength();
+    int  byte0 = getByteAt(0);
+    int  byte1 = getByteAt(1);
+    int  id_hash = (int)(0xffff & idHash.getValue(this.addr));
+    return id_hash |
+           ((addr_bits ^ (length << 8) ^ ((byte0 << 8) | byte1)) << 16);
+  }
 
   public boolean equals(byte[] modUTF8Chars) {
     int l = (int) getLength();
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/cpu/aarch64/vm/aarch64.ad
--- a/hotspot/src/cpu/aarch64/vm/aarch64.ad	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/cpu/aarch64/vm/aarch64.ad	Thu Sep 03 16:14:02 2015 -0700
@@ -1033,27 +1033,39 @@
 };
 
   // graph traversal helpers
-  MemBarNode *has_parent_membar(const Node *n,
-				ProjNode *&ctl, ProjNode *&mem);
-  MemBarNode *has_child_membar(const MemBarNode *n,
-			       ProjNode *&ctl, ProjNode *&mem);
+
+  MemBarNode *parent_membar(const Node *n);
+  MemBarNode *child_membar(const MemBarNode *n);
+  bool leading_membar(const MemBarNode *barrier);
+
+  bool is_card_mark_membar(const MemBarNode *barrier);
+
+  MemBarNode *leading_to_normal(MemBarNode *leading);
+  MemBarNode *normal_to_leading(const MemBarNode *barrier);
+  MemBarNode *card_mark_to_trailing(const MemBarNode *barrier);
+  MemBarNode *trailing_to_card_mark(const MemBarNode *trailing);
+  MemBarNode *trailing_to_leading(const MemBarNode *trailing);
 
   // predicates controlling emit of ldr<x>/ldar<x> and associated dmb
+
   bool unnecessary_acquire(const Node *barrier);
   bool needs_acquiring_load(const Node *load);
 
   // predicates controlling emit of str<x>/stlr<x> and associated dmbs
+
   bool unnecessary_release(const Node *barrier);
   bool unnecessary_volatile(const Node *barrier);
   bool needs_releasing_store(const Node *store);
 
-  // Use barrier instructions for unsafe volatile gets rather than
-  // trying to identify an exact signature for them
-  const bool UseBarriersForUnsafeVolatileGet = false;
+  // predicate controlling translation of StoreCM
+  bool unnecessary_storestore(const Node *storecm);
 %}
 
 source %{
 
+  // Optimizaton of volatile gets and puts
+  // -------------------------------------
+  //
   // AArch64 has ldar<x> and stlr<x> instructions which we can safely
   // use to implement volatile reads and writes. For a volatile read
   // we simply need
@@ -1102,15 +1114,19 @@
   // A volatile write is translated to the node sequence
   //
   //   MemBarRelease
-  //   StoreX[mo_release]
+  //   StoreX[mo_release] {CardMark}-optional
   //   MemBarVolatile
   //
   // n.b. the above node patterns are generated with a strict
   // 'signature' configuration of input and output dependencies (see
-  // the predicates below for exact details). The two signatures are
-  // unique to translated volatile reads/stores -- they will not
-  // appear as a result of any other bytecode translation or inlining
-  // nor as a consequence of optimizing transforms.
+  // the predicates below for exact details). The card mark may be as
+  // simple as a few extra nodes or, in a few GC configurations, may
+  // include more complex control flow between the leading and
+  // trailing memory barriers. However, whatever the card mark
+  // configuration these signatures are unique to translated volatile
+  // reads/stores -- they will not appear as a result of any other
+  // bytecode translation or inlining nor as a consequence of
+  // optimizing transforms.
   //
   // We also want to catch inlined unsafe volatile gets and puts and
   // be able to implement them using either ldar<x>/stlr<x> or some
@@ -1122,7 +1138,7 @@
   //
   //   MemBarRelease
   //   MemBarCPUOrder
-  //   StoreX[mo_release]
+  //   StoreX[mo_release] {CardMark}-optional
   //   MemBarVolatile
   //
   // n.b. as an aside, the cpuorder membar is not itself subject to
@@ -1130,7 +1146,7 @@
   // predicates need to detect its presence in order to correctly
   // select the desired adlc rules.
   //
-  // Inlined unsafe volatiles gets manifest as a somewhat different
+  // Inlined unsafe volatile gets manifest as a somewhat different
   // node sequence to a normal volatile get
   //
   //   MemBarCPUOrder
@@ -1173,33 +1189,22 @@
   // n.b. the translation rules below which rely on detection of the
   // volatile signatures and insert ldar<x> or stlr<x> are failsafe.
   // If we see anything other than the signature configurations we
-  // always just translate the loads and stors to ldr<x> and str<x>
+  // always just translate the loads and stores to ldr<x> and str<x>
   // and translate acquire, release and volatile membars to the
   // relevant dmb instructions.
   //
-  // n.b.b as a case in point for the above comment, the current
-  // predicates don't detect the precise signature for certain types
-  // of volatile object stores (where the heap_base input type is not
-  // known at compile-time to be non-NULL). In those cases the
-  // MemBarRelease and MemBarVolatile bracket an if-then-else sequence
-  // with a store in each branch (we need a different store depending
-  // on whether heap_base is actually NULL). In such a case we will
-  // just plant a dmb both before and after the branch/merge. The
-  // predicate could (and probably should) be fixed later to also
-  // detect this case.
-
-  // graph traversal helpers
+
+  // graph traversal helpers used for volatile put/get optimization
+
+  // 1) general purpose helpers
 
   // if node n is linked to a parent MemBarNode by an intervening
-  // Control or Memory ProjNode return the MemBarNode otherwise return
+  // Control and Memory ProjNode return the MemBarNode otherwise return
   // NULL.
   //
   // n may only be a Load or a MemBar.
-  //
-  // The ProjNode* references c and m are used to return the relevant
-  // nodes.
-
-  MemBarNode *has_parent_membar(const Node *n, ProjNode *&c, ProjNode *&m)
+
+  MemBarNode *parent_membar(const Node *n)
   {
     Node *ctl = NULL;
     Node *mem = NULL;
@@ -1218,15 +1223,11 @@
     if (!ctl || !mem || !ctl->is_Proj() || !mem->is_Proj())
       return NULL;
 
-    c = ctl->as_Proj();
-
     membar = ctl->lookup(0);
 
     if (!membar || !membar->is_MemBar())
       return NULL;
 
-    m = mem->as_Proj();
-
     if (mem->lookup(0) != membar)
       return NULL;
 
@@ -1235,12 +1236,8 @@
 
   // if n is linked to a child MemBarNode by intervening Control and
   // Memory ProjNodes return the MemBarNode otherwise return NULL.
-  //
-  // The ProjNode** arguments c and m are used to return pointers to
-  // the relevant nodes. A null argument means don't don't return a
-  // value.
-
-  MemBarNode *has_child_membar(const MemBarNode *n, ProjNode *&c, ProjNode *&m)
+
+  MemBarNode *child_membar(const MemBarNode *n)
   {
     ProjNode *ctl = n->proj_out(TypeFunc::Control);
     ProjNode *mem = n->proj_out(TypeFunc::Memory);
@@ -1249,9 +1246,6 @@
     if (! ctl || ! mem)
       return NULL;
 
-    c = ctl;
-    m = mem;
-
     MemBarNode *child = NULL;
     Node *x;
 
@@ -1279,9 +1273,838 @@
     return NULL;
   }
 
+  // helper predicate use to filter candidates for a leading memory
+  // barrier
+  //
+  // returns true if barrier is a MemBarRelease or a MemBarCPUOrder
+  // whose Ctl and Mem feeds come from a MemBarRelease otherwise false
+
+  bool leading_membar(const MemBarNode *barrier)
+  {
+    int opcode = barrier->Opcode();
+    // if this is a release membar we are ok
+    if (opcode == Op_MemBarRelease)
+      return true;
+    // if its a cpuorder membar . . .
+    if (opcode != Op_MemBarCPUOrder)
+      return false;
+    // then the parent has to be a release membar
+    MemBarNode *parent = parent_membar(barrier);
+    if (!parent)
+      return false;
+    opcode = parent->Opcode();
+    return opcode == Op_MemBarRelease;
+  }
+ 
+  // 2) card mark detection helper
+
+  // helper predicate which can be used to detect a volatile membar
+  // introduced as part of a conditional card mark sequence either by
+  // G1 or by CMS when UseCondCardMark is true.
+  //
+  // membar can be definitively determined to be part of a card mark
+  // sequence if and only if all the following hold
+  //
+  // i) it is a MemBarVolatile
+  //
+  // ii) either UseG1GC or (UseConcMarkSweepGC && UseCondCardMark) is
+  // true
+  //
+  // iii) the node's Mem projection feeds a StoreCM node.
+  
+  bool is_card_mark_membar(const MemBarNode *barrier)
+  {
+    if (!UseG1GC && !(UseConcMarkSweepGC && UseCondCardMark))
+      return false;
+
+    if (barrier->Opcode() != Op_MemBarVolatile)
+      return false;
+
+    ProjNode *mem = barrier->proj_out(TypeFunc::Memory);
+
+    for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax ; i++) {
+      Node *y = mem->fast_out(i);
+      if (y->Opcode() == Op_StoreCM) {
+	return true;
+      }
+    }
+  
+    return false;
+  }
+
+
+  // 3) helper predicates to traverse volatile put graphs which may
+  // contain GC barrier subgraphs
+
+  // Preamble
+  // --------
+  //
+  // for volatile writes we can omit generating barriers and employ a
+  // releasing store when we see a node sequence sequence with a
+  // leading MemBarRelease and a trailing MemBarVolatile as follows
+  //
+  //   MemBarRelease
+  //  {      ||      } -- optional
+  //  {MemBarCPUOrder}
+  //         ||     \\
+  //         ||     StoreX[mo_release]
+  //         | \     /
+  //         | MergeMem
+  //         | /
+  //   MemBarVolatile
+  //
+  // where
+  //  || and \\ represent Ctl and Mem feeds via Proj nodes
+  //  | \ and / indicate further routing of the Ctl and Mem feeds
+  // 
+  // this is the graph we see for non-object stores. however, for a
+  // volatile Object store (StoreN/P) we may see other nodes below the
+  // leading membar because of the need for a GC pre- or post-write
+  // barrier.
+  //
+  // with most GC configurations we with see this simple variant which
+  // includes a post-write barrier card mark.
+  //
+  //   MemBarRelease______________________________
+  //         ||    \\               Ctl \        \\
+  //         ||    StoreN/P[mo_release] CastP2X  StoreB/CM
+  //         | \     /                       . . .  /
+  //         | MergeMem
+  //         | /
+  //         ||      /
+  //   MemBarVolatile
+  //
+  // i.e. the leading membar feeds Ctl to a CastP2X (which converts
+  // the object address to an int used to compute the card offset) and
+  // Ctl+Mem to a StoreB node (which does the actual card mark).
+  //
+  // n.b. a StoreCM node will only appear in this configuration when
+  // using CMS. StoreCM differs from a normal card mark write (StoreB)
+  // because it implies a requirement to order visibility of the card
+  // mark (StoreCM) relative to the object put (StoreP/N) using a
+  // StoreStore memory barrier (arguably this ought to be represented
+  // explicitly in the ideal graph but that is not how it works). This
+  // ordering is required for both non-volatile and volatile
+  // puts. Normally that means we need to translate a StoreCM using
+  // the sequence
+  //
+  //   dmb ishst
+  //   stlrb
+  //
+  // However, in the case of a volatile put if we can recognise this
+  // configuration and plant an stlr for the object write then we can
+  // omit the dmb and just plant an strb since visibility of the stlr
+  // is ordered before visibility of subsequent stores. StoreCM nodes
+  // also arise when using G1 or using CMS with conditional card
+  // marking. In these cases (as we shall see) we don't need to insert
+  // the dmb when translating StoreCM because there is already an
+  // intervening StoreLoad barrier between it and the StoreP/N.
+  //
+  // It is also possible to perform the card mark conditionally on it
+  // currently being unmarked in which case the volatile put graph
+  // will look slightly different
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder___________________________________________
+  //         ||    \\               Ctl \     Ctl \     \\  Mem \
+  //         ||    StoreN/P[mo_release] CastP2X   If   LoadB     |
+  //         | \     /                              \            |
+  //         | MergeMem                            . . .      StoreB
+  //         | /                                                /
+  //         ||     /
+  //   MemBarVolatile
+  //
+  // It is worth noting at this stage that both the above
+  // configurations can be uniquely identified by checking that the
+  // memory flow includes the following subgraph:
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder
+  //          |  \      . . .
+  //          |  StoreX[mo_release]  . . .
+  //          |   /
+  //         MergeMem
+  //          |
+  //   MemBarVolatile
+  //
+  // This is referred to as a *normal* subgraph. It can easily be
+  // detected starting from any candidate MemBarRelease,
+  // StoreX[mo_release] or MemBarVolatile.
+  //
+  // the code below uses two helper predicates, leading_to_normal and
+  // normal_to_leading to identify this configuration, one validating
+  // the layout starting from the top membar and searching down and
+  // the other validating the layout starting from the lower membar
+  // and searching up.
+  //
+  // There are two special case GC configurations when a normal graph
+  // may not be generated: when using G1 (which always employs a
+  // conditional card mark); and when using CMS with conditional card
+  // marking configured. These GCs are both concurrent rather than
+  // stop-the world GCs. So they introduce extra Ctl+Mem flow into the
+  // graph between the leading and trailing membar nodes, in
+  // particular enforcing stronger memory serialisation beween the
+  // object put and the corresponding conditional card mark. CMS
+  // employs a post-write GC barrier while G1 employs both a pre- and
+  // post-write GC barrier. Of course the extra nodes may be absent --
+  // they are only inserted for object puts. This significantly
+  // complicates the task of identifying whether a MemBarRelease,
+  // StoreX[mo_release] or MemBarVolatile forms part of a volatile put
+  // when using these GC configurations (see below).
+  //
+  // In both cases the post-write subtree includes an auxiliary
+  // MemBarVolatile (StoreLoad barrier) separating the object put and
+  // the read of the corresponding card. This poses two additional
+  // problems.
+  //
+  // Firstly, a card mark MemBarVolatile needs to be distinguished
+  // from a normal trailing MemBarVolatile. Resolving this first
+  // problem is straightforward: a card mark MemBarVolatile always
+  // projects a Mem feed to a StoreCM node and that is a unique marker
+  //
+  //      MemBarVolatile (card mark)
+  //       C |    \     . . .
+  //         |   StoreCM   . . .
+  //       . . .
+  //
+  // The second problem is how the code generator is to translate the
+  // card mark barrier? It always needs to be translated to a "dmb
+  // ish" instruction whether or not it occurs as part of a volatile
+  // put. A StoreLoad barrier is needed after the object put to ensure
+  // i) visibility to GC threads of the object put and ii) visibility
+  // to the mutator thread of any card clearing write by a GC
+  // thread. Clearly a normal store (str) will not guarantee this
+  // ordering but neither will a releasing store (stlr). The latter
+  // guarantees that the object put is visible but does not guarantee
+  // that writes by other threads have also been observed.
+  // 
+  // So, returning to the task of translating the object put and the
+  // leading/trailing membar nodes: what do the non-normal node graph
+  // look like for these 2 special cases? and how can we determine the
+  // status of a MemBarRelease, StoreX[mo_release] or MemBarVolatile
+  // in both normal and non-normal cases?
+  //
+  // A CMS GC post-barrier wraps its card write (StoreCM) inside an If
+  // which selects conditonal execution based on the value loaded
+  // (LoadB) from the card. Ctl and Mem are fed to the If via an
+  // intervening StoreLoad barrier (MemBarVolatile).
+  //
+  // So, with CMS we may see a node graph which looks like this
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder_(leading)__________________
+  //     C |    M \       \\                   C \
+  //       |       \    StoreN/P[mo_release]  CastP2X
+  //       |    Bot \    /
+  //       |       MergeMem
+  //       |         /
+  //      MemBarVolatile (card mark)
+  //     C |  ||    M |
+  //       | LoadB    |
+  //       |   |      |
+  //       | Cmp      |\
+  //       | /        | \
+  //       If         |  \
+  //       | \        |   \
+  // IfFalse  IfTrue  |    \
+  //       \     / \  |     \
+  //        \   / StoreCM    |
+  //         \ /      |      |
+  //        Region   . . .   |
+  //          | \           /
+  //          |  . . .  \  / Bot
+  //          |       MergeMem
+  //          |          |
+  //        MemBarVolatile (trailing)
+  //
+  // The first MergeMem merges the AliasIdxBot Mem slice from the
+  // leading membar and the oopptr Mem slice from the Store into the
+  // card mark membar. The trailing MergeMem merges the AliasIdxBot
+  // Mem slice from the card mark membar and the AliasIdxRaw slice
+  // from the StoreCM into the trailing membar (n.b. the latter
+  // proceeds via a Phi associated with the If region).
+  //
+  // G1 is quite a lot more complicated. The nodes inserted on behalf
+  // of G1 may comprise: a pre-write graph which adds the old value to
+  // the SATB queue; the releasing store itself; and, finally, a
+  // post-write graph which performs a card mark.
+  //
+  // The pre-write graph may be omitted, but only when the put is
+  // writing to a newly allocated (young gen) object and then only if
+  // there is a direct memory chain to the Initialize node for the
+  // object allocation. This will not happen for a volatile put since
+  // any memory chain passes through the leading membar.
+  //
+  // The pre-write graph includes a series of 3 If tests. The outermost
+  // If tests whether SATB is enabled (no else case). The next If tests
+  // whether the old value is non-NULL (no else case). The third tests
+  // whether the SATB queue index is > 0, if so updating the queue. The
+  // else case for this third If calls out to the runtime to allocate a
+  // new queue buffer.
+  //
+  // So with G1 the pre-write and releasing store subgraph looks like
+  // this (the nested Ifs are omitted).
+  //
+  //  MemBarRelease (leading)____________
+  //     C |  ||  M \   M \    M \  M \ . . .
+  //       | LoadB   \  LoadL  LoadN   \
+  //       | /        \                 \
+  //       If         |\                 \
+  //       | \        | \                 \
+  //  IfFalse  IfTrue |  \                 \
+  //       |     |    |   \                 |
+  //       |     If   |   /\                |
+  //       |     |          \               |
+  //       |                 \              |
+  //       |    . . .         \             |
+  //       | /       | /       |            |
+  //      Region  Phi[M]       |            |
+  //       | \       |         |            |
+  //       |  \_____ | ___     |            |
+  //     C | C \     |   C \ M |            |
+  //       | CastP2X | StoreN/P[mo_release] |
+  //       |         |         |            |
+  //     C |       M |       M |          M |
+  //        \        |         |           /
+  //                  . . . 
+  //          (post write subtree elided)
+  //                    . . .
+  //             C \         M /
+  //         MemBarVolatile (trailing)
+  //
+  // n.b. the LoadB in this subgraph is not the card read -- it's a
+  // read of the SATB queue active flag.
+  //
+  // The G1 post-write subtree is also optional, this time when the
+  // new value being written is either null or can be identified as a
+  // newly allocated (young gen) object with no intervening control
+  // flow. The latter cannot happen but the former may, in which case
+  // the card mark membar is omitted and the memory feeds from the
+  // leading membar and the StoreN/P are merged direct into the
+  // trailing membar as per the normal subgraph. So, the only special
+  // case which arises is when the post-write subgraph is generated.
+  //
+  // The kernel of the post-write G1 subgraph is the card mark itself
+  // which includes a card mark memory barrier (MemBarVolatile), a
+  // card test (LoadB), and a conditional update (If feeding a
+  // StoreCM). These nodes are surrounded by a series of nested Ifs
+  // which try to avoid doing the card mark. The top level If skips if
+  // the object reference does not cross regions (i.e. it tests if
+  // (adr ^ val) >> log2(regsize) != 0) -- intra-region references
+  // need not be recorded. The next If, which skips on a NULL value,
+  // may be absent (it is not generated if the type of value is >=
+  // OopPtr::NotNull). The 3rd If skips writes to young regions (by
+  // checking if card_val != young).  n.b. although this test requires
+  // a pre-read of the card it can safely be done before the StoreLoad
+  // barrier. However that does not bypass the need to reread the card
+  // after the barrier.
+  //
+  //                (pre-write subtree elided)
+  //        . . .                  . . .    . . .  . . .
+  //        C |                    M |     M |    M |
+  //       Region                  Phi[M] StoreN    |
+  //          |                     / \      |      |
+  //         / \_______            /   \     |      |
+  //      C / C \      . . .            \    |      |
+  //       If   CastP2X . . .            |   |      |
+  //       / \                           |   |      |
+  //      /   \                          |   |      |
+  // IfFalse IfTrue                      |   |      |
+  //   |       |                         |   |     /|
+  //   |       If                        |   |    / |
+  //   |      / \                        |   |   /  |
+  //   |     /   \                        \  |  /   |
+  //   | IfFalse IfTrue                   MergeMem  |
+  //   |  . . .    / \                       /      |
+  //   |          /   \                     /       |
+  //   |     IfFalse IfTrue                /        |
+  //   |      . . .    |                  /         |
+  //   |               If                /          |
+  //   |               / \              /           |
+  //   |              /   \            /            |
+  //   |         IfFalse IfTrue       /             |
+  //   |           . . .   |         /              |
+  //   |                    \       /               |
+  //   |                     \     /                |
+  //   |             MemBarVolatile__(card mark)    |
+  //   |                ||   C |  M \  M \          |
+  //   |               LoadB   If    |    |         |
+  //   |                      / \    |    |         |
+  //   |                     . . .   |    |         |
+  //   |                          \  |    |        /
+  //   |                        StoreCM   |       /
+  //   |                          . . .   |      /
+  //   |                        _________/      /
+  //   |                       /  _____________/
+  //   |   . . .       . . .  |  /            /
+  //   |    |                 | /   _________/
+  //   |    |               Phi[M] /        /
+  //   |    |                 |   /        /
+  //   |    |                 |  /        /
+  //   |  Region  . . .     Phi[M]  _____/
+  //   |    /                 |    /
+  //   |                      |   /   
+  //   | . . .   . . .        |  /
+  //   | /                    | /
+  // Region           |  |  Phi[M]
+  //   |              |  |  / Bot
+  //    \            MergeMem 
+  //     \            /
+  //     MemBarVolatile
+  //
+  // As with CMS the initial MergeMem merges the AliasIdxBot Mem slice
+  // from the leading membar and the oopptr Mem slice from the Store
+  // into the card mark membar i.e. the memory flow to the card mark
+  // membar still looks like a normal graph.
+  //
+  // The trailing MergeMem merges an AliasIdxBot Mem slice with other
+  // Mem slices (from the StoreCM and other card mark queue stores).
+  // However in this case the AliasIdxBot Mem slice does not come
+  // direct from the card mark membar. It is merged through a series
+  // of Phi nodes. These are needed to merge the AliasIdxBot Mem flow
+  // from the leading membar with the Mem feed from the card mark
+  // membar. Each Phi corresponds to one of the Ifs which may skip
+  // around the card mark membar. So when the If implementing the NULL
+  // value check has been elided the total number of Phis is 2
+  // otherwise it is 3.
+  //
+  // So, the upshot is that in all cases the volatile put graph will
+  // include a *normal* memory subgraph betwen the leading membar and
+  // its child membar. When that child is not a card mark membar then
+  // it marks the end of a volatile put subgraph. If the child is a
+  // card mark membar then the normal subgraph will form part of a
+  // volatile put subgraph if and only if the child feeds an
+  // AliasIdxBot Mem feed to a trailing barrier via a MergeMem. That
+  // feed is either direct (for CMS) or via 2 or 3 Phi nodes merging
+  // the leading barrier memory flow (for G1).
+  // 
+  // The predicates controlling generation of instructions for store
+  // and barrier nodes employ a few simple helper functions (described
+  // below) which identify the presence or absence of these subgraph
+  // configurations and provide a means of traversing from one node in
+  // the subgraph to another.
+
+  // leading_to_normal
+  //
+  //graph traversal helper which detects the normal case Mem feed
+  // from a release membar (or, optionally, its cpuorder child) to a
+  // dependent volatile membar i.e. it ensures that the following Mem
+  // flow subgraph is present.
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder
+  //          |  \      . . .
+  //          |  StoreN/P[mo_release]  . . .
+  //          |   /
+  //         MergeMem
+  //          |
+  //   MemBarVolatile
+  //
+  // if the correct configuration is present returns the volatile
+  // membar otherwise NULL.
+  //
+  // the input membar is expected to be either a cpuorder membar or a
+  // release membar. in the latter case it should not have a cpu membar
+  // child.
+  //
+  // the returned membar may be a card mark membar rather than a
+  // trailing membar.
+
+  MemBarNode *leading_to_normal(MemBarNode *leading)
+  {
+    assert((leading->Opcode() == Op_MemBarRelease ||
+	    leading->Opcode() == Op_MemBarCPUOrder),
+	   "expecting a volatile or cpuroder membar!");
+
+    // check the mem flow
+    ProjNode *mem = leading->proj_out(TypeFunc::Memory);
+
+    if (!mem)
+      return NULL;
+
+    Node *x = NULL;
+    StoreNode * st = NULL;
+    MergeMemNode *mm = NULL;
+
+    for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
+      x = mem->fast_out(i);
+      if (x->is_MergeMem()) {
+	if (mm != NULL)
+	  return NULL;
+	// two merge mems is one too many
+	mm = x->as_MergeMem();
+      } else if (x->is_Store() && x->as_Store()->is_release() && x->Opcode() != Op_StoreCM) {
+	// two releasing stores is one too many
+	if (st != NULL)
+	  return NULL;
+	st = x->as_Store();
+      }
+    }
+
+    if (!mm || !st)
+      return NULL;
+
+    bool found = false;
+    // ensure the store feeds the merge
+    for (DUIterator_Fast imax, i = st->fast_outs(imax); i < imax; i++) {
+      if (st->fast_out(i) == mm) {
+	found = true;
+	break;
+      }
+    }
+
+    if (!found)
+      return NULL;
+
+    MemBarNode *mbvol = NULL;
+    // ensure the merge feeds a volatile membar
+    for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
+      x = mm->fast_out(i);
+      if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
+	mbvol = x->as_MemBar();
+	break;
+      }
+    }
+
+    return mbvol;
+  }
+
+  // normal_to_leading
+  //
+  // graph traversal helper which detects the normal case Mem feed
+  // from either a card mark or a trailing membar to a preceding
+  // release membar (optionally its cpuorder child) i.e. it ensures
+  // that the following Mem flow subgraph is present.
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder {leading}
+  //          |  \      . . .
+  //          |  StoreN/P[mo_release]  . . .
+  //          |   /
+  //         MergeMem
+  //          |
+  //   MemBarVolatile
+  //
+  // this predicate checks for the same flow as the previous predicate
+  // but starting from the bottom rather than the top.
+  //
+  // if the configuration is present returns the cpuorder member for
+  // preference or when absent the release membar otherwise NULL.
+  //
+  // n.b. the input membar is expected to be a MemBarVolatile but
+  // need not be a card mark membar.
+
+  MemBarNode *normal_to_leading(const MemBarNode *barrier)
+  {
+    // input must be a volatile membar
+    assert(barrier->Opcode() == Op_MemBarVolatile, "expecting a volatile membar");
+    Node *x;
+
+    // the Mem feed to the membar should be a merge
+    x = barrier->in(TypeFunc::Memory);
+    if (!x->is_MergeMem())
+      return NULL;
+
+    MergeMemNode *mm = x->as_MergeMem();
+
+    // the AliasIdxBot slice should be another MemBar projection
+    x = mm->in(Compile::AliasIdxBot);
+    // ensure this is a non control projection
+    if (!x->is_Proj() || x->is_CFG())
+      return NULL;
+    // if it is fed by a membar that's the one we want
+    x = x->in(0);
+
+    if (!x->is_MemBar())
+      return NULL;
+
+    MemBarNode *leading = x->as_MemBar();
+    // reject invalid candidates
+    if (!leading_membar(leading))
+      return NULL;
+
+    // ok, we have a leading ReleaseMembar, now for the sanity clauses
+
+    // the leading membar must feed Mem to a releasing store
+    ProjNode *mem = leading->proj_out(TypeFunc::Memory);
+    StoreNode *st = NULL;
+    for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
+      x = mem->fast_out(i);
+      if (x->is_Store() && x->as_Store()->is_release() && x->Opcode() != Op_StoreCM) {
+	st = x->as_Store();
+	break;
+      }
+    }
+    if (st == NULL)
+      return NULL;
+
+    // the releasing store has to feed the same merge
+    for (DUIterator_Fast imax, i = st->fast_outs(imax); i < imax; i++) {
+      if (st->fast_out(i) == mm)
+	return leading;
+    }
+
+    return NULL;
+  }
+
+  // card_mark_to_trailing
+  //
+  // graph traversal helper which detects extra, non-normal Mem feed
+  // from a card mark volatile membar to a trailing membar i.e. it
+  // ensures that one of the following three GC post-write Mem flow
+  // subgraphs is present.
+  //
+  // 1)
+  //     . . .
+  //       |
+  //   MemBarVolatile (card mark)
+  //      |          |     
+  //      |        StoreCM
+  //      |          |
+  //      |        . . .
+  //  Bot |  / 
+  //   MergeMem 
+  //      |
+  //   MemBarVolatile (trailing)
+  //
+  //
+  // 2)
+  //   MemBarRelease/CPUOrder (leading)
+  //    |
+  //    | 
+  //    |\       . . .
+  //    | \        | 
+  //    |  \  MemBarVolatile (card mark) 
+  //    |   \   |     |
+  //     \   \  |   StoreCM    . . .
+  //      \   \ |
+  //       \  Phi
+  //        \ /
+  //        Phi  . . .
+  //     Bot |   /
+  //       MergeMem
+  //         |
+  //   MemBarVolatile (trailing)
+  //
+  // 3)
+  //   MemBarRelease/CPUOrder (leading)
+  //    |
+  //    |\
+  //    | \
+  //    |  \      . . .
+  //    |   \       |
+  //    |\   \  MemBarVolatile (card mark)
+  //    | \   \   |     |
+  //    |  \   \  |   StoreCM    . . .
+  //    |   \   \ |
+  //     \   \  Phi
+  //      \   \ /  
+  //       \  Phi
+  //        \ /
+  //        Phi  . . .
+  //     Bot |   /
+  //       MergeMem
+  //         |
+  //   MemBarVolatile (trailing)
+  //
+  // configuration 1 is only valid if UseConcMarkSweepGC &&
+  // UseCondCardMark
+  //
+  // configurations 2 and 3 are only valid if UseG1GC.
+  //
+  // if a valid configuration is present returns the trailing membar
+  // otherwise NULL.
+  //
+  // n.b. the supplied membar is expected to be a card mark
+  // MemBarVolatile i.e. the caller must ensure the input node has the
+  // correct operand and feeds Mem to a StoreCM node
+
+  MemBarNode *card_mark_to_trailing(const MemBarNode *barrier)
+  {
+    // input must be a card mark volatile membar
+    assert(is_card_mark_membar(barrier), "expecting a card mark membar");
+
+    Node *feed = barrier->proj_out(TypeFunc::Memory);
+    Node *x;
+    MergeMemNode *mm = NULL;
+
+    const int MAX_PHIS = 3;	// max phis we will search through
+    int phicount = 0; 		// current search count
+
+    bool retry_feed = true;
+    while (retry_feed) {
+      // see if we have a direct MergeMem feed
+      for (DUIterator_Fast imax, i = feed->fast_outs(imax); i < imax; i++) {
+	x = feed->fast_out(i);
+	// the correct Phi will be merging a Bot memory slice
+	if (x->is_MergeMem()) {
+	  mm = x->as_MergeMem();
+	  break;
+	}
+      }
+      if (mm) {
+	retry_feed = false;
+      } else if (UseG1GC & phicount++ < MAX_PHIS) {
+	// the barrier may feed indirectly via one or two Phi nodes
+	PhiNode *phi = NULL;
+	for (DUIterator_Fast imax, i = feed->fast_outs(imax); i < imax; i++) {
+	  x = feed->fast_out(i);
+	  // the correct Phi will be merging a Bot memory slice
+	  if (x->is_Phi() && x->adr_type() == TypePtr::BOTTOM) {
+	    phi = x->as_Phi();
+	    break;
+	  }
+	}
+	if (!phi)
+	  return NULL;
+	// look for another merge below this phi
+	feed = phi;
+      } else {
+	// couldn't find a merge
+	return NULL;
+      }
+    }
+
+    // sanity check this feed turns up as the expected slice
+    assert(mm->as_MergeMem()->in(Compile::AliasIdxBot) == feed, "expecting membar to feed AliasIdxBot slice to Merge");
+
+    MemBarNode *trailing = NULL;
+    // be sure we have a volatile membar below the merge
+    for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
+      x = mm->fast_out(i);
+      if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
+	trailing = x->as_MemBar();
+	break;
+      }
+    }
+
+    return trailing;
+  }
+
+  // trailing_to_card_mark
+  //
+  // graph traversal helper which detects extra, non-normal Mem feed
+  // from a trailing membar to a preceding card mark volatile membar
+  // i.e. it identifies whether one of the three possible extra GC
+  // post-write Mem flow subgraphs is present
+  //
+  // this predicate checks for the same flow as the previous predicate
+  // but starting from the bottom rather than the top.
+  //
+  // if the configurationis present returns the card mark membar
+  // otherwise NULL
+
+  MemBarNode *trailing_to_card_mark(const MemBarNode *trailing)
+  {
+    assert(!is_card_mark_membar(trailing), "not expecting a card mark membar");
+
+    Node *x = trailing->in(TypeFunc::Memory);
+    // the Mem feed to the membar should be a merge
+    if (!x->is_MergeMem())
+      return NULL;
+
+    MergeMemNode *mm = x->as_MergeMem();
+
+    x = mm->in(Compile::AliasIdxBot);
+    // with G1 we may possibly see a Phi or two before we see a Memory
+    // Proj from the card mark membar
+
+    const int MAX_PHIS = 3;	// max phis we will search through
+    int phicount = 0; 		// current search count
+
+    bool retry_feed = !x->is_Proj();
+
+    while (retry_feed) {
+      if (UseG1GC && x->is_Phi() && phicount++ < MAX_PHIS) {
+	PhiNode *phi = x->as_Phi();
+	ProjNode *proj = NULL;
+	PhiNode *nextphi = NULL;
+	bool found_leading = false;
+	for (uint i = 1; i < phi->req(); i++) {
+	  x = phi->in(i);
+	  if (x->is_Phi()) {
+	    nextphi = x->as_Phi();
+	  } else if (x->is_Proj()) {
+	    int opcode = x->in(0)->Opcode();
+	    if (opcode == Op_MemBarVolatile) {
+	      proj = x->as_Proj();
+	    } else if (opcode == Op_MemBarRelease ||
+		       opcode == Op_MemBarCPUOrder) {
+	      // probably a leading membar
+	      found_leading = true;
+	    }
+	  }
+	}
+	// if we found a correct looking proj then retry from there
+	// otherwise we must see a leading and a phi or this the
+	// wrong config
+	if (proj != NULL) {
+	  x = proj;
+	  retry_feed = false;
+	} else if (found_leading && nextphi != NULL) {
+	  // retry from this phi to check phi2
+	  x = nextphi;
+	} else {
+	  // not what we were looking for
+	  return NULL;
+	}
+      } else {
+	return NULL;
+      }
+    }
+    // the proj has to come from the card mark membar
+    x = x->in(0);
+    if (!x->is_MemBar())
+      return NULL;
+
+    MemBarNode *card_mark_membar = x->as_MemBar();
+
+    if (!is_card_mark_membar(card_mark_membar))
+      return NULL;
+
+    return card_mark_membar;
+  }
+
+  // trailing_to_leading
+  //
+  // graph traversal helper which checks the Mem flow up the graph
+  // from a (non-card mark) volatile membar attempting to locate and
+  // return an associated leading membar. it first looks for a
+  // subgraph in the normal configuration (relying on helper
+  // normal_to_leading). failing that it then looks for one of the
+  // possible post-write card mark subgraphs linking the trailing node
+  // to a the card mark membar (relying on helper
+  // trailing_to_card_mark), and then checks that the card mark membar
+  // is fed by a leading membar (once again relying on auxiliary
+  // predicate normal_to_leading).
+  //
+  // if the configuration is valid returns the cpuorder member for
+  // preference or when absent the release membar otherwise NULL.
+  //
+  // n.b. the input membar is expected to be a volatile membar but
+  // must *not* be a card mark membar.
+
+  MemBarNode *trailing_to_leading(const MemBarNode *trailing)
+  {
+    assert(!is_card_mark_membar(trailing), "not expecting a card mark membar");
+
+    MemBarNode *leading = normal_to_leading(trailing);
+
+    if (leading)
+      return leading;
+
+    MemBarNode *card_mark_membar = trailing_to_card_mark(trailing);
+
+    if (!card_mark_membar)
+      return NULL;
+
+    return normal_to_leading(card_mark_membar);
+  }
+
   // predicates controlling emit of ldr<x>/ldar<x> and associated dmb
 
-bool unnecessary_acquire(const Node *barrier) {
+bool unnecessary_acquire(const Node *barrier)
+{
   // assert barrier->is_MemBar();
   if (UseBarriersForVolatile)
     // we need to plant a dmb
@@ -1323,13 +2146,11 @@
     return (x->is_Load() && x->as_Load()->is_acquire());
   }
   
-  // only continue if we want to try to match unsafe volatile gets
-  if (UseBarriersForUnsafeVolatileGet)
-    return false;
+  // now check for an unsafe volatile get
 
   // need to check for
   //
-  //     MemBarCPUOrder
+  //   MemBarCPUOrder
   //        ||       \\
   //   MemBarAcquire* LoadX[mo_acquire]
   //        ||
@@ -1341,9 +2162,13 @@
   // check for a parent MemBarCPUOrder
   ProjNode *ctl;
   ProjNode *mem;
-  MemBarNode *parent = has_parent_membar(barrier, ctl, mem);
+  MemBarNode *parent = parent_membar(barrier);
   if (!parent || parent->Opcode() != Op_MemBarCPUOrder)
     return false;
+  ctl = parent->proj_out(TypeFunc::Control);
+  mem = parent->proj_out(TypeFunc::Memory);
+  if (!ctl || !mem)
+    return false;
   // ensure the proj nodes both feed a LoadX[mo_acquire]
   LoadNode *ld = NULL;
   for (DUIterator_Fast imax, i = ctl->fast_outs(imax); i < imax; i++) {
@@ -1369,7 +2194,7 @@
   if (ld)
     return false;
   // check for a child cpuorder membar
-  MemBarNode *child  = has_child_membar(barrier->as_MemBar(), ctl, mem);
+  MemBarNode *child  = child_membar(barrier->as_MemBar());
   if (!child || child->Opcode() != Op_MemBarCPUOrder)
     return false;
 
@@ -1422,9 +2247,7 @@
     return true;
   }
 
-  // only continue if we want to try to match unsafe volatile gets
-  if (UseBarriersForUnsafeVolatileGet)
-    return false;
+  // now check for an unsafe volatile get
 
   // check if Ctl and Proj feed comes from a MemBarCPUOrder
   //
@@ -1435,22 +2258,20 @@
   //   MemBarCPUOrder
 
   MemBarNode *membar;
-  ProjNode *ctl;
-  ProjNode *mem;
-
-  membar = has_parent_membar(ld, ctl, mem);
+
+  membar = parent_membar(ld);
 
   if (!membar || !membar->Opcode() == Op_MemBarCPUOrder)
     return false;
 
   // ensure that there is a CPUOrder->Acquire->CPUOrder membar chain
 
-  membar = has_child_membar(membar, ctl, mem);
+  membar = child_membar(membar);
 
   if (!membar || !membar->Opcode() == Op_MemBarAcquire)
     return false;
 
-  membar = has_child_membar(membar, ctl, mem);
+  membar = child_membar(membar);
   
   if (!membar || !membar->Opcode() == Op_MemBarCPUOrder)
     return false;
@@ -1458,194 +2279,81 @@
   return true;
 }
 
-bool unnecessary_release(const Node *n) {
+bool unnecessary_release(const Node *n)
+{
+  assert((n->is_MemBar() &&
+	  n->Opcode() == Op_MemBarRelease),
+	 "expecting a release membar");
+
+  if (UseBarriersForVolatile)
+    // we need to plant a dmb
+    return false;
+
+  // if there is a dependent CPUOrder barrier then use that as the
+  // leading
+
+  MemBarNode *barrier = n->as_MemBar();
+  // check for an intervening cpuorder membar
+  MemBarNode *b = child_membar(barrier);
+  if (b && b->Opcode() == Op_MemBarCPUOrder) {
+    // ok, so start the check from the dependent cpuorder barrier
+    barrier = b;
+  }
+
+  // must start with a normal feed
+  MemBarNode *child_barrier = leading_to_normal(barrier);
+
+  if (!child_barrier)
+    return false;
+
+  if (!is_card_mark_membar(child_barrier))
+    // this is the trailing membar and we are done
+    return true;
+
+  // must be sure this card mark feeds a trailing membar
+  MemBarNode *trailing = card_mark_to_trailing(child_barrier);
+  return (trailing != NULL);
+}
+
+bool unnecessary_volatile(const Node *n)
+{
   // assert n->is_MemBar();
   if (UseBarriersForVolatile)
     // we need to plant a dmb
     return false;
 
-  // ok, so we can omit this release barrier if it has been inserted
-  // as part of a volatile store sequence
-  //
-  //   MemBarRelease
-  //  {      ||      }
-  //  {MemBarCPUOrder} -- optional
-  //         ||     \\
-  //         ||     StoreX[mo_release]
-  //         | \     /
-  //         | MergeMem
-  //         | /
-  //   MemBarVolatile
-  //
-  // where
-  //  || and \\ represent Ctl and Mem feeds via Proj nodes
-  //  | \ and / indicate further routing of the Ctl and Mem feeds
-  // 
-  // so we need to check that
-  //
-  // ia) the release membar (or its dependent cpuorder membar) feeds
-  // control to a store node (via a Control project node)
-  //
-  // ii) the store is ordered release
-  //
-  // iii) the release membar (or its dependent cpuorder membar) feeds
-  // control to a volatile membar (via the same Control project node)
-  //
-  // iv) the release membar feeds memory to a merge mem and to the
-  // same store (both via a single Memory proj node)
-  //
-  // v) the store outputs to the merge mem
-  //
-  // vi) the merge mem outputs to the same volatile membar
-  //
-  // n.b. if this is an inlined unsafe node then the release membar
-  // may feed its control and memory links via an intervening cpuorder
-  // membar. this case can be dealt with when we check the release
-  // membar projections. if they both feed a single cpuorder membar
-  // node continue to make the same checks as above but with the
-  // cpuorder membar substituted for the release membar. if they don't
-  // both feed a cpuorder membar then the check fails.
-  //
-  // n.b.b. for an inlined unsafe store of an object in the case where
-  // !TypePtr::NULL_PTR->higher_equal(type(heap_base_oop)) we may see
-  // an embedded if then else where we expect the store. this is
-  // needed to do the right type of store depending on whether
-  // heap_base is NULL. We could check for that but for now we can
-  // just take the hit of on inserting a redundant dmb for this
-  // redundant volatile membar
-
-  MemBarNode *barrier = n->as_MemBar();
-  ProjNode *ctl;
-  ProjNode *mem;
-  // check for an intervening cpuorder membar
-  MemBarNode *b = has_child_membar(barrier, ctl, mem);
-  if (b && b->Opcode() == Op_MemBarCPUOrder) {
-    // ok, so start form the dependent cpuorder barrier
-    barrier = b;
-  }
-  // check the ctl and mem flow
-  ctl = barrier->proj_out(TypeFunc::Control);
-  mem = barrier->proj_out(TypeFunc::Memory);
-
-  // the barrier needs to have both a Ctl and Mem projection
-  if (! ctl || ! mem)
+  MemBarNode *mbvol = n->as_MemBar();
+
+  // first we check if this is part of a card mark. if so then we have
+  // to generate a StoreLoad barrier
+  
+  if (is_card_mark_membar(mbvol))
+      return false;
+
+  // ok, if it's not a card mark then we still need to check if it is
+  // a trailing membar of a volatile put hgraph.
+
+  return (trailing_to_leading(mbvol) != NULL);
+}
+
+// predicates controlling emit of str<x>/stlr<x> and associated dmbs
+
+bool needs_releasing_store(const Node *n)
+{
+  // assert n->is_Store();
+  if (UseBarriersForVolatile)
+    // we use a normal store and dmb combination
     return false;
 
-  Node *x = NULL;
-  Node *mbvol = NULL;
-  StoreNode * st = NULL;
-
-  // For a normal volatile write the Ctl ProjNode should have output
-  // to a MemBarVolatile and a Store marked as releasing
-  //
-  // n.b. for an inlined unsafe store of an object in the case where
-  // !TypePtr::NULL_PTR->higher_equal(type(heap_base_oop)) we may see
-  // an embedded if then else where we expect the store. this is
-  // needed to do the right type of store depending on whether
-  // heap_base is NULL. We could check for that case too but for now
-  // we can just take the hit of inserting a dmb and a non-volatile
-  // store to implement the volatile store
-
-  for (DUIterator_Fast imax, i = ctl->fast_outs(imax); i < imax; i++) {
-    x = ctl->fast_out(i);
-    if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
-      if (mbvol) {
-	return false;
-      }
-      mbvol = x;
-    } else if (x->is_Store()) {
-      st = x->as_Store();
-      if (! st->is_release()) {
-	return false;
-      }
-    } else if (!x->is_Mach()) {
-      // we may see mach nodes added during matching but nothing else
-      return false;
-    }
-  }
-
-  if (!mbvol || !st)
-    return false;
-
-  // the Mem ProjNode should output to a MergeMem and the same Store
-  Node *mm = NULL;
-  for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
-    x = mem->fast_out(i);
-    if (!mm && x->is_MergeMem()) {
-      mm = x;
-    } else if (x != st && !x->is_Mach()) {
-      // we may see mach nodes added during matching but nothing else
-      return false;
-    }
-  }
-
-  if (!mm)
+  StoreNode *st = n->as_Store();
+
+  // the store must be marked as releasing
+  if (!st->is_release())
     return false;
 
-  // the MergeMem should output to the MemBarVolatile
-  for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
-    x = mm->fast_out(i);
-    if (x != mbvol && !x->is_Mach()) {
-      // we may see mach nodes added during matching but nothing else
-      return false;
-    }
-  }
-
-  return true;
-}
-
-bool unnecessary_volatile(const Node *n) {
-  // assert n->is_MemBar();
-  if (UseBarriersForVolatile)
-    // we need to plant a dmb
-    return false;
-
-  // ok, so we can omit this volatile barrier if it has been inserted
-  // as part of a volatile store sequence
-  //
-  //   MemBarRelease
-  //  {      ||      }
-  //  {MemBarCPUOrder} -- optional
-  //         ||     \\
-  //         ||     StoreX[mo_release]
-  //         | \     /
-  //         | MergeMem
-  //         | /
-  //   MemBarVolatile
-  //
-  // where
-  //  || and \\ represent Ctl and Mem feeds via Proj nodes
-  //  | \ and / indicate further routing of the Ctl and Mem feeds
-  // 
-  // we need to check that
-  //
-  // i) the volatile membar gets its control feed from a release
-  // membar (or its dependent cpuorder membar) via a Control project
-  // node
-  //
-  // ii) the release membar (or its dependent cpuorder membar) also
-  // feeds control to a store node via the same proj node
-  //
-  // iii) the store is ordered release
-  //
-  // iv) the release membar (or its dependent cpuorder membar) feeds
-  // memory to a merge mem and to the same store (both via a single
-  // Memory proj node)
-  //
-  // v) the store outputs to the merge mem
-  //
-  // vi) the merge mem outputs to the volatile membar
-  //
-  // n.b. for an inlined unsafe store of an object in the case where
-  // !TypePtr::NULL_PTR->higher_equal(type(heap_base_oop)) we may see
-  // an embedded if then else where we expect the store. this is
-  // needed to do the right type of store depending on whether
-  // heap_base is NULL. We could check for that but for now we can
-  // just take the hit of on inserting a redundant dmb for this
-  // redundant volatile membar
-
-  MemBarNode *mbvol = n->as_MemBar();
-  Node *x = n->lookup(TypeFunc::Control);
+  // the store must be fed by a membar
+
+  Node *x = st->lookup(StoreNode::Memory);
 
   if (! x || !x->is_Proj())
     return false;
@@ -1659,200 +2367,78 @@
 
   MemBarNode *barrier = x->as_MemBar();
 
-  // if the barrier is a release membar we have what we want. if it is
-  // a cpuorder membar then we need to ensure that it is fed by a
-  // release membar in which case we proceed to check the graph below
-  // this cpuorder membar as the feed
-
-  if (x->Opcode() != Op_MemBarRelease) {
-    if (x->Opcode() != Op_MemBarCPUOrder)
-      return false;
-    ProjNode *ctl;
-    ProjNode *mem;
-    MemBarNode *b = has_parent_membar(x, ctl, mem);
-    if (!b || !b->Opcode() == Op_MemBarRelease)
-      return false;
-  }
-
-  ProjNode *ctl = barrier->proj_out(TypeFunc::Control);
-  ProjNode *mem = barrier->proj_out(TypeFunc::Memory);
-
-  // barrier needs to have both a Ctl and Mem projection
-  // and we need to have reached it via the Ctl projection
-  if (! ctl || ! mem || ctl != proj)
-    return false;
-
-  StoreNode * st = NULL;
-
-  // The Ctl ProjNode should have output to a MemBarVolatile and
-  // a Store marked as releasing
-  for (DUIterator_Fast imax, i = ctl->fast_outs(imax); i < imax; i++) {
-    x = ctl->fast_out(i);
-    if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
-      if (x != mbvol) {
-	return false;
-      }
-    } else if (x->is_Store()) {
-      st = x->as_Store();
-      if (! st->is_release()) {
-	return false;
-      }
-    } else if (!x->is_Mach()){
-      // we may see mach nodes added during matching but nothing else
-      return false;
-    }
-  }
-
-  if (!st)
-    return false;
-
-  // the Mem ProjNode should output to a MergeMem and the same Store
-  Node *mm = NULL;
-  for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
-    x = mem->fast_out(i);
-    if (!mm && x->is_MergeMem()) {
-      mm = x;
-    } else if (x != st && !x->is_Mach()) {
-      // we may see mach nodes added during matching but nothing else
-      return false;
-    }
-  }
-
-  if (!mm)
-    return false;
-
-  // the MergeMem should output to the MemBarVolatile
-  for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
-    x = mm->fast_out(i);
-    if (x != mbvol && !x->is_Mach()) {
-      // we may see mach nodes added during matching but nothing else
-      return false;
-    }
-  }
-
-  return true;
-}
-
-
-
-bool needs_releasing_store(const Node *n)
-{
-  // assert n->is_Store();
-  if (UseBarriersForVolatile)
-    // we use a normal store and dmb combination
+  // if the barrier is a release membar or a cpuorder mmebar fed by a
+  // release membar then we need to check whether that forms part of a
+  // volatile put graph.
+
+  // reject invalid candidates
+  if (!leading_membar(barrier))
     return false;
 
-  StoreNode *st = n->as_Store();
-
-  if (!st->is_release())
-    return false;
-
-  // check if this store is bracketed by a release (or its dependent
-  // cpuorder membar) and a volatile membar
-  //
-  //   MemBarRelease
-  //  {      ||      }
-  //  {MemBarCPUOrder} -- optional
-  //         ||     \\
-  //         ||     StoreX[mo_release]
-  //         | \     /
-  //         | MergeMem
-  //         | /
-  //   MemBarVolatile
-  //
-  // where
-  //  || and \\ represent Ctl and Mem feeds via Proj nodes
-  //  | \ and / indicate further routing of the Ctl and Mem feeds
-  // 
-
-
-  Node *x = st->lookup(TypeFunc::Control);
-
-  if (! x || !x->is_Proj())
-    return false;
-
-  ProjNode *proj = x->as_Proj();
-
-  x = proj->lookup(0);
-
-  if (!x || !x->is_MemBar())
-    return false;
-
-  MemBarNode *barrier = x->as_MemBar();
-
-  // if the barrier is a release membar we have what we want. if it is
-  // a cpuorder membar then we need to ensure that it is fed by a
-  // release membar in which case we proceed to check the graph below
-  // this cpuorder membar as the feed
-
-  if (x->Opcode() != Op_MemBarRelease) {
-    if (x->Opcode() != Op_MemBarCPUOrder)
-      return false;
-    Node *ctl = x->lookup(TypeFunc::Control);
-    Node *mem = x->lookup(TypeFunc::Memory);
-    if (!ctl || !ctl->is_Proj() || !mem || !mem->is_Proj())
-      return false;
-    x = ctl->lookup(0);
-    if (!x || !x->is_MemBar() || !x->Opcode() == Op_MemBarRelease)
-      return false;
-    Node *y = mem->lookup(0);
-    if (!y || y != x)
-      return false;
-  }
-
-  ProjNode *ctl = barrier->proj_out(TypeFunc::Control);
-  ProjNode *mem = barrier->proj_out(TypeFunc::Memory);
-
-  // MemBarRelease needs to have both a Ctl and Mem projection
-  // and we need to have reached it via the Ctl projection
-  if (! ctl || ! mem || ctl != proj)
-    return false;
-
-  MemBarNode *mbvol = NULL;
-
-  // The Ctl ProjNode should have output to a MemBarVolatile and
-  // a Store marked as releasing
-  for (DUIterator_Fast imax, i = ctl->fast_outs(imax); i < imax; i++) {
-    x = ctl->fast_out(i);
-    if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
-      mbvol = x->as_MemBar();
-    } else if (x->is_Store()) {
-      if (x != st) {
-	return false;
-      }
-    } else if (!x->is_Mach()){
-      return false;
-    }
-  }
+  // does this lead a normal subgraph?
+  MemBarNode *mbvol = leading_to_normal(barrier);
 
   if (!mbvol)
     return false;
 
-  // the Mem ProjNode should output to a MergeMem and the same Store
-  Node *mm = NULL;
-  for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
-    x = mem->fast_out(i);
-    if (!mm && x->is_MergeMem()) {
-      mm = x;
-    } else if (x != st && !x->is_Mach()) {
-      return false;
-    }
-  }
-
-  if (!mm)
+  // all done unless this is a card mark
+  if (!is_card_mark_membar(mbvol))
+    return true;
+  
+  // we found a card mark -- just make sure we have a trailing barrier
+
+  return (card_mark_to_trailing(mbvol) != NULL);
+}
+
+// predicate controlling translation of StoreCM
+//
+// returns true if a StoreStore must precede the card write otherwise
+// false
+
+bool unnecessary_storestore(const Node *storecm)
+{
+  assert(storecm->Opcode()  == Op_StoreCM, "expecting a StoreCM");
+
+  // we only ever need to generate a dmb ishst between an object put
+  // and the associated card mark when we are using CMS without
+  // conditional card marking
+
+  if (!UseConcMarkSweepGC || UseCondCardMark)
+    return true;
+
+  // if we are implementing volatile puts using barriers then the
+  // object put as an str so we must insert the dmb ishst
+
+  if (UseBarriersForVolatile)
     return false;
 
-  // the MergeMem should output to the MemBarVolatile
-  for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
-    x = mm->fast_out(i);
-    if (x != mbvol && !x->is_Mach()) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
+  // we can omit the dmb ishst if this StoreCM is part of a volatile
+  // put because in thta case the put will be implemented by stlr
+  //
+  // we need to check for a normal subgraph feeding this StoreCM.
+  // that means the StoreCM must be fed Memory from a leading membar,
+  // either a MemBarRelease or its dependent MemBarCPUOrder, and the
+  // leading membar must be part of a normal subgraph
+
+  Node *x = storecm->in(StoreNode::Memory);
+
+  if (!x->is_Proj())
+    return false;
+
+  x = x->in(0);
+
+  if (!x->is_MemBar())
+    return false;
+
+  MemBarNode *leading = x->as_MemBar();
+
+  // reject invalid candidates
+  if (!leading_membar(leading))
+    return false;
+
+  // we can omit the StoreStore if it is the head of a normal subgraph
+  return (leading_to_normal(leading) != NULL);
+}
 
 
 #define __ _masm.
@@ -2944,6 +3530,13 @@
                as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
   %}
 
+  enc_class aarch64_enc_strb0_ordered(memory mem) %{
+    MacroAssembler _masm(&cbuf);
+    __ membar(Assembler::StoreStore);
+    loadStore(_masm, &MacroAssembler::strb, zr, $mem->opcode(),
+               as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
+  %}
+
   enc_class aarch64_enc_strh(iRegI src, memory mem) %{
     Register src_reg = as_Register($src$$reg);
     loadStore(MacroAssembler(&cbuf), &MacroAssembler::strh, src_reg, $mem->opcode(),
@@ -6613,6 +7206,7 @@
 instruct storeimmCM0(immI0 zero, memory mem)
 %{
   match(Set mem (StoreCM mem zero));
+  predicate(unnecessary_storestore(n));
 
   ins_cost(INSN_COST);
   format %{ "strb zr, $mem\t# byte" %}
@@ -6622,6 +7216,21 @@
   ins_pipe(istore_mem);
 %}
 
+// Store CMS card-mark Immediate with intervening StoreStore
+// needed when using CMS with no conditional card marking
+instruct storeimmCM0_ordered(immI0 zero, memory mem)
+%{
+  match(Set mem (StoreCM mem zero));
+
+  ins_cost(INSN_COST * 2);
+  format %{ "dmb ishst"
+      "\n\tstrb zr, $mem\t# byte" %}
+
+  ins_encode(aarch64_enc_strb0_ordered(mem));
+
+  ins_pipe(istore_mem);
+%}
+
 // Store Byte
 instruct storeB(iRegIorL2I src, memory mem)
 %{
@@ -6643,7 +7252,7 @@
   predicate(!needs_releasing_store(n));
 
   ins_cost(INSN_COST);
-  format %{ "strb zr, $mem\t# byte" %}
+  format %{ "strb rscractch2, $mem\t# byte" %}
 
   ins_encode(aarch64_enc_strb0(mem));
 
@@ -7396,6 +8005,7 @@
   format %{ "membar_acquire" %}
 
   ins_encode %{
+    __ block_comment("membar_acquire");
     __ membar(Assembler::LoadLoad|Assembler::LoadStore);
   %}
 
@@ -7448,6 +8058,7 @@
   format %{ "membar_release" %}
 
   ins_encode %{
+    __ block_comment("membar_release");
     __ membar(Assembler::LoadStore|Assembler::StoreStore);
   %}
   ins_pipe(pipe_serial);
@@ -7499,6 +8110,7 @@
   format %{ "membar_volatile" %}
 
   ins_encode %{
+    __ block_comment("membar_volatile");
     __ membar(Assembler::StoreLoad);
   %}
 
@@ -9429,7 +10041,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9465,7 +10077,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::ASR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9501,7 +10113,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSL,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9537,7 +10149,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9573,7 +10185,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::ASR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9609,7 +10221,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSL,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9645,7 +10257,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9681,7 +10293,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::ASR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9717,7 +10329,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSL,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9754,7 +10366,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9792,7 +10404,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::ASR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9830,7 +10442,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSL,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9868,7 +10480,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9906,7 +10518,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::ASR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9944,7 +10556,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSL,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -9982,7 +10594,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -10020,7 +10632,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::ASR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -10058,7 +10670,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSL,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -10096,7 +10708,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -10134,7 +10746,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::ASR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -10172,7 +10784,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSL,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -10210,7 +10822,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -10248,7 +10860,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::ASR,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -10286,7 +10898,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::LSL,
-              $src3$$constant & 0x3f);
+              $src3$$constant & 0x1f);
   %}
 
   ins_pipe(ialu_reg_reg_shift);
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/cpu/aarch64/vm/aarch64_ad.m4
--- a/hotspot/src/cpu/aarch64/vm/aarch64_ad.m4	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/cpu/aarch64/vm/aarch64_ad.m4	Thu Sep 03 16:14:02 2015 -0700
@@ -42,7 +42,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::$5,
-              $src3$$constant & 0x3f);
+              $src3$$constant & ifelse($1,I,0x1f,0x3f));
   %}
 
   ins_pipe(ialu_reg_reg_shift);
@@ -87,7 +87,7 @@
               as_Register($src1$$reg),
               as_Register($src2$$reg),
               Assembler::$5,
-              $src3$$constant & 0x3f);
+              $src3$$constant & ifelse($1,I,0x1f,0x3f));
   %}
 
   ins_pipe(ialu_reg_reg_shift);
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/cpu/aarch64/vm/assembler_aarch64.cpp
--- a/hotspot/src/cpu/aarch64/vm/assembler_aarch64.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/cpu/aarch64/vm/assembler_aarch64.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -268,7 +268,7 @@
     __ ldar(r21, r28);                                 //       ldar    x21, [x28]
 
 // LoadStoreExclusiveOp
-    __ stxrw(r24, r24, r7);                            //       stxr    w24, w24, [x7]
+    __ stxrw(r21, r24, r7);                            //       stxr    w21, w24, [x7]
     __ stlxrw(r21, r26, r28);                          //       stlxr   w21, w26, [x28]
     __ ldxrw(r21, r6);                                 //       ldxr    w21, [x6]
     __ ldaxrw(r15, r30);                               //       ldaxr   w15, [x30]
@@ -299,7 +299,7 @@
 
 // LoadStoreExclusiveOp
     __ ldxpw(r25, r4, r22);                            //       ldxp    w25, w4, [x22]
-    __ ldaxpw(r14, r14, r15);                          //       ldaxp   w14, w14, [x15]
+    __ ldaxpw(r13, r14, r15);                          //       ldaxp   w13, w14, [x15]
     __ stxpw(r20, r26, r8, r10);                       //       stxp    w20, w26, w8, [x10]
     __ stlxpw(r23, r18, r18, r18);                     //       stlxp   w23, w18, w18, [x18]
 
@@ -773,7 +773,7 @@
  260:   c85fffbb        ldaxr   x27, [x29]
  264:   c89fffa0        stlr    x0, [x29]
  268:   c8dfff95        ldar    x21, [x28]
- 26c:   88187cf8        stxr    w24, w24, [x7]
+ 26c:   88157cf8        stxr    w21, w24, [x7]
  270:   8815ff9a        stlxr   w21, w26, [x28]
  274:   885f7cd5        ldxr    w21, [x6]
  278:   885fffcf        ldaxr   w15, [x30]
@@ -796,7 +796,7 @@
  2bc:   c82870bb        stxp    w8, x27, x28, [x5]
  2c0:   c825b8c8        stlxp   w5, x8, x14, [x6]
  2c4:   887f12d9        ldxp    w25, w4, [x22]
- 2c8:   887fb9ee        ldaxp   w14, w14, [x15]
+ 2c8:   887fb9ed        ldaxp   w13, w14, [x15]
  2cc:   8834215a        stxp    w20, w26, w8, [x10]
  2d0:   8837ca52        stlxp   w23, w18, w18, [x18]
  2d4:   f806317e        str     x30, [x11,#99]
@@ -1085,13 +1085,13 @@
     0xd444c320,     0xd503201f,     0xd69f03e0,     0xd6bf03e0,
     0xd5033fdf,     0xd5033f9f,     0xd5033abf,     0xd61f0040,
     0xd63f00a0,     0xc8147c55,     0xc805fcfd,     0xc85f7e05,
-    0xc85fffbb,     0xc89fffa0,     0xc8dfff95,     0x88187cf8,
+    0xc85fffbb,     0xc89fffa0,     0xc8dfff95,     0x88157cf8,
     0x8815ff9a,     0x885f7cd5,     0x885fffcf,     0x889ffc73,
     0x88dffc56,     0x48127c0f,     0x480bff85,     0x485f7cdd,
     0x485ffcf2,     0x489fff99,     0x48dffe62,     0x080a7c3e,
     0x0814fed5,     0x085f7c59,     0x085ffcb8,     0x089ffc70,
     0x08dfffb6,     0xc87f0a68,     0xc87fcdc7,     0xc82870bb,
-    0xc825b8c8,     0x887f12d9,     0x887fb9ee,     0x8834215a,
+    0xc825b8c8,     0x887f12d9,     0x887fb9ed,     0x8834215a,
     0x8837ca52,     0xf806317e,     0xb81b3337,     0x39000dc2,
     0x78005149,     0xf84391f4,     0xb85b220c,     0x385fd356,
     0x785d127e,     0x389f4149,     0x79801e3c,     0x79c014a3,
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp
--- a/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -1106,13 +1106,13 @@
 
 #define INSN4(NAME, sz, op, o0) /* Four registers */                    \
   void NAME(Register Rs, Register Rt1, Register Rt2, Register Rn) {     \
-    assert(Rs != Rn, "unpredictable instruction");                  \
+    guarantee(Rs != Rn && Rs != Rt1 && Rs != Rt2, "unpredictable instruction"); \
     load_store_exclusive(Rs, Rt1, Rt2, Rn, sz, op, o0);                 \
   }
 
 #define INSN3(NAME, sz, op, o0) /* Three registers */                   \
   void NAME(Register Rs, Register Rt, Register Rn) {                    \
-    assert(Rs != Rn, "unpredictable instruction");                  \
+    guarantee(Rs != Rn && Rs != Rt, "unpredictable instruction");       \
     load_store_exclusive(Rs, Rt, (Register)0b11111, Rn, sz, op, o0);    \
   }
 
@@ -1124,6 +1124,7 @@
 
 #define INSN_FOO(NAME, sz, op, o0) /* Three registers, encoded differently */ \
   void NAME(Register Rt1, Register Rt2, Register Rn) {                  \
+    guarantee(Rt1 != Rt2, "unpredictable instruction");                 \
     load_store_exclusive((Register)0b11111, Rt1, Rt2, Rn, sz, op, o0);  \
   }
 
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/cpu/aarch64/vm/interp_masm_aarch64.cpp
--- a/hotspot/src/cpu/aarch64/vm/interp_masm_aarch64.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/cpu/aarch64/vm/interp_masm_aarch64.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -611,6 +611,7 @@
     Label done;
 
     const Register swap_reg = r0;
+    const Register tmp = c_rarg2;
     const Register obj_reg = c_rarg3; // Will contain the oop
 
     const int obj_offset = BasicObjectLock::obj_offset_in_bytes();
@@ -624,7 +625,7 @@
     ldr(obj_reg, Address(lock_reg, obj_offset));
 
     if (UseBiasedLocking) {
-      biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch2, false, done, &slow_case);
+      biased_locking_enter(lock_reg, obj_reg, swap_reg, tmp, false, done, &slow_case);
     }
 
     // Load (object->mark() | 1) into swap_reg
@@ -643,7 +644,7 @@
       cmpxchgptr(swap_reg, lock_reg, obj_reg, rscratch1, fast, &fail);
       bind(fast);
       atomic_incw(Address((address)BiasedLocking::fast_path_entry_count_addr()),
-                  rscratch2, rscratch1);
+                  rscratch2, rscratch1, tmp);
       b(done);
       bind(fail);
     } else {
@@ -671,7 +672,7 @@
     if (PrintBiasedLockingStatistics) {
       br(Assembler::NE, slow_case);
       atomic_incw(Address((address)BiasedLocking::fast_path_entry_count_addr()),
-                  rscratch2, rscratch1);
+                  rscratch2, rscratch1, tmp);
     }
     br(Assembler::EQ, done);
 
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -34,6 +34,7 @@
 #include "memory/resourceArea.hpp"
 #include "nativeInst_aarch64.hpp"
 #include "oops/klass.inline.hpp"
+#include "oops/oop.inline.hpp"
 #include "opto/compile.hpp"
 #include "opto/node.hpp"
 #include "runtime/biasedLocking.hpp"
@@ -398,11 +399,7 @@
   if (PrintBiasedLockingStatistics && counters == NULL)
     counters = BiasedLocking::counters();
 
-  bool need_tmp_reg = false;
-  if (tmp_reg == noreg) {
-    tmp_reg = rscratch2;
-  }
-  assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1);
+  assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
@@ -432,7 +429,7 @@
   if (counters != NULL) {
     Label around;
     cbnz(tmp_reg, around);
-    atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1);
+    atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
     b(done);
     bind(around);
   } else {
@@ -485,7 +482,7 @@
     bind(here);
     if (counters != NULL) {
       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
-                  tmp_reg, rscratch1);
+                  tmp_reg, rscratch1, rscratch2);
     }
   }
   b(done);
@@ -511,7 +508,7 @@
     bind(here);
     if (counters != NULL) {
       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
-                  tmp_reg, rscratch1);
+                  tmp_reg, rscratch1, rscratch2);
     }
   }
   b(done);
@@ -539,7 +536,7 @@
     // removing the bias bit from the object's header.
     if (counters != NULL) {
       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
-                  rscratch1);
+                  rscratch1, rscratch2);
     }
     bind(nope);
   }
@@ -1640,15 +1637,15 @@
   return Address(Rd);
 }
 
-void MacroAssembler::atomic_incw(Register counter_addr, Register tmp) {
+void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
   Label retry_load;
   bind(retry_load);
   // flush and load exclusive from the memory location
   ldxrw(tmp, counter_addr);
   addw(tmp, tmp, 1);
   // if we store+flush with no intervening write tmp wil be zero
-  stxrw(tmp, tmp, counter_addr);
-  cbnzw(tmp, retry_load);
+  stxrw(tmp2, tmp, counter_addr);
+  cbnzw(tmp2, retry_load);
 }
 
 
@@ -2021,6 +2018,14 @@
   }
 }
 
+void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
+  if (decrement.is_register()) {
+    subw(Rd, Rn, decrement.as_register());
+  } else {
+    subw(Rd, Rn, decrement.as_constant());
+  }
+}
+
 void MacroAssembler::reinit_heapbase()
 {
   if (UseCompressedOops) {
@@ -2110,7 +2115,7 @@
     return a != b.as_register() && a != c && b.as_register() != c;
 }
 
-#define ATOMIC_OP(LDXR, OP, STXR)                                       \
+#define ATOMIC_OP(LDXR, OP, IOP, STXR)                                       \
 void MacroAssembler::atomic_##OP(Register prev, RegisterOrConstant incr, Register addr) { \
   Register result = rscratch2;                                          \
   if (prev->is_valid())                                                 \
@@ -2120,14 +2125,15 @@
   bind(retry_load);                                                     \
   LDXR(result, addr);                                                   \
   OP(rscratch1, result, incr);                                          \
-  STXR(rscratch1, rscratch1, addr);                                     \
-  cbnzw(rscratch1, retry_load);                                         \
-  if (prev->is_valid() && prev != result)                               \
-    mov(prev, result);                                                  \
+  STXR(rscratch2, rscratch1, addr);                                     \
+  cbnzw(rscratch2, retry_load);                                         \
+  if (prev->is_valid() && prev != result) {                             \
+    IOP(prev, rscratch1, incr);                                         \
+  }                                                                     \
 }
 
-ATOMIC_OP(ldxr, add, stxr)
-ATOMIC_OP(ldxrw, addw, stxrw)
+ATOMIC_OP(ldxr, add, sub, stxr)
+ATOMIC_OP(ldxrw, addw, subw, stxrw)
 
 #undef ATOMIC_OP
 
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -107,9 +107,7 @@
   // Biased locking support
   // lock_reg and obj_reg must be loaded up with the appropriate values.
   // swap_reg is killed.
-  // tmp_reg is optional. If it is supplied (i.e., != noreg) it will
-  // be killed; if not supplied, push/pop will be used internally to
-  // allocate a temporary (inefficient, avoid if possible).
+  // tmp_reg must be supplied and must not be rscratch1 or rscratch2
   // Optional slow case is for implementations (interpreter and C1) which branch to
   // slow case directly. Leaves condition codes set for C2's Fast_Lock node.
   // Returns offset of first potentially-faulting instruction for null
@@ -126,10 +124,10 @@
 
   // Helper functions for statistics gathering.
   // Unconditional atomic increment.
-  void atomic_incw(Register counter_addr, Register tmp);
-  void atomic_incw(Address counter_addr, Register tmp1, Register tmp2) {
+  void atomic_incw(Register counter_addr, Register tmp, Register tmp2);
+  void atomic_incw(Address counter_addr, Register tmp1, Register tmp2, Register tmp3) {
     lea(tmp1, counter_addr);
-    atomic_incw(tmp1, tmp2);
+    atomic_incw(tmp1, tmp2, tmp3);
   }
   // Load Effective Address
   void lea(Register r, const Address &a) {
@@ -1057,6 +1055,7 @@
   void add(Register Rd, Register Rn, RegisterOrConstant increment);
   void addw(Register Rd, Register Rn, RegisterOrConstant increment);
   void sub(Register Rd, Register Rn, RegisterOrConstant decrement);
+  void subw(Register Rd, Register Rn, RegisterOrConstant decrement);
 
   void adrp(Register reg1, const Address &dest, unsigned long &byte_offset);
 
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp
--- a/hotspot/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -1774,6 +1774,7 @@
   const Register obj_reg  = r19;  // Will contain the oop
   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
   const Register old_hdr  = r13;  // value of old header at unlock time
+  const Register tmp = c_rarg3;
 
   Label slow_path_lock;
   Label lock_done;
@@ -1795,7 +1796,7 @@
     __ ldr(obj_reg, Address(oop_handle_reg, 0));
 
     if (UseBiasedLocking) {
-      __ biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch2, false, lock_done, &slow_path_lock);
+      __ biased_locking_enter(lock_reg, obj_reg, swap_reg, tmp, false, lock_done, &slow_path_lock);
     }
 
     // Load (object->mark() | 1) into swap_reg %r0
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp
--- a/hotspot/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -1913,15 +1913,18 @@
 }
 
 void TemplateInterpreterGenerator::count_bytecode() {
+  Register rscratch3 = r0;
   __ push(rscratch1);
   __ push(rscratch2);
+  __ push(rscratch3);
   Label L;
   __ mov(rscratch2, (address) &BytecodeCounter::_counter_value);
   __ bind(L);
   __ ldxr(rscratch1, rscratch2);
   __ add(rscratch1, rscratch1, 1);
-  __ stxr(rscratch1, rscratch1, rscratch2);
-  __ cbnzw(rscratch1, L);
+  __ stxr(rscratch3, rscratch1, rscratch2);
+  __ cbnzw(rscratch3, L);
+  __ pop(rscratch3);
   __ pop(rscratch2);
   __ pop(rscratch1);
 }
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/cpu/x86/vm/assembler_x86.cpp
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -1674,6 +1674,13 @@
   emit_simd_arith(0x2A, dst, src, VEX_SIMD_F3, true);
 }
 
+void Assembler::cvtsi2ssq(XMMRegister dst, Register src) {
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3, true);
+  emit_int8(0x2A);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 void Assembler::cvtss2sd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0x5A, dst, src, VEX_SIMD_F3);
@@ -6604,13 +6611,6 @@
   emit_operand(dst, src);
 }
 
-void Assembler::cvtsi2ssq(XMMRegister dst, Register src) {
-  NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3, true);
-  emit_int8(0x2A);
-  emit_int8((unsigned char)(0xC0 | encode));
-}
-
 void Assembler::cvtsi2ssq(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   if (VM_Version::supports_evex()) {
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/cpu/x86/vm/interp_masm_x86.cpp
--- a/hotspot/src/cpu/x86/vm/interp_masm_x86.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/interp_masm_x86.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -355,8 +355,8 @@
     case ctos:                                   // fall through
     case stos:                                   // fall through
     case itos: movl(rax, val_addr);                 break;
-    case ftos: movflt(xmm0, val_addr);              break;
-    case dtos: movdbl(xmm0, val_addr);              break;
+    case ftos: load_float(val_addr);                break;
+    case dtos: load_double(val_addr);               break;
     case vtos: /* nothing to do */                  break;
     default  : ShouldNotReachHere();
   }
@@ -376,8 +376,8 @@
     case ctos:                                     // fall through
     case stos:                                     // fall through
     case itos: movl(rax, val_addr);                   break;
-    case ftos: fld_s(val_addr);                       break;
-    case dtos: fld_d(val_addr);                       break;
+    case ftos: load_float(val_addr);                  break;
+    case dtos: load_double(val_addr);                 break;
     case vtos: /* nothing to do */                    break;
     default  : ShouldNotReachHere();
   }
@@ -578,6 +578,26 @@
   push(r);
 }
 
+void InterpreterMacroAssembler::push_f(XMMRegister r) {
+  subptr(rsp, wordSize);
+  movflt(Address(rsp, 0), r);
+}
+
+void InterpreterMacroAssembler::pop_f(XMMRegister r) {
+  movflt(r, Address(rsp, 0));
+  addptr(rsp, wordSize);
+}
+
+void InterpreterMacroAssembler::push_d(XMMRegister r) {
+  subptr(rsp, 2 * wordSize);
+  movdbl(Address(rsp, 0), r);
+}
+
+void InterpreterMacroAssembler::pop_d(XMMRegister r) {
+  movdbl(r, Address(rsp, 0));
+  addptr(rsp, 2 * Interpreter::stackElementSize);
+}
+
 #ifdef _LP64
 void InterpreterMacroAssembler::pop_i(Register r) {
   // XXX can't use pop currently, upper half non clean
@@ -590,31 +610,11 @@
   addptr(rsp, 2 * Interpreter::stackElementSize);
 }
 
-void InterpreterMacroAssembler::pop_f(XMMRegister r) {
-  movflt(r, Address(rsp, 0));
-  addptr(rsp, wordSize);
-}
-
-void InterpreterMacroAssembler::pop_d(XMMRegister r) {
-  movdbl(r, Address(rsp, 0));
-  addptr(rsp, 2 * Interpreter::stackElementSize);
-}
-
 void InterpreterMacroAssembler::push_l(Register r) {
   subptr(rsp, 2 * wordSize);
   movq(Address(rsp, 0), r);
 }
 
-void InterpreterMacroAssembler::push_f(XMMRegister r) {
-  subptr(rsp, wordSize);
-  movflt(Address(rsp, 0), r);
-}
-
-void InterpreterMacroAssembler::push_d(XMMRegister r) {
-  subptr(rsp, 2 * wordSize);
-  movdbl(Address(rsp, 0), r);
-}
-
 void InterpreterMacroAssembler::pop(TosState state) {
   switch (state) {
   case atos: pop_ptr();                 break;
@@ -623,8 +623,8 @@
   case stos:
   case itos: pop_i();                   break;
   case ltos: pop_l();                   break;
-  case ftos: pop_f();                   break;
-  case dtos: pop_d();                   break;
+  case ftos: pop_f(xmm0);               break;
+  case dtos: pop_d(xmm0);               break;
   case vtos: /* nothing to do */        break;
   default:   ShouldNotReachHere();
   }
@@ -640,8 +640,8 @@
   case stos:
   case itos: push_i();                  break;
   case ltos: push_l();                  break;
-  case ftos: push_f();                  break;
-  case dtos: push_d();                  break;
+  case ftos: push_f(xmm0);              break;
+  case dtos: push_d(xmm0);              break;
   case vtos: /* nothing to do */        break;
   default  : ShouldNotReachHere();
   }
@@ -675,8 +675,20 @@
     case stos:                                               // fall through
     case itos: pop_i(rax);                                   break;
     case ltos: pop_l(rax, rdx);                              break;
-    case ftos: pop_f();                                      break;
-    case dtos: pop_d();                                      break;
+    case ftos:
+      if (UseSSE >= 1) {
+        pop_f(xmm0);
+      } else {
+        pop_f();
+      }
+      break;
+    case dtos:
+      if (UseSSE >= 2) {
+        pop_d(xmm0);
+      } else {
+        pop_d();
+      }
+      break;
     case vtos: /* nothing to do */                           break;
     default  : ShouldNotReachHere();
   }
@@ -695,7 +707,7 @@
   fstp_s(Address(rsp, 0));
 }
 
-void InterpreterMacroAssembler::push_d(Register r) {
+void InterpreterMacroAssembler::push_d() {
   // Do not schedule for no AGI! Never write beyond rsp!
   subptr(rsp, 2 * wordSize);
   fstp_d(Address(rsp, 0));
@@ -711,8 +723,20 @@
     case stos:                                               // fall through
     case itos: push_i(rax);                                    break;
     case ltos: push_l(rax, rdx);                               break;
-    case ftos: push_f();                                       break;
-    case dtos: push_d(rax);                                    break;
+    case ftos:
+      if (UseSSE >= 1) {
+        push_f(xmm0);
+      } else {
+        push_f();
+      }
+      break;
+    case dtos:
+      if (UseSSE >= 2) {
+        push_d(xmm0);
+      } else {
+        push_d();
+      }
+      break;
     case vtos: /* nothing to do */                             break;
     default  : ShouldNotReachHere();
   }
@@ -995,22 +1019,6 @@
   leave();                           // remove frame anchor
   pop(ret_addr);                     // get return address
   mov(rsp, rbx);                     // set sp to sender sp
-#ifndef _LP64
-  if (UseSSE) {
-    // float and double are returned in xmm register in SSE-mode
-    if (state == ftos && UseSSE >= 1) {
-      subptr(rsp, wordSize);
-      fstp_s(Address(rsp, 0));
-      movflt(xmm0, Address(rsp, 0));
-      addptr(rsp, wordSize);
-    } else if (state == dtos && UseSSE >= 2) {
-      subptr(rsp, 2*wordSize);
-      fstp_d(Address(rsp, 0));
-      movdbl(xmm0, Address(rsp, 0));
-      addptr(rsp, 2*wordSize);
-    }
-  }
-#endif // _LP64
 }
 #endif // !CC_INTERP
 
@@ -1783,7 +1791,10 @@
 
 void InterpreterMacroAssembler::verify_FPU(int stack_depth, TosState state) {
 #ifndef _LP64
-  if (state == ftos || state == dtos) MacroAssembler::verify_FPU(stack_depth);
+  if ((state == ftos && UseSSE < 1) ||
+      (state == dtos && UseSSE < 2)) {
+    MacroAssembler::verify_FPU(stack_depth);
+  }
 #endif
 }
 
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/cpu/x86/vm/interp_masm_x86.hpp
--- a/hotspot/src/cpu/x86/vm/interp_masm_x86.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/interp_masm_x86.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -140,20 +140,20 @@
   void push_ptr(Register r = rax);
   void push_i(Register r = rax);
 
+  void push_f(XMMRegister r);
+  void pop_f(XMMRegister r);
+  void pop_d(XMMRegister r);
+  void push_d(XMMRegister r);
 #ifdef _LP64
   void pop_l(Register r = rax);
-  void pop_f(XMMRegister r = xmm0);
-  void pop_d(XMMRegister r = xmm0);
   void push_l(Register r = rax);
-  void push_f(XMMRegister r = xmm0);
-  void push_d(XMMRegister r = xmm0);
 #else
   void pop_l(Register lo = rax, Register hi = rdx);
   void pop_f();
   void pop_d();
 
   void push_l(Register lo = rax, Register hi = rdx);
-  void push_d(Register r = rax);
+  void push_d();
   void push_f();
 #endif // _LP64
 
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/cpu/x86/vm/interpreterGenerator_x86.hpp
--- a/hotspot/src/cpu/x86/vm/interpreterGenerator_x86.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/interpreterGenerator_x86.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -42,6 +42,12 @@
   address generate_Reference_get_entry();
   address generate_CRC32_update_entry();
   address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind);
+#ifndef _LP64
+  address generate_Float_intBitsToFloat_entry();
+  address generate_Float_floatToRawIntBits_entry();
+  address generate_Double_longBitsToDouble_entry();
+  address generate_Double_doubleToRawLongBits_entry();
+#endif
   void lock_method(void);
   void generate_stack_overflow_check(void);
 
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -3314,6 +3314,42 @@
   fincstp();
 }
 
+void MacroAssembler::load_float(Address src) {
+  if (UseSSE >= 1) {
+    movflt(xmm0, src);
+  } else {
+    LP64_ONLY(ShouldNotReachHere());
+    NOT_LP64(fld_s(src));
+  }
+}
+
+void MacroAssembler::store_float(Address dst) {
+  if (UseSSE >= 1) {
+    movflt(dst, xmm0);
+  } else {
+    LP64_ONLY(ShouldNotReachHere());
+    NOT_LP64(fstp_s(dst));
+  }
+}
+
+void MacroAssembler::load_double(Address src) {
+  if (UseSSE >= 2) {
+    movdbl(xmm0, src);
+  } else {
+    LP64_ONLY(ShouldNotReachHere());
+    NOT_LP64(fld_d(src));
+  }
+}
+
+void MacroAssembler::store_double(Address dst) {
+  if (UseSSE >= 2) {
+    movdbl(dst, xmm0);
+  } else {
+    LP64_ONLY(ShouldNotReachHere());
+    NOT_LP64(fstp_d(dst));
+  }
+}
+
 void MacroAssembler::fremr(Register tmp) {
   save_rax(tmp);
   { Label L;
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -471,6 +471,22 @@
   // Pop ST (ffree & fincstp combined)
   void fpop();
 
+  // Load float value from 'address'. If UseSSE >= 1, the value is loaded into
+  // register xmm0. Otherwise, the value is loaded onto the FPU stack.
+  void load_float(Address src);
+
+  // Store float value to 'address'. If UseSSE >= 1, the value is stored
+  // from register xmm0. Otherwise, the value is stored from the FPU stack.
+  void store_float(Address dst);
+
+  // Load double value from 'address'. If UseSSE >= 2, the value is loaded into
+  // register xmm0. Otherwise, the value is loaded onto the FPU stack.
+  void load_double(Address src);
+
+  // Store double value to 'address'. If UseSSE >= 2, the value is stored
+  // from register xmm0. Otherwise, the value is stored from the FPU stack.
+  void store_double(Address dst);
+
   // pushes double TOS element of FPU stack on CPU stack; pops from FPU stack
   void push_fTOS();
 
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/cpu/x86/vm/templateInterpreter_x86_32.cpp
--- a/hotspot/src/cpu/x86/vm/templateInterpreter_x86_32.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/templateInterpreter_x86_32.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -170,22 +170,12 @@
     __ MacroAssembler::verify_FPU(0, "generate_return_entry_for compiled");
   }
 
-  // In SSE mode, interpreter returns FP results in xmm0 but they need
-  // to end up back on the FPU so it can operate on them.
-  if (state == ftos && UseSSE >= 1) {
-    __ subptr(rsp, wordSize);
-    __ movflt(Address(rsp, 0), xmm0);
-    __ fld_s(Address(rsp, 0));
-    __ addptr(rsp, wordSize);
-  } else if (state == dtos && UseSSE >= 2) {
-    __ subptr(rsp, 2*wordSize);
-    __ movdbl(Address(rsp, 0), xmm0);
-    __ fld_d(Address(rsp, 0));
-    __ addptr(rsp, 2*wordSize);
+  if (state == ftos) {
+    __ MacroAssembler::verify_FPU(UseSSE >= 1 ? 0 : 1, "generate_return_entry_for in interpreter");
+  } else if (state == dtos) {
+    __ MacroAssembler::verify_FPU(UseSSE >= 2 ? 0 : 1, "generate_return_entry_for in interpreter");
   }
 
-  __ MacroAssembler::verify_FPU(state == ftos || state == dtos ? 1 : 0, "generate_return_entry_for in interpreter");
-
   // Restore stack bottom in case i2c adjusted stack
   __ movptr(rsp, Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize));
   // and NULL it as marker that rsp is now tos until next java call
@@ -217,21 +207,12 @@
 address TemplateInterpreterGenerator::generate_deopt_entry_for(TosState state, int step) {
   address entry = __ pc();
 
-  // In SSE mode, FP results are in xmm0
-  if (state == ftos && UseSSE > 0) {
-    __ subptr(rsp, wordSize);
-    __ movflt(Address(rsp, 0), xmm0);
-    __ fld_s(Address(rsp, 0));
-    __ addptr(rsp, wordSize);
-  } else if (state == dtos && UseSSE >= 2) {
-    __ subptr(rsp, 2*wordSize);
-    __ movdbl(Address(rsp, 0), xmm0);
-    __ fld_d(Address(rsp, 0));
-    __ addptr(rsp, 2*wordSize);
+  if (state == ftos) {
+    __ MacroAssembler::verify_FPU(UseSSE >= 1 ? 0 : 1, "generate_deopt_entry_for in interpreter");
+  } else if (state == dtos) {
+    __ MacroAssembler::verify_FPU(UseSSE >= 2 ? 0 : 1, "generate_deopt_entry_for in interpreter");
   }
 
-  __ MacroAssembler::verify_FPU(state == ftos || state == dtos ? 1 : 0, "generate_deopt_entry_for in interpreter");
-
   // The stack is not extended by deopt but we must NULL last_sp as this
   // entry is like a "return".
   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
@@ -735,7 +716,7 @@
   if (UseCRC32Intrinsics) {
     address entry = __ pc();
 
-    // rbx,: Method*
+    // rbx: Method*
     // rsi: senderSP must preserved for slow path, set SP to it on fast path
     // rdx: scratch
     // rdi: scratch
@@ -841,6 +822,124 @@
   return generate_native_entry(false);
 }
 
+/**
+ * Method entry for static native method:
+ *    java.lang.Float.intBitsToFloat(int bits)
+ */
+address InterpreterGenerator::generate_Float_intBitsToFloat_entry() {
+  address entry;
+
+  if (UseSSE >= 1) {
+    entry = __ pc();
+
+    // rsi: the sender's SP
+
+    // Skip safepoint check (compiler intrinsic versions of this method
+    // do not perform safepoint checks either).
+
+    // Load 'bits' into xmm0 (interpreter returns results in xmm0)
+    __ movflt(xmm0, Address(rsp, wordSize));
+
+    // Return
+    __ pop(rdi); // get return address
+    __ mov(rsp, rsi); // set rsp to the sender's SP
+    __ jmp(rdi);
+  } else {
+    entry = generate_native_entry(false);
+  }
+
+  return entry;
+}
+
+/**
+ * Method entry for static native method:
+ *    java.lang.Float.floatToRawIntBits(float value)
+ */
+address InterpreterGenerator::generate_Float_floatToRawIntBits_entry() {
+  address entry;
+
+  if (UseSSE >= 1) {
+    entry = __ pc();
+
+    // rsi: the sender's SP
+
+    // Skip safepoint check (compiler intrinsic versions of this method
+    // do not perform safepoint checks either).
+
+    // Load the parameter (a floating-point value) into rax.
+    __ movl(rax, Address(rsp, wordSize));
+
+    // Return
+    __ pop(rdi); // get return address
+    __ mov(rsp, rsi); // set rsp to the sender's SP
+    __ jmp(rdi);
+  } else {
+    entry = generate_native_entry(false);
+  }
+
+  return entry;
+}
+
+
+/**
+ * Method entry for static native method:
+ *    java.lang.Double.longBitsToDouble(long bits)
+ */
+address InterpreterGenerator::generate_Double_longBitsToDouble_entry() {
+  address entry;
+
+   if (UseSSE >= 2) {
+     entry = __ pc();
+
+     // rsi: the sender's SP
+
+     // Skip safepoint check (compiler intrinsic versions of this method
+     // do not perform safepoint checks either).
+
+     // Load 'bits' into xmm0 (interpreter returns results in xmm0)
+     __ movdbl(xmm0, Address(rsp, wordSize));
+
+     // Return
+     __ pop(rdi); // get return address
+     __ mov(rsp, rsi); // set rsp to the sender's SP
+     __ jmp(rdi);
+   } else {
+     entry = generate_native_entry(false);
+   }
+
+   return entry;
+}
+
+/**
+ * Method entry for static native method:
+ *    java.lang.Double.doubleToRawLongBits(double value)
+ */
+address InterpreterGenerator::generate_Double_doubleToRawLongBits_entry() {
+  address entry;
+
+  if (UseSSE >= 2) {
+    entry = __ pc();
+
+    // rsi: the sender's SP
+
+    // Skip safepoint check (compiler intrinsic versions of this method
+    // do not perform safepoint checks either).
+
+    // Load the parameter (a floating-point value) into rax.
+    __ movl(rdx, Address(rsp, 2*wordSize));
+    __ movl(rax, Address(rsp, wordSize));
+
+    // Return
+    __ pop(rdi); // get return address
+    __ mov(rsp, rsi); // set rsp to the sender's SP
+    __ jmp(rdi);
+  } else {
+    entry = generate_native_entry(false);
+  }
+
+  return entry;
+}
+
 //
 // Interpreter stub for calling a native method. (asm interpreter)
 // This sets up a somewhat different looking stack for calling the native method
@@ -1090,7 +1189,7 @@
               double_handler.addr());
     __ jcc(Assembler::notEqual, L);
     __ bind(push_double);
-    __ push(dtos);
+    __ push_d(); // FP values are returned using the FPU, so push FPU contents (even if UseSSE > 0).
     __ bind(L);
   }
   __ push(ltos);
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/cpu/x86/vm/templateInterpreter_x86_64.cpp
--- a/hotspot/src/cpu/x86/vm/templateInterpreter_x86_64.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/templateInterpreter_x86_64.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -1707,10 +1707,10 @@
                                                          address& vep) {
   assert(t->is_valid() && t->tos_in() == vtos, "illegal template");
   Label L;
-  aep = __ pc();  __ push_ptr();  __ jmp(L);
-  fep = __ pc();  __ push_f();    __ jmp(L);
-  dep = __ pc();  __ push_d();    __ jmp(L);
-  lep = __ pc();  __ push_l();    __ jmp(L);
+  aep = __ pc();  __ push_ptr();   __ jmp(L);
+  fep = __ pc();  __ push_f(xmm0); __ jmp(L);
+  dep = __ pc();  __ push_d(xmm0); __ jmp(L);
+  lep = __ pc();  __ push_l();     __ jmp(L);
   bep = cep = sep =
   iep = __ pc();  __ push_i();
   vep = __ pc();
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/cpu/x86/vm/templateTable_x86.cpp
--- a/hotspot/src/cpu/x86/vm/templateTable_x86.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/templateTable_x86.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -349,53 +349,60 @@
 
 void TemplateTable::fconst(int value) {
   transition(vtos, ftos);
+  if (UseSSE >= 1) {
+    static float one = 1.0f, two = 2.0f;
+    switch (value) {
+    case 0:
+      __ xorps(xmm0, xmm0);
+      break;
+    case 1:
+      __ movflt(xmm0, ExternalAddress((address) &one));
+      break;
+    case 2:
+      __ movflt(xmm0, ExternalAddress((address) &two));
+      break;
+    default:
+      ShouldNotReachHere();
+      break;
+    }
+  } else {
 #ifdef _LP64
-  static float one = 1.0f, two = 2.0f;
-  switch (value) {
-  case 0:
-    __ xorps(xmm0, xmm0);
-    break;
-  case 1:
-    __ movflt(xmm0, ExternalAddress((address) &one));
-    break;
-  case 2:
-    __ movflt(xmm0, ExternalAddress((address) &two));
-    break;
-  default:
     ShouldNotReachHere();
-    break;
+#else
+           if (value == 0) { __ fldz();
+    } else if (value == 1) { __ fld1();
+    } else if (value == 2) { __ fld1(); __ fld1(); __ faddp(); // should do a better solution here
+    } else                 { ShouldNotReachHere();
+    }
+#endif // _LP64
   }
-#else
-         if (value == 0) { __ fldz();
-  } else if (value == 1) { __ fld1();
-  } else if (value == 2) { __ fld1(); __ fld1(); __ faddp(); // should do a better solution here
-  } else                 { ShouldNotReachHere();
-  }
-#endif
 }
 
 void TemplateTable::dconst(int value) {
   transition(vtos, dtos);
+  if (UseSSE >= 2) {
+    static double one = 1.0;
+    switch (value) {
+    case 0:
+      __ xorpd(xmm0, xmm0);
+      break;
+    case 1:
+      __ movdbl(xmm0, ExternalAddress((address) &one));
+      break;
+    default:
+      ShouldNotReachHere();
+      break;
+    }
+  } else {
 #ifdef _LP64
-  static double one = 1.0;
-  switch (value) {
-  case 0:
-    __ xorpd(xmm0, xmm0);
-    break;
-  case 1:
-    __ movdbl(xmm0, ExternalAddress((address) &one));
-    break;
-  default:
     ShouldNotReachHere();
-    break;
+#else
+           if (value == 0) { __ fldz();
+    } else if (value == 1) { __ fld1();
+    } else                 { ShouldNotReachHere();
+    }
+#endif
   }
-
-#else
-         if (value == 0) { __ fldz();
-  } else if (value == 1) { __ fld1();
-  } else                 { ShouldNotReachHere();
-  }
-#endif
 }
 
 void TemplateTable::bipush() {
@@ -454,8 +461,7 @@
   __ jccb(Assembler::notEqual, notFloat);
 
   // ftos
-  LP64_ONLY(__ movflt(xmm0, Address(rcx, rbx, Address::times_8, base_offset)));
-  NOT_LP64(__ fld_s(    Address(rcx, rbx, Address::times_ptr, base_offset)));
+  __ load_float(Address(rcx, rbx, Address::times_ptr, base_offset));
   __ push(ftos);
   __ jmp(Done);
 
@@ -522,8 +528,7 @@
   __ jccb(Assembler::notEqual, Long);
 
   // dtos
-  LP64_ONLY(__ movdbl(xmm0, Address(rcx, rbx, Address::times_8, base_offset)));
-  NOT_LP64(__ fld_d(    Address(rcx, rbx, Address::times_ptr, base_offset)));
+  __ load_double(Address(rcx, rbx, Address::times_ptr, base_offset));
   __ push(dtos);
 
   __ jmpb(Done);
@@ -617,15 +622,13 @@
 void TemplateTable::fload() {
   transition(vtos, ftos);
   locals_index(rbx);
-  LP64_ONLY(__ movflt(xmm0, faddress(rbx)));
-  NOT_LP64(__ fld_s(faddress(rbx)));
+  __ load_float(faddress(rbx));
 }
 
 void TemplateTable::dload() {
   transition(vtos, dtos);
   locals_index(rbx);
-  LP64_ONLY(__ movdbl(xmm0, daddress(rbx)));
-  NOT_LP64(__ fld_d(daddress(rbx)));
+  __ load_double(daddress(rbx));
 }
 
 void TemplateTable::aload() {
@@ -657,15 +660,13 @@
 void TemplateTable::wide_fload() {
   transition(vtos, ftos);
   locals_index_wide(rbx);
-  LP64_ONLY(__ movflt(xmm0, faddress(rbx)));
-  NOT_LP64(__ fld_s(faddress(rbx)));
+  __ load_float(faddress(rbx));
 }
 
 void TemplateTable::wide_dload() {
   transition(vtos, dtos);
   locals_index_wide(rbx);
-  LP64_ONLY(__ movdbl(xmm0, daddress(rbx)));
-  NOT_LP64(__ fld_d(daddress(rbx)));
+  __ load_double(daddress(rbx));
 }
 
 void TemplateTable::wide_aload() {
@@ -726,10 +727,9 @@
   // rax: index
   // rdx: array
   index_check(rdx, rax); // kills rbx
-  LP64_ONLY(__ movflt(xmm0, Address(rdx, rax,
-                         Address::times_4,
-                         arrayOopDesc::base_offset_in_bytes(T_FLOAT))));
-  NOT_LP64(__ fld_s(Address(rdx, rax, Address::times_4, arrayOopDesc::base_offset_in_bytes(T_FLOAT))));
+  __ load_float(Address(rdx, rax,
+                        Address::times_4,
+                        arrayOopDesc::base_offset_in_bytes(T_FLOAT)));
 }
 
 void TemplateTable::daload() {
@@ -737,10 +737,9 @@
   // rax: index
   // rdx: array
   index_check(rdx, rax); // kills rbx
-  LP64_ONLY(__ movdbl(xmm0, Address(rdx, rax,
-                          Address::times_8,
-                          arrayOopDesc::base_offset_in_bytes(T_DOUBLE))));
-  NOT_LP64(__ fld_d(Address(rdx, rax, Address::times_8, arrayOopDesc::base_offset_in_bytes(T_DOUBLE))));
+  __ load_double(Address(rdx, rax,
+                         Address::times_8,
+                         arrayOopDesc::base_offset_in_bytes(T_DOUBLE)));
 }
 
 void TemplateTable::aaload() {
@@ -807,14 +806,12 @@
 
 void TemplateTable::fload(int n) {
   transition(vtos, ftos);
-  LP64_ONLY(__ movflt(xmm0, faddress(n)));
-  NOT_LP64(__ fld_s(faddress(n)));
+  __ load_float(faddress(n));
 }
 
 void TemplateTable::dload(int n) {
   transition(vtos, dtos);
-  LP64_ONLY(__ movdbl(xmm0, daddress(n)));
-  NOT_LP64(__ fld_d(daddress(n)));
+  __ load_double(daddress(n));
 }
 
 void TemplateTable::aload(int n) {
@@ -919,15 +916,13 @@
 void TemplateTable::fstore() {
   transition(ftos, vtos);
   locals_index(rbx);
-  LP64_ONLY(__ movflt(faddress(rbx), xmm0));
-  NOT_LP64(__ fstp_s(faddress(rbx)));
+  __ store_float(faddress(rbx));
 }
 
 void TemplateTable::dstore() {
   transition(dtos, vtos);
   locals_index(rbx);
-  LP64_ONLY(__ movdbl(daddress(rbx), xmm0));
-  NOT_LP64(__ fstp_d(daddress(rbx)));
+  __ store_double(daddress(rbx));
 }
 
 void TemplateTable::astore() {
@@ -956,7 +951,7 @@
 void TemplateTable::wide_fstore() {
 #ifdef _LP64
   transition(vtos, vtos);
-  __ pop_f();
+  __ pop_f(xmm0);
   locals_index_wide(rbx);
   __ movflt(faddress(rbx), xmm0);
 #else
@@ -967,7 +962,7 @@
 void TemplateTable::wide_dstore() {
 #ifdef _LP64
   transition(vtos, vtos);
-  __ pop_d();
+  __ pop_d(xmm0);
   locals_index_wide(rbx);
   __ movdbl(daddress(rbx), xmm0);
 #else
@@ -1011,29 +1006,21 @@
 void TemplateTable::fastore() {
   transition(ftos, vtos);
   __ pop_i(rbx);
-  // xmm0: value
+  // value is in UseSSE >= 1 ? xmm0 : ST(0)
   // rbx:  index
   // rdx:  array
   index_check(rdx, rbx); // prefer index in rbx
-  LP64_ONLY(__ movflt(Address(rdx, rbx,
-                   Address::times_4,
-                   arrayOopDesc::base_offset_in_bytes(T_FLOAT)),
-           xmm0));
-  NOT_LP64(__ fstp_s(Address(rdx, rbx, Address::times_4, arrayOopDesc::base_offset_in_bytes(T_FLOAT))));
+  __ store_float(Address(rdx, rbx, Address::times_4, arrayOopDesc::base_offset_in_bytes(T_FLOAT)));
 }
 
 void TemplateTable::dastore() {
   transition(dtos, vtos);
   __ pop_i(rbx);
-  // xmm0: value
+  // value is in UseSSE >= 2 ? xmm0 : ST(0)
   // rbx:  index
   // rdx:  array
   index_check(rdx, rbx); // prefer index in rbx
-  LP64_ONLY(__ movdbl(Address(rdx, rbx,
-                   Address::times_8,
-                   arrayOopDesc::base_offset_in_bytes(T_DOUBLE)),
-           xmm0));
-  NOT_LP64(__ fstp_d(Address(rdx, rbx, Address::times_8, arrayOopDesc::base_offset_in_bytes(T_DOUBLE))));
+  __ store_double(Address(rdx, rbx, Address::times_8, arrayOopDesc::base_offset_in_bytes(T_DOUBLE)));
 }
 
 void TemplateTable::aastore() {
@@ -1134,14 +1121,12 @@
 
 void TemplateTable::fstore(int n) {
   transition(ftos, vtos);
-  LP64_ONLY(__ movflt(faddress(n), xmm0));
-  NOT_LP64(__ fstp_s(faddress(n)));
+  __ store_float(faddress(n));
 }
 
 void TemplateTable::dstore(int n) {
   transition(dtos, vtos);
-  LP64_ONLY(__ movdbl(daddress(n), xmm0));
-  NOT_LP64(__ fstp_d(daddress(n)));
+  __ store_double(daddress(n));
 }
 
 
@@ -1425,82 +1410,127 @@
 
 void TemplateTable::fop2(Operation op) {
   transition(ftos, ftos);
+
+  if (UseSSE >= 1) {
+    switch (op) {
+    case add:
+      __ addss(xmm0, at_rsp());
+      __ addptr(rsp, Interpreter::stackElementSize);
+      break;
+    case sub:
+      __ movflt(xmm1, xmm0);
+      __ pop_f(xmm0);
+      __ subss(xmm0, xmm1);
+      break;
+    case mul:
+      __ mulss(xmm0, at_rsp());
+      __ addptr(rsp, Interpreter::stackElementSize);
+      break;
+    case div:
+      __ movflt(xmm1, xmm0);
+      __ pop_f(xmm0);
+      __ divss(xmm0, xmm1);
+      break;
+    case rem:
+      // On x86_64 platforms the SharedRuntime::frem method is called to perform the
+      // modulo operation. The frem method calls the function
+      // double fmod(double x, double y) in math.h. The documentation of fmod states:
+      // "If x or y is a NaN, a NaN is returned." without specifying what type of NaN
+      // (signalling or quiet) is returned.
+      //
+      // On x86_32 platforms the FPU is used to perform the modulo operation. The
+      // reason is that on 32-bit Windows the sign of modulo operations diverges from
+      // what is considered the standard (e.g., -0.0f % -3.14f is 0.0f (and not -0.0f).
+      // The fprem instruction used on x86_32 is functionally equivalent to
+      // SharedRuntime::frem in that it returns a NaN.
 #ifdef _LP64
-  switch (op) {
-  case add:
-    __ addss(xmm0, at_rsp());
-    __ addptr(rsp, Interpreter::stackElementSize);
-    break;
-  case sub:
-    __ movflt(xmm1, xmm0);
-    __ pop_f(xmm0);
-    __ subss(xmm0, xmm1);
-    break;
-  case mul:
-    __ mulss(xmm0, at_rsp());
-    __ addptr(rsp, Interpreter::stackElementSize);
-    break;
-  case div:
-    __ movflt(xmm1, xmm0);
-    __ pop_f(xmm0);
-    __ divss(xmm0, xmm1);
-    break;
-  case rem:
-    __ movflt(xmm1, xmm0);
-    __ pop_f(xmm0);
-    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::frem), 2);
-    break;
-  default:
+      __ movflt(xmm1, xmm0);
+      __ pop_f(xmm0);
+      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::frem), 2);
+#else
+      __ push_f(xmm0);
+      __ pop_f();
+      __ fld_s(at_rsp());
+      __ fremr(rax);
+      __ f2ieee();
+      __ pop(rax);  // pop second operand off the stack
+      __ push_f();
+      __ pop_f(xmm0);
+#endif
+      break;
+    default:
+      ShouldNotReachHere();
+      break;
+    }
+  } else {
+#ifdef _LP64
     ShouldNotReachHere();
-    break;
-  }
 #else
-  switch (op) {
+    switch (op) {
     case add: __ fadd_s (at_rsp());                break;
     case sub: __ fsubr_s(at_rsp());                break;
     case mul: __ fmul_s (at_rsp());                break;
     case div: __ fdivr_s(at_rsp());                break;
     case rem: __ fld_s  (at_rsp()); __ fremr(rax); break;
     default : ShouldNotReachHere();
+    }
+    __ f2ieee();
+    __ pop(rax);  // pop second operand off the stack
+#endif // _LP64
   }
-  __ f2ieee();
-  __ pop(rax);  // pop float thing off
-#endif
 }
 
 void TemplateTable::dop2(Operation op) {
   transition(dtos, dtos);
+  if (UseSSE >= 2) {
+    switch (op) {
+    case add:
+      __ addsd(xmm0, at_rsp());
+      __ addptr(rsp, 2 * Interpreter::stackElementSize);
+      break;
+    case sub:
+      __ movdbl(xmm1, xmm0);
+      __ pop_d(xmm0);
+      __ subsd(xmm0, xmm1);
+      break;
+    case mul:
+      __ mulsd(xmm0, at_rsp());
+      __ addptr(rsp, 2 * Interpreter::stackElementSize);
+      break;
+    case div:
+      __ movdbl(xmm1, xmm0);
+      __ pop_d(xmm0);
+      __ divsd(xmm0, xmm1);
+      break;
+    case rem:
+      // Similar to fop2(), the modulo operation is performed using the
+      // SharedRuntime::drem method (on x86_64 platforms) or using the
+      // FPU (on x86_32 platforms) for the same reasons as mentioned in fop2().
 #ifdef _LP64
-  switch (op) {
-  case add:
-    __ addsd(xmm0, at_rsp());
-    __ addptr(rsp, 2 * Interpreter::stackElementSize);
-    break;
-  case sub:
-    __ movdbl(xmm1, xmm0);
-    __ pop_d(xmm0);
-    __ subsd(xmm0, xmm1);
-    break;
-  case mul:
-    __ mulsd(xmm0, at_rsp());
-    __ addptr(rsp, 2 * Interpreter::stackElementSize);
-    break;
-  case div:
-    __ movdbl(xmm1, xmm0);
-    __ pop_d(xmm0);
-    __ divsd(xmm0, xmm1);
-    break;
-  case rem:
-    __ movdbl(xmm1, xmm0);
-    __ pop_d(xmm0);
-    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::drem), 2);
-    break;
-  default:
+      __ movdbl(xmm1, xmm0);
+      __ pop_d(xmm0);
+      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::drem), 2);
+#else
+      __ push_d(xmm0);
+      __ pop_d();
+      __ fld_d(at_rsp());
+      __ fremr(rax);
+      __ d2ieee();
+      __ pop(rax);
+      __ pop(rdx);
+      __ push_d();
+      __ pop_d(xmm0);
+#endif
+      break;
+    default:
+      ShouldNotReachHere();
+      break;
+    }
+  } else {
+#ifdef _LP64
     ShouldNotReachHere();
-    break;
-  }
 #else
-  switch (op) {
+    switch (op) {
     case add: __ fadd_d (at_rsp());                break;
     case sub: __ fsubr_d(at_rsp());                break;
     case mul: {
@@ -1543,12 +1573,13 @@
     }
     case rem: __ fld_d  (at_rsp()); __ fremr(rax); break;
     default : ShouldNotReachHere();
+    }
+    __ d2ieee();
+    // Pop double precision number from rsp.
+    __ pop(rax);
+    __ pop(rdx);
+#endif
   }
-  __ d2ieee();
-  // Pop double precision number from rsp.
-  __ pop(rax);
-  __ pop(rdx);
-#endif
 }
 
 void TemplateTable::ineg() {
@@ -1562,7 +1593,6 @@
   NOT_LP64(__ lneg(rdx, rax));
 }
 
-#ifdef _LP64
 // Note: 'double' and 'long long' have 32-bits alignment on x86.
 static jlong* double_quadword(jlong *adr, jlong lo, jlong hi) {
   // Use the expression (adr)&(~0xF) to provide 128-bits aligned address
@@ -1577,26 +1607,30 @@
 // Buffer for 128-bits masks used by SSE instructions.
 static jlong float_signflip_pool[2*2];
 static jlong double_signflip_pool[2*2];
-#endif
 
 void TemplateTable::fneg() {
   transition(ftos, ftos);
-#ifdef _LP64
-  static jlong *float_signflip  = double_quadword(&float_signflip_pool[1], 0x8000000080000000, 0x8000000080000000);
-  __ xorps(xmm0, ExternalAddress((address) float_signflip));
-#else
-  __ fchs();
-#endif
+  if (UseSSE >= 1) {
+    static jlong *float_signflip  = double_quadword(&float_signflip_pool[1], 0x8000000080000000, 0x8000000080000000);
+    __ xorps(xmm0, ExternalAddress((address) float_signflip));
+  } else {
+    LP64_ONLY(ShouldNotReachHere());
+    NOT_LP64(__ fchs());
+  }
 }
 
 void TemplateTable::dneg() {
   transition(dtos, dtos);
+  if (UseSSE >= 2) {
+    static jlong *double_signflip  = double_quadword(&double_signflip_pool[1], 0x8000000000000000, 0x8000000000000000);
+    __ xorpd(xmm0, ExternalAddress((address) double_signflip));
+  } else {
 #ifdef _LP64
-  static jlong *double_signflip  = double_quadword(&double_signflip_pool[1], 0x8000000000000000, 0x8000000000000000);
-  __ xorpd(xmm0, ExternalAddress((address) double_signflip));
+    ShouldNotReachHere();
 #else
-  __ fchs();
+    __ fchs();
 #endif
+  }
 }
 
 void TemplateTable::iinc() {
@@ -1798,18 +1832,26 @@
       __ extend_sign(rdx, rax);
       break;
     case Bytecodes::_i2f:
-      __ push(rax);          // store int on tos
-      __ fild_s(at_rsp());   // load int to ST0
-      __ f2ieee();           // truncate to float size
-      __ pop(rcx);           // adjust rsp
+      if (UseSSE >= 1) {
+        __ cvtsi2ssl(xmm0, rax);
+      } else {
+        __ push(rax);          // store int on tos
+        __ fild_s(at_rsp());   // load int to ST0
+        __ f2ieee();           // truncate to float size
+        __ pop(rcx);           // adjust rsp
+      }
       break;
     case Bytecodes::_i2d:
+      if (UseSSE >= 2) {
+        __ cvtsi2sdl(xmm0, rax);
+      } else {
       __ push(rax);          // add one slot for d2ieee()
       __ push(rax);          // store int on tos
       __ fild_s(at_rsp());   // load int to ST0
       __ d2ieee();           // truncate to double size
       __ pop(rcx);           // adjust rsp
       __ pop(rcx);
+      }
       break;
     case Bytecodes::_i2b:
       __ shll(rax, 24);      // truncate upper 24 bits
@@ -1829,50 +1871,102 @@
       /* nothing to do */
       break;
     case Bytecodes::_l2f:
+      // On 64-bit platforms, the cvtsi2ssq instruction is used to convert
+      // 64-bit long values to floats. On 32-bit platforms it is not possible
+      // to use that instruction with 64-bit operands, therefore the FPU is
+      // used to perform the conversion.
       __ push(rdx);          // store long on tos
       __ push(rax);
       __ fild_d(at_rsp());   // load long to ST0
       __ f2ieee();           // truncate to float size
       __ pop(rcx);           // adjust rsp
       __ pop(rcx);
+      if (UseSSE >= 1) {
+        __ push_f();
+        __ pop_f(xmm0);
+      }
       break;
     case Bytecodes::_l2d:
+      // On 32-bit platforms the FPU is used for conversion because on
+      // 32-bit platforms it is not not possible to use the cvtsi2sdq
+      // instruction with 64-bit operands.
       __ push(rdx);          // store long on tos
       __ push(rax);
       __ fild_d(at_rsp());   // load long to ST0
       __ d2ieee();           // truncate to double size
       __ pop(rcx);           // adjust rsp
       __ pop(rcx);
+      if (UseSSE >= 2) {
+        __ push_d();
+        __ pop_d(xmm0);
+      }
       break;
     case Bytecodes::_f2i:
-      __ push(rcx);          // reserve space for argument
-      __ fstp_s(at_rsp());   // pass float argument on stack
+      // SharedRuntime::f2i does not differentiate between sNaNs and qNaNs
+      // as it returns 0 for any NaN.
+      if (UseSSE >= 1) {
+        __ push_f(xmm0);
+      } else {
+        __ push(rcx);          // reserve space for argument
+        __ fstp_s(at_rsp());   // pass float argument on stack
+      }
       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::f2i), 1);
       break;
     case Bytecodes::_f2l:
-      __ push(rcx);          // reserve space for argument
-      __ fstp_s(at_rsp());   // pass float argument on stack
+      // SharedRuntime::f2l does not differentiate between sNaNs and qNaNs
+      // as it returns 0 for any NaN.
+      if (UseSSE >= 1) {
+       __ push_f(xmm0);
+      } else {
+        __ push(rcx);          // reserve space for argument
+        __ fstp_s(at_rsp());   // pass float argument on stack
+      }
       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::f2l), 1);
       break;
     case Bytecodes::_f2d:
-      /* nothing to do */
+      if (UseSSE < 1) {
+        /* nothing to do */
+      } else if (UseSSE == 1) {
+        __ push_f(xmm0);
+        __ pop_f();
+      } else { // UseSSE >= 2
+        __ cvtss2sd(xmm0, xmm0);
+      }
       break;
     case Bytecodes::_d2i:
-      __ push(rcx);          // reserve space for argument
-      __ push(rcx);
-      __ fstp_d(at_rsp());   // pass double argument on stack
+      if (UseSSE >= 2) {
+        __ push_d(xmm0);
+      } else {
+        __ push(rcx);          // reserve space for argument
+        __ push(rcx);
+        __ fstp_d(at_rsp());   // pass double argument on stack
+      }
       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::d2i), 2);
       break;
     case Bytecodes::_d2l:
-      __ push(rcx);          // reserve space for argument
-      __ push(rcx);
-      __ fstp_d(at_rsp());   // pass double argument on stack
+      if (UseSSE >= 2) {
+        __ push_d(xmm0);
+      } else {
+        __ push(rcx);          // reserve space for argument
+        __ push(rcx);
+        __ fstp_d(at_rsp());   // pass double argument on stack
+      }
       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::d2l), 2);
       break;
     case Bytecodes::_d2f:
-      __ push(rcx);          // reserve space for f2ieee()
-      __ f2ieee();           // truncate to float size
-      __ pop(rcx);           // adjust rsp
+      if (UseSSE <= 1) {
+        __ push(rcx);          // reserve space for f2ieee()
+        __ f2ieee();           // truncate to float size
+        __ pop(rcx);           // adjust rsp
+        if (UseSSE == 1) {
+          // The cvtsd2ss instruction is not available if UseSSE==1, therefore
+          // the conversion is performed using the FPU in this case.
+          __ push_f();
+          __ pop_f(xmm0);
+        }
+      } else { // UseSSE >= 2
+        __ cvtsd2ss(xmm0, xmm0);
+      }
       break;
     default             :
       ShouldNotReachHere();
@@ -1901,42 +1995,47 @@
 }
 
 void TemplateTable::float_cmp(bool is_float, int unordered_result) {
-#ifdef _LP64
-  Label done;
-  if (is_float) {
-    // XXX get rid of pop here, use ... reg, mem32
-    __ pop_f(xmm1);
-    __ ucomiss(xmm1, xmm0);
-  } else {
-    // XXX get rid of pop here, use ... reg, mem64
-    __ pop_d(xmm1);
-    __ ucomisd(xmm1, xmm0);
-  }
-  if (unordered_result < 0) {
-    __ movl(rax, -1);
-    __ jccb(Assembler::parity, done);
-    __ jccb(Assembler::below, done);
-    __ setb(Assembler::notEqual, rdx);
-    __ movzbl(rax, rdx);
+  if ((is_float && UseSSE >= 1) ||
+      (!is_float && UseSSE >= 2)) {
+    Label done;
+    if (is_float) {
+      // XXX get rid of pop here, use ... reg, mem32
+      __ pop_f(xmm1);
+      __ ucomiss(xmm1, xmm0);
+    } else {
+      // XXX get rid of pop here, use ... reg, mem64
+      __ pop_d(xmm1);
+      __ ucomisd(xmm1, xmm0);
+    }
+    if (unordered_result < 0) {
+      __ movl(rax, -1);
+      __ jccb(Assembler::parity, done);
+      __ jccb(Assembler::below, done);
+      __ setb(Assembler::notEqual, rdx);
+      __ movzbl(rax, rdx);
+    } else {
+      __ movl(rax, 1);
+      __ jccb(Assembler::parity, done);
+      __ jccb(Assembler::above, done);
+      __ movl(rax, 0);
+      __ jccb(Assembler::equal, done);
+      __ decrementl(rax);
+    }
+    __ bind(done);
   } else {
-    __ movl(rax, 1);
-    __ jccb(Assembler::parity, done);
-    __ jccb(Assembler::above, done);
-    __ movl(rax, 0);
-    __ jccb(Assembler::equal, done);
-    __ decrementl(rax);
-  }
-  __ bind(done);
+#ifdef _LP64
+    ShouldNotReachHere();
 #else
-  if (is_float) {
-    __ fld_s(at_rsp());
-  } else {
-    __ fld_d(at_rsp());
-    __ pop(rdx);
+    if (is_float) {
+      __ fld_s(at_rsp());
+    } else {
+      __ fld_d(at_rsp());
+      __ pop(rdx);
+    }
+    __ pop(rcx);
+    __ fcmp2int(rax, unordered_result < 0);
+#endif // _LP64
   }
-  __ pop(rcx);
-  __ fcmp2int(rax, unordered_result < 0);
-#endif
 }
 
 void TemplateTable::branch(bool is_jsr, bool is_wide) {
@@ -2014,6 +2113,7 @@
     __ pop(rcx);
     __ pop(rdx);
     __ movptr(rax, Address(rcx, Method::method_counters_offset()));
+    __ testptr(rax, rax);
     __ jcc(Assembler::zero, dispatch);
     __ bind(has_counters);
 
@@ -2747,8 +2847,7 @@
   __ jcc(Assembler::notEqual, notFloat);
   // ftos
 
-  LP64_ONLY(__ movflt(xmm0, field));
-  NOT_LP64(__ fld_s(field));
+  __ load_float(field);
   __ push(ftos);
   // Rewrite bytecode to be faster
   if (!is_static && rc == may_rewrite) {
@@ -2762,8 +2861,7 @@
   __ jcc(Assembler::notEqual, notDouble);
 #endif
   // dtos
-  LP64_ONLY(__ movdbl(xmm0, field));
-  NOT_LP64(__ fld_d(field));
+  __ load_double(field);
   __ push(dtos);
   // Rewrite bytecode to be faster
   if (!is_static && rc == may_rewrite) {
@@ -3045,8 +3143,7 @@
   {
     __ pop(ftos);
     if (!is_static) pop_and_check_object(obj);
-    NOT_LP64( __ fstp_s(field);)
-    LP64_ONLY( __ movflt(field, xmm0);)
+    __ store_float(field);
     if (!is_static && rc == may_rewrite) {
       patch_bytecode(Bytecodes::_fast_fputfield, bc, rbx, true, byte_no);
     }
@@ -3063,8 +3160,7 @@
   {
     __ pop(dtos);
     if (!is_static) pop_and_check_object(obj);
-    NOT_LP64( __ fstp_d(field);)
-    LP64_ONLY( __ movdbl(field, xmm0);)
+    __ store_double(field);
     if (!is_static && rc == may_rewrite) {
       patch_bytecode(Bytecodes::_fast_dputfield, bc, rbx, true, byte_no);
     }
@@ -3122,8 +3218,8 @@
     case Bytecodes::_fast_sputfield: // fall through
     case Bytecodes::_fast_cputfield: // fall through
     case Bytecodes::_fast_iputfield: __ push_i(rax); break;
-    case Bytecodes::_fast_dputfield: __ push_d(); break;
-    case Bytecodes::_fast_fputfield: __ push_f(); break;
+    case Bytecodes::_fast_dputfield: __ push(dtos); break;
+    case Bytecodes::_fast_fputfield: __ push(ftos); break;
     case Bytecodes::_fast_lputfield: __ push_l(rax); break;
 
     default:
@@ -3146,8 +3242,8 @@
     case Bytecodes::_fast_sputfield: // fall through
     case Bytecodes::_fast_cputfield: // fall through
     case Bytecodes::_fast_iputfield: __ pop_i(rax); break;
-    case Bytecodes::_fast_dputfield: __ pop_d(); break;
-    case Bytecodes::_fast_fputfield: __ pop_f(); break;
+    case Bytecodes::_fast_dputfield: __ pop(dtos); break;
+    case Bytecodes::_fast_fputfield: __ pop(ftos); break;
     case Bytecodes::_fast_lputfield: __ pop_l(rax); break;
     }
     __ bind(L2);
@@ -3211,12 +3307,10 @@
     __ movw(field, rax);
     break;
   case Bytecodes::_fast_fputfield:
-    NOT_LP64( __ fstp_s(field); )
-    LP64_ONLY( __ movflt(field, xmm0);)
+    __ store_float(field);
     break;
   case Bytecodes::_fast_dputfield:
-    NOT_LP64( __ fstp_d(field); )
-    LP64_ONLY( __ movdbl(field, xmm0);)
+    __ store_double(field);
     break;
   default:
     ShouldNotReachHere();
@@ -3301,12 +3395,10 @@
     __ load_unsigned_short(rax, field);
     break;
   case Bytecodes::_fast_fgetfield:
-    LP64_ONLY(__ movflt(xmm0, field));
-    NOT_LP64(__ fld_s(field));
+    __ load_float(field);
     break;
   case Bytecodes::_fast_dgetfield:
-    LP64_ONLY(__ movdbl(xmm0, field));
-    NOT_LP64(__ fld_d(field));
+    __ load_double(field);
     break;
   default:
     ShouldNotReachHere();
@@ -3346,8 +3438,7 @@
     __ verify_oop(rax);
     break;
   case ftos:
-    LP64_ONLY(__ movflt(xmm0, field));
-    NOT_LP64(__ fld_s(field));
+    __ load_float(field);
     break;
   default:
     ShouldNotReachHere();
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/os/aix/vm/perfMemory_aix.cpp
--- a/hotspot/src/os/aix/vm/perfMemory_aix.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/os/aix/vm/perfMemory_aix.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2012, 2013 SAP AG. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
@@ -454,13 +454,27 @@
     *saved_cwd_fd = result;
   }
 
-  // Set the current directory to dirname by using the fd of the directory.
+  // Set the current directory to dirname by using the fd of the directory and
+  // handle errors, otherwise shared memory files will be created in cwd.
   result = fchdir(fd);
-
-  return dirp;
+  if (result == OS_ERR) {
+    if (PrintMiscellaneous && Verbose) {
+      warning("could not change to directory %s", dirname);
+    }
+    if (*saved_cwd_fd != -1) {
+      ::close(*saved_cwd_fd);
+      *saved_cwd_fd = -1;
+    }
+    // Close the directory.
+    os::closedir(dirp);
+    return NULL;
+  } else {
+    return dirp;
+  }
 }
 
 // Close the directory and restore the current working directory.
+//
 static void close_directory_secure_cwd(DIR* dirp, int saved_cwd_fd) {
 
   int result;
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/os/bsd/vm/perfMemory_bsd.cpp
--- a/hotspot/src/os/bsd/vm/perfMemory_bsd.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/os/bsd/vm/perfMemory_bsd.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -375,10 +375,23 @@
     *saved_cwd_fd = result;
   }
 
-  // Set the current directory to dirname by using the fd of the directory.
+  // Set the current directory to dirname by using the fd of the directory and
+  // handle errors, otherwise shared memory files will be created in cwd.
   result = fchdir(fd);
-
-  return dirp;
+  if (result == OS_ERR) {
+    if (PrintMiscellaneous && Verbose) {
+      warning("could not change to directory %s", dirname);
+    }
+    if (*saved_cwd_fd != -1) {
+      ::close(*saved_cwd_fd);
+      *saved_cwd_fd = -1;
+    }
+    // Close the directory.
+    os::closedir(dirp);
+    return NULL;
+  } else {
+    return dirp;
+  }
 }
 
 // Close the directory and restore the current working directory.
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/os/linux/vm/os_linux.cpp
--- a/hotspot/src/os/linux/vm/os_linux.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/os/linux/vm/os_linux.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -5785,9 +5785,11 @@
         status = pthread_mutex_unlock(_mutex);
         assert(status == 0, "invariant");
       } else {
+        // must capture correct index before unlocking
+        int index = _cur_index;
         status = pthread_mutex_unlock(_mutex);
         assert(status == 0, "invariant");
-        status = pthread_cond_signal(&_cond[_cur_index]);
+        status = pthread_cond_signal(&_cond[index]);
         assert(status == 0, "invariant");
       }
     } else {
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/os/linux/vm/perfMemory_linux.cpp
--- a/hotspot/src/os/linux/vm/perfMemory_linux.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/os/linux/vm/perfMemory_linux.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -374,10 +374,23 @@
     *saved_cwd_fd = result;
   }
 
-  // Set the current directory to dirname by using the fd of the directory.
+  // Set the current directory to dirname by using the fd of the directory and
+  // handle errors, otherwise shared memory files will be created in cwd.
   result = fchdir(fd);
-
-  return dirp;
+  if (result == OS_ERR) {
+    if (PrintMiscellaneous && Verbose) {
+      warning("could not change to directory %s", dirname);
+    }
+    if (*saved_cwd_fd != -1) {
+      ::close(*saved_cwd_fd);
+      *saved_cwd_fd = -1;
+    }
+    // Close the directory.
+    os::closedir(dirp);
+    return NULL;
+  } else {
+    return dirp;
+  }
 }
 
 // Close the directory and restore the current working directory.
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/os/solaris/vm/perfMemory_solaris.cpp
--- a/hotspot/src/os/solaris/vm/perfMemory_solaris.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/os/solaris/vm/perfMemory_solaris.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -377,10 +377,23 @@
     *saved_cwd_fd = result;
   }
 
-  // Set the current directory to dirname by using the fd of the directory.
+  // Set the current directory to dirname by using the fd of the directory and
+  // handle errors, otherwise shared memory files will be created in cwd.
   result = fchdir(fd);
-
-  return dirp;
+  if (result == OS_ERR) {
+    if (PrintMiscellaneous && Verbose) {
+      warning("could not change to directory %s", dirname);
+    }
+    if (*saved_cwd_fd != -1) {
+      ::close(*saved_cwd_fd);
+      *saved_cwd_fd = -1;
+    }
+    // Close the directory.
+    os::closedir(dirp);
+    return NULL;
+  } else {
+    return dirp;
+  }
 }
 
 // Close the directory and restore the current working directory.
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/classfile/systemDictionary.cpp
--- a/hotspot/src/share/vm/classfile/systemDictionary.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/classfile/systemDictionary.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -2680,187 +2680,3 @@
 #endif // INCLUDE_TRACE
 }
 
-#ifndef PRODUCT
-
-// statistics code
-class ClassStatistics: AllStatic {
- private:
-  static int nclasses;        // number of classes
-  static int nmethods;        // number of methods
-  static int nmethoddata;     // number of methodData
-  static int class_size;      // size of class objects in words
-  static int method_size;     // size of method objects in words
-  static int debug_size;      // size of debug info in methods
-  static int methoddata_size; // size of methodData objects in words
-
-  static void do_class(Klass* k) {
-    nclasses++;
-    class_size += k->size();
-    if (k->oop_is_instance()) {
-      InstanceKlass* ik = (InstanceKlass*)k;
-      class_size += ik->methods()->size();
-      class_size += ik->constants()->size();
-      class_size += ik->local_interfaces()->size();
-      class_size += ik->transitive_interfaces()->size();
-      // We do not have to count implementors, since we only store one!
-      // SSS: How should these be accounted now that they have moved?
-      // class_size += ik->fields()->length();
-    }
-  }
-
-  static void do_method(Method* m) {
-    nmethods++;
-    method_size += m->size();
-    // class loader uses same objArray for empty vectors, so don't count these
-    if (m->has_stackmap_table()) {
-      method_size += m->stackmap_data()->size();
-    }
-
-    MethodData* mdo = m->method_data();
-    if (mdo != NULL) {
-      nmethoddata++;
-      methoddata_size += mdo->size();
-    }
-  }
-
- public:
-  static void print() {
-    SystemDictionary::classes_do(do_class);
-    SystemDictionary::methods_do(do_method);
-    tty->print_cr("Class statistics:");
-    tty->print_cr("%d classes (%d bytes)", nclasses, class_size * oopSize);
-    tty->print_cr("%d methods (%d bytes = %d base + %d debug info)", nmethods,
-                  (method_size + debug_size) * oopSize, method_size * oopSize, debug_size * oopSize);
-    tty->print_cr("%d methoddata (%d bytes)", nmethoddata, methoddata_size * oopSize);
-  }
-};
-
-
-int ClassStatistics::nclasses        = 0;
-int ClassStatistics::nmethods        = 0;
-int ClassStatistics::nmethoddata     = 0;
-int ClassStatistics::class_size      = 0;
-int ClassStatistics::method_size     = 0;
-int ClassStatistics::debug_size      = 0;
-int ClassStatistics::methoddata_size = 0;
-
-void SystemDictionary::print_class_statistics() {
-  ResourceMark rm;
-  ClassStatistics::print();
-}
-
-
-class MethodStatistics: AllStatic {
- public:
-  enum {
-    max_parameter_size = 10
-  };
- private:
-
-  static int _number_of_methods;
-  static int _number_of_final_methods;
-  static int _number_of_static_methods;
-  static int _number_of_native_methods;
-  static int _number_of_synchronized_methods;
-  static int _number_of_profiled_methods;
-  static int _number_of_bytecodes;
-  static int _parameter_size_profile[max_parameter_size];
-  static int _bytecodes_profile[Bytecodes::number_of_java_codes];
-
-  static void initialize() {
-    _number_of_methods        = 0;
-    _number_of_final_methods  = 0;
-    _number_of_static_methods = 0;
-    _number_of_native_methods = 0;
-    _number_of_synchronized_methods = 0;
-    _number_of_profiled_methods = 0;
-    _number_of_bytecodes      = 0;
-    for (int i = 0; i < max_parameter_size             ; i++) _parameter_size_profile[i] = 0;
-    for (int j = 0; j < Bytecodes::number_of_java_codes; j++) _bytecodes_profile     [j] = 0;
-  };
-
-  static void do_method(Method* m) {
-    _number_of_methods++;
-    // collect flag info
-    if (m->is_final()       ) _number_of_final_methods++;
-    if (m->is_static()      ) _number_of_static_methods++;
-    if (m->is_native()      ) _number_of_native_methods++;
-    if (m->is_synchronized()) _number_of_synchronized_methods++;
-    if (m->method_data() != NULL) _number_of_profiled_methods++;
-    // collect parameter size info (add one for receiver, if any)
-    _parameter_size_profile[MIN2(m->size_of_parameters() + (m->is_static() ? 0 : 1), max_parameter_size - 1)]++;
-    // collect bytecodes info
-    {
-      Thread *thread = Thread::current();
-      HandleMark hm(thread);
-      BytecodeStream s(methodHandle(thread, m));
-      Bytecodes::Code c;
-      while ((c = s.next()) >= 0) {
-        _number_of_bytecodes++;
-        _bytecodes_profile[c]++;
-      }
-    }
-  }
-
- public:
-  static void print() {
-    initialize();
-    SystemDictionary::methods_do(do_method);
-    // generate output
-    tty->cr();
-    tty->print_cr("Method statistics (static):");
-    // flag distribution
-    tty->cr();
-    tty->print_cr("%6d final        methods  %6.1f%%", _number_of_final_methods       , _number_of_final_methods        * 100.0F / _number_of_methods);
-    tty->print_cr("%6d static       methods  %6.1f%%", _number_of_static_methods      , _number_of_static_methods       * 100.0F / _number_of_methods);
-    tty->print_cr("%6d native       methods  %6.1f%%", _number_of_native_methods      , _number_of_native_methods       * 100.0F / _number_of_methods);
-    tty->print_cr("%6d synchronized methods  %6.1f%%", _number_of_synchronized_methods, _number_of_synchronized_methods * 100.0F / _number_of_methods);
-    tty->print_cr("%6d profiled     methods  %6.1f%%", _number_of_profiled_methods, _number_of_profiled_methods * 100.0F / _number_of_methods);
-    // parameter size profile
-    tty->cr();
-    { int tot = 0;
-      int avg = 0;
-      for (int i = 0; i < max_parameter_size; i++) {
-        int n = _parameter_size_profile[i];
-        tot += n;
-        avg += n*i;
-        tty->print_cr("parameter size = %1d: %6d methods  %5.1f%%", i, n, n * 100.0F / _number_of_methods);
-      }
-      assert(tot == _number_of_methods, "should be the same");
-      tty->print_cr("                    %6d methods  100.0%%", _number_of_methods);
-      tty->print_cr("(average parameter size = %3.1f including receiver, if any)", (float)avg / _number_of_methods);
-    }
-    // bytecodes profile
-    tty->cr();
-    { int tot = 0;
-      for (int i = 0; i < Bytecodes::number_of_java_codes; i++) {
-        if (Bytecodes::is_defined(i)) {
-          Bytecodes::Code c = Bytecodes::cast(i);
-          int n = _bytecodes_profile[c];
-          tot += n;
-          tty->print_cr("%9d  %7.3f%%  %s", n, n * 100.0F / _number_of_bytecodes, Bytecodes::name(c));
-        }
-      }
-      assert(tot == _number_of_bytecodes, "should be the same");
-      tty->print_cr("%9d  100.000%%", _number_of_bytecodes);
-    }
-    tty->cr();
-  }
-};
-
-int MethodStatistics::_number_of_methods;
-int MethodStatistics::_number_of_final_methods;
-int MethodStatistics::_number_of_static_methods;
-int MethodStatistics::_number_of_native_methods;
-int MethodStatistics::_number_of_synchronized_methods;
-int MethodStatistics::_number_of_profiled_methods;
-int MethodStatistics::_number_of_bytecodes;
-int MethodStatistics::_parameter_size_profile[MethodStatistics::max_parameter_size];
-int MethodStatistics::_bytecodes_profile[Bytecodes::number_of_java_codes];
-
-
-void SystemDictionary::print_method_statistics() {
-  MethodStatistics::print();
-}
-
-#endif // PRODUCT
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/classfile/systemDictionary.hpp
--- a/hotspot/src/share/vm/classfile/systemDictionary.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/classfile/systemDictionary.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -366,8 +366,6 @@
   // Printing
   static void print(bool details = true);
   static void print_shared(bool details = true);
-  static void print_class_statistics()  PRODUCT_RETURN;
-  static void print_method_statistics() PRODUCT_RETURN;
 
   // Number of contained klasses
   // This is both fully loaded classes and classes in the process
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/code/codeCache.cpp
--- a/hotspot/src/share/vm/code/codeCache.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/code/codeCache.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -746,14 +746,17 @@
 void CodeCache::gc_epilogue() {
   assert_locked_or_safepoint(CodeCache_lock);
   NMethodIterator iter;
-  while(iter.next_alive()) {
+  while(iter.next()) {
     nmethod* nm = iter.method();
-    assert(!nm->is_unloaded(), "Tautology");
-    if (needs_cache_clean()) {
-      nm->cleanup_inline_caches();
+    if (!nm->is_zombie()) {
+      if (needs_cache_clean()) {
+        // Clean ICs of unloaded nmethods as well because they may reference other
+        // unloaded nmethods that may be flushed earlier in the sweeper cycle.
+        nm->cleanup_inline_caches();
+      }
+      DEBUG_ONLY(nm->verify());
+      DEBUG_ONLY(nm->verify_oop_relocations());
     }
-    DEBUG_ONLY(nm->verify());
-    DEBUG_ONLY(nm->verify_oop_relocations());
   }
   set_needs_cache_clean(false);
   prune_scavenge_root_nmethods();
@@ -993,29 +996,6 @@
   return number_of_marked_CodeBlobs;
 }
 
-void CodeCache::make_marked_nmethods_zombies() {
-  assert(SafepointSynchronize::is_at_safepoint(), "must be at a safepoint");
-  NMethodIterator iter;
-  while(iter.next_alive()) {
-    nmethod* nm = iter.method();
-    if (nm->is_marked_for_deoptimization()) {
-
-      // If the nmethod has already been made non-entrant and it can be converted
-      // then zombie it now. Otherwise make it non-entrant and it will eventually
-      // be zombied when it is no longer seen on the stack. Note that the nmethod
-      // might be "entrant" and not on the stack and so could be zombied immediately
-      // but we can't tell because we don't track it on stack until it becomes
-      // non-entrant.
-
-      if (nm->is_not_entrant() && nm->can_not_entrant_be_converted()) {
-        nm->make_zombie();
-      } else {
-        nm->make_not_entrant();
-      }
-    }
-  }
-}
-
 void CodeCache::make_marked_nmethods_not_entrant() {
   assert_locked_or_safepoint(CodeCache_lock);
   NMethodIterator iter;
@@ -1072,7 +1052,7 @@
     // Deoptimize all activations depending on marked nmethods
     Deoptimization::deoptimize_dependents();
 
-    // Make the dependent methods not entrant (in VM_Deoptimize they are made zombies)
+    // Make the dependent methods not entrant
     make_marked_nmethods_not_entrant();
   }
 }
@@ -1102,7 +1082,7 @@
     // Deoptimize all activations depending on marked nmethods
     Deoptimization::deoptimize_dependents();
 
-    // Make the dependent methods not entrant (in VM_Deoptimize they are made zombies)
+    // Make the dependent methods not entrant
     make_marked_nmethods_not_entrant();
   }
 }
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/code/codeCache.hpp
--- a/hotspot/src/share/vm/code/codeCache.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/code/codeCache.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -225,7 +225,6 @@
  public:
   static void mark_all_nmethods_for_deoptimization();
   static int  mark_for_deoptimization(Method* dependee);
-  static void make_marked_nmethods_zombies();
   static void make_marked_nmethods_not_entrant();
 
   // Flushing and deoptimization
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/code/compiledIC.cpp
--- a/hotspot/src/share/vm/code/compiledIC.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/code/compiledIC.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -343,8 +343,8 @@
     // Kill any leftover stub we might have too
     clear_ic_stub();
     if (is_optimized()) {
-    set_ic_destination(entry);
-  } else {
+      set_ic_destination(entry);
+    } else {
       set_ic_destination_and_value(entry, (void*)NULL);
     }
   } else {
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/code/compiledIC.hpp
--- a/hotspot/src/share/vm/code/compiledIC.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/code/compiledIC.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -214,7 +214,7 @@
   //
   // They all takes a TRAP argument, since they can cause a GC if the inline-cache buffer is full.
   //
-  void set_to_clean();  // Can only be called during a safepoint operation
+  void set_to_clean();
   void set_to_monomorphic(CompiledICInfo& info);
   void clear_ic_stub();
 
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/code/nmethod.cpp
--- a/hotspot/src/share/vm/code/nmethod.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/code/nmethod.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -1021,7 +1021,6 @@
 
 
 void nmethod::cleanup_inline_caches() {
-
   assert_locked_or_safepoint(CompiledIC_lock);
 
   // If the method is not entrant or zombie then a JMP is plastered over the
@@ -1037,7 +1036,8 @@
     // In fact, why are we bothering to look at oops in a non-entrant method??
   }
 
-  // Find all calls in an nmethod, and clear the ones that points to zombie methods
+  // Find all calls in an nmethod and clear the ones that point to non-entrant,
+  // zombie and unloaded nmethods.
   ResourceMark rm;
   RelocIterator iter(this, low_boundary);
   while(iter.next()) {
@@ -1049,7 +1049,7 @@
         CodeBlob *cb = CodeCache::find_blob_unsafe(ic->ic_destination());
         if( cb != NULL && cb->is_nmethod() ) {
           nmethod* nm = (nmethod*)cb;
-          // Clean inline caches pointing to both zombie and not_entrant methods
+          // Clean inline caches pointing to zombie, non-entrant and unloaded methods
           if (!nm->is_in_use() || (nm->method()->code() != nm)) ic->set_to_clean();
         }
         break;
@@ -1059,7 +1059,7 @@
         CodeBlob *cb = CodeCache::find_blob_unsafe(csc->destination());
         if( cb != NULL && cb->is_nmethod() ) {
           nmethod* nm = (nmethod*)cb;
-          // Clean inline caches pointing to both zombie and not_entrant methods
+          // Clean inline caches pointing to zombie, non-entrant and unloaded methods
           if (!nm->is_in_use() || (nm->method()->code() != nm)) csc->set_to_clean();
         }
         break;
@@ -2529,7 +2529,7 @@
   // Hmm. OSR methods can be deopted but not marked as zombie or not_entrant
   // seems odd.
 
-  if( is_zombie() || is_not_entrant() )
+  if (is_zombie() || is_not_entrant() || is_unloaded())
     return;
 
   // Make sure all the entry points are correctly aligned for patching.
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/compiler/compileBroker.cpp
--- a/hotspot/src/share/vm/compiler/compileBroker.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/compiler/compileBroker.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -1399,6 +1399,28 @@
   // do the compilation
   if (method->is_native()) {
     if (!PreferInterpreterNativeStubs || method->is_method_handle_intrinsic()) {
+      // The following native methods:
+      //
+      // java.lang.Float.intBitsToFloat
+      // java.lang.Float.floatToRawIntBits
+      // java.lang.Double.longBitsToDouble
+      // java.lang.Double.doubleToRawLongBits
+      //
+      // are called through the interpreter even if interpreter native stubs
+      // are not preferred (i.e., calling through adapter handlers is preferred).
+      // The reason is that on x86_32 signaling NaNs (sNaNs) are not preserved
+      // if the version of the methods from the native libraries is called.
+      // As the interpreter and the C2-intrinsified version of the methods preserves
+      // sNaNs, that would result in an inconsistent way of handling of sNaNs.
+      if ((UseSSE >= 1 &&
+          (method->intrinsic_id() == vmIntrinsics::_intBitsToFloat ||
+           method->intrinsic_id() == vmIntrinsics::_floatToRawIntBits)) ||
+          (UseSSE >= 2 &&
+           (method->intrinsic_id() == vmIntrinsics::_longBitsToDouble ||
+            method->intrinsic_id() == vmIntrinsics::_doubleToRawLongBits))) {
+        return NULL;
+      }
+
       // To properly handle the appendix argument for out-of-line calls we are using a small trampoline that
       // pops off the appendix argument and jumps to the target (see gen_special_dispatch in SharedRuntime).
       //
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/cms/concurrentMarkSweepGeneration.cpp
--- a/hotspot/src/share/vm/gc/cms/concurrentMarkSweepGeneration.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/cms/concurrentMarkSweepGeneration.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -620,7 +620,7 @@
   // Support for parallelizing survivor space rescan
   if ((CMSParallelRemarkEnabled && CMSParallelSurvivorRemarkEnabled) || CMSParallelInitialMarkEnabled) {
     const size_t max_plab_samples =
-      _young_gen->max_survivor_size() / (ThreadLocalAllocBuffer::min_size() * HeapWordSize);
+      _young_gen->max_survivor_size() / (PLAB::min_size() * HeapWordSize);
 
     _survivor_plab_array  = NEW_C_HEAP_ARRAY(ChunkArray, ParallelGCThreads, mtGC);
     _survivor_chunk_array = NEW_C_HEAP_ARRAY(HeapWord*, max_plab_samples, mtGC);
@@ -3005,7 +3005,7 @@
     COMPILER2_PRESENT(DerivedPointerTableDeactivate dpt_deact;)
     if (CMSParallelInitialMarkEnabled) {
       // The parallel version.
-      FlexibleWorkGang* workers = gch->workers();
+      WorkGang* workers = gch->workers();
       assert(workers != NULL, "Need parallel worker threads.");
       uint n_workers = workers->active_workers();
 
@@ -4488,7 +4488,7 @@
   // workers to be taken from the active workers in the work gang.
   CMSParRemarkTask(CMSCollector* collector,
                    CompactibleFreeListSpace* cms_space,
-                   uint n_workers, FlexibleWorkGang* workers,
+                   uint n_workers, WorkGang* workers,
                    OopTaskQueueSet* task_queues,
                    StrongRootsScope* strong_roots_scope):
     CMSParMarkTask("Rescan roots and grey objects in parallel",
@@ -5061,7 +5061,7 @@
 // Parallel version of remark
 void CMSCollector::do_remark_parallel() {
   GenCollectedHeap* gch = GenCollectedHeap::heap();
-  FlexibleWorkGang* workers = gch->workers();
+  WorkGang* workers = gch->workers();
   assert(workers != NULL, "Need parallel worker threads.");
   // Choose to use the number of GC workers most recently set
   // into "active_workers".
@@ -5236,6 +5236,16 @@
 ////////////////////////////////////////////////////////
 // Parallel Reference Processing Task Proxy Class
 ////////////////////////////////////////////////////////
+class AbstractGangTaskWOopQueues : public AbstractGangTask {
+  OopTaskQueueSet*       _queues;
+  ParallelTaskTerminator _terminator;
+ public:
+  AbstractGangTaskWOopQueues(const char* name, OopTaskQueueSet* queues, uint n_threads) :
+    AbstractGangTask(name), _queues(queues), _terminator(n_threads, _queues) {}
+  ParallelTaskTerminator* terminator() { return &_terminator; }
+  OopTaskQueueSet* queues() { return _queues; }
+};
+
 class CMSRefProcTaskProxy: public AbstractGangTaskWOopQueues {
   typedef AbstractRefProcTaskExecutor::ProcessTask ProcessTask;
   CMSCollector*          _collector;
@@ -5372,7 +5382,7 @@
 void CMSRefProcTaskExecutor::execute(ProcessTask& task)
 {
   GenCollectedHeap* gch = GenCollectedHeap::heap();
-  FlexibleWorkGang* workers = gch->workers();
+  WorkGang* workers = gch->workers();
   assert(workers != NULL, "Need parallel worker threads.");
   CMSRefProcTaskProxy rp_task(task, &_collector,
                               _collector.ref_processor()->span(),
@@ -5385,7 +5395,7 @@
 {
 
   GenCollectedHeap* gch = GenCollectedHeap::heap();
-  FlexibleWorkGang* workers = gch->workers();
+  WorkGang* workers = gch->workers();
   assert(workers != NULL, "Need parallel worker threads.");
   CMSRefEnqueueTaskProxy enq_task(task);
   workers->run_task(&enq_task);
@@ -5419,7 +5429,7 @@
       // balance_all_queues() and balance_queues()).
       GenCollectedHeap* gch = GenCollectedHeap::heap();
       uint active_workers = ParallelGCThreads;
-      FlexibleWorkGang* workers = gch->workers();
+      WorkGang* workers = gch->workers();
       if (workers != NULL) {
         active_workers = workers->active_workers();
         // The expectation is that active_workers will have already
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/cms/parNewGeneration.cpp
--- a/hotspot/src/share/vm/gc/cms/parNewGeneration.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/cms/parNewGeneration.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -25,7 +25,7 @@
 #include "precompiled.hpp"
 #include "gc/cms/compactibleFreeListSpace.hpp"
 #include "gc/cms/concurrentMarkSweepGeneration.hpp"
-#include "gc/cms/parNewGeneration.hpp"
+#include "gc/cms/parNewGeneration.inline.hpp"
 #include "gc/cms/parOopClosures.inline.hpp"
 #include "gc/serial/defNewGeneration.inline.hpp"
 #include "gc/shared/adaptiveSizePolicy.hpp"
@@ -248,8 +248,7 @@
         }
       }
       if (buf_space != NULL) {
-        plab->set_word_size(buf_size);
-        plab->set_buf(buf_space);
+        plab->set_buf(buf_space, buf_size);
         record_survivor_plab(buf_space, buf_size);
         obj = plab->allocate_aligned(word_sz, SurvivorAlignmentInBytes);
         // Note that we cannot compare buf_size < word_sz below
@@ -803,7 +802,7 @@
 void ParNewRefProcTaskExecutor::execute(ProcessTask& task)
 {
   GenCollectedHeap* gch = GenCollectedHeap::heap();
-  FlexibleWorkGang* workers = gch->workers();
+  WorkGang* workers = gch->workers();
   assert(workers != NULL, "Need parallel worker threads.");
   _state_set.reset(workers->active_workers(), _young_gen.promotion_failed());
   ParNewRefProcTaskProxy rp_task(task, _young_gen, _old_gen,
@@ -816,7 +815,7 @@
 void ParNewRefProcTaskExecutor::execute(EnqueueTask& task)
 {
   GenCollectedHeap* gch = GenCollectedHeap::heap();
-  FlexibleWorkGang* workers = gch->workers();
+  WorkGang* workers = gch->workers();
   assert(workers != NULL, "Need parallel worker threads.");
   ParNewRefEnqueueTaskProxy enq_task(task);
   workers->run_task(&enq_task);
@@ -890,7 +889,7 @@
   _gc_timer->register_gc_start();
 
   AdaptiveSizePolicy* size_policy = gch->gen_policy()->size_policy();
-  FlexibleWorkGang* workers = gch->workers();
+  WorkGang* workers = gch->workers();
   assert(workers != NULL, "Need workgang for parallel work");
   uint active_workers =
        AdaptiveSizePolicy::calc_active_workers(workers->total_workers(),
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/cms/parNewGeneration.hpp
--- a/hotspot/src/share/vm/gc/cms/parNewGeneration.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/cms/parNewGeneration.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -169,11 +169,7 @@
   // Allocate a to-space block of size "sz", or else return NULL.
   HeapWord* alloc_in_to_space_slow(size_t word_sz);
 
-  HeapWord* alloc_in_to_space(size_t word_sz) {
-    HeapWord* obj = to_space_alloc_buffer()->allocate_aligned(word_sz, SurvivorAlignmentInBytes);
-    if (obj != NULL) return obj;
-    else return alloc_in_to_space_slow(word_sz);
-  }
+  inline HeapWord* alloc_in_to_space(size_t word_sz);
 
   HeapWord* young_old_boundary() { return _young_old_boundary; }
 
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/cms/parNewGeneration.inline.hpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/share/vm/gc/cms/parNewGeneration.inline.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_VM_GC_CMS_PARNEWGENERATION_INLINE_HPP
+#define SHARE_VM_GC_CMS_PARNEWGENERATION_INLINE_HPP
+
+#include "gc/cms/parNewGeneration.hpp"
+#include "gc/shared/plab.inline.hpp"
+#include "utilities/globalDefinitions.hpp"
+
+inline HeapWord* ParScanThreadState::alloc_in_to_space(size_t word_sz) {
+  HeapWord* obj = to_space_alloc_buffer()->allocate_aligned(word_sz, SurvivorAlignmentInBytes);
+  if (obj != NULL) return obj;
+  else return alloc_in_to_space_slow(word_sz);
+}
+#endif // SHARE_VM_GC_CMS_PARNEWGENERATION_INLINE_HPP
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/cms/yieldingWorkgroup.cpp
--- a/hotspot/src/share/vm/gc/cms/yieldingWorkgroup.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/cms/yieldingWorkgroup.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -26,20 +26,45 @@
 #include "gc/cms/yieldingWorkgroup.hpp"
 #include "utilities/macros.hpp"
 
-// Forward declaration of classes declared here.
-
-class GangWorker;
-class WorkData;
+YieldingFlexibleGangWorker::YieldingFlexibleGangWorker(YieldingFlexibleWorkGang* gang, int id)
+    : AbstractGangWorker(gang, id) {}
 
 YieldingFlexibleWorkGang::YieldingFlexibleWorkGang(
-  const char* name, uint workers, bool are_GC_task_threads) :
-  FlexibleWorkGang(name, workers, are_GC_task_threads, false),
-    _yielded_workers(0) {}
+    const char* name, uint workers, bool are_GC_task_threads) :
+         AbstractWorkGang(name, workers, are_GC_task_threads, false),
+         _yielded_workers(0),
+         _started_workers(0),
+         _finished_workers(0),
+         _sequence_number(0),
+         _task(NULL) {
+
+  // Other initialization.
+  _monitor = new Monitor(/* priority */       Mutex::leaf,
+                         /* name */           "WorkGroup monitor",
+                         /* allow_vm_block */ are_GC_task_threads,
+                                              Monitor::_safepoint_check_sometimes);
+
+  assert(monitor() != NULL, "Failed to allocate monitor");
+}
 
-GangWorker* YieldingFlexibleWorkGang::allocate_worker(uint which) {
-  YieldingFlexibleGangWorker* new_member =
-      new YieldingFlexibleGangWorker(this, which);
-  return (YieldingFlexibleGangWorker*) new_member;
+AbstractGangWorker* YieldingFlexibleWorkGang::allocate_worker(uint which) {
+  return new YieldingFlexibleGangWorker(this, which);
+}
+
+void YieldingFlexibleWorkGang::internal_worker_poll(YieldingWorkData* data) const {
+  assert(data != NULL, "worker data is null");
+  data->set_task(task());
+  data->set_sequence_number(sequence_number());
+}
+
+void YieldingFlexibleWorkGang::internal_note_start() {
+  assert(monitor()->owned_by_self(), "note_finish is an internal method");
+  _started_workers += 1;
+}
+
+void YieldingFlexibleWorkGang::internal_note_finish() {
+  assert(monitor()->owned_by_self(), "note_finish is an internal method");
+  _finished_workers += 1;
 }
 
 // Run a task; returns when the task is done, or the workers yield,
@@ -292,37 +317,37 @@
 ///////////////////////////////
 void YieldingFlexibleGangWorker::loop() {
   int previous_sequence_number = 0;
-  Monitor* gang_monitor = gang()->monitor();
+  Monitor* gang_monitor = yf_gang()->monitor();
   MutexLockerEx ml(gang_monitor, Mutex::_no_safepoint_check_flag);
-  WorkData data;
+  YieldingWorkData data;
   int id;
   while (true) {
     // Check if there is work to do.
-    gang()->internal_worker_poll(&data);
+    yf_gang()->internal_worker_poll(&data);
     if (data.task() != NULL && data.sequence_number() != previous_sequence_number) {
       // There is work to be done.
       // First check if we need to become active or if there
       // are already the requisite number of workers
-      if (gang()->started_workers() == yf_gang()->active_workers()) {
+      if (yf_gang()->started_workers() == yf_gang()->active_workers()) {
         // There are already enough workers, we do not need to
         // to run; fall through and wait on monitor.
       } else {
         // We need to pitch in and do the work.
-        assert(gang()->started_workers() < yf_gang()->active_workers(),
+        assert(yf_gang()->started_workers() < yf_gang()->active_workers(),
                "Unexpected state");
-        id = gang()->started_workers();
-        gang()->internal_note_start();
+        id = yf_gang()->started_workers();
+        yf_gang()->internal_note_start();
         // Now, release the gang mutex and do the work.
         {
           MutexUnlockerEx mul(gang_monitor, Mutex::_no_safepoint_check_flag);
           data.task()->work(id);   // This might include yielding
         }
         // Reacquire monitor and note completion of this worker
-        gang()->internal_note_finish();
+        yf_gang()->internal_note_finish();
         // Update status of task based on whether all workers have
         // finished or some have yielded
-        assert(data.task() == gang()->task(), "Confused task binding");
-        if (gang()->finished_workers() == yf_gang()->active_workers()) {
+        assert(data.task() == yf_gang()->task(), "Confused task binding");
+        if (yf_gang()->finished_workers() == yf_gang()->active_workers()) {
           switch (data.yf_task()->status()) {
             case ABORTING: {
               data.yf_task()->set_status(ABORTED);
@@ -338,7 +363,7 @@
           }
           gang_monitor->notify_all();  // Notify overseer
         } else { // at least one worker is still working or yielded
-          assert(gang()->finished_workers() < yf_gang()->active_workers(),
+          assert(yf_gang()->finished_workers() < yf_gang()->active_workers(),
                  "Counts inconsistent");
           switch (data.yf_task()->status()) {
             case ACTIVE: {
@@ -347,7 +372,7 @@
               break;
             }
             case YIELDING: {
-              if (gang()->finished_workers() + yf_gang()->yielded_workers()
+              if (yf_gang()->finished_workers() + yf_gang()->yielded_workers()
                   == yf_gang()->active_workers()) {
                 data.yf_task()->set_status(YIELDED);
                 gang_monitor->notify_all();  // notify overseer
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/cms/yieldingWorkgroup.hpp
--- a/hotspot/src/share/vm/gc/cms/yieldingWorkgroup.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/cms/yieldingWorkgroup.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -29,6 +29,7 @@
 #include "utilities/macros.hpp"
 
 // Forward declarations
+class YieldingFlexibleGangTask;
 class YieldingFlexibleWorkGang;
 
 // Status of tasks
@@ -43,13 +44,32 @@
     COMPLETED
 };
 
+class YieldingWorkData: public StackObj {
+  // This would be a struct, but I want accessor methods.
+private:
+  AbstractGangTask* _task;
+  int               _sequence_number;
+public:
+  // Constructor and destructor
+  YieldingWorkData() : _task(NULL), _sequence_number(0) {}
+  ~YieldingWorkData() {}
+
+  // Accessors and modifiers
+  AbstractGangTask* task()               const { return _task; }
+  void set_task(AbstractGangTask* value)       { _task = value; }
+  int sequence_number()                  const { return _sequence_number; }
+  void set_sequence_number(int value)          { _sequence_number = value; }
+
+  YieldingFlexibleGangTask* yf_task()    const {
+    return (YieldingFlexibleGangTask*)_task;
+  }
+};
+
 // Class YieldingFlexibleGangWorker:
 //   Several instances of this class run in parallel as workers for a gang.
-class YieldingFlexibleGangWorker: public GangWorker {
+class YieldingFlexibleGangWorker: public AbstractGangWorker {
 public:
-  // Ctor
-  YieldingFlexibleGangWorker(AbstractWorkGang* gang, int id) :
-    GangWorker(gang, id) { }
+  YieldingFlexibleGangWorker(YieldingFlexibleWorkGang* gang, int id);
 
 public:
   YieldingFlexibleWorkGang* yf_gang() const
@@ -108,9 +128,6 @@
 
   friend class YieldingFlexibleWorkGang;
   friend class YieldingFlexibleGangWorker;
-  NOT_PRODUCT(virtual bool is_YieldingFlexibleGang_task() const {
-    return true;
-  })
 
   void set_status(Status s) {
     _status = s;
@@ -160,7 +177,7 @@
 // YieldingGangWorkers, and provides infrastructure
 // supporting yielding to the "GangOverseer",
 // being the thread that orchestrates the WorkGang via run_task().
-class YieldingFlexibleWorkGang: public FlexibleWorkGang {
+class YieldingFlexibleWorkGang: public AbstractWorkGang {
   // Here's the public interface to this class.
 public:
   // Constructor and destructor.
@@ -168,12 +185,10 @@
                            bool are_GC_task_threads);
 
   YieldingFlexibleGangTask* yielding_task() const {
-    assert(task() == NULL || task()->is_YieldingFlexibleGang_task(),
-           "Incorrect cast");
-    return (YieldingFlexibleGangTask*)task();
+    return task();
   }
   // Allocate a worker and return a pointer to it.
-  GangWorker* allocate_worker(uint which);
+  AbstractGangWorker* allocate_worker(uint which);
 
   // Run a task; returns when the task is done, or the workers yield,
   // or the task is aborted.
@@ -216,6 +231,42 @@
 private:
   friend class YieldingFlexibleGangWorker;
   void reset(); // NYI
+
+
+  // The monitor which protects these data,
+  // and notifies of changes in it.
+  Monitor*   _monitor;
+  // Accessors for fields
+  Monitor* monitor() const {
+    return _monitor;
+  }
+
+  // The number of started workers.
+  uint _started_workers;
+  // The number of finished workers.
+  uint _finished_workers;
+
+  uint started_workers() const {
+    return _started_workers;
+  }
+  uint finished_workers() const {
+    return _finished_workers;
+  }
+
+  // A sequence number for the current task.
+  int _sequence_number;
+  int sequence_number() const {
+    return _sequence_number;
+  }
+
+  YieldingFlexibleGangTask* _task;
+  YieldingFlexibleGangTask* task() const {
+    return _task;
+  }
+
+  void internal_worker_poll(YieldingWorkData* data) const;
+  void internal_note_start();
+  void internal_note_finish();
 };
 
 #endif // SHARE_VM_GC_CMS_YIELDINGWORKGROUP_HPP
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/concurrentMark.cpp
--- a/hotspot/src/share/vm/gc/g1/concurrentMark.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/g1/concurrentMark.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -629,7 +629,7 @@
   gclog_or_tty->print_cr("CL Sleep Factor          %1.4lf", cleanup_sleep_factor());
 #endif
 
-  _parallel_workers = new FlexibleWorkGang("G1 Marker",
+  _parallel_workers = new WorkGang("G1 Marker",
        _max_parallel_marking_threads, false, true);
   if (_parallel_workers == NULL) {
     vm_exit_during_initialization("Failed necessary allocation.");
@@ -3088,29 +3088,6 @@
 }
 #endif
 
-template<bool scan>
-inline void CMTask::process_grey_object(oop obj) {
-  assert(scan || obj->is_typeArray(), "Skipping scan of grey non-typeArray");
-  assert(_nextMarkBitMap->isMarked((HeapWord*) obj), "invariant");
-
-  if (_cm->verbose_high()) {
-    gclog_or_tty->print_cr("[%u] processing grey object " PTR_FORMAT,
-                           _worker_id, p2i((void*) obj));
-  }
-
-  size_t obj_size = obj->size();
-  _words_scanned += obj_size;
-
-  if (scan) {
-    obj->oop_iterate(_cm_oop_closure);
-  }
-  statsOnly( ++_objs_scanned );
-  check_limits();
-}
-
-template void CMTask::process_grey_object<true>(oop);
-template void CMTask::process_grey_object<false>(oop);
-
 // Closure for iteration over bitmaps
 class CMBitMapClosure : public BitMapClosure {
 private:
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/concurrentMark.hpp
--- a/hotspot/src/share/vm/gc/g1/concurrentMark.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/g1/concurrentMark.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -451,7 +451,7 @@
 
   double*   _accum_task_vtime;   // Accumulated task vtime
 
-  FlexibleWorkGang* _parallel_workers;
+  WorkGang* _parallel_workers;
 
   ForceOverflowSettings _force_overflow_conc;
   ForceOverflowSettings _force_overflow_stw;
@@ -1126,7 +1126,7 @@
   inline void deal_with_reference(oop obj);
 
   // It scans an object and visits its children.
-  void scan_object(oop obj) { process_grey_object<true>(obj); }
+  inline void scan_object(oop obj);
 
   // It pushes an object on the local queue.
   inline void push(oop obj);
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/concurrentMark.inline.hpp
--- a/hotspot/src/share/vm/gc/g1/concurrentMark.inline.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/g1/concurrentMark.inline.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -232,6 +232,9 @@
   }
 }
 
+// It scans an object and visits its children.
+inline void CMTask::scan_object(oop obj) { process_grey_object<true>(obj); }
+
 inline void CMTask::push(oop obj) {
   HeapWord* objAddr = (HeapWord*) obj;
   assert(_g1h->is_in_g1_reserved(objAddr), "invariant");
@@ -299,6 +302,28 @@
   return objAddr < global_finger;
 }
 
+template<bool scan>
+inline void CMTask::process_grey_object(oop obj) {
+  assert(scan || obj->is_typeArray(), "Skipping scan of grey non-typeArray");
+  assert(_nextMarkBitMap->isMarked((HeapWord*) obj), "invariant");
+
+  if (_cm->verbose_high()) {
+    gclog_or_tty->print_cr("[%u] processing grey object " PTR_FORMAT,
+                           _worker_id, p2i((void*) obj));
+  }
+
+  size_t obj_size = obj->size();
+  _words_scanned += obj_size;
+
+  if (scan) {
+    obj->oop_iterate(_cm_oop_closure);
+  }
+  statsOnly( ++_objs_scanned );
+  check_limits();
+}
+
+
+
 inline void CMTask::make_reference_grey(oop obj, HeapRegion* hr) {
   if (_cm->par_mark_and_count(obj, hr, _marked_bytes_array, _card_bm)) {
 
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/g1AllocRegion.cpp
--- a/hotspot/src/share/vm/gc/g1/g1AllocRegion.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/g1/g1AllocRegion.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -46,10 +46,11 @@
   _dummy_region = dummy_region;
 }
 
-void G1AllocRegion::fill_up_remaining_space(HeapRegion* alloc_region,
-                                            bool bot_updates) {
+size_t G1AllocRegion::fill_up_remaining_space(HeapRegion* alloc_region,
+                                              bool bot_updates) {
   assert(alloc_region != NULL && alloc_region != _dummy_region,
          "pre-condition");
+  size_t result = 0;
 
   // Other threads might still be trying to allocate using a CAS out
   // of the region we are trying to retire, as they can do so without
@@ -73,6 +74,7 @@
       // If the allocation was successful we should fill in the space.
       CollectedHeap::fill_with_object(dummy, free_word_size);
       alloc_region->set_pre_dummy_top(dummy);
+      result += free_word_size * HeapWordSize;
       break;
     }
 
@@ -81,13 +83,18 @@
     // allocation and they fill up the region. In that case, we can
     // just get out of the loop.
   }
+  result += alloc_region->free();
+
   assert(alloc_region->free() / HeapWordSize < min_word_size_to_fill,
          "post-condition");
+  return result;
 }
 
-void G1AllocRegion::retire(bool fill_up) {
+size_t G1AllocRegion::retire(bool fill_up) {
   assert(_alloc_region != NULL, ar_ext_msg(this, "not initialized properly"));
 
+  size_t result = 0;
+
   trace("retiring");
   HeapRegion* alloc_region = _alloc_region;
   if (alloc_region != _dummy_region) {
@@ -98,7 +105,7 @@
            ar_ext_msg(this, "the alloc region should never be empty"));
 
     if (fill_up) {
-      fill_up_remaining_space(alloc_region, _bot_updates);
+      result = fill_up_remaining_space(alloc_region, _bot_updates);
     }
 
     assert(alloc_region->used() >= _used_bytes_before,
@@ -109,6 +116,8 @@
     _alloc_region = _dummy_region;
   }
   trace("retired");
+
+  return result;
 }
 
 HeapWord* G1AllocRegion::new_alloc_region_and_allocate(size_t word_size,
@@ -196,11 +205,11 @@
 }
 
 #if G1_ALLOC_REGION_TRACING
-void G1AllocRegion::trace(const char* str, size_t word_size, HeapWord* result) {
+void G1AllocRegion::trace(const char* str, size_t min_word_size, size_t desired_word_size, size_t actual_word_size, HeapWord* result) {
   // All the calls to trace that set either just the size or the size
   // and the result are considered part of level 2 tracing and are
   // skipped during level 1 tracing.
-  if ((word_size == 0 && result == NULL) || (G1_ALLOC_REGION_TRACING > 1)) {
+  if ((actual_word_size == 0 && result == NULL) || (G1_ALLOC_REGION_TRACING > 1)) {
     const size_t buffer_length = 128;
     char hr_buffer[buffer_length];
     char rest_buffer[buffer_length];
@@ -217,10 +226,10 @@
 
     if (G1_ALLOC_REGION_TRACING > 1) {
       if (result != NULL) {
-        jio_snprintf(rest_buffer, buffer_length, SIZE_FORMAT " " PTR_FORMAT,
-                     word_size, result);
-      } else if (word_size != 0) {
-        jio_snprintf(rest_buffer, buffer_length, SIZE_FORMAT, word_size);
+        jio_snprintf(rest_buffer, buffer_length, "min " SIZE_FORMAT " desired " SIZE_FORMAT " actual " SIZE_FORMAT " " PTR_FORMAT,
+                     min_word_size, desired_word_size, actual_word_size, result);
+      } else if (min_word_size != 0) {
+        jio_snprintf(rest_buffer, buffer_length, "min " SIZE_FORMAT " desired " SIZE_FORMAT, min_word_size, desired_word_size);
       } else {
         jio_snprintf(rest_buffer, buffer_length, "");
       }
@@ -251,26 +260,25 @@
   _g1h->retire_mutator_alloc_region(alloc_region, allocated_bytes);
 }
 
-HeapRegion* SurvivorGCAllocRegion::allocate_new_region(size_t word_size,
-                                                       bool force) {
+HeapRegion* G1GCAllocRegion::allocate_new_region(size_t word_size,
+                                                 bool force) {
   assert(!force, "not supported for GC alloc regions");
-  return _g1h->new_gc_alloc_region(word_size, count(), InCSetState::Young);
+  return _g1h->new_gc_alloc_region(word_size, count(), _purpose);
 }
 
-void SurvivorGCAllocRegion::retire_region(HeapRegion* alloc_region,
-                                          size_t allocated_bytes) {
-  _g1h->retire_gc_alloc_region(alloc_region, allocated_bytes, InCSetState::Young);
+void G1GCAllocRegion::retire_region(HeapRegion* alloc_region,
+                                    size_t allocated_bytes) {
+  _g1h->retire_gc_alloc_region(alloc_region, allocated_bytes, _purpose);
 }
 
-HeapRegion* OldGCAllocRegion::allocate_new_region(size_t word_size,
-                                                  bool force) {
-  assert(!force, "not supported for GC alloc regions");
-  return _g1h->new_gc_alloc_region(word_size, count(), InCSetState::Old);
-}
-
-void OldGCAllocRegion::retire_region(HeapRegion* alloc_region,
-                                     size_t allocated_bytes) {
-  _g1h->retire_gc_alloc_region(alloc_region, allocated_bytes, InCSetState::Old);
+size_t G1GCAllocRegion::retire(bool fill_up) {
+  HeapRegion* retired = get();
+  size_t end_waste = G1AllocRegion::retire(fill_up);
+  // Do not count retirement of the dummy allocation region.
+  if (retired != NULL) {
+    _stats->add_region_end_waste(end_waste / HeapWordSize);
+  }
+  return end_waste;
 }
 
 HeapRegion* OldGCAllocRegion::release() {
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/g1AllocRegion.hpp
--- a/hotspot/src/share/vm/gc/g1/g1AllocRegion.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/g1/g1AllocRegion.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -26,6 +26,8 @@
 #define SHARE_VM_GC_G1_G1ALLOCREGION_HPP
 
 #include "gc/g1/heapRegion.hpp"
+#include "gc/g1/g1EvacStats.hpp"
+#include "gc/g1/g1InCSetState.hpp"
 
 class G1CollectedHeap;
 
@@ -102,16 +104,22 @@
   static inline HeapWord* par_allocate(HeapRegion* alloc_region,
                                        size_t word_size,
                                        bool bot_updates);
+  // Perform a MT-safe allocation out of the given region, with the given
+  // minimum and desired size. Returns the actual size allocated (between
+  // minimum and desired size) in actual_word_size if the allocation has been
+  // successful.
+  static inline HeapWord* par_allocate(HeapRegion* alloc_region,
+                                       size_t min_word_size,
+                                       size_t desired_word_size,
+                                       size_t* actual_word_size,
+                                       bool bot_updates);
 
   // Ensure that the region passed as a parameter has been filled up
   // so that noone else can allocate out of it any more.
-  static void fill_up_remaining_space(HeapRegion* alloc_region,
-                                      bool bot_updates);
-
-  // Retire the active allocating region. If fill_up is true then make
-  // sure that the region is full before we retire it so that noone
-  // else can allocate out of it.
-  void retire(bool fill_up);
+  // Returns the number of bytes that have been wasted by filled up
+  // the space.
+  static size_t fill_up_remaining_space(HeapRegion* alloc_region,
+                                        bool bot_updates);
 
   // After a region is allocated by alloc_new_region, this
   // method is used to set it as the active alloc_region
@@ -126,6 +134,12 @@
   void fill_in_ext_msg(ar_ext_msg* msg, const char* message);
 
 protected:
+  // Retire the active allocating region. If fill_up is true then make
+  // sure that the region is full before we retire it so that no one
+  // else can allocate out of it.
+  // Returns the number of bytes that have been filled up during retire.
+  virtual size_t retire(bool fill_up);
+
   // For convenience as subclasses use it.
   static G1CollectedHeap* _g1h;
 
@@ -154,7 +168,18 @@
   // First-level allocation: Should be called without holding a
   // lock. It will try to allocate lock-free out of the active region,
   // or return NULL if it was unable to.
-  inline HeapWord* attempt_allocation(size_t word_size, bool bot_updates);
+  inline HeapWord* attempt_allocation(size_t word_size,
+                                      bool bot_updates);
+  // Perform an allocation out of the current allocation region, with the given
+  // minimum and desired size. Returns the actual size allocated (between
+  // minimum and desired size) in actual_word_size if the allocation has been
+  // successful.
+  // Should be called without holding a lock. It will try to allocate lock-free
+  // out of the active region, or return NULL if it was unable to.
+  inline HeapWord* attempt_allocation(size_t min_word_size,
+                                      size_t desired_word_size,
+                                      size_t* actual_word_size,
+                                      bool bot_updates);
 
   // Second-level allocation: Should be called while holding a
   // lock. It will try to first allocate lock-free out of the active
@@ -164,6 +189,14 @@
   // it conform to its locking protocol.
   inline HeapWord* attempt_allocation_locked(size_t word_size,
                                              bool bot_updates);
+  // Same as attempt_allocation_locked(size_t, bool), but allowing specification
+  // of minimum word size of the block in min_word_size, and the maximum word
+  // size of the allocation in desired_word_size. The actual size of the block is
+  // returned in actual_word_size.
+  inline HeapWord* attempt_allocation_locked(size_t min_word_size,
+                                             size_t desired_word_size,
+                                             size_t* actual_word_size,
+                                             bool bot_updates);
 
   // Should be called to allocate a new region even if the max of this
   // type of regions has been reached. Should only be called if other
@@ -186,9 +219,17 @@
   virtual HeapRegion* release();
 
 #if G1_ALLOC_REGION_TRACING
-  void trace(const char* str, size_t word_size = 0, HeapWord* result = NULL);
+  void trace(const char* str,
+             size_t min_word_size = 0,
+             size_t desired_word_size = 0,
+             size_t actual_word_size = 0,
+             HeapWord* result = NULL);
 #else // G1_ALLOC_REGION_TRACING
-  void trace(const char* str, size_t word_size = 0, HeapWord* result = NULL) { }
+  void trace(const char* str,
+             size_t min_word_size = 0,
+             size_t desired_word_size = 0,
+             size_t actual_word_size = 0,
+             HeapWord* result = NULL) { }
 #endif // G1_ALLOC_REGION_TRACING
 };
 
@@ -201,22 +242,33 @@
     : G1AllocRegion("Mutator Alloc Region", false /* bot_updates */) { }
 };
 
-class SurvivorGCAllocRegion : public G1AllocRegion {
+// Common base class for allocation regions used during GC.
+class G1GCAllocRegion : public G1AllocRegion {
 protected:
+  G1EvacStats* _stats;
+  InCSetState::in_cset_state_t _purpose;
+
   virtual HeapRegion* allocate_new_region(size_t word_size, bool force);
   virtual void retire_region(HeapRegion* alloc_region, size_t allocated_bytes);
+
+  virtual size_t retire(bool fill_up);
 public:
-  SurvivorGCAllocRegion()
-  : G1AllocRegion("Survivor GC Alloc Region", false /* bot_updates */) { }
+  G1GCAllocRegion(const char* name, bool bot_updates, G1EvacStats* stats, InCSetState::in_cset_state_t purpose)
+  : G1AllocRegion(name, bot_updates), _stats(stats), _purpose(purpose) {
+    assert(stats != NULL, "Must pass non-NULL PLAB statistics");
+  }
 };
 
-class OldGCAllocRegion : public G1AllocRegion {
-protected:
-  virtual HeapRegion* allocate_new_region(size_t word_size, bool force);
-  virtual void retire_region(HeapRegion* alloc_region, size_t allocated_bytes);
+class SurvivorGCAllocRegion : public G1GCAllocRegion {
 public:
-  OldGCAllocRegion()
-  : G1AllocRegion("Old GC Alloc Region", true /* bot_updates */) { }
+  SurvivorGCAllocRegion(G1EvacStats* stats)
+  : G1GCAllocRegion("Survivor GC Alloc Region", false /* bot_updates */, stats, InCSetState::Young) { }
+};
+
+class OldGCAllocRegion : public G1GCAllocRegion {
+public:
+  OldGCAllocRegion(G1EvacStats* stats)
+  : G1GCAllocRegion("Old GC Alloc Region", true /* bot_updates */, stats, InCSetState::Old) { }
 
   // This specialization of release() makes sure that the last card that has
   // been allocated into has been completely filled by a dummy object.  This
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/g1AllocRegion.inline.hpp
--- a/hotspot/src/share/vm/gc/g1/g1AllocRegion.inline.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/g1/g1AllocRegion.inline.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -40,52 +40,74 @@
   }
 }
 
+inline HeapWord* G1AllocRegion::par_allocate(HeapRegion* alloc_region, size_t word_size, bool bot_updates) {
+  size_t temp;
+  return par_allocate(alloc_region, word_size, word_size, &temp, bot_updates);
+}
+
 inline HeapWord* G1AllocRegion::par_allocate(HeapRegion* alloc_region,
-                                             size_t word_size,
+                                             size_t min_word_size,
+                                             size_t desired_word_size,
+                                             size_t* actual_word_size,
                                              bool bot_updates) {
   assert(alloc_region != NULL, err_msg("pre-condition"));
   assert(!alloc_region->is_empty(), err_msg("pre-condition"));
 
   if (!bot_updates) {
-    return alloc_region->par_allocate_no_bot_updates(word_size);
+    return alloc_region->par_allocate_no_bot_updates(min_word_size, desired_word_size, actual_word_size);
   } else {
-    return alloc_region->par_allocate(word_size);
+    return alloc_region->par_allocate(min_word_size, desired_word_size, actual_word_size);
   }
 }
 
-inline HeapWord* G1AllocRegion::attempt_allocation(size_t word_size,
+inline HeapWord* G1AllocRegion::attempt_allocation(size_t word_size, bool bot_updates) {
+  size_t temp;
+  return attempt_allocation(word_size, word_size, &temp, bot_updates);
+}
+
+inline HeapWord* G1AllocRegion::attempt_allocation(size_t min_word_size,
+                                                   size_t desired_word_size,
+                                                   size_t* actual_word_size,
                                                    bool bot_updates) {
   assert(bot_updates == _bot_updates, ar_ext_msg(this, "pre-condition"));
 
   HeapRegion* alloc_region = _alloc_region;
   assert(alloc_region != NULL, ar_ext_msg(this, "not initialized properly"));
 
-  HeapWord* result = par_allocate(alloc_region, word_size, bot_updates);
+  HeapWord* result = par_allocate(alloc_region, min_word_size, desired_word_size, actual_word_size, bot_updates);
   if (result != NULL) {
-    trace("alloc", word_size, result);
+    trace("alloc", min_word_size, desired_word_size, *actual_word_size, result);
     return result;
   }
-  trace("alloc failed", word_size);
+  trace("alloc failed", min_word_size, desired_word_size);
   return NULL;
 }
 
-inline HeapWord* G1AllocRegion::attempt_allocation_locked(size_t word_size,
+inline HeapWord* G1AllocRegion::attempt_allocation_locked(size_t word_size, bool bot_updates) {
+  size_t temp;
+  return attempt_allocation_locked(word_size, word_size, &temp, bot_updates);
+}
+
+inline HeapWord* G1AllocRegion::attempt_allocation_locked(size_t min_word_size,
+                                                          size_t desired_word_size,
+                                                          size_t* actual_word_size,
                                                           bool bot_updates) {
   // First we have to redo the allocation, assuming we're holding the
   // appropriate lock, in case another thread changed the region while
   // we were waiting to get the lock.
-  HeapWord* result = attempt_allocation(word_size, bot_updates);
+  HeapWord* result = attempt_allocation(min_word_size, desired_word_size, actual_word_size, bot_updates);
   if (result != NULL) {
     return result;
   }
 
   retire(true /* fill_up */);
-  result = new_alloc_region_and_allocate(word_size, false /* force */);
+  result = new_alloc_region_and_allocate(desired_word_size, false /* force */);
   if (result != NULL) {
-    trace("alloc locked (second attempt)", word_size, result);
+    *actual_word_size = desired_word_size;
+    trace("alloc locked (second attempt)", min_word_size, desired_word_size, *actual_word_size, result);
     return result;
   }
-  trace("alloc locked failed", word_size);
+  trace("alloc locked failed", min_word_size, desired_word_size);
   return NULL;
 }
 
@@ -94,13 +116,13 @@
   assert(bot_updates == _bot_updates, ar_ext_msg(this, "pre-condition"));
   assert(_alloc_region != NULL, ar_ext_msg(this, "not initialized properly"));
 
-  trace("forcing alloc");
+  trace("forcing alloc", word_size, word_size);
   HeapWord* result = new_alloc_region_and_allocate(word_size, true /* force */);
   if (result != NULL) {
-    trace("alloc forced", word_size, result);
+    trace("alloc forced", word_size, word_size, word_size, result);
     return result;
   }
-  trace("alloc forced failed", word_size);
+  trace("alloc forced failed", word_size, word_size);
   return NULL;
 }
 
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/g1Allocator.cpp
--- a/hotspot/src/share/vm/gc/g1/g1Allocator.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/g1/g1Allocator.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -24,12 +24,20 @@
 
 #include "precompiled.hpp"
 #include "gc/g1/g1Allocator.inline.hpp"
+#include "gc/g1/g1AllocRegion.inline.hpp"
 #include "gc/g1/g1CollectedHeap.inline.hpp"
 #include "gc/g1/g1CollectorPolicy.hpp"
 #include "gc/g1/g1MarkSweep.hpp"
 #include "gc/g1/heapRegion.inline.hpp"
 #include "gc/g1/heapRegionSet.inline.hpp"
 
+G1DefaultAllocator::G1DefaultAllocator(G1CollectedHeap* heap) :
+  G1Allocator(heap),
+  _retained_old_gc_alloc_region(NULL),
+  _survivor_gc_alloc_region(heap->alloc_buffer_stats(InCSetState::Young)),
+  _old_gc_alloc_region(heap->alloc_buffer_stats(InCSetState::Old)) {
+}
+
 void G1DefaultAllocator::init_mutator_alloc_region() {
   assert(_mutator_alloc_region.get() == NULL, "pre-condition");
   _mutator_alloc_region.init();
@@ -79,6 +87,8 @@
 void G1DefaultAllocator::init_gc_alloc_regions(EvacuationInfo& evacuation_info) {
   assert_at_safepoint(true /* should_be_vm_thread */);
 
+  G1Allocator::init_gc_alloc_regions(evacuation_info);
+
   _survivor_gc_alloc_region.init();
   _old_gc_alloc_region.init();
   reuse_retained_old_region(evacuation_info,
@@ -101,10 +111,8 @@
     _retained_old_gc_alloc_region->record_retained_region();
   }
 
-  if (ResizePLAB) {
-    _g1h->alloc_buffer_stats(InCSetState::Young)->adjust_desired_plab_sz();
-    _g1h->alloc_buffer_stats(InCSetState::Old)->adjust_desired_plab_sz();
-  }
+  _g1h->alloc_buffer_stats(InCSetState::Young)->adjust_desired_plab_sz();
+  _g1h->alloc_buffer_stats(InCSetState::Old)->adjust_desired_plab_sz();
 }
 
 void G1DefaultAllocator::abandon_gc_alloc_regions() {
@@ -136,78 +144,159 @@
 HeapWord* G1Allocator::par_allocate_during_gc(InCSetState dest,
                                               size_t word_size,
                                               AllocationContext_t context) {
+  size_t temp = 0;
+  HeapWord* result = par_allocate_during_gc(dest, word_size, word_size, &temp, context);
+  assert(result == NULL || temp == word_size,
+         err_msg("Requested " SIZE_FORMAT " words, but got " SIZE_FORMAT " at " PTR_FORMAT,
+                 word_size, temp, p2i(result)));
+  return result;
+}
+
+HeapWord* G1Allocator::par_allocate_during_gc(InCSetState dest,
+                                              size_t min_word_size,
+                                              size_t desired_word_size,
+                                              size_t* actual_word_size,
+                                              AllocationContext_t context) {
   switch (dest.value()) {
     case InCSetState::Young:
-      return survivor_attempt_allocation(word_size, context);
+      return survivor_attempt_allocation(min_word_size, desired_word_size, actual_word_size, context);
     case InCSetState::Old:
-      return old_attempt_allocation(word_size, context);
+      return old_attempt_allocation(min_word_size, desired_word_size, actual_word_size, context);
     default:
       ShouldNotReachHere();
       return NULL; // Keep some compilers happy
   }
 }
 
-HeapWord* G1Allocator::survivor_attempt_allocation(size_t word_size,
+bool G1Allocator::survivor_is_full(AllocationContext_t context) const {
+  return _survivor_is_full;
+}
+
+bool G1Allocator::old_is_full(AllocationContext_t context) const {
+  return _old_is_full;
+}
+
+void G1Allocator::set_survivor_full(AllocationContext_t context) {
+  _survivor_is_full = true;
+}
+
+void G1Allocator::set_old_full(AllocationContext_t context) {
+  _old_is_full = true;
+}
+
+HeapWord* G1Allocator::survivor_attempt_allocation(size_t min_word_size,
+                                                   size_t desired_word_size,
+                                                   size_t* actual_word_size,
                                                    AllocationContext_t context) {
-  assert(!_g1h->is_humongous(word_size),
+  assert(!_g1h->is_humongous(desired_word_size),
          "we should not be seeing humongous-size allocations in this path");
 
-  HeapWord* result = survivor_gc_alloc_region(context)->attempt_allocation(word_size,
+  HeapWord* result = survivor_gc_alloc_region(context)->attempt_allocation(min_word_size,
+                                                                           desired_word_size,
+                                                                           actual_word_size,
                                                                            false /* bot_updates */);
-  if (result == NULL) {
+  if (result == NULL && !survivor_is_full(context)) {
     MutexLockerEx x(FreeList_lock, Mutex::_no_safepoint_check_flag);
-    result = survivor_gc_alloc_region(context)->attempt_allocation_locked(word_size,
+    result = survivor_gc_alloc_region(context)->attempt_allocation_locked(min_word_size,
+                                                                          desired_word_size,
+                                                                          actual_word_size,
                                                                           false /* bot_updates */);
+    if (result == NULL) {
+      set_survivor_full(context);
+    }
   }
   if (result != NULL) {
-    _g1h->dirty_young_block(result, word_size);
+    _g1h->dirty_young_block(result, *actual_word_size);
   }
   return result;
 }
 
-HeapWord* G1Allocator::old_attempt_allocation(size_t word_size,
+HeapWord* G1Allocator::old_attempt_allocation(size_t min_word_size,
+                                              size_t desired_word_size,
+                                              size_t* actual_word_size,
                                               AllocationContext_t context) {
-  assert(!_g1h->is_humongous(word_size),
+  assert(!_g1h->is_humongous(desired_word_size),
          "we should not be seeing humongous-size allocations in this path");
 
-  HeapWord* result = old_gc_alloc_region(context)->attempt_allocation(word_size,
+  HeapWord* result = old_gc_alloc_region(context)->attempt_allocation(min_word_size,
+                                                                      desired_word_size,
+                                                                      actual_word_size,
                                                                       true /* bot_updates */);
-  if (result == NULL) {
+  if (result == NULL && !old_is_full(context)) {
     MutexLockerEx x(FreeList_lock, Mutex::_no_safepoint_check_flag);
-    result = old_gc_alloc_region(context)->attempt_allocation_locked(word_size,
+    result = old_gc_alloc_region(context)->attempt_allocation_locked(min_word_size,
+                                                                     desired_word_size,
+                                                                     actual_word_size,
                                                                      true /* bot_updates */);
+    if (result == NULL) {
+      set_old_full(context);
+    }
   }
   return result;
 }
 
+void G1Allocator::init_gc_alloc_regions(EvacuationInfo& evacuation_info) {
+  _survivor_is_full = false;
+  _old_is_full = false;
+}
+
 G1PLABAllocator::G1PLABAllocator(G1Allocator* allocator) :
   _g1h(G1CollectedHeap::heap()),
   _allocator(allocator),
   _survivor_alignment_bytes(calc_survivor_alignment_bytes()) {
+  for (size_t i = 0; i < ARRAY_SIZE(_direct_allocated); i++) {
+    _direct_allocated[i] = 0;
+  }
+}
+
+bool G1PLABAllocator::may_throw_away_buffer(size_t const allocation_word_sz, size_t const buffer_size) const {
+  return (allocation_word_sz * 100 < buffer_size * ParallelGCBufferWastePct);
 }
 
 HeapWord* G1PLABAllocator::allocate_direct_or_new_plab(InCSetState dest,
                                                        size_t word_sz,
-                                                       AllocationContext_t context) {
-  size_t gclab_word_size = _g1h->desired_plab_sz(dest);
-  if (word_sz * 100 < gclab_word_size * ParallelGCBufferWastePct) {
+                                                       AllocationContext_t context,
+                                                       bool* plab_refill_failed) {
+  size_t plab_word_size = G1CollectedHeap::heap()->desired_plab_sz(dest);
+  size_t required_in_plab = PLAB::size_required_for_allocation(word_sz);
+
+  // Only get a new PLAB if the allocation fits and it would not waste more than
+  // ParallelGCBufferWastePct in the existing buffer.
+  if ((required_in_plab <= plab_word_size) &&
+    may_throw_away_buffer(required_in_plab, plab_word_size)) {
+
     G1PLAB* alloc_buf = alloc_buffer(dest, context);
     alloc_buf->retire();
 
-    HeapWord* buf = _allocator->par_allocate_during_gc(dest, gclab_word_size, context);
-    if (buf == NULL) {
-      return NULL; // Let caller handle allocation failure.
+    size_t actual_plab_size = 0;
+    HeapWord* buf = _allocator->par_allocate_during_gc(dest,
+                                                       required_in_plab,
+                                                       plab_word_size,
+                                                       &actual_plab_size,
+                                                       context);
+
+    assert(buf == NULL || ((actual_plab_size >= required_in_plab) && (actual_plab_size <= plab_word_size)),
+           err_msg("Requested at minimum " SIZE_FORMAT ", desired " SIZE_FORMAT " words, but got " SIZE_FORMAT " at " PTR_FORMAT,
+                   required_in_plab, plab_word_size, actual_plab_size, p2i(buf)));
+
+    if (buf != NULL) {
+      alloc_buf->set_buf(buf, actual_plab_size);
+
+      HeapWord* const obj = alloc_buf->allocate(word_sz);
+      assert(obj != NULL, err_msg("PLAB should have been big enough, tried to allocate "
+                                  SIZE_FORMAT " requiring " SIZE_FORMAT " PLAB size " SIZE_FORMAT,
+                                  word_sz, required_in_plab, plab_word_size));
+      return obj;
     }
     // Otherwise.
-    alloc_buf->set_word_size(gclab_word_size);
-    alloc_buf->set_buf(buf);
-
-    HeapWord* const obj = alloc_buf->allocate(word_sz);
-    assert(obj != NULL, "buffer was definitely big enough...");
-    return obj;
-  } else {
-    return _allocator->par_allocate_during_gc(dest, word_sz, context);
+    *plab_refill_failed = true;
   }
+  // Try direct allocation.
+  HeapWord* result = _allocator->par_allocate_during_gc(dest, word_sz, context);
+  if (result != NULL) {
+    _direct_allocated[dest.value()] += word_sz;
+  }
+  return result;
 }
 
 void G1PLABAllocator::undo_allocation(InCSetState dest, HeapWord* obj, size_t word_sz, AllocationContext_t context) {
@@ -225,11 +314,14 @@
   _alloc_buffers[InCSetState::Old]  = &_tenured_alloc_buffer;
 }
 
-void G1DefaultPLABAllocator::retire_alloc_buffers() {
+void G1DefaultPLABAllocator::flush_and_retire_stats() {
   for (uint state = 0; state < InCSetState::Num; state++) {
     G1PLAB* const buf = _alloc_buffers[state];
     if (buf != NULL) {
-      buf->flush_and_retire_stats(_g1h->alloc_buffer_stats(state));
+      G1EvacStats* stats = _g1h->alloc_buffer_stats(state);
+      buf->flush_and_retire_stats(stats);
+      stats->add_direct_allocated(_direct_allocated[state]);
+      _direct_allocated[state] = 0;
     }
   }
 }
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/g1Allocator.hpp
--- a/hotspot/src/share/vm/gc/g1/g1Allocator.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/g1/g1Allocator.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -38,23 +38,36 @@
 // Also keeps track of retained regions across GCs.
 class G1Allocator : public CHeapObj<mtGC> {
   friend class VMStructs;
+private:
+  bool _survivor_is_full;
+  bool _old_is_full;
 protected:
   G1CollectedHeap* _g1h;
 
   virtual MutatorAllocRegion* mutator_alloc_region(AllocationContext_t context) = 0;
 
+  virtual bool survivor_is_full(AllocationContext_t context) const;
+  virtual bool old_is_full(AllocationContext_t context) const;
+
+  virtual void set_survivor_full(AllocationContext_t context);
+  virtual void set_old_full(AllocationContext_t context);
+
   // Accessors to the allocation regions.
   virtual SurvivorGCAllocRegion* survivor_gc_alloc_region(AllocationContext_t context) = 0;
   virtual OldGCAllocRegion* old_gc_alloc_region(AllocationContext_t context) = 0;
 
   // Allocation attempt during GC for a survivor object / PLAB.
-  inline HeapWord* survivor_attempt_allocation(size_t word_size,
+  inline HeapWord* survivor_attempt_allocation(size_t min_word_size,
+                                               size_t desired_word_size,
+                                               size_t* actual_word_size,
                                                AllocationContext_t context);
   // Allocation attempt during GC for an old object / PLAB.
-  inline HeapWord* old_attempt_allocation(size_t word_size,
+  inline HeapWord* old_attempt_allocation(size_t min_word_size,
+                                          size_t desired_word_size,
+                                          size_t* actual_word_size,
                                           AllocationContext_t context);
 public:
-  G1Allocator(G1CollectedHeap* heap) : _g1h(heap) { }
+  G1Allocator(G1CollectedHeap* heap) : _g1h(heap), _survivor_is_full(false), _old_is_full(false) { }
   virtual ~G1Allocator() { }
 
   static G1Allocator* create_allocator(G1CollectedHeap* g1h);
@@ -66,7 +79,7 @@
   virtual void init_mutator_alloc_region() = 0;
   virtual void release_mutator_alloc_region() = 0;
 
-  virtual void init_gc_alloc_regions(EvacuationInfo& evacuation_info) = 0;
+  virtual void init_gc_alloc_regions(EvacuationInfo& evacuation_info);
   virtual void release_gc_alloc_regions(EvacuationInfo& evacuation_info) = 0;
   virtual void abandon_gc_alloc_regions() = 0;
 
@@ -93,6 +106,12 @@
                                    size_t word_size,
                                    AllocationContext_t context);
 
+  HeapWord* par_allocate_during_gc(InCSetState dest,
+                                   size_t min_word_size,
+                                   size_t desired_word_size,
+                                   size_t* actual_word_size,
+                                   AllocationContext_t context);
+
   virtual size_t used_in_alloc_regions() = 0;
 };
 
@@ -114,7 +133,7 @@
 
   HeapRegion* _retained_old_gc_alloc_region;
 public:
-  G1DefaultAllocator(G1CollectedHeap* heap) : G1Allocator(heap), _retained_old_gc_alloc_region(NULL) { }
+  G1DefaultAllocator(G1CollectedHeap* heap);
 
   virtual void init_mutator_alloc_region();
   virtual void release_mutator_alloc_region();
@@ -163,8 +182,12 @@
     guarantee(_retired, "Allocation buffer has not been retired");
   }
 
-  virtual void set_buf(HeapWord* buf) {
-    PLAB::set_buf(buf);
+  // The amount of space in words wasted within the PLAB including
+  // waste due to refills and alignment.
+  size_t wasted() const { return _wasted; }
+
+  virtual void set_buf(HeapWord* buf, size_t word_size) {
+    PLAB::set_buf(buf, word_size);
     _retired = false;
   }
 
@@ -198,7 +221,10 @@
   // architectures have a special compare against zero instructions.
   const uint _survivor_alignment_bytes;
 
-  virtual void retire_alloc_buffers() = 0;
+  // Number of words allocated directly (not counting PLAB allocation).
+  size_t _direct_allocated[InCSetState::Num];
+
+  virtual void flush_and_retire_stats() = 0;
   virtual G1PLAB* alloc_buffer(InCSetState dest, AllocationContext_t context) = 0;
 
   // Calculate the survivor space object alignment in bytes. Returns that or 0 if
@@ -215,6 +241,11 @@
     }
   }
 
+  HeapWord* allocate_new_plab(InCSetState dest,
+                              size_t word_sz,
+                              AllocationContext_t context);
+
+  bool may_throw_away_buffer(size_t const allocation_word_sz, size_t const buffer_size) const;
 public:
   G1PLABAllocator(G1Allocator* allocator);
   virtual ~G1PLABAllocator() { }
@@ -225,31 +256,28 @@
 
   // Allocate word_sz words in dest, either directly into the regions or by
   // allocating a new PLAB. Returns the address of the allocated memory, NULL if
-  // not successful.
+  // not successful. Plab_refill_failed indicates whether an attempt to refill the
+  // PLAB failed or not.
   HeapWord* allocate_direct_or_new_plab(InCSetState dest,
                                         size_t word_sz,
-                                        AllocationContext_t context);
+                                        AllocationContext_t context,
+                                        bool* plab_refill_failed);
 
   // Allocate word_sz words in the PLAB of dest.  Returns the address of the
   // allocated memory, NULL if not successful.
-  HeapWord* plab_allocate(InCSetState dest,
-                          size_t word_sz,
-                          AllocationContext_t context) {
-    G1PLAB* buffer = alloc_buffer(dest, context);
-    if (_survivor_alignment_bytes == 0 || !dest.is_young()) {
-      return buffer->allocate(word_sz);
-    } else {
-      return buffer->allocate_aligned(word_sz, _survivor_alignment_bytes);
-    }
-  }
+  inline HeapWord* plab_allocate(InCSetState dest,
+                                 size_t word_sz,
+                                 AllocationContext_t context);
 
-  HeapWord* allocate(InCSetState dest, size_t word_sz,
-                     AllocationContext_t context) {
+  HeapWord* allocate(InCSetState dest,
+                     size_t word_sz,
+                     AllocationContext_t context,
+                     bool* refill_failed) {
     HeapWord* const obj = plab_allocate(dest, word_sz, context);
     if (obj != NULL) {
       return obj;
     }
-    return allocate_direct_or_new_plab(dest, word_sz, context);
+    return allocate_direct_or_new_plab(dest, word_sz, context, refill_failed);
   }
 
   void undo_allocation(InCSetState dest, HeapWord* obj, size_t word_sz, AllocationContext_t context);
@@ -273,7 +301,7 @@
     return _alloc_buffers[dest.value()];
   }
 
-  virtual void retire_alloc_buffers();
+  virtual void flush_and_retire_stats();
 
   virtual void waste(size_t& wasted, size_t& undo_wasted);
 };
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/g1Allocator.inline.hpp
--- a/hotspot/src/share/vm/gc/g1/g1Allocator.inline.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/g1/g1Allocator.inline.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -27,6 +27,7 @@
 
 #include "gc/g1/g1Allocator.hpp"
 #include "gc/g1/g1AllocRegion.inline.hpp"
+#include "gc/shared/plab.inline.hpp"
 
 HeapWord* G1Allocator::attempt_allocation(size_t word_size, AllocationContext_t context) {
   return mutator_alloc_region(context)->attempt_allocation(word_size, false /* bot_updates */);
@@ -43,4 +44,15 @@
   return mutator_alloc_region(context)->attempt_allocation_force(word_size, false /* bot_updates */);
 }
 
+inline HeapWord* G1PLABAllocator::plab_allocate(InCSetState dest,
+                                                size_t word_sz,
+                                                AllocationContext_t context) {
+  G1PLAB* buffer = alloc_buffer(dest, context);
+  if (_survivor_alignment_bytes == 0 || !dest.is_young()) {
+    return buffer->allocate(word_sz);
+  } else {
+    return buffer->allocate_aligned(word_sz, _survivor_alignment_bytes);
+  }
+}
+
 #endif // SHARE_VM_GC_G1_G1ALLOCATOR_HPP
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/g1Allocator_ext.cpp
--- a/hotspot/src/share/vm/gc/g1/g1Allocator_ext.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/g1/g1Allocator_ext.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -23,7 +23,7 @@
  */
 
 #include "precompiled.hpp"
-#include "gc/g1/g1Allocator.hpp"
+#include "gc/g1/g1Allocator.inline.hpp"
 #include "gc/g1/g1CollectedHeap.hpp"
 
 G1Allocator* G1Allocator::create_allocator(G1CollectedHeap* g1h) {
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/g1BlockOffsetTable.inline.hpp
--- a/hotspot/src/share/vm/gc/g1/g1BlockOffsetTable.inline.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/g1/g1BlockOffsetTable.inline.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -26,7 +26,7 @@
 #define SHARE_VM_GC_G1_G1BLOCKOFFSETTABLE_INLINE_HPP
 
 #include "gc/g1/g1BlockOffsetTable.hpp"
-#include "gc/g1/heapRegion.inline.hpp"
+#include "gc/g1/heapRegion.hpp"
 #include "gc/shared/space.hpp"
 
 inline HeapWord* G1BlockOffsetTable::block_start(const void* addr) {
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp
--- a/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -1944,8 +1944,8 @@
   _young_list(new YoungList(this)),
   _gc_time_stamp(0),
   _summary_bytes_used(0),
-  _survivor_plab_stats(YoungPLABSize, PLABWeight),
-  _old_plab_stats(OldPLABSize, PLABWeight),
+  _survivor_evac_stats(YoungPLABSize, PLABWeight),
+  _old_evac_stats(OldPLABSize, PLABWeight),
   _expand_heap_after_alloc_failure(true),
   _surviving_young_words(NULL),
   _old_marking_cycles_started(0),
@@ -1960,7 +1960,7 @@
   _gc_tracer_stw(new (ResourceObj::C_HEAP, mtGC) G1NewTracer()),
   _gc_tracer_cm(new (ResourceObj::C_HEAP, mtGC) G1OldTracer()) {
 
-  _workers = new FlexibleWorkGang("GC Thread", ParallelGCThreads,
+  _workers = new WorkGang("GC Thread", ParallelGCThreads,
                           /* are_GC_task_threads */true,
                           /* are_ConcurrentGC_threads */false);
   _workers->initialize_workers();
@@ -3504,6 +3504,13 @@
   return G1HeapSummary(heap_summary, used(), eden_used_bytes, eden_capacity_bytes, survivor_used_bytes);
 }
 
+G1EvacSummary G1CollectedHeap::create_g1_evac_summary(G1EvacStats* stats) {
+  return G1EvacSummary(stats->allocated(), stats->wasted(), stats->undo_wasted(),
+                       stats->unused(), stats->used(), stats->region_end_waste(),
+                       stats->regions_filled(), stats->direct_allocated(),
+                       stats->failure_used(), stats->failure_waste());
+}
+
 void G1CollectedHeap::trace_heap(GCWhen::Type when, const GCTracer* gc_tracer) {
   const G1HeapSummary& heap_summary = create_g1_heap_summary();
   gc_tracer->report_gc_heap_summary(when, heap_summary);
@@ -3753,8 +3760,7 @@
   cl.flush_rem_set_entries();
 }
 
-void
-G1CollectedHeap::setup_surviving_young_words() {
+void G1CollectedHeap::setup_surviving_young_words() {
   assert(_surviving_young_words == NULL, "pre-condition");
   uint array_length = g1_policy()->young_cset_region_length();
   _surviving_young_words = NEW_C_HEAP_ARRAY(size_t, (size_t) array_length, mtGC);
@@ -3770,17 +3776,15 @@
 #endif // !ASSERT
 }
 
-void
-G1CollectedHeap::update_surviving_young_words(size_t* surv_young_words) {
-  MutexLockerEx x(ParGCRareEvent_lock, Mutex::_no_safepoint_check_flag);
+void G1CollectedHeap::update_surviving_young_words(size_t* surv_young_words) {
+  assert_at_safepoint(true);
   uint array_length = g1_policy()->young_cset_region_length();
   for (uint i = 0; i < array_length; ++i) {
     _surviving_young_words[i] += surv_young_words[i];
   }
 }
 
-void
-G1CollectedHeap::cleanup_surviving_young_words() {
+void G1CollectedHeap::cleanup_surviving_young_words() {
   guarantee( _surviving_young_words != NULL, "pre-condition" );
   FREE_C_HEAP_ARRAY(size_t, _surviving_young_words);
   _surviving_young_words = NULL;
@@ -4375,6 +4379,13 @@
 }
 
 class G1ParEvacuateFollowersClosure : public VoidClosure {
+private:
+  double _start_term;
+  double _term_time;
+  size_t _term_attempts;
+
+  void start_term_time() { _term_attempts++; _start_term = os::elapsedTime(); }
+  void end_term_time() { _term_time += os::elapsedTime() - _start_term; }
 protected:
   G1CollectedHeap*              _g1h;
   G1ParScanThreadState*         _par_scan_state;
@@ -4391,19 +4402,23 @@
                                 RefToScanQueueSet* queues,
                                 ParallelTaskTerminator* terminator)
     : _g1h(g1h), _par_scan_state(par_scan_state),
-      _queues(queues), _terminator(terminator) {}
+      _queues(queues), _terminator(terminator),
+      _start_term(0.0), _term_time(0.0), _term_attempts(0) {}
 
   void do_void();
 
+  double term_time() const { return _term_time; }
+  size_t term_attempts() const { return _term_attempts; }
+
 private:
   inline bool offer_termination();
 };
 
 bool G1ParEvacuateFollowersClosure::offer_termination() {
   G1ParScanThreadState* const pss = par_scan_state();
-  pss->start_term_time();
+  start_term_time();
   const bool res = terminator()->offer_termination();
-  pss->end_term_time();
+  end_term_time();
   return res;
 }
 
@@ -4444,15 +4459,17 @@
 class G1ParTask : public AbstractGangTask {
 protected:
   G1CollectedHeap*       _g1h;
-  RefToScanQueueSet      *_queues;
+  G1ParScanThreadState** _pss;
+  RefToScanQueueSet*     _queues;
   G1RootProcessor*       _root_processor;
   ParallelTaskTerminator _terminator;
   uint _n_workers;
 
 public:
-  G1ParTask(G1CollectedHeap* g1h, RefToScanQueueSet *task_queues, G1RootProcessor* root_processor, uint n_workers)
+  G1ParTask(G1CollectedHeap* g1h, G1ParScanThreadState** per_thread_states, RefToScanQueueSet *task_queues, G1RootProcessor* root_processor, uint n_workers)
     : AbstractGangTask("G1 collection"),
       _g1h(g1h),
+      _pss(per_thread_states),
       _queues(task_queues),
       _root_processor(root_processor),
       _terminator(n_workers, _queues),
@@ -4499,7 +4516,8 @@
   void work(uint worker_id) {
     if (worker_id >= _n_workers) return;  // no work needed this round
 
-    _g1h->g1_policy()->phase_times()->record_time_secs(G1GCPhaseTimes::GCWorkerStart, worker_id, os::elapsedTime());
+    double start_sec = os::elapsedTime();
+    _g1h->g1_policy()->phase_times()->record_time_secs(G1GCPhaseTimes::GCWorkerStart, worker_id, start_sec);
 
     {
       ResourceMark rm;
@@ -4507,23 +4525,24 @@
 
       ReferenceProcessor*             rp = _g1h->ref_processor_stw();
 
-      G1ParScanThreadState            pss(_g1h, worker_id, rp);
+      G1ParScanThreadState*           pss = _pss[worker_id];
+      pss->set_ref_processor(rp);
 
       bool only_young = _g1h->collector_state()->gcs_are_young();
 
       // Non-IM young GC.
-      G1ParCopyClosure<G1BarrierNone, G1MarkNone>             scan_only_root_cl(_g1h, &pss, rp);
+      G1ParCopyClosure<G1BarrierNone, G1MarkNone>             scan_only_root_cl(_g1h, pss, rp);
       G1CLDClosure<G1MarkNone>                                scan_only_cld_cl(&scan_only_root_cl,
                                                                                only_young, // Only process dirty klasses.
                                                                                false);     // No need to claim CLDs.
       // IM young GC.
       //    Strong roots closures.
-      G1ParCopyClosure<G1BarrierNone, G1MarkFromRoot>         scan_mark_root_cl(_g1h, &pss, rp);
+      G1ParCopyClosure<G1BarrierNone, G1MarkFromRoot>         scan_mark_root_cl(_g1h, pss, rp);
       G1CLDClosure<G1MarkFromRoot>                            scan_mark_cld_cl(&scan_mark_root_cl,
                                                                                false, // Process all klasses.
                                                                                true); // Need to claim CLDs.
       //    Weak roots closures.
-      G1ParCopyClosure<G1BarrierNone, G1MarkPromotedFromRoot> scan_mark_weak_root_cl(_g1h, &pss, rp);
+      G1ParCopyClosure<G1BarrierNone, G1MarkPromotedFromRoot> scan_mark_weak_root_cl(_g1h, pss, rp);
       G1CLDClosure<G1MarkPromotedFromRoot>                    scan_mark_weak_cld_cl(&scan_mark_weak_root_cl,
                                                                                     false, // Process all klasses.
                                                                                     true); // Need to claim CLDs.
@@ -4554,8 +4573,7 @@
         weak_cld_cl    = &scan_only_cld_cl;
       }
 
-      pss.start_strong_roots();
-
+      double start_strong_roots_sec = os::elapsedTime();
       _root_processor->evacuate_roots(strong_root_cl,
                                       weak_root_cl,
                                       strong_cld_cl,
@@ -4563,32 +4581,45 @@
                                       trace_metadata,
                                       worker_id);
 
-      G1ParPushHeapRSClosure push_heap_rs_cl(_g1h, &pss);
+      G1ParPushHeapRSClosure push_heap_rs_cl(_g1h, pss);
       _root_processor->scan_remembered_sets(&push_heap_rs_cl,
                                             weak_root_cl,
                                             worker_id);
-      pss.end_strong_roots();
-
+      double strong_roots_sec = os::elapsedTime() - start_strong_roots_sec;
+
+      double term_sec = 0.0;
+      size_t evac_term_attempts = 0;
       {
         double start = os::elapsedTime();
-        G1ParEvacuateFollowersClosure evac(_g1h, &pss, _queues, &_terminator);
+        G1ParEvacuateFollowersClosure evac(_g1h, pss, _queues, &_terminator);
         evac.do_void();
+
+        evac_term_attempts = evac.term_attempts();
+        term_sec = evac.term_time();
         double elapsed_sec = os::elapsedTime() - start;
-        double term_sec = pss.term_time();
         _g1h->g1_policy()->phase_times()->add_time_secs(G1GCPhaseTimes::ObjCopy, worker_id, elapsed_sec - term_sec);
         _g1h->g1_policy()->phase_times()->record_time_secs(G1GCPhaseTimes::Termination, worker_id, term_sec);
-        _g1h->g1_policy()->phase_times()->record_thread_work_item(G1GCPhaseTimes::Termination, worker_id, pss.term_attempts());
+        _g1h->g1_policy()->phase_times()->record_thread_work_item(G1GCPhaseTimes::Termination, worker_id, evac_term_attempts);
       }
-      _g1h->g1_policy()->record_thread_age_table(pss.age_table());
-      _g1h->update_surviving_young_words(pss.surviving_young_words()+1);
+
+      assert(pss->queue_is_empty(), "should be empty");
 
       if (PrintTerminationStats) {
         MutexLockerEx x(ParGCRareEvent_lock, Mutex::_no_safepoint_check_flag);
-        pss.print_termination_stats();
+        size_t lab_waste;
+        size_t lab_undo_waste;
+        pss->waste(lab_waste, lab_undo_waste);
+        _g1h->print_termination_stats(gclog_or_tty,
+                                      worker_id,
+                                      (os::elapsedTime() - start_sec) * 1000.0,   /* elapsed time */
+                                      strong_roots_sec * 1000.0,                  /* strong roots time */
+                                      term_sec * 1000.0,                          /* evac term time */
+                                      evac_term_attempts,                         /* evac term attempts */
+                                      lab_waste,                                  /* alloc buffer waste */
+                                      lab_undo_waste                              /* undo waste */
+                                      );
       }
 
-      assert(pss.queue_is_empty(), "should be empty");
-
       // Close the inner scope so that the ResourceMark and HandleMark
       // destructors are executed here and are included as part of the
       // "GC Worker Time".
@@ -4597,6 +4628,31 @@
   }
 };
 
+void G1CollectedHeap::print_termination_stats_hdr(outputStream* const st) {
+  st->print_raw_cr("GC Termination Stats");
+  st->print_raw_cr("     elapsed  --strong roots-- -------termination------- ------waste (KiB)------");
+  st->print_raw_cr("thr     ms        ms      %        ms      %    attempts  total   alloc    undo");
+  st->print_raw_cr("--- --------- --------- ------ --------- ------ -------- ------- ------- -------");
+}
+
+void G1CollectedHeap::print_termination_stats(outputStream* const st,
+                                              uint worker_id,
+                                              double elapsed_ms,
+                                              double strong_roots_ms,
+                                              double term_ms,
+                                              size_t term_attempts,
+                                              size_t alloc_buffer_waste,
+                                              size_t undo_waste) const {
+  st->print_cr("%3d %9.2f %9.2f %6.2f "
+               "%9.2f %6.2f " SIZE_FORMAT_W(8) " "
+               SIZE_FORMAT_W(7) " " SIZE_FORMAT_W(7) " " SIZE_FORMAT_W(7),
+               worker_id, elapsed_ms, strong_roots_ms, strong_roots_ms * 100 / elapsed_ms,
+               term_ms, term_ms * 100 / elapsed_ms, term_attempts,
+               (alloc_buffer_waste + undo_waste) * HeapWordSize / K,
+               alloc_buffer_waste * HeapWordSize / K,
+               undo_waste * HeapWordSize / K);
+}
+
 class G1StringSymbolTableUnlinkTask : public AbstractGangTask {
 private:
   BoolObjectClosure* _is_alive;
@@ -5125,17 +5181,20 @@
 
 class G1STWRefProcTaskExecutor: public AbstractRefProcTaskExecutor {
 private:
-  G1CollectedHeap*   _g1h;
-  RefToScanQueueSet* _queues;
-  FlexibleWorkGang*  _workers;
-  uint               _active_workers;
+  G1CollectedHeap*        _g1h;
+  G1ParScanThreadState**  _pss;
+  RefToScanQueueSet*      _queues;
+  WorkGang*               _workers;
+  uint                    _active_workers;
 
 public:
   G1STWRefProcTaskExecutor(G1CollectedHeap* g1h,
-                           FlexibleWorkGang* workers,
+                           G1ParScanThreadState** per_thread_states,
+                           WorkGang* workers,
                            RefToScanQueueSet *task_queues,
                            uint n_workers) :
     _g1h(g1h),
+    _pss(per_thread_states),
     _queues(task_queues),
     _workers(workers),
     _active_workers(n_workers)
@@ -5154,17 +5213,20 @@
   typedef AbstractRefProcTaskExecutor::ProcessTask ProcessTask;
   ProcessTask&     _proc_task;
   G1CollectedHeap* _g1h;
-  RefToScanQueueSet *_task_queues;
+  G1ParScanThreadState** _pss;
+  RefToScanQueueSet* _task_queues;
   ParallelTaskTerminator* _terminator;
 
 public:
   G1STWRefProcTaskProxy(ProcessTask& proc_task,
-                     G1CollectedHeap* g1h,
-                     RefToScanQueueSet *task_queues,
-                     ParallelTaskTerminator* terminator) :
+                        G1CollectedHeap* g1h,
+                        G1ParScanThreadState** per_thread_states,
+                        RefToScanQueueSet *task_queues,
+                        ParallelTaskTerminator* terminator) :
     AbstractGangTask("Process reference objects in parallel"),
     _proc_task(proc_task),
     _g1h(g1h),
+    _pss(per_thread_states),
     _task_queues(task_queues),
     _terminator(terminator)
   {}
@@ -5176,11 +5238,12 @@
 
     G1STWIsAliveClosure is_alive(_g1h);
 
-    G1ParScanThreadState            pss(_g1h, worker_id, NULL);
-
-    G1ParScanExtRootClosure        only_copy_non_heap_cl(_g1h, &pss, NULL);
-
-    G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(_g1h, &pss, NULL);
+    G1ParScanThreadState*           pss = _pss[worker_id];
+    pss->set_ref_processor(NULL);
+
+    G1ParScanExtRootClosure        only_copy_non_heap_cl(_g1h, pss, NULL);
+
+    G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(_g1h, pss, NULL);
 
     OopClosure*                    copy_non_heap_cl = &only_copy_non_heap_cl;
 
@@ -5190,10 +5253,10 @@
     }
 
     // Keep alive closure.
-    G1CopyingKeepAliveClosure keep_alive(_g1h, copy_non_heap_cl, &pss);
+    G1CopyingKeepAliveClosure keep_alive(_g1h, copy_non_heap_cl, pss);
 
     // Complete GC closure
-    G1ParEvacuateFollowersClosure drain_queue(_g1h, &pss, _task_queues, _terminator);
+    G1ParEvacuateFollowersClosure drain_queue(_g1h, pss, _task_queues, _terminator);
 
     // Call the reference processing task's work routine.
     _proc_task.work(worker_id, is_alive, keep_alive, drain_queue);
@@ -5212,7 +5275,7 @@
   assert(_workers != NULL, "Need parallel worker threads.");
 
   ParallelTaskTerminator terminator(_active_workers, _queues);
-  G1STWRefProcTaskProxy proc_task_proxy(proc_task, _g1h, _queues, &terminator);
+  G1STWRefProcTaskProxy proc_task_proxy(proc_task, _g1h, _pss, _queues, &terminator);
 
   _workers->run_task(&proc_task_proxy);
 }
@@ -5254,15 +5317,17 @@
 
 class G1ParPreserveCMReferentsTask: public AbstractGangTask {
 protected:
-  G1CollectedHeap* _g1h;
-  RefToScanQueueSet      *_queues;
+  G1CollectedHeap*       _g1h;
+  G1ParScanThreadState** _pss;
+  RefToScanQueueSet*     _queues;
   ParallelTaskTerminator _terminator;
   uint _n_workers;
 
 public:
-  G1ParPreserveCMReferentsTask(G1CollectedHeap* g1h, uint workers, RefToScanQueueSet *task_queues) :
+  G1ParPreserveCMReferentsTask(G1CollectedHeap* g1h, G1ParScanThreadState** per_thread_states, int workers, RefToScanQueueSet *task_queues) :
     AbstractGangTask("ParPreserveCMReferents"),
     _g1h(g1h),
+    _pss(per_thread_states),
     _queues(task_queues),
     _terminator(workers, _queues),
     _n_workers(workers)
@@ -5272,12 +5337,13 @@
     ResourceMark rm;
     HandleMark   hm;
 
-    G1ParScanThreadState            pss(_g1h, worker_id, NULL);
-    assert(pss.queue_is_empty(), "both queue and overflow should be empty");
-
-    G1ParScanExtRootClosure        only_copy_non_heap_cl(_g1h, &pss, NULL);
-
-    G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(_g1h, &pss, NULL);
+    G1ParScanThreadState*          pss = _pss[worker_id];
+    pss->set_ref_processor(NULL);
+    assert(pss->queue_is_empty(), "both queue and overflow should be empty");
+
+    G1ParScanExtRootClosure        only_copy_non_heap_cl(_g1h, pss, NULL);
+
+    G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(_g1h, pss, NULL);
 
     OopClosure*                    copy_non_heap_cl = &only_copy_non_heap_cl;
 
@@ -5291,7 +5357,7 @@
 
     // Copying keep alive closure. Applied to referent objects that need
     // to be copied.
-    G1CopyingKeepAliveClosure keep_alive(_g1h, copy_non_heap_cl, &pss);
+    G1CopyingKeepAliveClosure keep_alive(_g1h, copy_non_heap_cl, pss);
 
     ReferenceProcessor* rp = _g1h->ref_processor_cm();
 
@@ -5324,15 +5390,15 @@
     }
 
     // Drain the queue - which may cause stealing
-    G1ParEvacuateFollowersClosure drain_queue(_g1h, &pss, _queues, &_terminator);
+    G1ParEvacuateFollowersClosure drain_queue(_g1h, pss, _queues, &_terminator);
     drain_queue.do_void();
     // Allocation buffers were retired at the end of G1ParEvacuateFollowersClosure
-    assert(pss.queue_is_empty(), "should be");
+    assert(pss->queue_is_empty(), "should be");
   }
 };
 
 // Weak Reference processing during an evacuation pause (part 1).
-void G1CollectedHeap::process_discovered_references() {
+void G1CollectedHeap::process_discovered_references(G1ParScanThreadState** per_thread_states) {
   double ref_proc_start = os::elapsedTime();
 
   ReferenceProcessor* rp = _ref_processor_stw;
@@ -5362,6 +5428,7 @@
   uint no_of_gc_workers = workers()->active_workers();
 
   G1ParPreserveCMReferentsTask keep_cm_referents(this,
+                                                 per_thread_states,
                                                  no_of_gc_workers,
                                                  _task_queues);
 
@@ -5376,16 +5443,17 @@
   // JNI refs.
 
   // Use only a single queue for this PSS.
-  G1ParScanThreadState            pss(this, 0, NULL);
-  assert(pss.queue_is_empty(), "pre-condition");
+  G1ParScanThreadState*           pss = per_thread_states[0];
+  pss->set_ref_processor(NULL);
+  assert(pss->queue_is_empty(), "pre-condition");
 
   // We do not embed a reference processor in the copying/scanning
   // closures while we're actually processing the discovered
   // reference objects.
 
-  G1ParScanExtRootClosure        only_copy_non_heap_cl(this, &pss, NULL);
-
-  G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(this, &pss, NULL);
+  G1ParScanExtRootClosure        only_copy_non_heap_cl(this, pss, NULL);
+
+  G1ParScanAndMarkExtRootClosure copy_mark_non_heap_cl(this, pss, NULL);
 
   OopClosure*                    copy_non_heap_cl = &only_copy_non_heap_cl;
 
@@ -5395,10 +5463,10 @@
   }
 
   // Keep alive closure.
-  G1CopyingKeepAliveClosure keep_alive(this, copy_non_heap_cl, &pss);
+  G1CopyingKeepAliveClosure keep_alive(this, copy_non_heap_cl, pss);
 
   // Serial Complete GC closure
-  G1STWDrainQueueClosure drain_queue(this, &pss);
+  G1STWDrainQueueClosure drain_queue(this, pss);
 
   // Setup the soft refs policy...
   rp->setup_policy(false);
@@ -5417,7 +5485,7 @@
     assert(rp->num_q() == no_of_gc_workers, "sanity");
     assert(no_of_gc_workers <= rp->max_num_q(), "sanity");
 
-    G1STWRefProcTaskExecutor par_task_executor(this, workers(), _task_queues, no_of_gc_workers);
+    G1STWRefProcTaskExecutor par_task_executor(this, per_thread_states, workers(), _task_queues, no_of_gc_workers);
     stats = rp->process_discovered_references(&is_alive,
                                               &keep_alive,
                                               &drain_queue,
@@ -5429,14 +5497,14 @@
   _gc_tracer_stw->report_gc_reference_stats(stats);
 
   // We have completed copying any necessary live referent objects.
-  assert(pss.queue_is_empty(), "both queue and overflow should be empty");
+  assert(pss->queue_is_empty(), "both queue and overflow should be empty");
 
   double ref_proc_time = os::elapsedTime() - ref_proc_start;
   g1_policy()->phase_times()->record_ref_proc_time(ref_proc_time * 1000.0);
 }
 
 // Weak Reference processing during an evacuation pause (part 2).
-void G1CollectedHeap::enqueue_discovered_references() {
+void G1CollectedHeap::enqueue_discovered_references(G1ParScanThreadState** per_thread_states) {
   double ref_enq_start = os::elapsedTime();
 
   ReferenceProcessor* rp = _ref_processor_stw;
@@ -5455,7 +5523,7 @@
     assert(rp->num_q() == n_workers, "sanity");
     assert(n_workers <= rp->max_num_q(), "sanity");
 
-    G1STWRefProcTaskExecutor par_task_executor(this, workers(), _task_queues, n_workers);
+    G1STWRefProcTaskExecutor par_task_executor(this, per_thread_states, workers(), _task_queues, n_workers);
     rp->enqueue_discovered_references(&par_task_executor);
   }
 
@@ -5491,9 +5559,14 @@
   double start_par_time_sec = os::elapsedTime();
   double end_par_time_sec;
 
+  G1ParScanThreadState** per_thread_states = NEW_C_HEAP_ARRAY(G1ParScanThreadState*, n_workers, mtGC);
+  for (uint i = 0; i < n_workers; i++) {
+    per_thread_states[i] = new_par_scan_state(i);
+  }
+
   {
     G1RootProcessor root_processor(this, n_workers);
-    G1ParTask g1_par_task(this, _task_queues, &root_processor, n_workers);
+    G1ParTask g1_par_task(this, per_thread_states, _task_queues, &root_processor, n_workers);
     // InitialMark needs claim bits to keep track of the marked-through CLDs.
     if (collector_state()->during_initial_mark_pause()) {
       ClassLoaderDataGraph::clear_claimed_marks();
@@ -5501,7 +5574,7 @@
 
     // The individual threads will set their evac-failure closures.
     if (PrintTerminationStats) {
-      G1ParScanThreadState::print_termination_stats_hdr();
+      print_termination_stats_hdr(gclog_or_tty);
     }
 
     workers()->run_task(&g1_par_task);
@@ -5528,7 +5601,7 @@
   // as we may have to copy some 'reachable' referent
   // objects (and their reachable sub-graphs) that were
   // not copied during the pause.
-  process_discovered_references();
+  process_discovered_references(per_thread_states);
 
   if (G1StringDedup::is_enabled()) {
     double fixup_start = os::elapsedTime();
@@ -5544,6 +5617,14 @@
   _allocator->release_gc_alloc_regions(evacuation_info);
   g1_rem_set()->cleanup_after_oops_into_collection_set_do();
 
+  for (uint i = 0; i < n_workers; i++) {
+    G1ParScanThreadState* pss = per_thread_states[i];
+    delete pss;
+  }
+  FREE_C_HEAP_ARRAY(G1ParScanThreadState*, per_thread_states);
+
+  record_obj_copy_mem_stats();
+
   // Reset and re-enable the hot card cache.
   // Note the counts for the cards in the regions in the
   // collection set are reset when the collection set is freed.
@@ -5568,12 +5649,17 @@
   // will log these updates (and dirty their associated
   // cards). We need these updates logged to update any
   // RSets.
-  enqueue_discovered_references();
+  enqueue_discovered_references(per_thread_states);
 
   redirty_logged_cards();
   COMPILER2_PRESENT(DerivedPointerTable::update_pointers());
 }
 
+void G1CollectedHeap::record_obj_copy_mem_stats() {
+  _gc_tracer_stw->report_evacuation_statistics(create_g1_evac_summary(&_survivor_evac_stats),
+                                               create_g1_evac_summary(&_old_evac_stats));
+}
+
 void G1CollectedHeap::free_region(HeapRegion* hr,
                                   FreeRegionList* free_list,
                                   bool par,
@@ -5972,6 +6058,11 @@
       cur->set_evacuation_failed(false);
       // The region is now considered to be old.
       cur->set_old();
+      // Do some allocation statistics accounting. Regions that failed evacuation
+      // are always made old, so there is no need to update anything in the young
+      // gen statistics, but we need to update old gen statistics.
+      size_t used_words = cur->marked_bytes() / HeapWordSize;
+      _old_evac_stats.add_failure_used_and_waste(used_words, HeapRegion::GrainWords - used_words);
       _old_set.add(cur);
       evacuation_info.increment_collectionset_used_after(cur->used());
     }
@@ -6217,6 +6308,10 @@
   }
 }
 
+bool G1CollectedHeap::is_old_gc_alloc_region(HeapRegion* hr) {
+  return _allocator->is_retained_old_region(hr);
+}
+
 void G1CollectedHeap::set_region_short_lived_locked(HeapRegion* hr) {
   _young_list->push_region(hr);
 }
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/g1CollectedHeap.hpp
--- a/hotspot/src/share/vm/gc/g1/g1CollectedHeap.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/g1/g1CollectedHeap.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -28,12 +28,12 @@
 #include "gc/g1/concurrentMark.hpp"
 #include "gc/g1/evacuationInfo.hpp"
 #include "gc/g1/g1AllocationContext.hpp"
-#include "gc/g1/g1Allocator.hpp"
 #include "gc/g1/g1BiasedArray.hpp"
 #include "gc/g1/g1CollectorState.hpp"
 #include "gc/g1/g1HRPrinter.hpp"
 #include "gc/g1/g1InCSetState.hpp"
 #include "gc/g1/g1MonitoringSupport.hpp"
+#include "gc/g1/g1EvacStats.hpp"
 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
 #include "gc/g1/g1YCTypes.hpp"
 #include "gc/g1/hSpaceCounters.hpp"
@@ -41,6 +41,7 @@
 #include "gc/g1/heapRegionSet.hpp"
 #include "gc/shared/barrierSet.hpp"
 #include "gc/shared/collectedHeap.hpp"
+#include "gc/shared/plab.hpp"
 #include "memory/memRegion.hpp"
 #include "utilities/stack.hpp"
 
@@ -54,6 +55,7 @@
 class HRRSCleanupTask;
 class GenerationSpec;
 class OopsInHeapRegionClosure;
+class G1ParScanThreadState;
 class G1KlassScanClosure;
 class G1ParScanThreadState;
 class ObjectClosure;
@@ -75,7 +77,9 @@
 class EvacuationFailedInfo;
 class nmethod;
 class Ticks;
-class FlexibleWorkGang;
+class WorkGang;
+class G1Allocator;
+class G1ArchiveAllocator;
 
 typedef OverflowTaskQueue<StarTask, mtGC>         RefToScanQueue;
 typedef GenericTaskQueueSet<RefToScanQueue, mtGC> RefToScanQueueSet;
@@ -184,8 +188,7 @@
   friend class VM_G1IncCollectionPause;
   friend class VMStructs;
   friend class MutatorAllocRegion;
-  friend class SurvivorGCAllocRegion;
-  friend class OldGCAllocRegion;
+  friend class G1GCAllocRegion;
 
   // Closures used in implementation.
   friend class G1ParScanThreadState;
@@ -200,7 +203,7 @@
   friend class G1CheckCSetFastTableClosure;
 
 private:
-  FlexibleWorkGang* _workers;
+  WorkGang* _workers;
 
   static size_t _humongous_object_threshold_in_words;
 
@@ -245,7 +248,7 @@
   // The sequence of all heap regions in the heap.
   HeapRegionManager _hrm;
 
-  // Handles non-humongous allocations in the G1CollectedHeap.
+  // Manages all allocations with regions except humongous object allocations.
   G1Allocator* _allocator;
 
   // Outside of GC pauses, the number of bytes used in all regions other
@@ -263,11 +266,11 @@
   // Statistics for each allocation context
   AllocationContextStats _allocation_context_stats;
 
-  // PLAB sizing policy for survivors.
-  PLABStats _survivor_plab_stats;
+  // GC allocation statistics policy for survivors.
+  G1EvacStats _survivor_evac_stats;
 
-  // PLAB sizing policy for tenured objects.
-  PLABStats _old_plab_stats;
+  // GC allocation statistics policy for tenured objects.
+  G1EvacStats _old_evac_stats;
 
   // It specifies whether we should attempt to expand the heap after a
   // region allocation failure. If heap expansion fails we set this to
@@ -581,14 +584,14 @@
 
   // Process any reference objects discovered during
   // an incremental evacuation pause.
-  void process_discovered_references();
+  void process_discovered_references(G1ParScanThreadState** per_thread_states);
 
   // Enqueue any remaining discovered references
   // after processing.
-  void enqueue_discovered_references();
+  void enqueue_discovered_references(G1ParScanThreadState** per_thread_states);
 
 public:
-  FlexibleWorkGang* workers() const { return _workers; }
+  WorkGang* workers() const { return _workers; }
 
   G1Allocator* allocator() {
     return _allocator;
@@ -606,7 +609,7 @@
   bool expand(size_t expand_bytes);
 
   // Returns the PLAB statistics for a given destination.
-  inline PLABStats* alloc_buffer_stats(InCSetState dest);
+  inline G1EvacStats* alloc_buffer_stats(InCSetState dest);
 
   // Determines PLAB size for a given destination.
   inline size_t desired_plab_sz(InCSetState dest);
@@ -680,6 +683,9 @@
   // Allocates a new heap region instance.
   HeapRegion* new_heap_region(uint hrs_index, MemRegion mr);
 
+  // Allocates a new per thread par scan state for the given thread id.
+  G1ParScanThreadState* new_par_scan_state(uint worker_id);
+
   // Allocate the highest free region in the reserved heap. This will commit
   // regions as necessary.
   HeapRegion* alloc_highest_free_region();
@@ -789,6 +795,20 @@
   // Actually do the work of evacuating the collection set.
   void evacuate_collection_set(EvacuationInfo& evacuation_info);
 
+  // Print the header for the per-thread termination statistics.
+  static void print_termination_stats_hdr(outputStream* const st);
+  // Print actual per-thread termination statistics.
+  void print_termination_stats(outputStream* const st,
+                               uint worker_id,
+                               double elapsed_ms,
+                               double strong_roots_ms,
+                               double term_ms,
+                               size_t term_attempts,
+                               size_t alloc_buffer_waste,
+                               size_t undo_waste) const;
+  // Update object copying statistics.
+  void record_obj_copy_mem_stats();
+
   // The g1 remembered set of the heap.
   G1RemSet* _g1_rem_set;
 
@@ -1195,9 +1215,7 @@
 
   // Determine whether the given region is one that we are using as an
   // old GC alloc region.
-  bool is_old_gc_alloc_region(HeapRegion* hr) {
-    return _allocator->is_retained_old_region(hr);
-  }
+  bool is_old_gc_alloc_region(HeapRegion* hr);
 
   // Perform a collection of the heap; intended for use in implementing
   // "System.gc".  This probably implies as full a collection as the
@@ -1566,6 +1584,7 @@
                         const VerifyOption vo) const;
 
   G1HeapSummary create_g1_heap_summary();
+  G1EvacSummary create_g1_evac_summary(G1EvacStats* stats);
 
   // Printing
 
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/g1CollectedHeap.inline.hpp
--- a/hotspot/src/share/vm/gc/g1/g1CollectedHeap.inline.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/g1/g1CollectedHeap.inline.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -35,12 +35,12 @@
 #include "gc/shared/taskqueue.hpp"
 #include "runtime/orderAccess.inline.hpp"
 
-PLABStats* G1CollectedHeap::alloc_buffer_stats(InCSetState dest) {
+G1EvacStats* G1CollectedHeap::alloc_buffer_stats(InCSetState dest) {
   switch (dest.value()) {
     case InCSetState::Young:
-      return &_survivor_plab_stats;
+      return &_survivor_evac_stats;
     case InCSetState::Old:
-      return &_old_plab_stats;
+      return &_old_evac_stats;
     default:
       ShouldNotReachHere();
       return NULL; // Keep some compilers happy
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/g1CollectedHeap_ext.cpp
--- a/hotspot/src/share/vm/gc/g1/g1CollectedHeap_ext.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/g1/g1CollectedHeap_ext.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -24,6 +24,7 @@
 
 #include "precompiled.hpp"
 #include "gc/g1/g1CollectedHeap.hpp"
+#include "gc/g1/g1ParScanThreadState.hpp"
 #include "gc/g1/heapRegion.inline.hpp"
 
 bool G1CollectedHeap::copy_allocation_context_stats(const jint* contexts,
@@ -37,3 +38,7 @@
                                              MemRegion mr) {
   return new HeapRegion(hrs_index, bot_shared(), mr);
 }
+
+G1ParScanThreadState* G1CollectedHeap::new_par_scan_state(uint worker_id) {
+  return new G1ParScanThreadState(this, worker_id);
+}
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/g1CollectorPolicy.cpp
--- a/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -1582,7 +1582,7 @@
 G1CollectorPolicy::record_concurrent_mark_cleanup_end() {
   _collectionSetChooser->clear();
 
-  FlexibleWorkGang* workers = _g1->workers();
+  WorkGang* workers = _g1->workers();
   uint n_workers = workers->active_workers();
 
   uint n_regions = _g1->num_regions();
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/g1CollectorPolicy.hpp
--- a/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/g1/g1CollectorPolicy.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -26,8 +26,8 @@
 #define SHARE_VM_GC_G1_G1COLLECTORPOLICY_HPP
 
 #include "gc/g1/collectionSetChooser.hpp"
-#include "gc/g1/g1Allocator.hpp"
 #include "gc/g1/g1CollectorState.hpp"
+#include "gc/g1/g1InCSetState.hpp"
 #include "gc/g1/g1MMUTracker.hpp"
 #include "gc/shared/collectorPolicy.hpp"
 
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/g1CollectorPolicy_ext.hpp
--- a/hotspot/src/share/vm/gc/g1/g1CollectorPolicy_ext.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-#ifndef SHARE_VM_GC_G1_G1COLLECTORPOLICY_EXT_HPP
-#define SHARE_VM_GC_G1_G1COLLECTORPOLICY_EXT_HPP
-
-#include "gc/g1/g1CollectorPolicy.hpp"
-
-class G1CollectorPolicyExt : public G1CollectorPolicy { };
-
-#endif // SHARE_VM_GC_G1_G1COLLECTORPOLICY_EXT_HPP
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/g1EvacStats.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/share/vm/gc/g1/g1EvacStats.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "gc/g1/g1EvacStats.hpp"
+#include "gc/shared/gcId.hpp"
+#include "trace/tracing.hpp"
+
+void G1EvacStats::adjust_desired_plab_sz() {
+  if (PrintPLAB) {
+    gclog_or_tty->print(" (allocated = " SIZE_FORMAT " wasted = " SIZE_FORMAT " "
+                        "unused = " SIZE_FORMAT " used = " SIZE_FORMAT " "
+                        "undo_waste = " SIZE_FORMAT " region_end_waste = " SIZE_FORMAT " "
+                        "regions filled = %u direct_allocated = " SIZE_FORMAT " "
+                        "failure_used = " SIZE_FORMAT " failure_waste = " SIZE_FORMAT ") ",
+                        _allocated, _wasted, _unused, used(), _undo_wasted, _region_end_waste,
+                        _regions_filled, _direct_allocated, _failure_used, _failure_waste);
+  }
+
+  if (ResizePLAB) {
+
+    assert(is_object_aligned(max_size()) && min_size() <= max_size(),
+           "PLAB clipping computation may be incorrect");
+
+    if (_allocated == 0) {
+      assert((_unused == 0),
+             err_msg("Inconsistency in PLAB stats: "
+                     "_allocated: "SIZE_FORMAT", "
+                     "_wasted: "SIZE_FORMAT", "
+                     "_region_end_waste: "SIZE_FORMAT", "
+                     "_unused: "SIZE_FORMAT", "
+                     "_used  : "SIZE_FORMAT,
+                     _allocated, _wasted, _region_end_waste, _unused, used()));
+      _allocated = 1;
+    }
+    // We account region end waste fully to PLAB allocation. This is not completely fair,
+    // but is a conservative assumption because PLABs may be sized flexibly while we
+    // cannot adjust direct allocations.
+    // In some cases, wasted_frac may become > 1 but that just reflects the problem
+    // with region_end_waste.
+    double wasted_frac    = (double)(_unused + _wasted + _region_end_waste) / (double)_allocated;
+    size_t target_refills = (size_t)((wasted_frac * TargetSurvivorRatio) / TargetPLABWastePct);
+    if (target_refills == 0) {
+      target_refills = 1;
+    }
+    size_t cur_plab_sz = used() / target_refills;
+    // Take historical weighted average
+    _filter.sample(cur_plab_sz);
+    // Clip from above and below, and align to object boundary
+    size_t plab_sz;
+    plab_sz = MAX2(min_size(), (size_t)_filter.average());
+    plab_sz = MIN2(max_size(), plab_sz);
+    plab_sz = align_object_size(plab_sz);
+    // Latch the result
+    _desired_net_plab_sz = plab_sz;
+    if (PrintPLAB) {
+      gclog_or_tty->print_cr(" (plab_sz = " SIZE_FORMAT " desired_plab_sz = " SIZE_FORMAT ") ", cur_plab_sz, plab_sz);
+    }
+  }
+  // Clear accumulators for next round.
+  reset();
+}
+
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/g1EvacStats.hpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/share/vm/gc/g1/g1EvacStats.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_VM_gc_G1_G1EVACSTATS_HPP
+#define SHARE_VM_gc_G1_G1EVACSTATS_HPP
+
+#include "gc/shared/plab.hpp"
+#include "runtime/atomic.hpp"
+
+// Records various memory allocation statistics gathered during evacuation.
+class G1EvacStats : public PLABStats {
+ private:
+  size_t _region_end_waste; // Number of words wasted due to skipping to the next region.
+  uint   _regions_filled;   // Number of regions filled completely.
+  size_t _direct_allocated; // Number of words allocated directly into the regions.
+
+  // Number of words in live objects remaining in regions that ultimately suffered an
+  // evacuation failure. This is used in the regions when the regions are made old regions.
+  size_t _failure_used;
+  // Number of words wasted in regions which failed evacuation. This is the sum of space
+  // for objects successfully copied out of the regions (now dead space) plus waste at the
+  // end of regions.
+  size_t _failure_waste;
+
+  virtual void reset() {
+    PLABStats::reset();
+    _region_end_waste = 0;
+    _regions_filled = 0;
+    _direct_allocated = 0;
+    _failure_used = 0;
+    _failure_waste = 0;
+  }
+
+ public:
+  G1EvacStats(size_t desired_plab_sz_, unsigned wt) : PLABStats(desired_plab_sz_, wt),
+    _region_end_waste(0), _regions_filled(0), _direct_allocated(0),
+    _failure_used(0), _failure_waste(0) {
+  }
+
+  virtual void adjust_desired_plab_sz();
+
+  size_t allocated() const { return _allocated; }
+  size_t wasted() const { return _wasted; }
+  size_t unused() const { return _unused; }
+  size_t used() const { return allocated() - (wasted() + unused()); }
+  size_t undo_wasted() const { return _undo_wasted; }
+
+  uint regions_filled() const { return _regions_filled; }
+  size_t region_end_waste() const { return _region_end_waste; }
+  size_t direct_allocated() const { return _direct_allocated; }
+
+  // Amount of space in heapwords used in the failing regions when an evacuation failure happens.
+  size_t failure_used() const { return _failure_used; }
+  // Amount of space in heapwords wasted (unused) in the failing regions when an evacuation failure happens.
+  size_t failure_waste() const { return _failure_waste; }
+
+  void add_direct_allocated(size_t value) {
+    Atomic::add_ptr(value, &_direct_allocated);
+  }
+
+  void add_region_end_waste(size_t value) {
+    Atomic::add_ptr(value, &_region_end_waste);
+    Atomic::add_ptr(1, &_regions_filled);
+  }
+
+  void add_failure_used_and_waste(size_t used, size_t waste) {
+    Atomic::add_ptr(used, &_failure_used);
+    Atomic::add_ptr(waste, &_failure_waste);
+  }
+};
+
+#endif // SHARE_VM_gc_G1_G1EVACSTATS_HPP
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/g1OopClosures.cpp
--- a/hotspot/src/share/vm/gc/g1/g1OopClosures.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/g1/g1OopClosures.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -32,7 +32,11 @@
 
 G1ParCopyHelper::G1ParCopyHelper(G1CollectedHeap* g1,  G1ParScanThreadState* par_scan_state) :
   G1ParClosureSuper(g1, par_scan_state), _scanned_klass(NULL),
-  _cm(_g1->concurrent_mark()) {}
+  _cm(_g1->concurrent_mark()) { }
+
+G1ParCopyHelper::G1ParCopyHelper(G1CollectedHeap* g1) :
+  G1ParClosureSuper(g1), _scanned_klass(NULL),
+  _cm(_g1->concurrent_mark()) { }
 
 G1ParClosureSuper::G1ParClosureSuper(G1CollectedHeap* g1) :
   _g1(g1), _par_scan_state(NULL), _worker_id(UINT_MAX) { }
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/g1OopClosures.hpp
--- a/hotspot/src/share/vm/gc/g1/g1OopClosures.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/g1/g1OopClosures.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -76,15 +76,13 @@
 
 class G1ParScanClosure : public G1ParClosureSuper {
 public:
-  G1ParScanClosure(G1CollectedHeap* g1, ReferenceProcessor* rp) :
-    G1ParClosureSuper(g1) {
-    assert(_ref_processor == NULL, "sanity");
-    _ref_processor = rp;
-  }
+  G1ParScanClosure(G1CollectedHeap* g1) : G1ParClosureSuper(g1) { }
 
   template <class T> void do_oop_nv(T* p);
   virtual void do_oop(oop* p)          { do_oop_nv(p); }
   virtual void do_oop(narrowOop* p)    { do_oop_nv(p); }
+
+  void set_ref_processor(ReferenceProcessor* ref_processor) { _ref_processor = ref_processor; }
 };
 
 // Add back base class for metadata
@@ -104,6 +102,7 @@
   void mark_forwarded_object(oop from_obj, oop to_obj);
  public:
   G1ParCopyHelper(G1CollectedHeap* g1,  G1ParScanThreadState* par_scan_state);
+  G1ParCopyHelper(G1CollectedHeap* g1);
 
   void set_scanned_klass(Klass* k) { _scanned_klass = k; }
   template <class T> void do_klass_barrier(T* p, oop new_obj);
@@ -132,6 +131,10 @@
     assert(_ref_processor == NULL, "sanity");
   }
 
+  G1ParCopyClosure(G1CollectedHeap* g1) : G1ParCopyHelper(g1) {
+    assert(_ref_processor == NULL, "sanity");
+  }
+
   template <class T> void do_oop_nv(T* p) { do_oop_work(p); }
   virtual void do_oop(oop* p)       { do_oop_nv(p); }
   virtual void do_oop(narrowOop* p) { do_oop_nv(p); }
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/g1ParScanThreadState.cpp
--- a/hotspot/src/share/vm/gc/g1/g1ParScanThreadState.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/g1/g1ParScanThreadState.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -23,6 +23,7 @@
  */
 
 #include "precompiled.hpp"
+#include "gc/g1/g1Allocator.inline.hpp"
 #include "gc/g1/g1CollectedHeap.inline.hpp"
 #include "gc/g1/g1OopClosures.inline.hpp"
 #include "gc/g1/g1ParScanThreadState.inline.hpp"
@@ -31,17 +32,19 @@
 #include "oops/oop.inline.hpp"
 #include "runtime/prefetch.inline.hpp"
 
-G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id, ReferenceProcessor* rp)
+G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id)
   : _g1h(g1h),
     _refs(g1h->task_queue(worker_id)),
     _dcq(&g1h->dirty_card_queue_set()),
     _ct_bs(g1h->g1_barrier_set()),
     _g1_rem(g1h->g1_rem_set()),
-    _hash_seed(17), _worker_id(worker_id),
-    _term_attempts(0),
+    _hash_seed(17),
+    _worker_id(worker_id),
     _tenuring_threshold(g1h->g1_policy()->tenuring_threshold()),
-    _age_table(false), _scanner(g1h, rp),
-    _strong_roots_time(0), _term_time(0) {
+    _age_table(false),
+    _scanner(g1h),
+    _old_gen_is_full(false)
+{
   _scanner.set_par_scan_thread_state(this);
   // we allocate G1YoungSurvRateNumRegions plus one entries, since
   // we "sacrifice" entry 0 to keep track of surviving bytes for
@@ -66,38 +69,20 @@
   // need to be moved to the next space.
   _dest[InCSetState::Young]        = InCSetState::Old;
   _dest[InCSetState::Old]          = InCSetState::Old;
-
-  _start = os::elapsedTime();
 }
 
 G1ParScanThreadState::~G1ParScanThreadState() {
-  _plab_allocator->retire_alloc_buffers();
+  // Update allocation statistics.
+  _plab_allocator->flush_and_retire_stats();
   delete _plab_allocator;
+  _g1h->g1_policy()->record_thread_age_table(&_age_table);
+  // Update heap statistics.
+  _g1h->update_surviving_young_words(_surviving_young_words);
   FREE_C_HEAP_ARRAY(size_t, _surviving_young_words_base);
 }
 
-void G1ParScanThreadState::print_termination_stats_hdr(outputStream* const st) {
-  st->print_raw_cr("GC Termination Stats");
-  st->print_raw_cr("     elapsed  --strong roots-- -------termination------- ------waste (KiB)------");
-  st->print_raw_cr("thr     ms        ms      %        ms      %    attempts  total   alloc    undo");
-  st->print_raw_cr("--- --------- --------- ------ --------- ------ -------- ------- ------- -------");
-}
-
-void G1ParScanThreadState::print_termination_stats(outputStream* const st) const {
-  const double elapsed_ms = elapsed_time() * 1000.0;
-  const double s_roots_ms = strong_roots_time() * 1000.0;
-  const double term_ms    = term_time() * 1000.0;
-  size_t alloc_buffer_waste = 0;
-  size_t undo_waste = 0;
-  _plab_allocator->waste(alloc_buffer_waste, undo_waste);
-  st->print_cr("%3u %9.2f %9.2f %6.2f "
-               "%9.2f %6.2f " SIZE_FORMAT_W(8) " "
-               SIZE_FORMAT_W(7) " " SIZE_FORMAT_W(7) " " SIZE_FORMAT_W(7),
-               _worker_id, elapsed_ms, s_roots_ms, s_roots_ms * 100 / elapsed_ms,
-               term_ms, term_ms * 100 / elapsed_ms, term_attempts(),
-               (alloc_buffer_waste + undo_waste) * HeapWordSize / K,
-               alloc_buffer_waste * HeapWordSize / K,
-               undo_waste * HeapWordSize / K);
+void G1ParScanThreadState::waste(size_t& wasted, size_t& undo_wasted) {
+  _plab_allocator->waste(wasted, undo_wasted);
 }
 
 #ifdef ASSERT
@@ -152,26 +137,38 @@
 HeapWord* G1ParScanThreadState::allocate_in_next_plab(InCSetState const state,
                                                       InCSetState* dest,
                                                       size_t word_sz,
-                                                      AllocationContext_t const context) {
+                                                      AllocationContext_t const context,
+                                                      bool previous_plab_refill_failed) {
   assert(state.is_in_cset_or_humongous(), err_msg("Unexpected state: " CSETSTATE_FORMAT, state.value()));
   assert(dest->is_in_cset_or_humongous(), err_msg("Unexpected dest: " CSETSTATE_FORMAT, dest->value()));
 
   // Right now we only have two types of regions (young / old) so
   // let's keep the logic here simple. We can generalize it when necessary.
   if (dest->is_young()) {
+    bool plab_refill_in_old_failed = false;
     HeapWord* const obj_ptr = _plab_allocator->allocate(InCSetState::Old,
                                                         word_sz,
-                                                        context);
-    if (obj_ptr == NULL) {
-      return NULL;
-    }
+                                                        context,
+                                                        &plab_refill_in_old_failed);
     // Make sure that we won't attempt to copy any other objects out
     // of a survivor region (given that apparently we cannot allocate
-    // any new ones) to avoid coming into this slow path.
-    _tenuring_threshold = 0;
-    dest->set_old();
+    // any new ones) to avoid coming into this slow path again and again.
+    // Only consider failed PLAB refill here: failed inline allocations are
+    // typically large, so not indicative of remaining space.
+    if (previous_plab_refill_failed) {
+      _tenuring_threshold = 0;
+    }
+
+    if (obj_ptr != NULL) {
+      dest->set_old();
+    } else {
+      // We just failed to allocate in old gen. The same idea as explained above
+      // for making survivor gen unavailable for allocation applies for old gen.
+      _old_gen_is_full = plab_refill_in_old_failed;
+    }
     return obj_ptr;
   } else {
+    _old_gen_is_full = previous_plab_refill_failed;
     assert(dest->is_old(), err_msg("Unexpected dest: " CSETSTATE_FORMAT, dest->value()));
     // no other space to try.
     return NULL;
@@ -202,14 +199,20 @@
 
   uint age = 0;
   InCSetState dest_state = next_state(state, old_mark, age);
+  // The second clause is to prevent premature evacuation failure in case there
+  // is still space in survivor, but old gen is full.
+  if (_old_gen_is_full && dest_state.is_old()) {
+    return handle_evacuation_failure_par(old, old_mark);
+  }
   HeapWord* obj_ptr = _plab_allocator->plab_allocate(dest_state, word_sz, context);
 
   // PLAB allocations should succeed most of the time, so we'll
   // normally check against NULL once and that's it.
   if (obj_ptr == NULL) {
-    obj_ptr = _plab_allocator->allocate_direct_or_new_plab(dest_state, word_sz, context);
+    bool plab_refill_failed = false;
+    obj_ptr = _plab_allocator->allocate_direct_or_new_plab(dest_state, word_sz, context, &plab_refill_failed);
     if (obj_ptr == NULL) {
-      obj_ptr = allocate_in_next_plab(state, &dest_state, word_sz, context);
+      obj_ptr = allocate_in_next_plab(state, &dest_state, word_sz, context, plab_refill_failed);
       if (obj_ptr == NULL) {
         // This will either forward-to-self, or detect that someone else has
         // installed a forwarding pointer.
@@ -253,7 +256,7 @@
       } else {
         obj->set_mark(old_mark->set_age(age));
       }
-      age_table()->add(age, word_sz);
+      _age_table.add(age, word_sz);
     } else {
       obj->set_mark(old_mark);
     }
@@ -271,8 +274,7 @@
                                              obj);
     }
 
-    size_t* const surv_young_words = surviving_young_words();
-    surv_young_words[young_index] += word_sz;
+    _surviving_young_words[young_index] += word_sz;
 
     if (obj->is_objArray() && arrayOop(obj)->length() >= ParGCArrayScanChunk) {
       // We keep track of the next start index in the length field of
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/g1ParScanThreadState.hpp
--- a/hotspot/src/share/vm/gc/g1/g1ParScanThreadState.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/g1/g1ParScanThreadState.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -35,16 +35,17 @@
 #include "memory/allocation.hpp"
 #include "oops/oop.hpp"
 
+class G1PLABAllocator;
 class HeapRegion;
 class outputStream;
 
-class G1ParScanThreadState : public StackObj {
+class G1ParScanThreadState : public CHeapObj<mtGC> {
  private:
   G1CollectedHeap* _g1h;
   RefToScanQueue*  _refs;
   DirtyCardQueue   _dcq;
   G1SATBCardTableModRefBS* _ct_bs;
-  G1RemSet* _g1_rem;
+  G1RemSet*         _g1_rem;
 
   G1PLABAllocator*  _plab_allocator;
 
@@ -57,20 +58,16 @@
   int  _hash_seed;
   uint _worker_id;
 
-  size_t _term_attempts;
-
-  double _start;
-  double _start_strong_roots;
-  double _strong_roots_time;
-  double _start_term;
-  double _term_time;
-
   // Map from young-age-index (0 == not young, 1 is youngest) to
   // surviving words. base is what we get back from the malloc call
   size_t* _surviving_young_words_base;
   // this points into the array, as we use the first few entries for padding
   size_t* _surviving_young_words;
 
+  // Indicates whether in the last generation (old) there is no more space
+  // available for allocation.
+  bool _old_gen_is_full;
+
 #define PADDING_ELEM_NUM (DEFAULT_CACHE_LINE_SIZE / sizeof(size_t))
 
   DirtyCardQueue& dirty_card_queue()             { return _dcq;  }
@@ -85,10 +82,10 @@
   }
 
  public:
-  G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id, ReferenceProcessor* rp);
+  G1ParScanThreadState(G1CollectedHeap* g1h, uint worker_id);
   ~G1ParScanThreadState();
 
-  ageTable*         age_table()       { return &_age_table;       }
+  void set_ref_processor(ReferenceProcessor* rp) { _scanner.set_ref_processor(rp); }
 
 #ifdef ASSERT
   bool queue_is_empty() const { return _refs->is_empty(); }
@@ -114,40 +111,14 @@
 
   uint worker_id() { return _worker_id; }
 
-  size_t term_attempts() const  { return _term_attempts; }
-  void note_term_attempt() { _term_attempts++; }
-
-  void start_strong_roots() {
-    _start_strong_roots = os::elapsedTime();
-  }
-  void end_strong_roots() {
-    _strong_roots_time += (os::elapsedTime() - _start_strong_roots);
-  }
-  double strong_roots_time() const { return _strong_roots_time; }
-
-  void start_term_time() {
-    note_term_attempt();
-    _start_term = os::elapsedTime();
-  }
-  void end_term_time() {
-    _term_time += (os::elapsedTime() - _start_term);
-  }
-  double term_time() const { return _term_time; }
-
-  double elapsed_time() const {
-    return os::elapsedTime() - _start;
-  }
-
-  // Print the header for the per-thread termination statistics.
-  static void print_termination_stats_hdr(outputStream* const st = gclog_or_tty);
-
-  // Print actual per-thread termination statistics.
-  void print_termination_stats(outputStream* const st = gclog_or_tty) const;
+  // Returns the current amount of waste due to alignment or not being able to fit
+  // objects within LABs and the undo waste.
+  virtual void waste(size_t& wasted, size_t& undo_wasted);
 
   size_t* surviving_young_words() {
-    // We add on to hide entry 0 which accumulates surviving words for
+    // We add one to hide entry 0 which accumulates surviving words for
     // age -1 regions (i.e. non-young ones)
-    return _surviving_young_words;
+    return _surviving_young_words + 1;
   }
 
  private:
@@ -190,12 +161,16 @@
 
   // Tries to allocate word_sz in the PLAB of the next "generation" after trying to
   // allocate into dest. State is the original (source) cset state for the object
-  // that is allocated for.
+  // that is allocated for. Previous_plab_refill_failed indicates whether previously
+  // a PLAB refill into "state" failed.
   // Returns a non-NULL pointer if successful, and updates dest if required.
+  // Also determines whether we should continue to try to allocate into the various
+  // generations or just end trying to allocate.
   HeapWord* allocate_in_next_plab(InCSetState const state,
                                   InCSetState* dest,
                                   size_t word_sz,
-                                  AllocationContext_t const context);
+                                  AllocationContext_t const context,
+                                  bool previous_plab_refill_failed);
 
   inline InCSetState next_state(InCSetState const state, markOop const m, uint& age);
  public:
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/g1RootProcessor.cpp
--- a/hotspot/src/share/vm/gc/g1/g1RootProcessor.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/g1/g1RootProcessor.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -115,7 +115,7 @@
 
 G1RootProcessor::G1RootProcessor(G1CollectedHeap* g1h, uint n_workers) :
     _g1h(g1h),
-    _process_strong_tasks(new SubTasksDone(G1RP_PS_NumElements)),
+    _process_strong_tasks(G1RP_PS_NumElements),
     _srs(n_workers),
     _lock(Mutex::leaf, "G1 Root Scanning barrier lock", false, Monitor::_safepoint_check_never),
     _n_workers_discovered_strong_classes(0) {}
@@ -158,7 +158,7 @@
   {
     // Now the CM ref_processor roots.
     G1GCParPhaseTimesTracker x(phase_times, G1GCPhaseTimes::CMRefRoots, worker_i);
-    if (!_process_strong_tasks->is_task_claimed(G1RP_PS_refProcessor_oops_do)) {
+    if (!_process_strong_tasks.is_task_claimed(G1RP_PS_refProcessor_oops_do)) {
       // We need to treat the discovered reference lists of the
       // concurrent mark ref processor as roots and keep entries
       // (which are added by the marking threads) on them live
@@ -201,12 +201,12 @@
   // as implicitly live).
   {
     G1GCParPhaseTimesTracker x(phase_times, G1GCPhaseTimes::SATBFiltering, worker_i);
-    if (!_process_strong_tasks->is_task_claimed(G1RP_PS_filter_satb_buffers) && _g1h->collector_state()->mark_in_progress()) {
+    if (!_process_strong_tasks.is_task_claimed(G1RP_PS_filter_satb_buffers) && _g1h->collector_state()->mark_in_progress()) {
       JavaThread::satb_mark_queue_set().filter_thread_buffers();
     }
   }
 
-  _process_strong_tasks->all_tasks_completed(n_workers());
+  _process_strong_tasks.all_tasks_completed(n_workers());
 }
 
 void G1RootProcessor::process_strong_roots(OopClosure* oops,
@@ -216,7 +216,7 @@
   process_java_roots(oops, clds, clds, NULL, blobs, NULL, 0);
   process_vm_roots(oops, NULL, NULL, 0);
 
-  _process_strong_tasks->all_tasks_completed(n_workers());
+  _process_strong_tasks.all_tasks_completed(n_workers());
 }
 
 void G1RootProcessor::process_all_roots(OopClosure* oops,
@@ -226,11 +226,11 @@
   process_java_roots(oops, NULL, clds, clds, NULL, NULL, 0);
   process_vm_roots(oops, oops, NULL, 0);
 
-  if (!_process_strong_tasks->is_task_claimed(G1RP_PS_CodeCache_oops_do)) {
+  if (!_process_strong_tasks.is_task_claimed(G1RP_PS_CodeCache_oops_do)) {
     CodeCache::blobs_do(blobs);
   }
 
-  _process_strong_tasks->all_tasks_completed(n_workers());
+  _process_strong_tasks.all_tasks_completed(n_workers());
 }
 
 void G1RootProcessor::process_java_roots(OopClosure* strong_roots,
@@ -246,7 +246,7 @@
   // let the thread process the weak CLDs and nmethods.
   {
     G1GCParPhaseTimesTracker x(phase_times, G1GCPhaseTimes::CLDGRoots, worker_i);
-    if (!_process_strong_tasks->is_task_claimed(G1RP_PS_ClassLoaderDataGraph_oops_do)) {
+    if (!_process_strong_tasks.is_task_claimed(G1RP_PS_ClassLoaderDataGraph_oops_do)) {
       ClassLoaderDataGraph::roots_cld_do(strong_clds, weak_clds);
     }
   }
@@ -264,49 +264,49 @@
                                        uint worker_i) {
   {
     G1GCParPhaseTimesTracker x(phase_times, G1GCPhaseTimes::UniverseRoots, worker_i);
-    if (!_process_strong_tasks->is_task_claimed(G1RP_PS_Universe_oops_do)) {
+    if (!_process_strong_tasks.is_task_claimed(G1RP_PS_Universe_oops_do)) {
       Universe::oops_do(strong_roots);
     }
   }
 
   {
     G1GCParPhaseTimesTracker x(phase_times, G1GCPhaseTimes::JNIRoots, worker_i);
-    if (!_process_strong_tasks->is_task_claimed(G1RP_PS_JNIHandles_oops_do)) {
+    if (!_process_strong_tasks.is_task_claimed(G1RP_PS_JNIHandles_oops_do)) {
       JNIHandles::oops_do(strong_roots);
     }
   }
 
   {
     G1GCParPhaseTimesTracker x(phase_times, G1GCPhaseTimes::ObjectSynchronizerRoots, worker_i);
-    if (!_process_strong_tasks-> is_task_claimed(G1RP_PS_ObjectSynchronizer_oops_do)) {
+    if (!_process_strong_tasks.is_task_claimed(G1RP_PS_ObjectSynchronizer_oops_do)) {
       ObjectSynchronizer::oops_do(strong_roots);
     }
   }
 
   {
     G1GCParPhaseTimesTracker x(phase_times, G1GCPhaseTimes::FlatProfilerRoots, worker_i);
-    if (!_process_strong_tasks->is_task_claimed(G1RP_PS_FlatProfiler_oops_do)) {
+    if (!_process_strong_tasks.is_task_claimed(G1RP_PS_FlatProfiler_oops_do)) {
       FlatProfiler::oops_do(strong_roots);
     }
   }
 
   {
     G1GCParPhaseTimesTracker x(phase_times, G1GCPhaseTimes::ManagementRoots, worker_i);
-    if (!_process_strong_tasks->is_task_claimed(G1RP_PS_Management_oops_do)) {
+    if (!_process_strong_tasks.is_task_claimed(G1RP_PS_Management_oops_do)) {
       Management::oops_do(strong_roots);
     }
   }
 
   {
     G1GCParPhaseTimesTracker x(phase_times, G1GCPhaseTimes::JVMTIRoots, worker_i);
-    if (!_process_strong_tasks->is_task_claimed(G1RP_PS_jvmti_oops_do)) {
+    if (!_process_strong_tasks.is_task_claimed(G1RP_PS_jvmti_oops_do)) {
       JvmtiExport::oops_do(strong_roots);
     }
   }
 
   {
     G1GCParPhaseTimesTracker x(phase_times, G1GCPhaseTimes::SystemDictionaryRoots, worker_i);
-    if (!_process_strong_tasks->is_task_claimed(G1RP_PS_SystemDictionary_oops_do)) {
+    if (!_process_strong_tasks.is_task_claimed(G1RP_PS_SystemDictionary_oops_do)) {
       SystemDictionary::roots_oops_do(strong_roots, weak_roots);
     }
   }
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/g1RootProcessor.hpp
--- a/hotspot/src/share/vm/gc/g1/g1RootProcessor.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/g1/g1RootProcessor.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -45,7 +45,7 @@
 // worker thread call the process_roots methods.
 class G1RootProcessor : public StackObj {
   G1CollectedHeap* _g1h;
-  SubTasksDone* _process_strong_tasks;
+  SubTasksDone _process_strong_tasks;
   StrongRootsScope _srs;
 
   // Used to implement the Thread work barrier.
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/heapRegion.hpp
--- a/hotspot/src/share/vm/gc/g1/heapRegion.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/g1/heapRegion.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -109,7 +109,7 @@
 // evacuation pauses between two cleanups, which is _highly_ unlikely.
 class G1OffsetTableContigSpace: public CompactibleSpace {
   friend class VMStructs;
-  HeapWord* _top;
+  HeapWord* volatile _top;
   HeapWord* volatile _scan_top;
  protected:
   G1BlockOffsetArrayContigSpace _offsets;
@@ -134,10 +134,18 @@
   // Reset the G1OffsetTableContigSpace.
   virtual void initialize(MemRegion mr, bool clear_space, bool mangle_space);
 
-  HeapWord** top_addr() { return &_top; }
-  // Allocation helpers (return NULL if full).
-  inline HeapWord* allocate_impl(size_t word_size, HeapWord* end_value);
-  inline HeapWord* par_allocate_impl(size_t word_size, HeapWord* end_value);
+  HeapWord* volatile* top_addr() { return &_top; }
+  // Try to allocate at least min_word_size and up to desired_size from this Space.
+  // Returns NULL if not possible, otherwise sets actual_word_size to the amount of
+  // space allocated.
+  // This version assumes that all allocation requests to this Space are properly
+  // synchronized.
+  inline HeapWord* allocate_impl(size_t min_word_size, size_t desired_word_size, size_t* actual_word_size);
+  // Try to allocate at least min_word_size and up to desired_size from this Space.
+  // Returns NULL if not possible, otherwise sets actual_word_size to the amount of
+  // space allocated.
+  // This version synchronizes with other calls to par_allocate_impl().
+  inline HeapWord* par_allocate_impl(size_t min_word_size, size_t desired_word_size, size_t* actual_word_size);
 
  public:
   void reset_after_compaction() { set_top(compaction_top()); }
@@ -179,9 +187,14 @@
   HeapWord* block_start(const void* p);
   HeapWord* block_start_const(const void* p) const;
 
-  // Add offset table update.
+  // Allocation (return NULL if full).  Assumes the caller has established
+  // mutually exclusive access to the space.
+  HeapWord* allocate(size_t min_word_size, size_t desired_word_size, size_t* actual_word_size);
+  // Allocation (return NULL if full).  Enforces mutual exclusion internally.
+  HeapWord* par_allocate(size_t min_word_size, size_t desired_word_size, size_t* actual_word_size);
+
   virtual HeapWord* allocate(size_t word_size);
-  HeapWord* par_allocate(size_t word_size);
+  virtual HeapWord* par_allocate(size_t word_size);
 
   HeapWord* saved_mark_word() const { ShouldNotReachHere(); return NULL; }
 
@@ -351,8 +364,9 @@
   // Override for scan_and_forward support.
   void prepare_for_compaction(CompactPoint* cp);
 
-  inline HeapWord* par_allocate_no_bot_updates(size_t word_size);
+  inline HeapWord* par_allocate_no_bot_updates(size_t min_word_size, size_t desired_word_size, size_t* word_size);
   inline HeapWord* allocate_no_bot_updates(size_t word_size);
+  inline HeapWord* allocate_no_bot_updates(size_t min_word_size, size_t desired_word_size, size_t* actual_size);
 
   // If this region is a member of a HeapRegionManager, the index in that
   // sequence, otherwise -1.
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/heapRegion.inline.hpp
--- a/hotspot/src/share/vm/gc/g1/heapRegion.inline.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/g1/heapRegion.inline.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -32,33 +32,39 @@
 #include "oops/oop.inline.hpp"
 #include "runtime/atomic.inline.hpp"
 
-// This version requires locking.
-inline HeapWord* G1OffsetTableContigSpace::allocate_impl(size_t size,
-                                                HeapWord* const end_value) {
+inline HeapWord* G1OffsetTableContigSpace::allocate_impl(size_t min_word_size,
+                                                         size_t desired_word_size,
+                                                         size_t* actual_size) {
   HeapWord* obj = top();
-  if (pointer_delta(end_value, obj) >= size) {
-    HeapWord* new_top = obj + size;
+  size_t available = pointer_delta(end(), obj);
+  size_t want_to_allocate = MIN2(available, desired_word_size);
+  if (want_to_allocate >= min_word_size) {
+    HeapWord* new_top = obj + want_to_allocate;
     set_top(new_top);
     assert(is_aligned(obj) && is_aligned(new_top), "checking alignment");
+    *actual_size = want_to_allocate;
     return obj;
   } else {
     return NULL;
   }
 }
 
-// This version is lock-free.
-inline HeapWord* G1OffsetTableContigSpace::par_allocate_impl(size_t size,
-                                                    HeapWord* const end_value) {
+inline HeapWord* G1OffsetTableContigSpace::par_allocate_impl(size_t min_word_size,
+                                                             size_t desired_word_size,
+                                                             size_t* actual_size) {
   do {
     HeapWord* obj = top();
-    if (pointer_delta(end_value, obj) >= size) {
-      HeapWord* new_top = obj + size;
+    size_t available = pointer_delta(end(), obj);
+    size_t want_to_allocate = MIN2(available, desired_word_size);
+    if (want_to_allocate >= min_word_size) {
+      HeapWord* new_top = obj + want_to_allocate;
       HeapWord* result = (HeapWord*)Atomic::cmpxchg_ptr(new_top, top_addr(), obj);
       // result can be one of two:
       //  the old top value: the exchange succeeded
       //  otherwise: the new value of the top is returned.
       if (result == obj) {
         assert(is_aligned(obj) && is_aligned(new_top), "checking alignment");
+        *actual_size = want_to_allocate;
         return obj;
       }
     } else {
@@ -67,20 +73,34 @@
   } while (true);
 }
 
-inline HeapWord* G1OffsetTableContigSpace::allocate(size_t size) {
-  HeapWord* res = allocate_impl(size, end());
+inline HeapWord* G1OffsetTableContigSpace::allocate(size_t min_word_size,
+                                                    size_t desired_word_size,
+                                                    size_t* actual_size) {
+  HeapWord* res = allocate_impl(min_word_size, desired_word_size, actual_size);
   if (res != NULL) {
-    _offsets.alloc_block(res, size);
+    _offsets.alloc_block(res, *actual_size);
   }
   return res;
 }
 
+inline HeapWord* G1OffsetTableContigSpace::allocate(size_t word_size) {
+  size_t temp;
+  return allocate(word_size, word_size, &temp);
+}
+
+inline HeapWord* G1OffsetTableContigSpace::par_allocate(size_t word_size) {
+  size_t temp;
+  return par_allocate(word_size, word_size, &temp);
+}
+
 // Because of the requirement of keeping "_offsets" up to date with the
 // allocations, we sequentialize these with a lock.  Therefore, best if
 // this is used for larger LAB allocations only.
-inline HeapWord* G1OffsetTableContigSpace::par_allocate(size_t size) {
+inline HeapWord* G1OffsetTableContigSpace::par_allocate(size_t min_word_size,
+                                                        size_t desired_word_size,
+                                                        size_t* actual_size) {
   MutexLocker x(&_par_alloc_lock);
-  return allocate(size);
+  return allocate(min_word_size, desired_word_size, actual_size);
 }
 
 inline HeapWord* G1OffsetTableContigSpace::block_start(const void* p) {
@@ -128,14 +148,23 @@
   return pointer_delta(next, addr);
 }
 
-inline HeapWord* HeapRegion::par_allocate_no_bot_updates(size_t word_size) {
+inline HeapWord* HeapRegion::par_allocate_no_bot_updates(size_t min_word_size,
+                                                         size_t desired_word_size,
+                                                         size_t* actual_word_size) {
   assert(is_young(), "we can only skip BOT updates on young regions");
-  return par_allocate_impl(word_size, end());
+  return par_allocate_impl(min_word_size, desired_word_size, actual_word_size);
 }
 
 inline HeapWord* HeapRegion::allocate_no_bot_updates(size_t word_size) {
+  size_t temp;
+  return allocate_no_bot_updates(word_size, word_size, &temp);
+}
+
+inline HeapWord* HeapRegion::allocate_no_bot_updates(size_t min_word_size,
+                                                     size_t desired_word_size,
+                                                     size_t* actual_word_size) {
   assert(is_young(), "we can only skip BOT updates on young regions");
-  return allocate_impl(word_size, end());
+  return allocate_impl(min_word_size, desired_word_size, actual_word_size);
 }
 
 inline void HeapRegion::note_start_of_marking() {
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/heapRegionManager.cpp
--- a/hotspot/src/share/vm/gc/g1/heapRegionManager.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/g1/heapRegionManager.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -428,7 +428,7 @@
 
     uncommit_regions(idx_last_found + num_last_found - to_remove, to_remove);
 
-    cur -= num_last_found;
+    cur = idx_last_found;
     removed += to_remove;
   }
 
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/g1/vmStructs_g1.hpp
--- a/hotspot/src/share/vm/gc/g1/vmStructs_g1.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/g1/vmStructs_g1.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -34,7 +34,7 @@
   static_field(HeapRegion, GrainBytes,        size_t)                         \
   static_field(HeapRegion, LogOfHRGrainBytes, int)                            \
                                                                               \
-  nonstatic_field(G1OffsetTableContigSpace, _top,       HeapWord*)            \
+  nonstatic_field(G1OffsetTableContigSpace, _top,       HeapWord* volatile)   \
                                                                               \
   nonstatic_field(G1HeapRegionTable, _base,             address)              \
   nonstatic_field(G1HeapRegionTable, _length,           size_t)               \
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/shared/collectorPolicy.cpp
--- a/hotspot/src/share/vm/gc/shared/collectorPolicy.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/shared/collectorPolicy.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -225,6 +225,10 @@
   return align_size_up(3 * _space_alignment, _gen_alignment);
 }
 
+size_t GenCollectorPolicy::old_gen_size_lower_bound() {
+  return align_size_up(_space_alignment, _gen_alignment);
+}
+
 #ifdef ASSERT
 void GenCollectorPolicy::assert_flags() {
   CollectorPolicy::assert_flags();
@@ -284,7 +288,7 @@
 
   // Make sure the heap is large enough for two generations
   size_t smallest_new_size = young_gen_size_lower_bound();
-  size_t smallest_heap_size = align_size_up(smallest_new_size + align_size_up(_space_alignment, _gen_alignment),
+  size_t smallest_heap_size = align_size_up(smallest_new_size + old_gen_size_lower_bound(),
                                            _heap_alignment);
   if (MaxHeapSize < smallest_heap_size) {
     FLAG_SET_ERGO(size_t, MaxHeapSize, smallest_heap_size);
@@ -356,6 +360,7 @@
     vm_exit_during_initialization("Invalid young gen ratio specified");
   }
 
+  OldSize = MAX2(OldSize, old_gen_size_lower_bound());
   if (!is_size_aligned(OldSize, _gen_alignment)) {
     // Setting OldSize directly to preserve information about the possible
     // setting of OldSize on the command line.
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/shared/collectorPolicy.hpp
--- a/hotspot/src/share/vm/gc/shared/collectorPolicy.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/shared/collectorPolicy.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -282,6 +282,8 @@
 
   size_t young_gen_size_lower_bound();
 
+  size_t old_gen_size_lower_bound();
+
   HeapWord* mem_allocate_work(size_t size,
                               bool is_tlab,
                               bool* gc_overhead_limit_was_exceeded);
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/shared/gcHeapSummary.hpp
--- a/hotspot/src/share/vm/gc/shared/gcHeapSummary.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/shared/gcHeapSummary.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -189,4 +189,44 @@
 
 };
 
+class G1EvacSummary : public StackObj {
+private:
+  size_t _allocated;          // Total allocated
+  size_t _wasted;             // of which wasted (internal fragmentation)
+  size_t _undo_wasted;        // of which wasted on undo (is not used for calculation of PLAB size)
+  size_t _unused;             // Unused in last buffer
+  size_t _used;
+
+  size_t _region_end_waste; // Number of words wasted due to skipping to the next region.
+  uint   _regions_filled;   // Number of regions filled completely.
+  size_t _direct_allocated; // Number of words allocated directly into the regions.
+
+  // Number of words in live objects remaining in regions that ultimately suffered an
+  // evacuation failure. This is used in the regions when the regions are made old regions.
+  size_t _failure_used;
+  // Number of words wasted in regions which failed evacuation. This is the sum of space
+  // for objects successfully copied out of the regions (now dead space) plus waste at the
+  // end of regions.
+  size_t _failure_waste;
+public:
+  G1EvacSummary(size_t allocated, size_t wasted, size_t undo_wasted, size_t unused,
+    size_t used, size_t region_end_waste, uint regions_filled, size_t direct_allocated,
+    size_t failure_used, size_t failure_waste) :
+    _allocated(allocated), _wasted(wasted), _undo_wasted(undo_wasted), _unused(unused),
+    _used(used),  _region_end_waste(region_end_waste), _regions_filled(regions_filled),
+    _direct_allocated(direct_allocated), _failure_used(failure_used), _failure_waste(failure_waste)
+  { }
+
+  size_t allocated() const { return _allocated; }
+  size_t wasted() const { return _wasted; }
+  size_t undo_wasted() const { return _undo_wasted; }
+  size_t unused() const { return _unused; }
+  size_t used() const { return _used; }
+  size_t region_end_waste() const { return _region_end_waste; }
+  uint regions_filled() const { return _regions_filled; }
+  size_t direct_allocated() const { return _direct_allocated; }
+  size_t failure_used() const { return _failure_used; }
+  size_t failure_waste() const { return _failure_waste; }
+};
+
 #endif // SHARE_VM_GC_SHARED_GCHEAPSUMMARY_HPP
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/shared/gcTrace.cpp
--- a/hotspot/src/share/vm/gc/shared/gcTrace.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/shared/gcTrace.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -252,4 +252,12 @@
   send_evacuation_failed_event(ef_info);
   ef_info.reset();
 }
+
+void G1NewTracer::report_evacuation_statistics(const G1EvacSummary& young_summary, const G1EvacSummary& old_summary) const {
+  assert_set_gc_id();
+
+  send_young_evacuation_statistics(young_summary);
+  send_old_evacuation_statistics(old_summary);
+}
+
 #endif
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/shared/gcTrace.hpp
--- a/hotspot/src/share/vm/gc/shared/gcTrace.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/shared/gcTrace.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -45,6 +45,7 @@
 class MetaspaceSummary;
 class PSHeapSummary;
 class G1HeapSummary;
+class G1EvacSummary;
 class ReferenceProcessorStats;
 class TimePartitions;
 class BoolObjectClosure;
@@ -257,10 +258,14 @@
   void report_evacuation_info(EvacuationInfo* info);
   void report_evacuation_failed(EvacuationFailedInfo& ef_info);
 
+  void report_evacuation_statistics(const G1EvacSummary& young_summary, const G1EvacSummary& old_summary) const;
  private:
   void send_g1_young_gc_event();
   void send_evacuation_info_event(EvacuationInfo* info);
   void send_evacuation_failed_event(const EvacuationFailedInfo& ef_info) const;
+
+  void send_young_evacuation_statistics(const G1EvacSummary& summary) const;
+  void send_old_evacuation_statistics(const G1EvacSummary& summary) const;
 };
 #endif
 
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/shared/gcTraceSend.cpp
--- a/hotspot/src/share/vm/gc/shared/gcTraceSend.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/shared/gcTraceSend.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -234,6 +234,37 @@
     e.commit();
   }
 }
+
+static TraceStructG1EvacStats create_g1_evacstats(unsigned gcid, const G1EvacSummary& summary) {
+  TraceStructG1EvacStats s;
+  s.set_gcId(gcid);
+  s.set_allocated(summary.allocated() * HeapWordSize);
+  s.set_wasted(summary.wasted() * HeapWordSize);
+  s.set_used(summary.used() * HeapWordSize);
+  s.set_undoWaste(summary.undo_wasted() * HeapWordSize);
+  s.set_regionEndWaste(summary.region_end_waste() * HeapWordSize);
+  s.set_regionsRefilled(summary.regions_filled());
+  s.set_directAllocated(summary.direct_allocated() * HeapWordSize);
+  s.set_failureUsed(summary.failure_used() * HeapWordSize);
+  s.set_failureWaste(summary.failure_waste() * HeapWordSize);
+  return s;
+}
+
+void G1NewTracer::send_young_evacuation_statistics(const G1EvacSummary& summary) const {
+  EventGCG1EvacuationYoungStatistics surv_evt;
+  if (surv_evt.should_commit()) {
+    surv_evt.set_stats(create_g1_evacstats(_shared_gc_info.gc_id().id(), summary));
+    surv_evt.commit();
+  }
+}
+
+void G1NewTracer::send_old_evacuation_statistics(const G1EvacSummary& summary) const {
+  EventGCG1EvacuationOldStatistics old_evt;
+  if (old_evt.should_commit()) {
+    old_evt.set_stats(create_g1_evacstats(_shared_gc_info.gc_id().id(), summary));
+    old_evt.commit();
+  }
+}
 #endif
 
 static TraceStructVirtualSpace to_trace_struct(const VirtualSpaceSummary& summary) {
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/shared/genCollectedHeap.cpp
--- a/hotspot/src/share/vm/gc/shared/genCollectedHeap.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/shared/genCollectedHeap.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -86,7 +86,7 @@
 {
   assert(policy != NULL, "Sanity check");
   if (UseConcMarkSweepGC) {
-    _workers = new FlexibleWorkGang("GC Thread", ParallelGCThreads,
+    _workers = new WorkGang("GC Thread", ParallelGCThreads,
                             /* are_GC_task_threads */true,
                             /* are_ConcurrentGC_threads */false);
     _workers->initialize_workers();
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/shared/genCollectedHeap.hpp
--- a/hotspot/src/share/vm/gc/shared/genCollectedHeap.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/shared/genCollectedHeap.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -30,9 +30,9 @@
 #include "gc/shared/collectorPolicy.hpp"
 #include "gc/shared/generation.hpp"
 
-class FlexibleWorkGang;
 class StrongRootsScope;
 class SubTasksDone;
+class WorkGang;
 
 // A "GenCollectedHeap" is a CollectedHeap that uses generational
 // collection.  It has two generations, young and old.
@@ -90,7 +90,7 @@
   // In block contents verification, the number of header words to skip
   NOT_PRODUCT(static size_t _skip_header_HeapWords;)
 
-  FlexibleWorkGang* _workers;
+  WorkGang* _workers;
 
 protected:
   // Helper functions for allocation
@@ -124,7 +124,7 @@
 public:
   GenCollectedHeap(GenCollectorPolicy *policy);
 
-  FlexibleWorkGang* workers() const { return _workers; }
+  WorkGang* workers() const { return _workers; }
 
   GCStats* gc_stats(Generation* generation) const;
 
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/shared/plab.cpp
--- a/hotspot/src/share/vm/gc/shared/plab.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/shared/plab.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -24,7 +24,7 @@
 
 #include "precompiled.hpp"
 #include "gc/shared/collectedHeap.hpp"
-#include "gc/shared/plab.hpp"
+#include "gc/shared/plab.inline.hpp"
 #include "gc/shared/threadLocalAllocBuffer.hpp"
 #include "oops/arrayOop.hpp"
 #include "oops/oop.inline.hpp"
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/shared/plab.hpp
--- a/hotspot/src/share/vm/gc/shared/plab.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/shared/plab.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -27,7 +27,6 @@
 
 #include "gc/shared/gcUtil.hpp"
 #include "memory/allocation.hpp"
-#include "runtime/atomic.hpp"
 #include "utilities/globalDefinitions.hpp"
 
 // Forward declarations.
@@ -75,6 +74,8 @@
   PLAB(size_t word_sz);
   virtual ~PLAB() {}
 
+  static size_t size_required_for_allocation(size_t word_size) { return word_size + AlignmentReserve; }
+
   // Minimum PLAB size.
   static size_t min_size();
   // Maximum PLAB size.
@@ -95,7 +96,7 @@
   }
 
   // Allocate the object aligned to "alignment_in_bytes".
-  HeapWord* allocate_aligned(size_t word_sz, unsigned short alignment_in_bytes);
+  inline HeapWord* allocate_aligned(size_t word_sz, unsigned short alignment_in_bytes);
 
   // Undo any allocation in the buffer, which is required to be of the
   // "obj" of the given "word_sz".
@@ -108,13 +109,6 @@
   size_t waste() { return _wasted; }
   size_t undo_waste() { return _undo_wasted; }
 
-  // Should only be done if we are about to reset with a new buffer of the
-  // given size.
-  void set_word_size(size_t new_word_sz) {
-    assert(new_word_sz > AlignmentReserve, "Too small");
-    _word_sz = new_word_sz;
-  }
-
   // The number of words of unallocated space remaining in the buffer.
   size_t words_remaining() {
     assert(_end >= _top, "Negative buffer");
@@ -126,7 +120,10 @@
   }
 
   // Sets the space of the buffer to be [buf, space+word_sz()).
-  virtual void set_buf(HeapWord* buf) {
+  virtual void set_buf(HeapWord* buf, size_t new_word_sz) {
+    assert(new_word_sz > AlignmentReserve, "Too small");
+    _word_sz = new_word_sz;
+
     _bottom   = buf;
     _top      = _bottom;
     _hard_end = _bottom + word_sz();
@@ -149,7 +146,8 @@
 };
 
 // PLAB book-keeping.
-class PLABStats VALUE_OBJ_CLASS_SPEC {
+class PLABStats : public CHeapObj<mtGC> {
+ protected:
   size_t _allocated;          // Total allocated
   size_t _wasted;             // of which wasted (internal fragmentation)
   size_t _undo_wasted;        // of which wasted on undo (is not used for calculation of PLAB size)
@@ -158,7 +156,7 @@
   AdaptiveWeightedAverage
          _filter;             // Integrator with decay
 
-  void reset() {
+  virtual void reset() {
     _allocated   = 0;
     _wasted      = 0;
     _undo_wasted = 0;
@@ -174,6 +172,8 @@
     _filter(wt)
   { }
 
+  virtual ~PLABStats() { }
+
   static const size_t min_size() {
     return PLAB::min_size();
   }
@@ -187,23 +187,15 @@
 
   // Updates the current desired PLAB size. Computes the new desired PLAB size with one gc worker thread,
   // updates _desired_plab_sz and clears sensor accumulators.
-  void adjust_desired_plab_sz();
+  virtual void adjust_desired_plab_sz();
 
-  void add_allocated(size_t v) {
-    Atomic::add_ptr(v, &_allocated);
-  }
+  inline void add_allocated(size_t v);
 
-  void add_unused(size_t v) {
-    Atomic::add_ptr(v, &_unused);
-  }
+  inline void add_unused(size_t v);
 
-  void add_wasted(size_t v) {
-    Atomic::add_ptr(v, &_wasted);
-  }
+  inline void add_wasted(size_t v);
 
-  void add_undo_wasted(size_t v) {
-    Atomic::add_ptr(v, &_undo_wasted);
-  }
+  inline void add_undo_wasted(size_t v);
 };
 
 #endif // SHARE_VM_GC_SHARED_PLAB_HPP
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/shared/plab.inline.hpp
--- a/hotspot/src/share/vm/gc/shared/plab.inline.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/shared/plab.inline.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -27,9 +27,10 @@
 
 #include "gc/shared/collectedHeap.inline.hpp"
 #include "gc/shared/plab.hpp"
+#include "memory/allocation.inline.hpp"
+#include "runtime/atomic.inline.hpp"
 
-HeapWord* PLAB::allocate_aligned(size_t word_sz, unsigned short alignment_in_bytes) {
-
+inline HeapWord* PLAB::allocate_aligned(size_t word_sz, unsigned short alignment_in_bytes) {
   HeapWord* res = CollectedHeap::align_allocation_or_fail(_top, _end, alignment_in_bytes);
   if (res == NULL) {
     return NULL;
@@ -41,4 +42,20 @@
   return allocate(word_sz);
 }
 
+void PLABStats::add_allocated(size_t v) {
+  Atomic::add_ptr(v, &_allocated);
+}
+
+void PLABStats::add_unused(size_t v) {
+  Atomic::add_ptr(v, &_unused);
+}
+
+void PLABStats::add_wasted(size_t v) {
+  Atomic::add_ptr(v, &_wasted);
+}
+
+void PLABStats::add_undo_wasted(size_t v) {
+  Atomic::add_ptr(v, &_undo_wasted);
+}
+
 #endif // SHARE_VM_GC_SHARED_PLAB_INLINE_HPP
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/shared/workgroup.cpp
--- a/hotspot/src/share/vm/gc/shared/workgroup.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/shared/workgroup.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -28,58 +28,25 @@
 #include "memory/allocation.inline.hpp"
 #include "runtime/atomic.inline.hpp"
 #include "runtime/os.hpp"
+#include "runtime/semaphore.hpp"
+#include "runtime/thread.inline.hpp"
 
 // Definitions of WorkGang methods.
 
-AbstractWorkGang::AbstractWorkGang(const char* name,
-                                   bool  are_GC_task_threads,
-                                   bool  are_ConcurrentGC_threads) :
-  _name(name),
-  _are_GC_task_threads(are_GC_task_threads),
-  _are_ConcurrentGC_threads(are_ConcurrentGC_threads) {
-
-  assert(!(are_GC_task_threads && are_ConcurrentGC_threads),
-         "They cannot both be STW GC and Concurrent threads" );
-
-  // Other initialization.
-  _monitor = new Monitor(/* priority */       Mutex::leaf,
-                         /* name */           "WorkGroup monitor",
-                         /* allow_vm_block */ are_GC_task_threads,
-                                              Monitor::_safepoint_check_sometimes);
-  assert(monitor() != NULL, "Failed to allocate monitor");
-  _task = NULL;
-  _sequence_number = 0;
-  _started_workers = 0;
-  _finished_workers = 0;
-}
-
-WorkGang::WorkGang(const char* name,
-                   uint        workers,
-                   bool        are_GC_task_threads,
-                   bool        are_ConcurrentGC_threads) :
-  AbstractWorkGang(name, are_GC_task_threads, are_ConcurrentGC_threads) {
-  _total_workers = workers;
-}
-
-GangWorker* WorkGang::allocate_worker(uint which) {
-  GangWorker* new_worker = new GangWorker(this, which);
-  return new_worker;
-}
-
 // The current implementation will exit if the allocation
 // of any worker fails.  Still, return a boolean so that
 // a future implementation can possibly do a partial
 // initialization of the workers and report such to the
 // caller.
-bool WorkGang::initialize_workers() {
+bool AbstractWorkGang::initialize_workers() {
 
   if (TraceWorkGang) {
     tty->print_cr("Constructing work gang %s with %d threads",
                   name(),
                   total_workers());
   }
-  _gang_workers = NEW_C_HEAP_ARRAY(GangWorker*, total_workers(), mtInternal);
-  if (gang_workers() == NULL) {
+  _workers = NEW_C_HEAP_ARRAY(AbstractGangWorker*, total_workers(), mtInternal);
+  if (_workers == NULL) {
     vm_exit_out_of_memory(0, OOM_MALLOC_ERROR, "Cannot create GangWorker array.");
     return false;
   }
@@ -90,9 +57,9 @@
     worker_type = os::pgc_thread;
   }
   for (uint worker = 0; worker < total_workers(); worker += 1) {
-    GangWorker* new_worker = allocate_worker(worker);
+    AbstractGangWorker* new_worker = allocate_worker(worker);
     assert(new_worker != NULL, "Failed to allocate GangWorker");
-    _gang_workers[worker] = new_worker;
+    _workers[worker] = new_worker;
     if (new_worker == NULL || !os::create_thread(new_worker, worker_type)) {
       vm_exit_out_of_memory(0, OOM_MALLOC_ERROR,
               "Cannot create worker GC thread. Out of system resources.");
@@ -105,110 +72,208 @@
   return true;
 }
 
-GangWorker* AbstractWorkGang::gang_worker(uint i) const {
+AbstractGangWorker* AbstractWorkGang::worker(uint i) const {
   // Array index bounds checking.
-  GangWorker* result = NULL;
-  assert(gang_workers() != NULL, "No workers for indexing");
+  AbstractGangWorker* result = NULL;
+  assert(_workers != NULL, "No workers for indexing");
   assert(i < total_workers(), "Worker index out of bounds");
-  result = _gang_workers[i];
+  result = _workers[i];
   assert(result != NULL, "Indexing to null worker");
   return result;
 }
 
-void WorkGang::run_task(AbstractGangTask* task) {
-  run_task(task, total_workers());
-}
-
-void WorkGang::run_task(AbstractGangTask* task, uint no_of_parallel_workers) {
-  // This thread is executed by the VM thread which does not block
-  // on ordinary MutexLocker's.
-  MutexLockerEx ml(monitor(), Mutex::_no_safepoint_check_flag);
-  if (TraceWorkGang) {
-    tty->print_cr("Running work gang %s task %s", name(), task->name());
-  }
-  // Tell all the workers to run a task.
-  assert(task != NULL, "Running a null task");
-  // Initialize.
-  _task = task;
-  _sequence_number += 1;
-  _started_workers = 0;
-  _finished_workers = 0;
-  // Tell the workers to get to work.
-  monitor()->notify_all();
-  // Wait for them to be finished
-  while (finished_workers() < no_of_parallel_workers) {
-    if (TraceWorkGang) {
-      tty->print_cr("Waiting in work gang %s: %u/%u finished sequence %d",
-                    name(), finished_workers(), no_of_parallel_workers,
-                    _sequence_number);
-    }
-    monitor()->wait(/* no_safepoint_check */ true);
-  }
-  _task = NULL;
-  if (TraceWorkGang) {
-    tty->print_cr("\nFinished work gang %s: %u/%u sequence %d",
-                  name(), finished_workers(), no_of_parallel_workers,
-                  _sequence_number);
-    Thread* me = Thread::current();
-    tty->print_cr("  T: " PTR_FORMAT "  VM_thread: %d", p2i(me), me->is_VM_thread());
-  }
-}
-
-void FlexibleWorkGang::run_task(AbstractGangTask* task) {
-  // If active_workers() is passed, _finished_workers
-  // must only be incremented for workers that find non_null
-  // work (as opposed to all those that just check that the
-  // task is not null).
-  WorkGang::run_task(task, (uint) active_workers());
-}
-
-void AbstractWorkGang::internal_worker_poll(WorkData* data) const {
-  assert(monitor()->owned_by_self(), "worker_poll is an internal method");
-  assert(data != NULL, "worker data is null");
-  data->set_task(task());
-  data->set_sequence_number(sequence_number());
-}
-
-void AbstractWorkGang::internal_note_start() {
-  assert(monitor()->owned_by_self(), "note_finish is an internal method");
-  _started_workers += 1;
-}
-
-void AbstractWorkGang::internal_note_finish() {
-  assert(monitor()->owned_by_self(), "note_finish is an internal method");
-  _finished_workers += 1;
-}
-
 void AbstractWorkGang::print_worker_threads_on(outputStream* st) const {
-  uint    num_thr = total_workers();
-  for (uint i = 0; i < num_thr; i++) {
-    gang_worker(i)->print_on(st);
+  uint workers = total_workers();
+  for (uint i = 0; i < workers; i++) {
+    worker(i)->print_on(st);
     st->cr();
   }
 }
 
 void AbstractWorkGang::threads_do(ThreadClosure* tc) const {
   assert(tc != NULL, "Null ThreadClosure");
-  uint num_thr = total_workers();
-  for (uint i = 0; i < num_thr; i++) {
-    tc->do_thread(gang_worker(i));
+  uint workers = total_workers();
+  for (uint i = 0; i < workers; i++) {
+    tc->do_thread(worker(i));
   }
 }
 
-// GangWorker methods.
+// WorkGang dispatcher implemented with semaphores.
+//
+// Semaphores don't require the worker threads to re-claim the lock when they wake up.
+// This helps lowering the latency when starting and stopping the worker threads.
+class SemaphoreGangTaskDispatcher : public GangTaskDispatcher {
+  // The task currently being dispatched to the GangWorkers.
+  AbstractGangTask* _task;
+
+  volatile uint _started;
+  volatile uint _not_finished;
+
+  // Semaphore used to start the GangWorkers.
+  Semaphore* _start_semaphore;
+  // Semaphore used to notify the coordinator that all workers are done.
+  Semaphore* _end_semaphore;
+
+public:
+  SemaphoreGangTaskDispatcher() :
+      _task(NULL),
+      _started(0),
+      _not_finished(0),
+      _start_semaphore(new Semaphore()),
+      _end_semaphore(new Semaphore())
+{ }
+
+  ~SemaphoreGangTaskDispatcher() {
+    delete _start_semaphore;
+    delete _end_semaphore;
+  }
+
+  void coordinator_execute_on_workers(AbstractGangTask* task, uint num_workers) {
+    // No workers are allowed to read the state variables until they have been signaled.
+    _task         = task;
+    _not_finished = num_workers;
+
+    // Dispatch 'num_workers' number of tasks.
+    _start_semaphore->signal(num_workers);
+
+    // Wait for the last worker to signal the coordinator.
+    _end_semaphore->wait();
+
+    // No workers are allowed to read the state variables after the coordinator has been signaled.
+    assert(_not_finished == 0, err_msg("%d not finished workers?", _not_finished));
+    _task    = NULL;
+    _started = 0;
+
+  }
+
+  WorkData worker_wait_for_task() {
+    // Wait for the coordinator to dispatch a task.
+    _start_semaphore->wait();
+
+    uint num_started = (uint) Atomic::add(1, (volatile jint*)&_started);
+
+    // Subtract one to get a zero-indexed worker id.
+    uint worker_id = num_started - 1;
+
+    return WorkData(_task, worker_id);
+  }
+
+  void worker_done_with_task() {
+    // Mark that the worker is done with the task.
+    // The worker is not allowed to read the state variables after this line.
+    uint not_finished = (uint) Atomic::add(-1, (volatile jint*)&_not_finished);
+
+    // The last worker signals to the coordinator that all work is completed.
+    if (not_finished == 0) {
+      _end_semaphore->signal();
+    }
+  }
+};
+
+class MutexGangTaskDispatcher : public GangTaskDispatcher {
+  AbstractGangTask* _task;
+
+  volatile uint _started;
+  volatile uint _finished;
+  volatile uint _num_workers;
+
+  Monitor* _monitor;
 
-GangWorker::GangWorker(AbstractWorkGang* gang, uint id) {
+ public:
+  MutexGangTaskDispatcher()
+      : _task(NULL),
+        _monitor(new Monitor(Monitor::leaf, "WorkGang dispatcher lock", false, Monitor::_safepoint_check_never)),
+        _started(0),
+        _finished(0),
+        _num_workers(0) {}
+
+  ~MutexGangTaskDispatcher() {
+    delete _monitor;
+  }
+
+  void coordinator_execute_on_workers(AbstractGangTask* task, uint num_workers) {
+    MutexLockerEx ml(_monitor, Mutex::_no_safepoint_check_flag);
+
+    _task        = task;
+    _num_workers = num_workers;
+
+    // Tell the workers to get to work.
+    _monitor->notify_all();
+
+    // Wait for them to finish.
+    while (_finished < _num_workers) {
+      _monitor->wait(/* no_safepoint_check */ true);
+    }
+
+    _task        = NULL;
+    _num_workers = 0;
+    _started     = 0;
+    _finished    = 0;
+  }
+
+  WorkData worker_wait_for_task() {
+    MonitorLockerEx ml(_monitor, Mutex::_no_safepoint_check_flag);
+
+    while (_num_workers == 0 || _started == _num_workers) {
+      _monitor->wait(/* no_safepoint_check */ true);
+    }
+
+    _started++;
+
+    // Subtract one to get a zero-indexed worker id.
+    uint worker_id = _started - 1;
+
+    return WorkData(_task, worker_id);
+  }
+
+  void worker_done_with_task() {
+    MonitorLockerEx ml(_monitor, Mutex::_no_safepoint_check_flag);
+
+    _finished++;
+
+    if (_finished == _num_workers) {
+      // This will wake up all workers and not only the coordinator.
+      _monitor->notify_all();
+    }
+  }
+};
+
+static GangTaskDispatcher* create_dispatcher() {
+  if (UseSemaphoreGCThreadsSynchronization) {
+    return new SemaphoreGangTaskDispatcher();
+  }
+
+  return new MutexGangTaskDispatcher();
+}
+
+WorkGang::WorkGang(const char* name,
+                   uint  workers,
+                   bool  are_GC_task_threads,
+                   bool  are_ConcurrentGC_threads) :
+    AbstractWorkGang(name, workers, are_GC_task_threads, are_ConcurrentGC_threads),
+    _dispatcher(create_dispatcher())
+{ }
+
+AbstractGangWorker* WorkGang::allocate_worker(uint worker_id) {
+  return new GangWorker(this, worker_id);
+}
+
+void WorkGang::run_task(AbstractGangTask* task) {
+  _dispatcher->coordinator_execute_on_workers(task, active_workers());
+}
+
+AbstractGangWorker::AbstractGangWorker(AbstractWorkGang* gang, uint id) {
   _gang = gang;
   set_id(id);
   set_name("%s#%d", gang->name(), id);
 }
 
-void GangWorker::run() {
+void AbstractGangWorker::run() {
   initialize();
   loop();
 }
 
-void GangWorker::initialize() {
+void AbstractGangWorker::initialize() {
   this->initialize_thread_local_storage();
   this->record_stack_base_and_size();
   this->initialize_named_thread();
@@ -224,112 +289,59 @@
          " of a work gang");
 }
 
-void GangWorker::loop() {
-  int previous_sequence_number = 0;
-  Monitor* gang_monitor = gang()->monitor();
-  for ( ; ; ) {
-    WorkData data;
-    int part;  // Initialized below.
-    {
-      // Grab the gang mutex.
-      MutexLocker ml(gang_monitor);
-      // Wait for something to do.
-      // Polling outside the while { wait } avoids missed notifies
-      // in the outer loop.
-      gang()->internal_worker_poll(&data);
-      if (TraceWorkGang) {
-        tty->print("Polled outside for work in gang %s worker %u",
-                   gang()->name(), id());
-        tty->print("  sequence: %d (prev: %d)",
-                   data.sequence_number(), previous_sequence_number);
-        if (data.task() != NULL) {
-          tty->print("  task: %s", data.task()->name());
-        } else {
-          tty->print("  task: NULL");
-        }
-        tty->cr();
-      }
-      for ( ; /* break */; ) {
-        // Check for new work.
-        if ((data.task() != NULL) &&
-            (data.sequence_number() != previous_sequence_number)) {
-          if (gang()->needs_more_workers()) {
-            gang()->internal_note_start();
-            gang_monitor->notify_all();
-            part = gang()->started_workers() - 1;
-            break;
-          }
-        }
-        // Nothing to do.
-        gang_monitor->wait(/* no_safepoint_check */ true);
-        gang()->internal_worker_poll(&data);
-        if (TraceWorkGang) {
-          tty->print("Polled inside for work in gang %s worker %u",
-                     gang()->name(), id());
-          tty->print("  sequence: %d (prev: %d)",
-                     data.sequence_number(), previous_sequence_number);
-          if (data.task() != NULL) {
-            tty->print("  task: %s", data.task()->name());
-          } else {
-            tty->print("  task: NULL");
-          }
-          tty->cr();
-        }
-      }
-      // Drop gang mutex.
-    }
-    if (TraceWorkGang) {
-      tty->print("Work for work gang %s id %u task %s part %d",
-                 gang()->name(), id(), data.task()->name(), part);
-    }
-    assert(data.task() != NULL, "Got null task");
-    data.task()->work(part);
-    {
-      if (TraceWorkGang) {
-        tty->print("Finish for work gang %s id %u task %s part %d",
-                   gang()->name(), id(), data.task()->name(), part);
-      }
-      // Grab the gang mutex.
-      MutexLocker ml(gang_monitor);
-      gang()->internal_note_finish();
-      // Tell the gang you are done.
-      gang_monitor->notify_all();
-      // Drop the gang mutex.
-    }
-    previous_sequence_number = data.sequence_number();
-  }
-}
-
-bool GangWorker::is_GC_task_thread() const {
+bool AbstractGangWorker::is_GC_task_thread() const {
   return gang()->are_GC_task_threads();
 }
 
-bool GangWorker::is_ConcurrentGC_thread() const {
+bool AbstractGangWorker::is_ConcurrentGC_thread() const {
   return gang()->are_ConcurrentGC_threads();
 }
 
-void GangWorker::print_on(outputStream* st) const {
+void AbstractGangWorker::print_on(outputStream* st) const {
   st->print("\"%s\" ", name());
   Thread::print_on(st);
   st->cr();
 }
 
-// Printing methods
+WorkData GangWorker::wait_for_task() {
+  return gang()->dispatcher()->worker_wait_for_task();
+}
 
-const char* AbstractWorkGang::name() const {
-  return _name;
+void GangWorker::signal_task_done() {
+  gang()->dispatcher()->worker_done_with_task();
+}
+
+void GangWorker::print_task_started(WorkData data) {
+  if (TraceWorkGang) {
+    tty->print_cr("Running work gang %s task %s worker %u", name(), data._task->name(), data._worker_id);
+  }
 }
 
-#ifndef PRODUCT
-
-const char* AbstractGangTask::name() const {
-  return _name;
+void GangWorker::print_task_done(WorkData data) {
+  if (TraceWorkGang) {
+    tty->print_cr("\nFinished work gang %s task %s worker %u", name(), data._task->name(), data._worker_id);
+    Thread* me = Thread::current();
+    tty->print_cr("  T: " PTR_FORMAT "  VM_thread: %d", p2i(me), me->is_VM_thread());
+  }
 }
 
-#endif /* PRODUCT */
+void GangWorker::run_task(WorkData data) {
+  print_task_started(data);
+
+  data._task->work(data._worker_id);
+
+  print_task_done(data);
+}
 
-// FlexibleWorkGang
+void GangWorker::loop() {
+  while (true) {
+    WorkData data = wait_for_task();
 
+    run_task(data);
+
+    signal_task_done();
+  }
+}
 
 // *** WorkGangBarrierSync
 
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/gc/shared/workgroup.hpp
--- a/hotspot/src/share/vm/gc/shared/workgroup.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/gc/shared/workgroup.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -25,282 +25,111 @@
 #ifndef SHARE_VM_GC_SHARED_WORKGROUP_HPP
 #define SHARE_VM_GC_SHARED_WORKGROUP_HPP
 
-#include "gc/shared/taskqueue.hpp"
-#include "runtime/thread.inline.hpp"
+#include "memory/allocation.hpp"
+#include "runtime/globals.hpp"
+#include "runtime/thread.hpp"
+#include "utilities/debug.hpp"
+#include "utilities/globalDefinitions.hpp"
 
 // Task class hierarchy:
 //   AbstractGangTask
-//     AbstractGangTaskWOopQueues
 //
 // Gang/Group class hierarchy:
 //   AbstractWorkGang
 //     WorkGang
-//       FlexibleWorkGang
-//         YieldingFlexibleWorkGang (defined in another file)
+//     YieldingFlexibleWorkGang (defined in another file)
 //
 // Worker class hierarchy:
-//   GangWorker (subclass of WorkerThread)
+//   AbstractGangWorker (subclass of WorkerThread)
+//     GangWorker
 //     YieldingFlexibleGangWorker   (defined in another file)
 
 // Forward declarations of classes defined here
 
+class AbstractGangWorker;
+class Semaphore;
 class WorkGang;
-class GangWorker;
-class YieldingFlexibleGangWorker;
-class YieldingFlexibleGangTask;
-class WorkData;
-class AbstractWorkGang;
 
 // An abstract task to be worked on by a gang.
 // You subclass this to supply your own work() method
 class AbstractGangTask VALUE_OBJ_CLASS_SPEC {
-public:
+  const char* _name;
+
+ public:
+  AbstractGangTask(const char* name) : _name(name) {}
+
   // The abstract work method.
   // The argument tells you which member of the gang you are.
   virtual void work(uint worker_id) = 0;
 
   // Debugging accessor for the name.
-  const char* name() const PRODUCT_RETURN_(return NULL;);
-  int counter() { return _counter; }
-  void set_counter(int value) { _counter = value; }
-  int *address_of_counter() { return &_counter; }
-
-  // RTTI
-  NOT_PRODUCT(virtual bool is_YieldingFlexibleGang_task() const {
-    return false;
-  })
+  const char* name() const { return _name; }
+};
 
-private:
-  NOT_PRODUCT(const char* _name;)
-  // ??? Should a task have a priority associated with it?
-  // ??? Or can the run method adjust priority as needed?
-  int _counter;
-
-protected:
-  // Constructor and desctructor: only construct subclasses.
-  AbstractGangTask(const char* name)
-  {
-    NOT_PRODUCT(_name = name);
-    _counter = 0;
-  }
-  ~AbstractGangTask() { }
-
-public:
+struct WorkData {
+  AbstractGangTask* _task;
+  uint              _worker_id;
+  WorkData(AbstractGangTask* task, uint worker_id) : _task(task), _worker_id(worker_id) {}
 };
 
-class AbstractGangTaskWOopQueues : public AbstractGangTask {
-  OopTaskQueueSet*       _queues;
-  ParallelTaskTerminator _terminator;
+// Interface to handle the synchronization between the coordinator thread and the worker threads,
+// when a task is dispatched out to the worker threads.
+class GangTaskDispatcher : public CHeapObj<mtGC> {
  public:
-  AbstractGangTaskWOopQueues(const char* name, OopTaskQueueSet* queues, uint n_threads) :
-    AbstractGangTask(name), _queues(queues), _terminator(n_threads, _queues) {}
-  ParallelTaskTerminator* terminator() { return &_terminator; }
-  OopTaskQueueSet* queues() { return _queues; }
+  virtual ~GangTaskDispatcher() {}
+
+  // Coordinator API.
+
+  // Distributes the task out to num_workers workers.
+  // Returns when the task has been completed by all workers.
+  virtual void coordinator_execute_on_workers(AbstractGangTask* task, uint num_workers) = 0;
+
+  // Worker API.
+
+  // Waits for a task to become available to the worker.
+  // Returns when the worker has been assigned a task.
+  virtual WorkData worker_wait_for_task() = 0;
+
+  // Signal to the coordinator that the worker is done with the assigned task.
+  virtual void     worker_done_with_task() = 0;
 };
 
+// The work gang is the collection of workers to execute tasks.
+// The number of workers run for a task is "_active_workers"
+// while "_total_workers" is the number of available of workers.
+class AbstractWorkGang : public CHeapObj<mtInternal> {
+ protected:
+  // The array of worker threads for this gang.
+  AbstractGangWorker** _workers;
+  // The count of the number of workers in the gang.
+  uint _total_workers;
+  // The currently active workers in this gang.
+  uint _active_workers;
+  // Printing support.
+  const char* _name;
 
-// Class AbstractWorkGang:
-// An abstract class representing a gang of workers.
-// You subclass this to supply an implementation of run_task().
-class AbstractWorkGang: public CHeapObj<mtInternal> {
-protected:
-  // Work gangs are never deleted, so no need to cleanup.
-  ~AbstractWorkGang() { ShouldNotReachHere(); }
-public:
-  // Constructor.
-  AbstractWorkGang(const char* name, bool are_GC_task_threads,
-                   bool are_ConcurrentGC_threads);
-  // Run a task, returns when the task is done (or terminated).
-  virtual void run_task(AbstractGangTask* task) = 0;
-  // Return true if more workers should be applied to the task.
-  virtual bool needs_more_workers() const { return true; }
-public:
-  // Debugging.
-  const char* name() const;
-protected:
+ private:
   // Initialize only instance data.
   const bool _are_GC_task_threads;
   const bool _are_ConcurrentGC_threads;
-  // Printing support.
-  const char* _name;
-  // The monitor which protects these data,
-  // and notifies of changes in it.
-  Monitor*  _monitor;
-  // The count of the number of workers in the gang.
-  uint _total_workers;
-  // The array of worker threads for this gang.
-  // This is only needed for cleaning up.
-  GangWorker** _gang_workers;
-  // The task for this gang.
-  AbstractGangTask* _task;
-  // A sequence number for the current task.
-  int _sequence_number;
-  // The number of started workers.
-  uint _started_workers;
-  // The number of finished workers.
-  uint _finished_workers;
-public:
-  // Accessors for fields
-  Monitor* monitor() const {
-    return _monitor;
-  }
-  uint total_workers() const {
-    return _total_workers;
-  }
-  virtual uint active_workers() const {
-    return _total_workers;
-  }
-  GangWorker** gang_workers() const {
-    return _gang_workers;
-  }
-  AbstractGangTask* task() const {
-    return _task;
-  }
-  int sequence_number() const {
-    return _sequence_number;
-  }
-  uint started_workers() const {
-    return _started_workers;
-  }
-  uint finished_workers() const {
-    return _finished_workers;
-  }
-  bool are_GC_task_threads() const {
-    return _are_GC_task_threads;
-  }
-  bool are_ConcurrentGC_threads() const {
-    return _are_ConcurrentGC_threads;
-  }
-  // Predicates.
-  bool is_idle() const {
-    return (task() == NULL);
-  }
-  // Return the Ith gang worker.
-  GangWorker* gang_worker(uint i) const;
-
-  void threads_do(ThreadClosure* tc) const;
-
-  // Printing
-  void print_worker_threads_on(outputStream *st) const;
-  void print_worker_threads() const {
-    print_worker_threads_on(tty);
-  }
-
-protected:
-  friend class GangWorker;
-  friend class YieldingFlexibleGangWorker;
-  // Note activation and deactivation of workers.
-  // These methods should only be called with the mutex held.
-  void internal_worker_poll(WorkData* data) const;
-  void internal_note_start();
-  void internal_note_finish();
-};
 
-class WorkData: public StackObj {
-  // This would be a struct, but I want accessor methods.
-private:
-  AbstractGangTask* _task;
-  int               _sequence_number;
-public:
-  // Constructor and destructor
-  WorkData() {
-    _task            = NULL;
-    _sequence_number = 0;
-  }
-  ~WorkData() {
-  }
-  AbstractGangTask* task()               const { return _task; }
-  void set_task(AbstractGangTask* value)       { _task = value; }
-  int sequence_number()                  const { return _sequence_number; }
-  void set_sequence_number(int value)          { _sequence_number = value; }
-
-  YieldingFlexibleGangTask* yf_task()    const {
-    return (YieldingFlexibleGangTask*)_task;
-  }
-};
-
-// Class WorkGang:
-class WorkGang: public AbstractWorkGang {
-public:
-  // Constructor
-  WorkGang(const char* name, uint workers,
-           bool are_GC_task_threads, bool are_ConcurrentGC_threads);
-  // Run a task, returns when the task is done (or terminated).
-  virtual void run_task(AbstractGangTask* task);
-  void run_task(AbstractGangTask* task, uint no_of_parallel_workers);
-  // Allocate a worker and return a pointer to it.
-  virtual GangWorker* allocate_worker(uint which);
-  // Initialize workers in the gang.  Return true if initialization
-  // succeeded. The type of the worker can be overridden in a derived
-  // class with the appropriate implementation of allocate_worker().
-  bool initialize_workers();
-};
-
-// Class GangWorker:
-//   Several instances of this class run in parallel as workers for a gang.
-class GangWorker: public WorkerThread {
-public:
-  // Constructors and destructor.
-  GangWorker(AbstractWorkGang* gang, uint id);
+ public:
+  AbstractWorkGang(const char* name, uint workers, bool are_GC_task_threads, bool are_ConcurrentGC_threads) :
+      _name(name),
+      _total_workers(workers),
+      _active_workers(UseDynamicNumberOfGCThreads ? 1U : workers),
+      _are_GC_task_threads(are_GC_task_threads),
+      _are_ConcurrentGC_threads(are_ConcurrentGC_threads)
+  { }
 
-  // The only real method: run a task for the gang.
-  virtual void run();
-  // Predicate for Thread
-  virtual bool is_GC_task_thread() const;
-  virtual bool is_ConcurrentGC_thread() const;
-  // Printing
-  void print_on(outputStream* st) const;
-  virtual void print() const { print_on(tty); }
-protected:
-  AbstractWorkGang* _gang;
-
-  virtual void initialize();
-  virtual void loop();
-
-public:
-  AbstractWorkGang* gang() const { return _gang; }
-};
+  // Initialize workers in the gang.  Return true if initialization succeeded.
+  bool initialize_workers();
 
-// Dynamic number of worker threads
-//
-// This type of work gang is used to run different numbers of
-// worker threads at different times.  The
-// number of workers run for a task is "_active_workers"
-// instead of "_total_workers" in a WorkGang.  The method
-// "needs_more_workers()" returns true until "_active_workers"
-// have been started and returns false afterwards.  The
-// implementation of "needs_more_workers()" in WorkGang always
-// returns true so that all workers are started.  The method
-// "loop()" in GangWorker was modified to ask "needs_more_workers()"
-// in its loop to decide if it should start working on a task.
-// A worker in "loop()" waits for notification on the WorkGang
-// monitor and execution of each worker as it checks for work
-// is serialized via the same monitor.  The "needs_more_workers()"
-// call is serialized and additionally the calculation for the
-// "part" (effectively the worker id for executing the task) is
-// serialized to give each worker a unique "part".  Workers that
-// are not needed for this tasks (i.e., "_active_workers" have
-// been started before it, continue to wait for work.
+  bool are_GC_task_threads()      const { return _are_GC_task_threads; }
+  bool are_ConcurrentGC_threads() const { return _are_ConcurrentGC_threads; }
 
-class FlexibleWorkGang: public WorkGang {
-  // The currently active workers in this gang.
-  // This is a number that is dynamically adjusted
-  // and checked in the run_task() method at each invocation.
-  // As described above _active_workers determines the number
-  // of threads started on a task.  It must also be used to
-  // determine completion.
+  uint total_workers() const { return _total_workers; }
 
- protected:
-  uint _active_workers;
- public:
-  // Constructor and destructor.
-  FlexibleWorkGang(const char* name, uint workers,
-                   bool are_GC_task_threads,
-                   bool  are_ConcurrentGC_threads) :
-    WorkGang(name, workers, are_GC_task_threads, are_ConcurrentGC_threads),
-    _active_workers(UseDynamicNumberOfGCThreads ? 1U : workers) {}
-
-  // Accessors for fields.
   virtual uint active_workers() const {
     assert(_active_workers <= _total_workers,
            err_msg("_active_workers: %u > _total_workers: %u", _active_workers, _total_workers));
@@ -317,10 +146,90 @@
     assert(UseDynamicNumberOfGCThreads || _active_workers == _total_workers,
            "Unless dynamic should use total workers");
   }
+
+  // Return the Ith worker.
+  AbstractGangWorker* worker(uint i) const;
+
+  void threads_do(ThreadClosure* tc) const;
+
+  // Debugging.
+  const char* name() const { return _name; }
+
+  // Printing
+  void print_worker_threads_on(outputStream *st) const;
+  void print_worker_threads() const {
+    print_worker_threads_on(tty);
+  }
+
+ protected:
+  virtual AbstractGangWorker* allocate_worker(uint which) = 0;
+};
+
+// An class representing a gang of workers.
+class WorkGang: public AbstractWorkGang {
+  // To get access to the GangTaskDispatcher instance.
+  friend class GangWorker;
+
+  // Never deleted.
+  ~WorkGang();
+
+  GangTaskDispatcher* const _dispatcher;
+  GangTaskDispatcher* dispatcher() const {
+    return _dispatcher;
+  }
+
+public:
+  WorkGang(const char* name,
+           uint workers,
+           bool are_GC_task_threads,
+           bool are_ConcurrentGC_threads);
+
+  // Run a task, returns when the task is done.
   virtual void run_task(AbstractGangTask* task);
-  virtual bool needs_more_workers() const {
-    return _started_workers < _active_workers;
-  }
+
+protected:
+  virtual AbstractGangWorker* allocate_worker(uint which);
+};
+
+// Several instances of this class run in parallel as workers for a gang.
+class AbstractGangWorker: public WorkerThread {
+public:
+  AbstractGangWorker(AbstractWorkGang* gang, uint id);
+
+  // The only real method: run a task for the gang.
+  virtual void run();
+  // Predicate for Thread
+  virtual bool is_GC_task_thread() const;
+  virtual bool is_ConcurrentGC_thread() const;
+  // Printing
+  void print_on(outputStream* st) const;
+  virtual void print() const { print_on(tty); }
+
+protected:
+  AbstractWorkGang* _gang;
+
+  virtual void initialize();
+  virtual void loop() = 0;
+
+  AbstractWorkGang* gang() const { return _gang; }
+};
+
+class GangWorker: public AbstractGangWorker {
+public:
+  GangWorker(WorkGang* gang, uint id) : AbstractGangWorker(gang, id) {}
+
+protected:
+  virtual void loop();
+
+private:
+  WorkData wait_for_task();
+  void run_task(WorkData work);
+  void signal_task_done();
+
+  void print_task_started(WorkData data);
+  void print_task_done(WorkData data);
+
+  WorkGang* gang() const { return (WorkGang*)_gang; }
 };
 
 // A class that acts as a synchronisation barrier. Workers enter
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/interpreter/abstractInterpreter.hpp
--- a/hotspot/src/share/vm/interpreter/abstractInterpreter.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/interpreter/abstractInterpreter.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -90,6 +90,10 @@
     java_util_zip_CRC32_update,                                 // implementation of java.util.zip.CRC32.update()
     java_util_zip_CRC32_updateBytes,                            // implementation of java.util.zip.CRC32.updateBytes()
     java_util_zip_CRC32_updateByteBuffer,                       // implementation of java.util.zip.CRC32.updateByteBuffer()
+    java_lang_Float_intBitsToFloat,                             // implementation of java.lang.Float.intBitsToFloat()
+    java_lang_Float_floatToRawIntBits,                          // implementation of java.lang.Float.floatToRawIntBits()
+    java_lang_Double_longBitsToDouble,                          // implementation of java.lang.Double.longBitsToDouble()
+    java_lang_Double_doubleToRawLongBits,                       // implementation of java.lang.Double.doubleToRawLongBits()
     number_of_method_entries,
     invalid = -1
   };
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/interpreter/interpreter.cpp
--- a/hotspot/src/share/vm/interpreter/interpreter.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/interpreter/interpreter.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -234,7 +234,15 @@
       case vmIntrinsics::_updateByteBufferCRC32  : return java_util_zip_CRC32_updateByteBuffer;
     }
   }
-#endif
+
+  switch(m->intrinsic_id()) {
+  case vmIntrinsics::_intBitsToFloat:      return java_lang_Float_intBitsToFloat;
+  case vmIntrinsics::_floatToRawIntBits:   return java_lang_Float_floatToRawIntBits;
+  case vmIntrinsics::_longBitsToDouble:    return java_lang_Double_longBitsToDouble;
+  case vmIntrinsics::_doubleToRawLongBits: return java_lang_Double_doubleToRawLongBits;
+  }
+
+#endif // CC_INTERP
 
   // Native method?
   // Note: This test must come _before_ the test for intrinsic
@@ -559,6 +567,25 @@
                                            : // fall thru
   case Interpreter::java_util_zip_CRC32_updateByteBuffer
                                            : entry_point = generate_CRC32_updateBytes_entry(kind); break;
+#if defined(TARGET_ARCH_x86) && !defined(_LP64)
+  // On x86_32 platforms, a special entry is generated for the following four methods.
+  // On other platforms the normal entry is used to enter these methods.
+  case Interpreter::java_lang_Float_intBitsToFloat
+                                           : entry_point = generate_Float_intBitsToFloat_entry(); break;
+  case Interpreter::java_lang_Float_floatToRawIntBits
+                                           : entry_point = generate_Float_floatToRawIntBits_entry(); break;
+  case Interpreter::java_lang_Double_longBitsToDouble
+                                           : entry_point = generate_Double_longBitsToDouble_entry(); break;
+  case Interpreter::java_lang_Double_doubleToRawLongBits
+                                           : entry_point = generate_Double_doubleToRawLongBits_entry(); break;
+#else
+  case Interpreter::java_lang_Float_intBitsToFloat:
+  case Interpreter::java_lang_Float_floatToRawIntBits:
+  case Interpreter::java_lang_Double_longBitsToDouble:
+  case Interpreter::java_lang_Double_doubleToRawLongBits:
+    entry_point = generate_native_entry(false);
+    break;
+#endif // defined(TARGET_ARCH_x86) && !defined(_LP64)
 #endif // CC_INTERP
   default:
     fatal(err_msg("unexpected method kind: %d", kind));
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/interpreter/templateInterpreter.cpp
--- a/hotspot/src/share/vm/interpreter/templateInterpreter.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/interpreter/templateInterpreter.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -397,34 +397,39 @@
 
       // all non-native method kinds
       method_entry(zerolocals)
-        method_entry(zerolocals_synchronized)
-        method_entry(empty)
-        method_entry(accessor)
-        method_entry(abstract)
-        method_entry(java_lang_math_sin  )
-        method_entry(java_lang_math_cos  )
-        method_entry(java_lang_math_tan  )
-        method_entry(java_lang_math_abs  )
-        method_entry(java_lang_math_sqrt )
-        method_entry(java_lang_math_log  )
-        method_entry(java_lang_math_log10)
-        method_entry(java_lang_math_exp  )
-        method_entry(java_lang_math_pow  )
-        method_entry(java_lang_ref_reference_get)
+      method_entry(zerolocals_synchronized)
+      method_entry(empty)
+      method_entry(accessor)
+      method_entry(abstract)
+      method_entry(java_lang_math_sin  )
+      method_entry(java_lang_math_cos  )
+      method_entry(java_lang_math_tan  )
+      method_entry(java_lang_math_abs  )
+      method_entry(java_lang_math_sqrt )
+      method_entry(java_lang_math_log  )
+      method_entry(java_lang_math_log10)
+      method_entry(java_lang_math_exp  )
+      method_entry(java_lang_math_pow  )
+      method_entry(java_lang_ref_reference_get)
 
-        if (UseCRC32Intrinsics) {
-          method_entry(java_util_zip_CRC32_update)
-            method_entry(java_util_zip_CRC32_updateBytes)
-            method_entry(java_util_zip_CRC32_updateByteBuffer)
-            }
+      if (UseCRC32Intrinsics) {
+        method_entry(java_util_zip_CRC32_update)
+        method_entry(java_util_zip_CRC32_updateBytes)
+        method_entry(java_util_zip_CRC32_updateByteBuffer)
+      }
+
+      method_entry(java_lang_Float_intBitsToFloat);
+      method_entry(java_lang_Float_floatToRawIntBits);
+      method_entry(java_lang_Double_longBitsToDouble);
+      method_entry(java_lang_Double_doubleToRawLongBits);
 
       initialize_method_handle_entries();
 
       // all native method kinds (must be one contiguous block)
       Interpreter::_native_entry_begin = Interpreter::code()->code_end();
       method_entry(native)
-        method_entry(native_synchronized)
-        Interpreter::_native_entry_end = Interpreter::code()->code_end();
+      method_entry(native_synchronized)
+      Interpreter::_native_entry_end = Interpreter::code()->code_end();
 
 #undef method_entry
 
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/memory/metaspace.hpp
--- a/hotspot/src/share/vm/memory/metaspace.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/memory/metaspace.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -254,7 +254,7 @@
   // Debugging support
   void verify();
 
-  static void print_compressed_class_space(outputStream* st, const char* requested_addr = 0);
+  static void print_compressed_class_space(outputStream* st, const char* requested_addr = 0) NOT_LP64({});
 
   class AllocRecordClosure :  public StackObj {
   public:
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/memory/universe.cpp
--- a/hotspot/src/share/vm/memory/universe.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/memory/universe.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -77,7 +77,7 @@
 #if INCLUDE_ALL_GCS
 #include "gc/cms/cmsCollectorPolicy.hpp"
 #include "gc/g1/g1CollectedHeap.inline.hpp"
-#include "gc/g1/g1CollectorPolicy_ext.hpp"
+#include "gc/g1/g1CollectorPolicy.hpp"
 #include "gc/parallel/parallelScavengeHeap.hpp"
 #include "gc/shared/adaptiveSizePolicy.hpp"
 #endif // INCLUDE_ALL_GCS
@@ -694,13 +694,29 @@
   return JNI_OK;
 }
 
-template <class Heap, class Policy>
-jint Universe::create_heap() {
+CollectedHeap* Universe::create_heap() {
   assert(_collectedHeap == NULL, "Heap already created");
-  Policy* policy = new Policy();
-  policy->initialize_all();
-  _collectedHeap = new Heap(policy);
-  return _collectedHeap->initialize();
+#if !INCLUDE_ALL_GCS
+  if (UseParallelGC) {
+    fatal("UseParallelGC not supported in this VM.");
+  } else if (UseG1GC) {
+    fatal("UseG1GC not supported in this VM.");
+  } else if (UseConcMarkSweepGC) {
+    fatal("UseConcMarkSweepGC not supported in this VM.");
+#else
+  if (UseParallelGC) {
+    return Universe::create_heap_with_policy<ParallelScavengeHeap, GenerationSizer>();
+  } else if (UseG1GC) {
+    return Universe::create_heap_with_policy<G1CollectedHeap, G1CollectorPolicy>();
+  } else if (UseConcMarkSweepGC) {
+    return Universe::create_heap_with_policy<GenCollectedHeap, ConcurrentMarkSweepPolicy>();
+#endif
+  } else if (UseSerialGC) {
+    return Universe::create_heap_with_policy<GenCollectedHeap, MarkSweepPolicy>();
+  }
+
+  ShouldNotReachHere();
+  return NULL;
 }
 
 // Choose the heap base address and oop encoding mode
@@ -714,27 +730,12 @@
 jint Universe::initialize_heap() {
   jint status = JNI_ERR;
 
-#if !INCLUDE_ALL_GCS
-  if (UseParallelGC) {
-    fatal("UseParallelGC not supported in this VM.");
-  } else if (UseG1GC) {
-    fatal("UseG1GC not supported in this VM.");
-  } else if (UseConcMarkSweepGC) {
-    fatal("UseConcMarkSweepGC not supported in this VM.");
-#else
-  if (UseParallelGC) {
-    status = Universe::create_heap<ParallelScavengeHeap, GenerationSizer>();
-  } else if (UseG1GC) {
-    status = Universe::create_heap<G1CollectedHeap, G1CollectorPolicyExt>();
-  } else if (UseConcMarkSweepGC) {
-    status = Universe::create_heap<GenCollectedHeap, ConcurrentMarkSweepPolicy>();
-#endif
-  } else if (UseSerialGC) {
-    status = Universe::create_heap<GenCollectedHeap, MarkSweepPolicy>();
-  } else {
-    ShouldNotReachHere();
+  _collectedHeap = create_heap_ext();
+  if (_collectedHeap == NULL) {
+    _collectedHeap = create_heap();
   }
 
+  status = _collectedHeap->initialize();
   if (status != JNI_OK) {
     return status;
   }
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/memory/universe.hpp
--- a/hotspot/src/share/vm/memory/universe.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/memory/universe.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -214,7 +214,9 @@
   static size_t _heap_capacity_at_last_gc;
   static size_t _heap_used_at_last_gc;
 
-  template <class Heap, class Policy> static jint create_heap();
+  template <class Heap, class Policy> static CollectedHeap* create_heap_with_policy();
+  static CollectedHeap* create_heap();
+  static CollectedHeap* create_heap_ext();
   static jint initialize_heap();
   static void initialize_basic_type_mirrors(TRAPS);
   static void fixup_mirrors(TRAPS);
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/memory/universe.inline.hpp
--- a/hotspot/src/share/vm/memory/universe.inline.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/memory/universe.inline.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -49,4 +49,11 @@
   _allocation_context_notification_obj = obj;
 }
 
+template <class Heap, class Policy>
+CollectedHeap* Universe::create_heap_with_policy() {
+  Policy* policy = new Policy();
+  policy->initialize_all();
+  return new Heap(policy);
+}
+
 #endif // SHARE_VM_MEMORY_UNIVERSE_INLINE_HPP
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/memory/universe_ext.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/share/vm/memory/universe_ext.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "memory/universe.hpp"
+
+CollectedHeap* Universe::create_heap_ext() {
+  return NULL;
+}
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/oops/symbol.cpp
--- a/hotspot/src/share/vm/oops/symbol.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/oops/symbol.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -35,7 +35,7 @@
 Symbol::Symbol(const u1* name, int length, int refcount) {
   _refcount = refcount;
   _length = length;
-  _identity_hash = os::random();
+  _identity_hash = (short)os::random();
   for (int i = 0; i < _length; i++) {
     byte_at_put(i, name[i]);
   }
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/oops/symbol.hpp
--- a/hotspot/src/share/vm/oops/symbol.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/oops/symbol.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -106,23 +106,18 @@
 #define PERM_REFCOUNT -1
 #endif
 
-// We separate the fields in SymbolBase from Symbol::_body so that
-// Symbol::size(int) can correctly calculate the space needed.
-class SymbolBase : public MetaspaceObj {
- public:
+class Symbol : public MetaspaceObj {
+  friend class VMStructs;
+  friend class SymbolTable;
+  friend class MoveSymbols;
+
+ private:
   ATOMIC_SHORT_PAIR(
     volatile short _refcount,  // needs atomic operation
     unsigned short _length     // number of UTF8 characters in the symbol (does not need atomic op)
   );
-  int            _identity_hash;
-};
-
-class Symbol : private SymbolBase {
-  friend class VMStructs;
-  friend class SymbolTable;
-  friend class MoveSymbols;
- private:
-  jbyte _body[1];
+  short _identity_hash;
+  jbyte _body[2];
 
   enum {
     // max_symbol_length is constrained by type of _length
@@ -130,7 +125,7 @@
   };
 
   static int size(int length) {
-    size_t sz = heap_word_size(sizeof(SymbolBase) + (length > 0 ? length : 0));
+    size_t sz = heap_word_size(sizeof(Symbol) + (length > 2 ? length - 2 : 0));
     return align_object_size(sz);
   }
 
@@ -154,8 +149,11 @@
 
   // Returns the largest size symbol we can safely hold.
   static int max_length()   { return max_symbol_length; }
-
-  int identity_hash()       { return _identity_hash; }
+  unsigned identity_hash() {
+    unsigned addr_bits = (unsigned)((uintptr_t)this >> (LogMinObjAlignmentInBytes + 3));
+    return ((unsigned)_identity_hash & 0xffff) |
+           ((addr_bits ^ (_length << 8) ^ (( _body[0] << 8) | _body[1])) << 16);
+  }
 
   // For symbol table alternate hashing
   unsigned int new_hash(juint seed);
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/opto/arraycopynode.cpp
--- a/hotspot/src/share/vm/opto/arraycopynode.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/opto/arraycopynode.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -626,3 +626,75 @@
 
   return CallNode::may_modify_arraycopy_helper(dest_t, t_oop, phase);
 }
+
+bool ArrayCopyNode::may_modify_helper(const TypeOopPtr *t_oop, Node* n, PhaseTransform *phase) {
+  if (n->is_Proj()) {
+    n = n->in(0);
+    if (n->is_Call() && n->as_Call()->may_modify(t_oop, phase)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ArrayCopyNode::may_modify(const TypeOopPtr *t_oop, MemBarNode* mb, PhaseTransform *phase) {
+  Node* mem = mb->in(TypeFunc::Memory);
+
+  if (mem->is_MergeMem()) {
+    Node* n = mem->as_MergeMem()->memory_at(Compile::AliasIdxRaw);
+    if (may_modify_helper(t_oop, n, phase)) {
+      return true;
+    } else if (n->is_Phi()) {
+      for (uint i = 1; i < n->req(); i++) {
+        if (n->in(i) != NULL) {
+          if (may_modify_helper(t_oop, n->in(i), phase)) {
+            return true;
+          }
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+// Does this array copy modify offsets between offset_lo and offset_hi
+// in the destination array
+// if must_modify is false, return true if the copy could write
+// between offset_lo and offset_hi
+// if must_modify is true, return true if the copy is guaranteed to
+// write between offset_lo and offset_hi
+bool ArrayCopyNode::modifies(intptr_t offset_lo, intptr_t offset_hi, PhaseTransform* phase, bool must_modify) {
+  assert(_kind == ArrayCopy || _kind == CopyOf || _kind == CopyOfRange, "only for real array copies");
+
+  Node* dest = in(ArrayCopyNode::Dest);
+  Node* src_pos = in(ArrayCopyNode::SrcPos);
+  Node* dest_pos = in(ArrayCopyNode::DestPos);
+  Node* len = in(ArrayCopyNode::Length);
+
+  const TypeInt *dest_pos_t = phase->type(dest_pos)->isa_int();
+  const TypeInt *len_t = phase->type(len)->isa_int();
+  const TypeAryPtr* ary_t = phase->type(dest)->isa_aryptr();
+
+  if (dest_pos_t != NULL && len_t != NULL && ary_t != NULL) {
+    BasicType ary_elem = ary_t->klass()->as_array_klass()->element_type()->basic_type();
+    uint header = arrayOopDesc::base_offset_in_bytes(ary_elem);
+    uint elemsize = type2aelembytes(ary_elem);
+
+    intptr_t dest_pos_plus_len_lo = (((intptr_t)dest_pos_t->_lo) + len_t->_lo) * elemsize + header;
+    intptr_t dest_pos_plus_len_hi = (((intptr_t)dest_pos_t->_hi) + len_t->_hi) * elemsize + header;
+    intptr_t dest_pos_lo = ((intptr_t)dest_pos_t->_lo) * elemsize + header;
+    intptr_t dest_pos_hi = ((intptr_t)dest_pos_t->_hi) * elemsize + header;
+
+    if (must_modify) {
+      if (offset_lo >= dest_pos_hi && offset_hi < dest_pos_plus_len_lo) {
+        return true;
+      }
+    } else {
+      if (offset_hi >= dest_pos_lo && offset_lo < dest_pos_plus_len_hi) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/opto/arraycopynode.hpp
--- a/hotspot/src/share/vm/opto/arraycopynode.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/opto/arraycopynode.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -108,6 +108,7 @@
                             BasicType copy_type, const Type* value_type, int count);
   bool finish_transform(PhaseGVN *phase, bool can_reshape,
                         Node* ctl, Node *mem);
+  static bool may_modify_helper(const TypeOopPtr *t_oop, Node* n, PhaseTransform *phase);
 
 public:
 
@@ -162,6 +163,9 @@
 
   bool is_alloc_tightly_coupled() const { return _alloc_tightly_coupled; }
 
+  static bool may_modify(const TypeOopPtr *t_oop, MemBarNode* mb, PhaseTransform *phase);
+  bool modifies(intptr_t offset_lo, intptr_t offset_hi, PhaseTransform* phase, bool must_modify);
+
 #ifndef PRODUCT
   virtual void dump_spec(outputStream *st) const;
   virtual void dump_compact_spec(outputStream* st) const;
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/opto/c2compiler.cpp
--- a/hotspot/src/share/vm/opto/c2compiler.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/opto/c2compiler.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -161,7 +161,7 @@
   vmIntrinsics::ID id = method->intrinsic_id();
   assert(id != vmIntrinsics::_none, "must be a VM intrinsic");
 
-  if (id < vmIntrinsics::FIRST_ID || id >= vmIntrinsics::LAST_COMPILER_INLINE) {
+  if (id < vmIntrinsics::FIRST_ID || id > vmIntrinsics::LAST_COMPILER_INLINE) {
     return false;
   }
 
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/opto/callnode.cpp
--- a/hotspot/src/share/vm/opto/callnode.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/opto/callnode.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -742,7 +742,7 @@
 //
 bool CallNode::may_modify(const TypeOopPtr *t_oop, PhaseTransform *phase) {
   assert((t_oop != NULL), "sanity");
-  if (is_call_to_arraycopystub()) {
+  if (is_call_to_arraycopystub() && strcmp(_name, "unsafe_arraycopy") != 0) {
     const TypeTuple* args = _tf->domain();
     Node* dest = NULL;
     // Stubs that can be called once an ArrayCopyNode is expanded have
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/opto/chaitin.cpp
--- a/hotspot/src/share/vm/opto/chaitin.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/opto/chaitin.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -990,9 +990,13 @@
         // FOUR registers!
 #ifdef ASSERT
         if (is_vect) {
-          assert(lrgmask.is_aligned_sets(lrg.num_regs()), "vector should be aligned");
-          assert(!lrg._fat_proj, "sanity");
-          assert(RegMask::num_registers(kreg) == lrg.num_regs(), "sanity");
+          if (lrg.num_regs() != 0) {
+            assert(lrgmask.is_aligned_sets(lrg.num_regs()), "vector should be aligned");
+            assert(!lrg._fat_proj, "sanity");
+            assert(RegMask::num_registers(kreg) == lrg.num_regs(), "sanity");
+          } else {
+            assert(n->is_Phi(), "not all inputs processed only if Phi");
+          }
         }
 #endif
         if (!is_vect && lrg.num_regs() == 2 && !lrg._fat_proj && rm.is_misaligned_pair()) {
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/opto/compile.hpp
--- a/hotspot/src/share/vm/opto/compile.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/opto/compile.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -93,7 +93,7 @@
  public:
 
   void set_idx(node_idx_t idx) {
-    _idx_clone_orig = _idx_clone_orig & 0xFFFFFFFF00000000 | idx;
+    _idx_clone_orig = _idx_clone_orig & CONST64(0xFFFFFFFF00000000) | idx;
   }
   node_idx_t idx() const { return (node_idx_t)(_idx_clone_orig & 0xFFFFFFFF); }
 
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/opto/library_call.cpp
--- a/hotspot/src/share/vm/opto/library_call.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/opto/library_call.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -2730,7 +2730,22 @@
         load_store = _gvn.transform(new CompareAndSwapPNode(control(), mem, adr, newval, oldval));
       }
     }
-    post_barrier(control(), load_store, base, adr, alias_idx, newval, T_OBJECT, true);
+    if (kind == LS_cmpxchg) {
+      // Emit the post barrier only when the actual store happened.
+      // This makes sense to check only for compareAndSet that can fail to set the value.
+      // CAS success path is marked more likely since we anticipate this is a performance
+      // critical path, while CAS failure path can use the penalty for going through unlikely
+      // path as backoff. Which is still better than doing a store barrier there.
+      IdealKit ideal(this);
+      ideal.if_then(load_store, BoolTest::ne, ideal.ConI(0), PROB_STATIC_FREQUENT); {
+        sync_kit(ideal);
+        post_barrier(ideal.ctrl(), load_store, base, adr, alias_idx, newval, T_OBJECT, true);
+        ideal.sync_kit(this);
+      } ideal.end_if();
+      final_sync(ideal);
+    } else {
+      post_barrier(control(), load_store, base, adr, alias_idx, newval, T_OBJECT, true);
+    }
     break;
   default:
     fatal(err_msg_res("unexpected type %d: %s", type, type2name(type)));
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/opto/loopnode.cpp
--- a/hotspot/src/share/vm/opto/loopnode.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/opto/loopnode.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -1175,7 +1175,7 @@
 //=============================================================================
 //------------------------------is_member--------------------------------------
 // Is 'l' a member of 'this'?
-int IdealLoopTree::is_member( const IdealLoopTree *l ) const {
+bool IdealLoopTree::is_member(const IdealLoopTree *l) const {
   while( l->_nest > _nest ) l = l->_parent;
   return l == this;
 }
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/opto/loopnode.hpp
--- a/hotspot/src/share/vm/opto/loopnode.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/opto/loopnode.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -384,7 +384,7 @@
   { }
 
   // Is 'l' a member of 'this'?
-  int is_member( const IdealLoopTree *l ) const; // Test for nested membership
+  bool is_member(const IdealLoopTree *l) const; // Test for nested membership
 
   // Set loop nesting depth.  Accumulate has_call bits.
   int set_nest( uint depth );
@@ -1086,6 +1086,8 @@
   bool split_up( Node *n, Node *blk1, Node *blk2 );
   void sink_use( Node *use, Node *post_loop );
   Node *place_near_use( Node *useblock ) const;
+  Node* try_move_store_before_loop(Node* n, Node *n_ctrl);
+  void try_move_store_after_loop(Node* n);
 
   bool _created_loop_node;
 public:
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/opto/loopopts.cpp
--- a/hotspot/src/share/vm/opto/loopopts.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/opto/loopopts.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -653,6 +653,209 @@
   return iff->in(1);
 }
 
+#ifdef ASSERT
+static void enqueue_cfg_uses(Node* m, Unique_Node_List& wq) {
+  for (DUIterator_Fast imax, i = m->fast_outs(imax); i < imax; i++) {
+    Node* u = m->fast_out(i);
+    if (u->is_CFG()) {
+      if (u->Opcode() == Op_NeverBranch) {
+        u = ((NeverBranchNode*)u)->proj_out(0);
+        enqueue_cfg_uses(u, wq);
+      } else {
+        wq.push(u);
+      }
+    }
+  }
+}
+#endif
+
+// Try moving a store out of a loop, right before the loop
+Node* PhaseIdealLoop::try_move_store_before_loop(Node* n, Node *n_ctrl) {
+  // Store has to be first in the loop body
+  IdealLoopTree *n_loop = get_loop(n_ctrl);
+  if (n->is_Store() && n_loop != _ltree_root && n_loop->is_loop()) {
+    assert(n->in(0), "store should have control set");
+    Node* address = n->in(MemNode::Address);
+    Node* value = n->in(MemNode::ValueIn);
+    Node* mem = n->in(MemNode::Memory);
+    IdealLoopTree* address_loop = get_loop(get_ctrl(address));
+    IdealLoopTree* value_loop = get_loop(get_ctrl(value));
+
+    // - address and value must be loop invariant
+    // - memory must be a memory Phi for the loop
+    // - Store must be the only store on this memory slice in the
+    // loop: if there's another store following this one then value
+    // written at iteration i by the second store could be overwritten
+    // at iteration i+n by the first store: it's not safe to move the
+    // first store out of the loop
+    // - nothing must observe the Phi memory: it guarantees no read
+    // before the store and no early exit out of the loop
+    // With those conditions, we are also guaranteed the store post
+    // dominates the loop head. Otherwise there would be extra Phi
+    // involved between the loop's Phi and the store.
+
+    if (!n_loop->is_member(address_loop) &&
+        !n_loop->is_member(value_loop) &&
+        mem->is_Phi() && mem->in(0) == n_loop->_head &&
+        mem->outcnt() == 1 &&
+        mem->in(LoopNode::LoopBackControl) == n) {
+
+#ifdef ASSERT
+      // Verify that store's control does post dominate loop entry and
+      // that there's no early exit of the loop before the store.
+      bool ctrl_ok = false;
+      {
+        // Follow control from loop head until n, we exit the loop or
+        // we reach the tail
+        ResourceMark rm;
+        Unique_Node_List wq;
+        wq.push(n_loop->_head);
+        assert(n_loop->_tail != NULL, "need a tail");
+        for (uint next = 0; next < wq.size(); ++next) {
+          Node *m = wq.at(next);
+          if (m == n->in(0)) {
+            ctrl_ok = true;
+            continue;
+          }
+          assert(!has_ctrl(m), "should be CFG");
+          if (!n_loop->is_member(get_loop(m)) || m == n_loop->_tail) {
+            ctrl_ok = false;
+            break;
+          }
+          enqueue_cfg_uses(m, wq);
+        }
+      }
+      assert(ctrl_ok, "bad control");
+#endif
+
+      // move the Store
+      _igvn.replace_input_of(mem, LoopNode::LoopBackControl, mem);
+      _igvn.replace_input_of(n, 0, n_loop->_head->in(LoopNode::EntryControl));
+      _igvn.replace_input_of(n, MemNode::Memory, mem->in(LoopNode::EntryControl));
+      // Disconnect the phi now. An empty phi can confuse other
+      // optimizations in this pass of loop opts.
+      _igvn.replace_node(mem, mem->in(LoopNode::EntryControl));
+      n_loop->_body.yank(mem);
+
+      IdealLoopTree* new_loop = get_loop(n->in(0));
+      set_ctrl_and_loop(n, n->in(0));
+
+      return n;
+    }
+  }
+  return NULL;
+}
+
+// Try moving a store out of a loop, right after the loop
+void PhaseIdealLoop::try_move_store_after_loop(Node* n) {
+  if (n->is_Store()) {
+    assert(n->in(0), "store should have control set");
+    Node *n_ctrl = get_ctrl(n);
+    IdealLoopTree *n_loop = get_loop(n_ctrl);
+    // Store must be in a loop
+    if (n_loop != _ltree_root && !n_loop->_irreducible) {
+      Node* address = n->in(MemNode::Address);
+      Node* value = n->in(MemNode::ValueIn);
+      IdealLoopTree* address_loop = get_loop(get_ctrl(address));
+      // address must be loop invariant
+      if (!n_loop->is_member(address_loop)) {
+        // Store must be last on this memory slice in the loop and
+        // nothing in the loop must observe it
+        Node* phi = NULL;
+        for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
+          Node* u = n->fast_out(i);
+          if (has_ctrl(u)) { // control use?
+            IdealLoopTree *u_loop = get_loop(get_ctrl(u));
+            if (!n_loop->is_member(u_loop)) {
+              continue;
+            }
+            if (u->is_Phi() && u->in(0) == n_loop->_head) {
+              assert(_igvn.type(u) == Type::MEMORY, "bad phi");
+              assert(phi == NULL, "already found");
+              phi = u;
+              continue;
+            }
+          }
+          phi = NULL;
+          break;
+        }
+        if (phi != NULL) {
+          // Nothing in the loop before the store (next iteration)
+          // must observe the stored value
+          bool mem_ok = true;
+          {
+            ResourceMark rm;
+            Unique_Node_List wq;
+            wq.push(phi);
+            for (uint next = 0; next < wq.size() && mem_ok; ++next) {
+              Node *m = wq.at(next);
+              for (DUIterator_Fast imax, i = m->fast_outs(imax); i < imax && mem_ok; i++) {
+                Node* u = m->fast_out(i);
+                if (u->is_Store() || u->is_Phi()) {
+                  if (u != n) {
+                    wq.push(u);
+                    mem_ok = (wq.size() <= 10);
+                  }
+                } else {
+                  mem_ok = false;
+                  break;
+                }
+              }
+            }
+          }
+          if (mem_ok) {
+            // Move the Store out of the loop creating clones along
+            // all paths out of the loop that observe the stored value
+            _igvn.rehash_node_delayed(phi);
+            int count = phi->replace_edge(n, n->in(MemNode::Memory));
+            assert(count > 0, "inconsistent phi");
+            for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
+              Node* u = n->fast_out(i);
+              Node* c = get_ctrl(u);
+
+              if (u->is_Phi()) {
+                c = u->in(0)->in(u->find_edge(n));
+              }
+              IdealLoopTree *u_loop = get_loop(c);
+              assert (!n_loop->is_member(u_loop), "only the phi should have been a use in the loop");
+              while(true) {
+                Node* next_c = find_non_split_ctrl(idom(c));
+                if (n_loop->is_member(get_loop(next_c))) {
+                  break;
+                }
+                c = next_c;
+              }
+
+              Node* st = n->clone();
+              st->set_req(0, c);
+              _igvn.register_new_node_with_optimizer(st);
+
+              set_ctrl(st, c);
+              IdealLoopTree* new_loop = get_loop(c);
+              assert(new_loop != n_loop, "should be moved out of loop");
+              if (new_loop->_child == NULL) new_loop->_body.push(st);
+
+              _igvn.replace_input_of(u, u->find_edge(n), st);
+              --imax;
+              --i;
+            }
+
+
+            assert(n->outcnt() == 0, "all uses should be gone");
+            _igvn.replace_input_of(n, MemNode::Memory, C->top());
+            // Disconnect the phi now. An empty phi can confuse other
+            // optimizations in this pass of loop opts..
+            if (phi->in(LoopNode::LoopBackControl) == phi) {
+              _igvn.replace_node(phi, phi->in(LoopNode::EntryControl));
+              n_loop->_body.yank(phi);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
 //------------------------------split_if_with_blocks_pre-----------------------
 // Do the real work in a non-recursive function.  Data nodes want to be
 // cloned in the pre-order so they can feed each other nicely.
@@ -683,6 +886,11 @@
   Node *n_ctrl = get_ctrl(n);
   if( !n_ctrl ) return n;       // Dead node
 
+  Node* res = try_move_store_before_loop(n, n_ctrl);
+  if (res != NULL) {
+    return n;
+  }
+
   // Attempt to remix address expressions for loop invariants
   Node *m = remix_address_expressions( n );
   if( m ) return m;
@@ -691,16 +899,18 @@
   // Returns the block to clone thru.
   Node *n_blk = has_local_phi_input( n );
   if( !n_blk ) return n;
+
   // Do not clone the trip counter through on a CountedLoop
   // (messes up the canonical shape).
   if( n_blk->is_CountedLoop() && n->Opcode() == Op_AddI ) return n;
 
   // Check for having no control input; not pinned.  Allow
   // dominating control.
-  if( n->in(0) ) {
+  if (n->in(0)) {
     Node *dom = idom(n_blk);
-    if( dom_lca( n->in(0), dom ) != n->in(0) )
+    if (dom_lca(n->in(0), dom) != n->in(0)) {
       return n;
+    }
   }
   // Policy: when is it profitable.  You must get more wins than
   // policy before it is considered profitable.  Policy is usually 0,
@@ -1029,6 +1239,8 @@
     }
   }
 
+  try_move_store_after_loop(n);
+
   // Check for Opaque2's who's loop has disappeared - who's input is in the
   // same loop nest as their output.  Remove 'em, they are no longer useful.
   if( n_op == Op_Opaque2 &&
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/opto/macro.cpp
--- a/hotspot/src/share/vm/opto/macro.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/opto/macro.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -324,18 +324,28 @@
         return in;
       } else if (in->is_Call()) {
         CallNode *call = in->as_Call();
-        if (!call->may_modify(tinst, phase)) {
-          mem = call->in(TypeFunc::Memory);
+        if (call->may_modify(tinst, phase)) {
+          assert(call->is_ArrayCopy(), "ArrayCopy is the only call node that doesn't make allocation escape");
+
+          if (call->as_ArrayCopy()->modifies(offset, offset, phase, false)) {
+            return in;
+          }
         }
         mem = in->in(TypeFunc::Memory);
       } else if (in->is_MemBar()) {
+        if (ArrayCopyNode::may_modify(tinst, in->as_MemBar(), phase)) {
+          assert(in->in(0)->is_Proj() && in->in(0)->in(0)->is_ArrayCopy(), "should be arraycopy");
+          ArrayCopyNode* ac = in->in(0)->in(0)->as_ArrayCopy();
+          assert(ac->is_clonebasic(), "Only basic clone is a non escaping clone");
+          return ac;
+        }
         mem = in->in(TypeFunc::Memory);
       } else {
         assert(false, "unexpected projection");
       }
     } else if (mem->is_Store()) {
       const TypePtr* atype = mem->as_Store()->adr_type();
-      int adr_idx = Compile::current()->get_alias_index(atype);
+      int adr_idx = phase->C->get_alias_index(atype);
       if (adr_idx == alias_idx) {
         assert(atype->isa_oopptr(), "address type must be oopptr");
         int adr_offset = atype->offset();
@@ -373,7 +383,7 @@
         adr = mem->in(3); // Destination array
       }
       const TypePtr* atype = adr->bottom_type()->is_ptr();
-      int adr_idx = Compile::current()->get_alias_index(atype);
+      int adr_idx = phase->C->get_alias_index(atype);
       if (adr_idx == alias_idx) {
         assert(false, "Object is not scalar replaceable if a LoadStore node access its field");
         return NULL;
@@ -386,12 +396,63 @@
   }
 }
 
+// Generate loads from source of the arraycopy for fields of
+// destination needed at a deoptimization point
+Node* PhaseMacroExpand::make_arraycopy_load(ArrayCopyNode* ac, intptr_t offset, Node* ctl, BasicType ft, const Type *ftype, AllocateNode *alloc) {
+  BasicType bt = ft;
+  const Type *type = ftype;
+  if (ft == T_NARROWOOP) {
+    bt = T_OBJECT;
+    type = ftype->make_oopptr();
+  }
+  Node* res = NULL;
+  if (ac->is_clonebasic()) {
+    Node* base = ac->in(ArrayCopyNode::Src)->in(AddPNode::Base);
+    Node* adr = _igvn.transform(new AddPNode(base, base, MakeConX(offset)));
+    const TypePtr* adr_type = _igvn.type(base)->is_ptr()->add_offset(offset);
+    Node* m = ac->in(TypeFunc::Memory);
+    while (m->is_MergeMem()) {
+      m = m->as_MergeMem()->memory_at(C->get_alias_index(adr_type));
+      if (m->is_Proj() && m->in(0)->is_MemBar()) {
+        m = m->in(0)->in(TypeFunc::Memory);
+      }
+    }
+    res = LoadNode::make(_igvn, ctl, m, adr, adr_type, type, bt, MemNode::unordered, LoadNode::Pinned);
+  } else {
+    if (ac->modifies(offset, offset, &_igvn, true)) {
+      assert(ac->in(ArrayCopyNode::Dest) == alloc->result_cast(), "arraycopy destination should be allocation's result");
+      uint shift  = exact_log2(type2aelembytes(bt));
+      Node* diff = _igvn.transform(new SubINode(ac->in(ArrayCopyNode::SrcPos), ac->in(ArrayCopyNode::DestPos)));
+#ifdef _LP64
+      diff = _igvn.transform(new ConvI2LNode(diff));
+#endif
+      diff = _igvn.transform(new LShiftXNode(diff, intcon(shift)));
+
+      Node* off = _igvn.transform(new AddXNode(MakeConX(offset), diff));
+      Node* base = ac->in(ArrayCopyNode::Src);
+      Node* adr = _igvn.transform(new AddPNode(base, base, off));
+      const TypePtr* adr_type = _igvn.type(base)->is_ptr()->add_offset(offset);
+      Node* m = ac->in(TypeFunc::Memory);
+      res = LoadNode::make(_igvn, ctl, m, adr, adr_type, type, bt, MemNode::unordered, LoadNode::Pinned);
+    }
+  }
+  if (res != NULL) {
+    res = _igvn.transform(res);
+    if (ftype->isa_narrowoop()) {
+      // PhaseMacroExpand::scalar_replacement adds DecodeN nodes
+      res = _igvn.transform(new EncodePNode(res, ftype));
+    }
+    return res;
+  }
+  return NULL;
+}
+
 //
 // Given a Memory Phi, compute a value Phi containing the values from stores
 // on the input paths.
-// Note: this function is recursive, its depth is limied by the "level" argument
+// Note: this function is recursive, its depth is limited by the "level" argument
 // Returns the computed Phi, or NULL if it cannot compute it.
-Node *PhaseMacroExpand::value_from_mem_phi(Node *mem, BasicType ft, const Type *phi_type, const TypeOopPtr *adr_t, Node *alloc, Node_Stack *value_phis, int level) {
+Node *PhaseMacroExpand::value_from_mem_phi(Node *mem, BasicType ft, const Type *phi_type, const TypeOopPtr *adr_t, AllocateNode *alloc, Node_Stack *value_phis, int level) {
   assert(mem->is_Phi(), "sanity");
   int alias_idx = C->get_alias_index(adr_t);
   int offset = adr_t->offset();
@@ -458,6 +519,12 @@
         assert(val->in(0)->is_LoadStore() || val->in(0)->Opcode() == Op_EncodeISOArray, "sanity");
         assert(false, "Object is not scalar replaceable if a LoadStore node access its field");
         return NULL;
+      } else if (val->is_ArrayCopy()) {
+        Node* res = make_arraycopy_load(val->as_ArrayCopy(), offset, val->in(0), ft, phi_type, alloc);
+        if (res == NULL) {
+          return NULL;
+        }
+        values.at_put(j, res);
       } else {
 #ifdef ASSERT
         val->dump();
@@ -479,7 +546,7 @@
 }
 
 // Search the last value stored into the object's field.
-Node *PhaseMacroExpand::value_from_mem(Node *sfpt_mem, BasicType ft, const Type *ftype, const TypeOopPtr *adr_t, Node *alloc) {
+Node *PhaseMacroExpand::value_from_mem(Node *sfpt_mem, Node *sfpt_ctl, BasicType ft, const Type *ftype, const TypeOopPtr *adr_t, AllocateNode *alloc) {
   assert(adr_t->is_known_instance_field(), "instance required");
   int instance_id = adr_t->instance_id();
   assert((uint)instance_id == alloc->_idx, "wrong allocation");
@@ -538,6 +605,8 @@
       } else {
         done = true;
       }
+    } else if (mem->is_ArrayCopy()) {
+      done = true;
     } else {
       assert(false, "unexpected node");
     }
@@ -562,6 +631,13 @@
           value_phis.pop();
         }
       }
+    } else if (mem->is_ArrayCopy()) {
+      Node* ctl = mem->in(0);
+      if (sfpt_ctl->is_Proj() && sfpt_ctl->as_Proj()->is_uncommon_trap_proj(Deoptimization::Reason_none)) {
+        // pin the loads in the uncommon trap path
+        ctl = sfpt_ctl;
+      }
+      return make_arraycopy_load(mem->as_ArrayCopy(), offset, ctl, ft, ftype, alloc);
     }
   }
   // Something go wrong.
@@ -738,6 +814,7 @@
   while (safepoints.length() > 0) {
     SafePointNode* sfpt = safepoints.pop();
     Node* mem = sfpt->memory();
+    Node* ctl = sfpt->control();
     assert(sfpt->jvms() != NULL, "missed JVMS");
     // Fields of scalar objs are referenced only at the end
     // of regular debuginfo at the last (youngest) JVMS.
@@ -789,7 +866,7 @@
 
       const TypeOopPtr *field_addr_type = res_type->add_offset(offset)->isa_oopptr();
 
-      Node *field_val = value_from_mem(mem, basic_elem_type, field_type, field_addr_type, alloc);
+      Node *field_val = value_from_mem(mem, ctl, basic_elem_type, field_type, field_addr_type, alloc);
       if (field_val == NULL) {
         // We weren't able to find a value for this field,
         // give up on eliminating this allocation.
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/opto/macro.hpp
--- a/hotspot/src/share/vm/opto/macro.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/opto/macro.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -85,8 +85,8 @@
                               Node* length,
                               const TypeFunc* slow_call_type,
                               address slow_call_address);
-  Node *value_from_mem(Node *mem, BasicType ft, const Type *ftype, const TypeOopPtr *adr_t, Node *alloc);
-  Node *value_from_mem_phi(Node *mem, BasicType ft, const Type *ftype, const TypeOopPtr *adr_t, Node *alloc, Node_Stack *value_phis, int level);
+  Node *value_from_mem(Node *mem, Node *ctl, BasicType ft, const Type *ftype, const TypeOopPtr *adr_t, AllocateNode *alloc);
+  Node *value_from_mem_phi(Node *mem, BasicType ft, const Type *ftype, const TypeOopPtr *adr_t, AllocateNode *alloc, Node_Stack *value_phis, int level);
 
   bool eliminate_boxing_node(CallStaticJavaNode *boxing);
   bool eliminate_allocate_node(AllocateNode *alloc);
@@ -200,6 +200,8 @@
                             Node* old_eden_top, Node* new_eden_top,
                             Node* length);
 
+  Node* make_arraycopy_load(ArrayCopyNode* ac, intptr_t offset, Node* ctl, BasicType ft, const Type *ftype, AllocateNode *alloc);
+
 public:
   PhaseMacroExpand(PhaseIterGVN &igvn) : Phase(Macro_Expand), _igvn(igvn), _has_locks(false) {
     _igvn.set_delay_transform(true);
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/opto/memnode.cpp
--- a/hotspot/src/share/vm/opto/memnode.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/opto/memnode.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -108,37 +108,6 @@
 
 #endif
 
-static bool membar_for_arraycopy_helper(const TypeOopPtr *t_oop, Node* n, PhaseTransform *phase) {
-  if (n->is_Proj()) {
-    n = n->in(0);
-    if (n->is_Call() && n->as_Call()->may_modify(t_oop, phase)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-static bool membar_for_arraycopy(const TypeOopPtr *t_oop, MemBarNode* mb, PhaseTransform *phase) {
-  Node* mem = mb->in(TypeFunc::Memory);
-
-  if (mem->is_MergeMem()) {
-    Node* n = mem->as_MergeMem()->memory_at(Compile::AliasIdxRaw);
-    if (membar_for_arraycopy_helper(t_oop, n, phase)) {
-      return true;
-    } else if (n->is_Phi()) {
-      for (uint i = 1; i < n->req(); i++) {
-        if (n->in(i) != NULL) {
-          if (membar_for_arraycopy_helper(t_oop, n->in(i), phase)) {
-            return true;
-          }
-        }
-      }
-    }
-  }
-
-  return false;
-}
-
 Node *MemNode::optimize_simple_memory_chain(Node *mchain, const TypeOopPtr *t_oop, Node *load, PhaseGVN *phase) {
   assert((t_oop != NULL), "sanity");
   bool is_instance = t_oop->is_known_instance_field();
@@ -183,7 +152,7 @@
           }
         }
       } else if (proj_in->is_MemBar()) {
-        if (membar_for_arraycopy(t_oop, proj_in->as_MemBar(), phase)) {
+        if (ArrayCopyNode::may_modify(t_oop, proj_in->as_MemBar(), phase)) {
           break;
         }
         result = proj_in->in(TypeFunc::Memory);
@@ -545,35 +514,12 @@
         Node* dest = ac->in(ArrayCopyNode::Dest);
 
         if (dest == ld_base) {
-          Node* src_pos = ac->in(ArrayCopyNode::SrcPos);
-          Node* dest_pos = ac->in(ArrayCopyNode::DestPos);
-          Node* len = ac->in(ArrayCopyNode::Length);
-
-          const TypeInt *dest_pos_t = phase->type(dest_pos)->isa_int();
           const TypeX *ld_offs_t = phase->type(ld_offs)->isa_intptr_t();
-          const TypeInt *len_t = phase->type(len)->isa_int();
-          const TypeAryPtr* ary_t = phase->type(dest)->isa_aryptr();
-
-          if (dest_pos_t != NULL && ld_offs_t != NULL && len_t != NULL && ary_t != NULL) {
-            BasicType ary_elem  = ary_t->klass()->as_array_klass()->element_type()->basic_type();
-            uint header = arrayOopDesc::base_offset_in_bytes(ary_elem);
-            uint elemsize = type2aelembytes(ary_elem);
-
-            intptr_t dest_pos_plus_len_lo = (((intptr_t)dest_pos_t->_lo) + len_t->_lo) * elemsize + header;
-            intptr_t dest_pos_plus_len_hi = (((intptr_t)dest_pos_t->_hi) + len_t->_hi) * elemsize + header;
-            intptr_t dest_pos_lo = ((intptr_t)dest_pos_t->_lo) * elemsize + header;
-            intptr_t dest_pos_hi = ((intptr_t)dest_pos_t->_hi) * elemsize + header;
-
-            if (can_see_stored_value) {
-              if (ld_offs_t->_lo >= dest_pos_hi && ld_offs_t->_hi < dest_pos_plus_len_lo) {
-                return ac;
-              }
-            } else {
-              if (ld_offs_t->_hi < dest_pos_lo || ld_offs_t->_lo >= dest_pos_plus_len_hi) {
-                mem = ac->in(TypeFunc::Memory);
-              }
-              return ac;
-            }
+          if (ac->modifies(ld_offs_t->_lo, ld_offs_t->_hi, phase, can_see_stored_value)) {
+            return ac;
+          }
+          if (!can_see_stored_value) {
+            mem = ac->in(TypeFunc::Memory);
           }
         }
       }
@@ -703,7 +649,7 @@
           continue;         // (a) advance through independent call memory
         }
       } else if (mem->is_Proj() && mem->in(0)->is_MemBar()) {
-        if (membar_for_arraycopy(addr_t, mem->in(0)->as_MemBar(), phase)) {
+        if (ArrayCopyNode::may_modify(addr_t, mem->in(0)->as_MemBar(), phase)) {
           break;
         }
         mem = mem->in(0)->in(TypeFunc::Memory);
@@ -883,18 +829,17 @@
 // Is the value loaded previously stored by an arraycopy? If so return
 // a load node that reads from the source array so we may be able to
 // optimize out the ArrayCopy node later.
-Node* MemNode::can_see_arraycopy_value(Node* st, PhaseTransform* phase) const {
+Node* LoadNode::can_see_arraycopy_value(Node* st, PhaseTransform* phase) const {
   Node* ld_adr = in(MemNode::Address);
   intptr_t ld_off = 0;
   AllocateNode* ld_alloc = AllocateNode::Ideal_allocation(ld_adr, phase, ld_off);
   Node* ac = find_previous_arraycopy(phase, ld_alloc, st, true);
   if (ac != NULL) {
     assert(ac->is_ArrayCopy(), "what kind of node can this be?");
-    assert(is_Load(), "only for loads");
-
+
+    Node* ld = clone();
     if (ac->as_ArrayCopy()->is_clonebasic()) {
       assert(ld_alloc != NULL, "need an alloc");
-      Node* ld = clone();
       Node* addp = in(MemNode::Address)->clone();
       assert(addp->is_AddP(), "address must be addp");
       assert(addp->in(AddPNode::Base) == ac->in(ArrayCopyNode::Dest)->in(AddPNode::Base), "strange pattern");
@@ -906,9 +851,7 @@
         assert(ld_alloc->in(0) != NULL, "alloc must have control");
         ld->set_req(0, ld_alloc->in(0));
       }
-      return ld;
     } else {
-      Node* ld = clone();
       Node* addp = in(MemNode::Address)->clone();
       assert(addp->in(AddPNode::Base) == addp->in(AddPNode::Address), "should be");
       addp->set_req(AddPNode::Base, ac->in(ArrayCopyNode::Src));
@@ -933,8 +876,10 @@
         assert(ac->in(0) != NULL, "alloc must have control");
         ld->set_req(0, ac->in(0));
       }
-      return ld;
     }
+    // load depends on the tests that validate the arraycopy
+    ld->as_Load()->_depends_only_on_test = Pinned;
+    return ld;
   }
   return NULL;
 }
@@ -2426,40 +2371,47 @@
 
   Node* mem     = in(MemNode::Memory);
   Node* address = in(MemNode::Address);
-
   // Back-to-back stores to same address?  Fold em up.  Generally
   // unsafe if I have intervening uses...  Also disallowed for StoreCM
   // since they must follow each StoreP operation.  Redundant StoreCMs
   // are eliminated just before matching in final_graph_reshape.
-  if (mem->is_Store() && mem->in(MemNode::Address)->eqv_uncast(address) &&
-      mem->Opcode() != Op_StoreCM) {
-    // Looking at a dead closed cycle of memory?
-    assert(mem != mem->in(MemNode::Memory), "dead loop in StoreNode::Ideal");
-
-    assert(Opcode() == mem->Opcode() ||
-           phase->C->get_alias_index(adr_type()) == Compile::AliasIdxRaw,
-           "no mismatched stores, except on raw memory");
-
-    if (mem->outcnt() == 1 &&           // check for intervening uses
-        mem->as_Store()->memory_size() <= this->memory_size()) {
-      // If anybody other than 'this' uses 'mem', we cannot fold 'mem' away.
-      // For example, 'mem' might be the final state at a conditional return.
-      // Or, 'mem' might be used by some node which is live at the same time
-      // 'this' is live, which might be unschedulable.  So, require exactly
-      // ONE user, the 'this' store, until such time as we clone 'mem' for
-      // each of 'mem's uses (thus making the exactly-1-user-rule hold true).
-      if (can_reshape) {  // (%%% is this an anachronism?)
-        set_req_X(MemNode::Memory, mem->in(MemNode::Memory),
-                  phase->is_IterGVN());
-      } else {
-        // It's OK to do this in the parser, since DU info is always accurate,
-        // and the parser always refers to nodes via SafePointNode maps.
-        set_req(MemNode::Memory, mem->in(MemNode::Memory));
+  {
+    Node* st = mem;
+    // If Store 'st' has more than one use, we cannot fold 'st' away.
+    // For example, 'st' might be the final state at a conditional
+    // return.  Or, 'st' might be used by some node which is live at
+    // the same time 'st' is live, which might be unschedulable.  So,
+    // require exactly ONE user until such time as we clone 'mem' for
+    // each of 'mem's uses (thus making the exactly-1-user-rule hold
+    // true).
+    while (st->is_Store() && st->outcnt() == 1 && st->Opcode() != Op_StoreCM) {
+      // Looking at a dead closed cycle of memory?
+      assert(st != st->in(MemNode::Memory), "dead loop in StoreNode::Ideal");
+      assert(Opcode() == st->Opcode() ||
+             st->Opcode() == Op_StoreVector ||
+             Opcode() == Op_StoreVector ||
+             phase->C->get_alias_index(adr_type()) == Compile::AliasIdxRaw ||
+             (Opcode() == Op_StoreL && st->Opcode() == Op_StoreI), // expanded ClearArrayNode
+             err_msg_res("no mismatched stores, except on raw memory: %s %s", NodeClassNames[Opcode()], NodeClassNames[st->Opcode()]));
+
+      if (st->in(MemNode::Address)->eqv_uncast(address) &&
+          st->as_Store()->memory_size() <= this->memory_size()) {
+        Node* use = st->raw_out(0);
+        phase->igvn_rehash_node_delayed(use);
+        if (can_reshape) {
+          use->set_req_X(MemNode::Memory, st->in(MemNode::Memory), phase->is_IterGVN());
+        } else {
+          // It's OK to do this in the parser, since DU info is always accurate,
+          // and the parser always refers to nodes via SafePointNode maps.
+          use->set_req(MemNode::Memory, st->in(MemNode::Memory));
+        }
+        return this;
       }
-      return this;
+      st = st->in(MemNode::Memory);
     }
   }
 
+
   // Capture an unaliased, unconditional, simple store into an initializer.
   // Or, if it is independent of the allocation, hoist it above the allocation.
   if (ReduceFieldZeroing && /*can_reshape &&*/
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/opto/memnode.hpp
--- a/hotspot/src/share/vm/opto/memnode.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/opto/memnode.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -126,7 +126,6 @@
   // Can this node (load or store) accurately see a stored value in
   // the given memory state?  (The state may or may not be in(Memory).)
   Node* can_see_stored_value(Node* st, PhaseTransform* phase) const;
-  Node* can_see_arraycopy_value(Node* st, PhaseTransform* phase) const;
 
 #ifndef PRODUCT
   static void dump_adr_type(const Node* mem, const TypePtr* adr_type, outputStream *st);
@@ -252,6 +251,9 @@
 protected:
   const Type* load_array_final_field(const TypeKlassPtr *tkls,
                                      ciKlass* klass) const;
+
+  Node* can_see_arraycopy_value(Node* st, PhaseTransform* phase) const;
+
   // depends_only_on_test is almost always true, and needs to be almost always
   // true to enable key hoisting & commoning optimizations.  However, for the
   // special case of RawPtr loads from TLS top & end, and other loads performed by
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/prims/jvmtiRedefineClasses.cpp
--- a/hotspot/src/share/vm/prims/jvmtiRedefineClasses.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/prims/jvmtiRedefineClasses.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -3771,7 +3771,7 @@
     // Deoptimize all activations depending on marked nmethods
     Deoptimization::deoptimize_dependents();
 
-    // Make the dependent methods not entrant (in VM_Deoptimize they are made zombies)
+    // Make the dependent methods not entrant
     CodeCache::make_marked_nmethods_not_entrant();
 
     // From now on we know that the dependency information is complete
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/runtime/arguments.cpp
--- a/hotspot/src/share/vm/runtime/arguments.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/runtime/arguments.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -81,8 +81,6 @@
 bool   Arguments::_has_profile                  = false;
 size_t Arguments::_conservative_max_heap_alignment = 0;
 size_t Arguments::_min_heap_size                = 0;
-uintx  Arguments::_min_heap_free_ratio          = 0;
-uintx  Arguments::_max_heap_free_ratio          = 0;
 Arguments::Mode Arguments::_mode                = _mixed;
 bool   Arguments::_java_compiler                = false;
 bool   Arguments::_xdebug_mode                  = false;
@@ -1614,11 +1612,9 @@
     // unless the user actually sets these flags.
     if (FLAG_IS_DEFAULT(MinHeapFreeRatio)) {
       FLAG_SET_DEFAULT(MinHeapFreeRatio, 0);
-      _min_heap_free_ratio = MinHeapFreeRatio;
     }
     if (FLAG_IS_DEFAULT(MaxHeapFreeRatio)) {
       FLAG_SET_DEFAULT(MaxHeapFreeRatio, 100);
-      _max_heap_free_ratio = MaxHeapFreeRatio;
     }
   }
 
@@ -3978,15 +3974,6 @@
   return JNI_OK;
 }
 
-// Any custom code post the 'CommandLineFlagConstraint::AfterErgo' constraint check
-// can be done here. We pass a flag that specifies whether
-// the check passed successfully
-void Arguments::post_after_ergo_constraint_check(bool check_passed) {
-  // This does not set the flag itself, but stores the value in a safe place for later usage.
-  _min_heap_free_ratio = MinHeapFreeRatio;
-  _max_heap_free_ratio = MaxHeapFreeRatio;
-}
-
 int Arguments::PropertyList_count(SystemProperty* pl) {
   int count = 0;
   while(pl != NULL) {
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/runtime/arguments.hpp
--- a/hotspot/src/share/vm/runtime/arguments.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/runtime/arguments.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -288,10 +288,6 @@
 
   static uintx _min_heap_size;
 
-  // Used to store original flag values
-  static uintx _min_heap_free_ratio;
-  static uintx _max_heap_free_ratio;
-
   // -Xrun arguments
   static AgentLibraryList _libraryList;
   static void add_init_library(const char* name, char* options)
@@ -463,8 +459,6 @@
   static jint apply_ergo();
   // Adjusts the arguments after the OS have adjusted the arguments
   static jint adjust_after_os();
-  // Set any arguments that need to be set after the 'CommandLineFlagConstraint::AfterErgo' constraint check
-  static void post_after_ergo_constraint_check(bool check_passed);
 
   static void set_gc_specific_flags();
   static inline bool gc_selected(); // whether a gc has been selected
@@ -526,10 +520,6 @@
   static size_t min_heap_size()             { return _min_heap_size; }
   static void  set_min_heap_size(size_t v)  { _min_heap_size = v;  }
 
-  // Returns the original values of -XX:MinHeapFreeRatio and -XX:MaxHeapFreeRatio
-  static uintx min_heap_free_ratio()        { return _min_heap_free_ratio; }
-  static uintx max_heap_free_ratio()        { return _max_heap_free_ratio; }
-
   // -Xrun
   static AgentLibrary* libraries()          { return _libraryList.first(); }
   static bool init_libraries_at_startup()   { return !_libraryList.is_empty(); }
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/runtime/commandLineFlagConstraintList.cpp
--- a/hotspot/src/share/vm/runtime/commandLineFlagConstraintList.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/runtime/commandLineFlagConstraintList.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -45,8 +45,8 @@
     _constraint=func;
   }
 
-  Flag::Error apply_bool(bool* value, bool verbose) {
-    return _constraint(verbose, value);
+  Flag::Error apply_bool(bool value, bool verbose) {
+    return _constraint(value, verbose);
   }
 };
 
@@ -61,8 +61,8 @@
     _constraint=func;
   }
 
-  Flag::Error apply_int(int* value, bool verbose) {
-    return _constraint(verbose, value);
+  Flag::Error apply_int(int value, bool verbose) {
+    return _constraint(value, verbose);
   }
 };
 
@@ -77,8 +77,8 @@
     _constraint=func;
   }
 
-  Flag::Error apply_intx(intx* value, bool verbose) {
-    return _constraint(verbose, value);
+  Flag::Error apply_intx(intx value, bool verbose) {
+    return _constraint(value, verbose);
   }
 };
 
@@ -93,8 +93,8 @@
     _constraint=func;
   }
 
-  Flag::Error apply_uint(uint* value, bool verbose) {
-    return _constraint(verbose, value);
+  Flag::Error apply_uint(uint value, bool verbose) {
+    return _constraint(value, verbose);
   }
 };
 
@@ -109,8 +109,8 @@
     _constraint=func;
   }
 
-  Flag::Error apply_uintx(uintx* value, bool verbose) {
-    return _constraint(verbose, value);
+  Flag::Error apply_uintx(uintx value, bool verbose) {
+    return _constraint(value, verbose);
   }
 };
 
@@ -125,8 +125,8 @@
     _constraint=func;
   }
 
-  Flag::Error apply_uint64_t(uint64_t* value, bool verbose) {
-    return _constraint(verbose, value);
+  Flag::Error apply_uint64_t(uint64_t value, bool verbose) {
+    return _constraint(value, verbose);
   }
 };
 
@@ -141,8 +141,8 @@
     _constraint=func;
   }
 
-  Flag::Error apply_size_t(size_t* value, bool verbose) {
-    return _constraint(verbose, value);
+  Flag::Error apply_size_t(size_t value, bool verbose) {
+    return _constraint(value, verbose);
   }
 };
 
@@ -157,8 +157,8 @@
     _constraint=func;
   }
 
-  Flag::Error apply_double(double* value, bool verbose) {
-    return _constraint(verbose, value);
+  Flag::Error apply_double(double value, bool verbose) {
+    return _constraint(value, verbose);
   }
 };
 
@@ -226,7 +226,6 @@
 
 // Check the ranges of all flags that have them or print them out and exit if requested
 void CommandLineFlagConstraintList::init(void) {
-
   _constraints = new (ResourceObj::C_HEAP, mtInternal) GrowableArray<CommandLineFlagConstraint*>(INITIAL_CONSTRAINTS_SIZE, true);
 
   emit_constraint_no(NULL RUNTIME_FLAGS(EMIT_CONSTRAINT_DEVELOPER_FLAG,
@@ -306,40 +305,6 @@
 
 // Check constraints for specific constraint type.
 bool CommandLineFlagConstraintList::check_constraints(CommandLineFlagConstraint::ConstraintType type) {
-//#define PRINT_CONSTRAINTS_SIZES
-#ifdef PRINT_CONSTRAINTS_SIZES
-  {
-    size_t size_constraints = sizeof(CommandLineFlagConstraintList);
-    for (int i=0; i<length(); i++) {
-      size_constraints += sizeof(CommandLineFlagConstraint);
-      CommandLineFlagConstraint* constraint = at(i);
-      const char* name = constraint->name();
-      Flag* flag = Flag::find_flag(name, strlen(name), true, true);
-      if (flag->is_bool()) {
-        size_constraints += sizeof(CommandLineFlagConstraintFunc_bool);
-        size_constraints += sizeof(CommandLineFlagConstraint*);
-      } else if (flag->is_intx()) {
-        size_constraints += sizeof(CommandLineFlagConstraintFunc_intx);
-        size_constraints += sizeof(CommandLineFlagConstraint*);
-      } else if (flag->is_uintx()) {
-        size_constraints += sizeof(CommandLineFlagConstraintFunc_uintx);
-        size_constraints += sizeof(CommandLineFlagConstraint*);
-      } else if (flag->is_uint64_t()) {
-        size_constraints += sizeof(CommandLineFlagConstraintFunc_uint64_t);
-        size_constraints += sizeof(CommandLineFlagConstraint*);
-      } else if (flag->is_size_t()) {
-        size_constraints += sizeof(CommandLineFlagConstraintFunc_size_t);
-        size_constraints += sizeof(CommandLineFlagConstraint*);
-      } else if (flag->is_double()) {
-        size_constraints += sizeof(CommandLineFlagConstraintFunc_double);
-        size_constraints += sizeof(CommandLineFlagConstraint*);
-      }
-    }
-    fprintf(stderr, "Size of %d constraints: " SIZE_FORMAT " bytes\n",
-            length(), size_constraints);
-  }
-#endif // PRINT_CONSTRAINTS_SIZES
-
   // Skip if we already checked.
   if (type < _validating_type) {
     return true;
@@ -350,27 +315,36 @@
   for (int i=0; i<length(); i++) {
     CommandLineFlagConstraint* constraint = at(i);
     if (type != constraint->type()) continue;
-    const char*name = constraint->name();
+    const char* name = constraint->name();
     Flag* flag = Flag::find_flag(name, strlen(name), true, true);
+    // We must check for NULL here as lp64_product flags on 32 bit architecture
+    // can generate constraint check (despite that they are declared as constants),
+    // but they will not be returned by Flag::find_flag()
     if (flag != NULL) {
       if (flag->is_bool()) {
         bool value = flag->get_bool();
-        if (constraint->apply_bool(&value, true) != Flag::SUCCESS) status = false;
+        if (constraint->apply_bool(value, true) != Flag::SUCCESS) status = false;
+      } else if (flag->is_int()) {
+        int value = flag->get_int();
+        if (constraint->apply_int(value, true) != Flag::SUCCESS) status = false;
+      } else if (flag->is_uint()) {
+        uint value = flag->get_uint();
+        if (constraint->apply_uint(value, true) != Flag::SUCCESS) status = false;
       } else if (flag->is_intx()) {
         intx value = flag->get_intx();
-        if (constraint->apply_intx(&value, true) != Flag::SUCCESS) status = false;
+        if (constraint->apply_intx(value, true) != Flag::SUCCESS) status = false;
       } else if (flag->is_uintx()) {
         uintx value = flag->get_uintx();
-        if (constraint->apply_uintx(&value, true) != Flag::SUCCESS) status = false;
+        if (constraint->apply_uintx(value, true) != Flag::SUCCESS) status = false;
       } else if (flag->is_uint64_t()) {
         uint64_t value = flag->get_uint64_t();
-        if (constraint->apply_uint64_t(&value, true) != Flag::SUCCESS) status = false;
+        if (constraint->apply_uint64_t(value, true) != Flag::SUCCESS) status = false;
       } else if (flag->is_size_t()) {
         size_t value = flag->get_size_t();
-        if (constraint->apply_size_t(&value, true) != Flag::SUCCESS) status = false;
+        if (constraint->apply_size_t(value, true) != Flag::SUCCESS) status = false;
       } else if (flag->is_double()) {
         double value = flag->get_double();
-        if (constraint->apply_double(&value, true) != Flag::SUCCESS) status = false;
+        if (constraint->apply_double(value, true) != Flag::SUCCESS) status = false;
       }
     }
   }
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/runtime/commandLineFlagConstraintList.hpp
--- a/hotspot/src/share/vm/runtime/commandLineFlagConstraintList.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/runtime/commandLineFlagConstraintList.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -39,14 +39,14 @@
  * "runtime/commandLineFlagConstraintsRuntime.hpp" for the functions themselves.
  */
 
-typedef Flag::Error (*CommandLineFlagConstraintFunc_bool)(bool verbose, bool* value);
-typedef Flag::Error (*CommandLineFlagConstraintFunc_int)(bool verbose, int* value);
-typedef Flag::Error (*CommandLineFlagConstraintFunc_intx)(bool verbose, intx* value);
-typedef Flag::Error (*CommandLineFlagConstraintFunc_uint)(bool verbose, uint* value);
-typedef Flag::Error (*CommandLineFlagConstraintFunc_uintx)(bool verbose, uintx* value);
-typedef Flag::Error (*CommandLineFlagConstraintFunc_uint64_t)(bool verbose, uint64_t* value);
-typedef Flag::Error (*CommandLineFlagConstraintFunc_size_t)(bool verbose, size_t* value);
-typedef Flag::Error (*CommandLineFlagConstraintFunc_double)(bool verbose, double* value);
+typedef Flag::Error (*CommandLineFlagConstraintFunc_bool)(bool value, bool verbose);
+typedef Flag::Error (*CommandLineFlagConstraintFunc_int)(int value, bool verbose);
+typedef Flag::Error (*CommandLineFlagConstraintFunc_intx)(intx value, bool verbose);
+typedef Flag::Error (*CommandLineFlagConstraintFunc_uint)(uint value, bool verbose);
+typedef Flag::Error (*CommandLineFlagConstraintFunc_uintx)(uintx value, bool verbose);
+typedef Flag::Error (*CommandLineFlagConstraintFunc_uint64_t)(uint64_t value, bool verbose);
+typedef Flag::Error (*CommandLineFlagConstraintFunc_size_t)(size_t value, bool verbose);
+typedef Flag::Error (*CommandLineFlagConstraintFunc_double)(double value, bool verbose);
 
 class CommandLineFlagConstraint : public CHeapObj<mtInternal> {
 public:
@@ -70,14 +70,14 @@
   ~CommandLineFlagConstraint() {};
   const char* name() const { return _name; }
   ConstraintType type() const { return _validate_type; }
-  virtual Flag::Error apply_bool(bool* value, bool verbose = true) { ShouldNotReachHere(); return Flag::ERR_OTHER; };
-  virtual Flag::Error apply_int(int* value, bool verbose = true) { ShouldNotReachHere(); return Flag::ERR_OTHER; };
-  virtual Flag::Error apply_intx(intx* value, bool verbose = true) { ShouldNotReachHere(); return Flag::ERR_OTHER; };
-  virtual Flag::Error apply_uint(uint* value, bool verbose = true) { ShouldNotReachHere(); return Flag::ERR_OTHER; };
-  virtual Flag::Error apply_uintx(uintx* value, bool verbose = true) { ShouldNotReachHere(); return Flag::ERR_OTHER; };
-  virtual Flag::Error apply_uint64_t(uint64_t* value, bool verbose = true) { ShouldNotReachHere(); return Flag::ERR_OTHER; };
-  virtual Flag::Error apply_size_t(size_t* value, bool verbose = true) { ShouldNotReachHere(); return Flag::ERR_OTHER; };
-  virtual Flag::Error apply_double(double* value, bool verbose = true) { ShouldNotReachHere(); return Flag::ERR_OTHER; };
+  virtual Flag::Error apply_bool(bool value, bool verbose = true) { ShouldNotReachHere(); return Flag::ERR_OTHER; };
+  virtual Flag::Error apply_int(int value, bool verbose = true) { ShouldNotReachHere(); return Flag::ERR_OTHER; };
+  virtual Flag::Error apply_intx(intx value, bool verbose = true) { ShouldNotReachHere(); return Flag::ERR_OTHER; };
+  virtual Flag::Error apply_uint(uint value, bool verbose = true) { ShouldNotReachHere(); return Flag::ERR_OTHER; };
+  virtual Flag::Error apply_uintx(uintx value, bool verbose = true) { ShouldNotReachHere(); return Flag::ERR_OTHER; };
+  virtual Flag::Error apply_uint64_t(uint64_t value, bool verbose = true) { ShouldNotReachHere(); return Flag::ERR_OTHER; };
+  virtual Flag::Error apply_size_t(size_t value, bool verbose = true) { ShouldNotReachHere(); return Flag::ERR_OTHER; };
+  virtual Flag::Error apply_double(double value, bool verbose = true) { ShouldNotReachHere(); return Flag::ERR_OTHER; };
 };
 
 class CommandLineFlagConstraintList : public AllStatic {
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/runtime/commandLineFlagConstraintsCompiler.cpp
--- a/hotspot/src/share/vm/runtime/commandLineFlagConstraintsCompiler.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/runtime/commandLineFlagConstraintsCompiler.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -25,17 +25,16 @@
 #include "precompiled.hpp"
 #include "runtime/arguments.hpp"
 #include "runtime/commandLineFlagConstraintsCompiler.hpp"
+#include "runtime/commandLineFlagRangeList.hpp"
 #include "runtime/globals.hpp"
 #include "utilities/defaultStream.hpp"
 
-Flag::Error AliasLevelConstraintFunc(bool verbose, intx* value) {
-  if ((*value <= 1) && (Arguments::mode() == Arguments::_comp)) {
-    if (verbose == true) {
-      jio_fprintf(defaultStream::error_stream(),
-                "AliasLevel (" INTX_FORMAT ") is not compatible "
-                "with -Xcomp \n",
-                *value);
-    }
+Flag::Error AliasLevelConstraintFunc(intx value, bool verbose) {
+  if ((value <= 1) && (Arguments::mode() == Arguments::_comp)) {
+    CommandLineError::print(verbose,
+                            "AliasLevel (" INTX_FORMAT ") is not "
+                            "compatible with -Xcomp \n",
+                            value);
     return Flag::VIOLATES_CONSTRAINT;
   } else {
     return Flag::SUCCESS;
@@ -57,7 +56,7 @@
  *    'TieredStopAtLevel = CompLevel_full_optimization' (the default value). As a result,
  *    the minimum number of compiler threads is 2.
  */
-Flag::Error CICompilerCountConstraintFunc(bool verbose, intx* value) {
+Flag::Error CICompilerCountConstraintFunc(intx value, bool verbose) {
   int min_number_of_compiler_threads = 0;
 #if !defined(COMPILER1) && !defined(COMPILER2) && !defined(SHARK)
   // case 1
@@ -75,12 +74,11 @@
   // min_number_of_compiler_threads to exceed CI_COMPILER_COUNT.
   min_number_of_compiler_threads = MIN2(min_number_of_compiler_threads, CI_COMPILER_COUNT);
 
-  if (*value < (intx)min_number_of_compiler_threads) {
-    if (verbose == true) {
-      jio_fprintf(defaultStream::error_stream(),
-                  "CICompilerCount=" INTX_FORMAT " must be at least %d \n",
-                  *value, min_number_of_compiler_threads);
-    }
+  if (value < (intx)min_number_of_compiler_threads) {
+    CommandLineError::print(verbose,
+                            "CICompilerCount (" INTX_FORMAT ") must be "
+                            "at least %d \n",
+                            value, min_number_of_compiler_threads);
     return Flag::VIOLATES_CONSTRAINT;
   } else {
     return Flag::SUCCESS;
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/runtime/commandLineFlagConstraintsCompiler.hpp
--- a/hotspot/src/share/vm/runtime/commandLineFlagConstraintsCompiler.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/runtime/commandLineFlagConstraintsCompiler.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -34,8 +34,8 @@
  * an appropriate error value.
  */
 
-Flag::Error AliasLevelConstraintFunc(bool verbose, intx* value);
+Flag::Error AliasLevelConstraintFunc(intx value, bool verbose);
 
-Flag::Error CICompilerCountConstraintFunc(bool verbose, intx* value);
+Flag::Error CICompilerCountConstraintFunc(intx value, bool verbose);
 
 #endif /* SHARE_VM_RUNTIME_COMMANDLINEFLAGCONSTRAINTSCOMPILER_HPP */
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/runtime/commandLineFlagConstraintsGC.cpp
--- a/hotspot/src/share/vm/runtime/commandLineFlagConstraintsGC.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/runtime/commandLineFlagConstraintsGC.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -25,6 +25,7 @@
 #include "precompiled.hpp"
 #include "runtime/arguments.hpp"
 #include "runtime/commandLineFlagConstraintsGC.hpp"
+#include "runtime/commandLineFlagRangeList.hpp"
 #include "runtime/globals.hpp"
 #include "utilities/defaultStream.hpp"
 
@@ -41,97 +42,85 @@
 #include "opto/c2_globals.hpp"
 #endif // COMPILER2
 
-static Flag::Error MinPLABSizeBounds(const char* name, bool verbose, size_t* value) {
+static Flag::Error MinPLABSizeBounds(const char* name, size_t value, bool verbose) {
 #if INCLUDE_ALL_GCS
-  if ((UseConcMarkSweepGC || UseG1GC) && (*value < PLAB::min_size())) {
-    if (verbose == true) {
-      jio_fprintf(defaultStream::error_stream(),
-                  "%s (" SIZE_FORMAT ") must be greater than "
-                  "ergonomic PLAB minimum size (" SIZE_FORMAT ")\n",
-                  name, *value, PLAB::min_size());
-    }
+  if ((UseConcMarkSweepGC || UseG1GC) && (value < PLAB::min_size())) {
+    CommandLineError::print(verbose,
+                            "%s (" SIZE_FORMAT ") must be "
+                            "greater than or equal to ergonomic PLAB minimum size (" SIZE_FORMAT ")\n",
+                            name, value, PLAB::min_size());
     return Flag::VIOLATES_CONSTRAINT;
   }
 #endif // INCLUDE_ALL_GCS
   return Flag::SUCCESS;
 }
 
-static Flag::Error MaxPLABSizeBounds(const char* name, bool verbose, size_t* value) {
+static Flag::Error MaxPLABSizeBounds(const char* name, size_t value, bool verbose) {
 #if INCLUDE_ALL_GCS
-  if ((UseConcMarkSweepGC || UseG1GC) && (*value > PLAB::max_size())) {
-    if (verbose == true) {
-      jio_fprintf(defaultStream::error_stream(),
-                  "%s (" SIZE_FORMAT ") must be less than "
-                  "ergonomic PLAB maximum size (" SIZE_FORMAT ")\n",
-                  name, *value, PLAB::max_size());
-    }
+  if ((UseConcMarkSweepGC || UseG1GC) && (value > PLAB::max_size())) {
+    CommandLineError::print(verbose,
+                            "%s (" SIZE_FORMAT ") must be "
+                            "less than ergonomic PLAB maximum size (" SIZE_FORMAT ")\n",
+                            name, value, PLAB::min_size());
     return Flag::VIOLATES_CONSTRAINT;
   }
 #endif // INCLUDE_ALL_GCS
   return Flag::SUCCESS;
 }
 
-static Flag::Error MinMaxPLABSizeBounds(const char* name, bool verbose, size_t* value) {
-  if (MinPLABSizeBounds(name, verbose, value) == Flag::SUCCESS) {
-    return MaxPLABSizeBounds(name, verbose, value);
+static Flag::Error MinMaxPLABSizeBounds(const char* name, size_t value, bool verbose) {
+  if (MinPLABSizeBounds(name, value, verbose) == Flag::SUCCESS) {
+    return MaxPLABSizeBounds(name, value, verbose);
   }
   return Flag::VIOLATES_CONSTRAINT;
 }
 
-Flag::Error YoungPLABSizeConstraintFunc(bool verbose, size_t* value) {
-  return MinMaxPLABSizeBounds("YoungPLABSize", verbose, value);
+Flag::Error YoungPLABSizeConstraintFunc(size_t value, bool verbose) {
+  return MinMaxPLABSizeBounds("YoungPLABSize", value, verbose);
 }
 
-Flag::Error MinHeapFreeRatioConstraintFunc(bool verbose, uintx* value) {
-  if (*value > MaxHeapFreeRatio) {
-    if (verbose == true) {
-      jio_fprintf(defaultStream::error_stream(),
-                  "MinHeapFreeRatio (" UINTX_FORMAT ") must be less than or "
-                  "equal to MaxHeapFreeRatio (" UINTX_FORMAT ")\n",
-                  *value, MaxHeapFreeRatio);
-    }
+Flag::Error MinHeapFreeRatioConstraintFunc(uintx value, bool verbose) {
+  if (value > MaxHeapFreeRatio) {
+    CommandLineError::print(verbose,
+                            "MinHeapFreeRatio (" UINTX_FORMAT ") must be "
+                            "less than or equal to MaxHeapFreeRatio (" UINTX_FORMAT ")\n",
+                            value, MaxHeapFreeRatio);
     return Flag::VIOLATES_CONSTRAINT;
   } else {
     return Flag::SUCCESS;
   }
 }
 
-Flag::Error MaxHeapFreeRatioConstraintFunc(bool verbose, uintx* value) {
-  if (*value < MinHeapFreeRatio) {
-    if (verbose == true) {
-      jio_fprintf(defaultStream::error_stream(),
-                  "MaxHeapFreeRatio (" UINTX_FORMAT ") must be greater than or "
-                  "equal to MinHeapFreeRatio (" UINTX_FORMAT ")\n",
-                  *value, MinHeapFreeRatio);
-    }
+Flag::Error MaxHeapFreeRatioConstraintFunc(uintx value, bool verbose) {
+  if (value < MinHeapFreeRatio) {
+    CommandLineError::print(verbose,
+                            "MaxHeapFreeRatio (" UINTX_FORMAT ") must be "
+                            "greater than or equal to MinHeapFreeRatio (" UINTX_FORMAT ")\n",
+                            value, MinHeapFreeRatio);
     return Flag::VIOLATES_CONSTRAINT;
   } else {
     return Flag::SUCCESS;
   }
 }
 
-Flag::Error MinMetaspaceFreeRatioConstraintFunc(bool verbose, uintx* value) {
-  if (*value > MaxMetaspaceFreeRatio) {
-    if (verbose == true) {
-      jio_fprintf(defaultStream::error_stream(),
-                  "MinMetaspaceFreeRatio (" UINTX_FORMAT ") must be less than or "
-                  "equal to MaxMetaspaceFreeRatio (" UINTX_FORMAT ")\n",
-                  *value, MaxMetaspaceFreeRatio);
-    }
+Flag::Error MinMetaspaceFreeRatioConstraintFunc(uintx value, bool verbose) {
+  if (value > MaxMetaspaceFreeRatio) {
+    CommandLineError::print(verbose,
+                            "MinMetaspaceFreeRatio (" UINTX_FORMAT ") must be "
+                            "less than or equal to MaxMetaspaceFreeRatio (" UINTX_FORMAT ")\n",
+                            value, MaxMetaspaceFreeRatio);
     return Flag::VIOLATES_CONSTRAINT;
   } else {
     return Flag::SUCCESS;
   }
 }
 
-Flag::Error MaxMetaspaceFreeRatioConstraintFunc(bool verbose, uintx* value) {
-  if (*value < MinMetaspaceFreeRatio) {
-    if (verbose == true) {
-      jio_fprintf(defaultStream::error_stream(),
-                  "MaxMetaspaceFreeRatio (" UINTX_FORMAT ") must be greater than or "
-                  "equal to MinMetaspaceFreeRatio (" UINTX_FORMAT ")\n",
-                  *value, MinMetaspaceFreeRatio);
-    }
+Flag::Error MaxMetaspaceFreeRatioConstraintFunc(uintx value, bool verbose) {
+  if (value < MinMetaspaceFreeRatio) {
+    CommandLineError::print(verbose,
+                            "MaxMetaspaceFreeRatio (" UINTX_FORMAT ") must be "
+                            "greater than or equal to MinMetaspaceFreeRatio (" UINTX_FORMAT ")\n",
+                            value, MinMetaspaceFreeRatio);
     return Flag::VIOLATES_CONSTRAINT;
   } else {
     return Flag::SUCCESS;
@@ -147,32 +136,28 @@
   } \
 }
 
-Flag::Error InitialTenuringThresholdConstraintFunc(bool verbose, uintx* value) {
-  UseConcMarkSweepGCWorkaroundIfNeeded(*value, MaxTenuringThreshold);
+Flag::Error InitialTenuringThresholdConstraintFunc(uintx value, bool verbose) {
+  UseConcMarkSweepGCWorkaroundIfNeeded(value, MaxTenuringThreshold);
 
-  if (*value > MaxTenuringThreshold) {
-    if (verbose == true) {
-      jio_fprintf(defaultStream::error_stream(),
-                  "InitialTenuringThreshold (" UINTX_FORMAT ") must be less than or "
-                  "equal to MaxTenuringThreshold (" UINTX_FORMAT ")\n",
-                  *value, MaxTenuringThreshold);
-    }
+  if (value > MaxTenuringThreshold) {
+    CommandLineError::print(verbose,
+                            "InitialTenuringThreshold (" UINTX_FORMAT ") must be "
+                            "less than or equal to MaxTenuringThreshold (" UINTX_FORMAT ")\n",
+                            value, MaxTenuringThreshold);
     return Flag::VIOLATES_CONSTRAINT;
   } else {
     return Flag::SUCCESS;
   }
 }
 
-Flag::Error MaxTenuringThresholdConstraintFunc(bool verbose, uintx* value) {
-  UseConcMarkSweepGCWorkaroundIfNeeded(InitialTenuringThreshold, *value);
+Flag::Error MaxTenuringThresholdConstraintFunc(uintx value, bool verbose) {
+  UseConcMarkSweepGCWorkaroundIfNeeded(InitialTenuringThreshold, value);
 
-  if (*value < InitialTenuringThreshold) {
-    if (verbose == true) {
-      jio_fprintf(defaultStream::error_stream(),
-                  "MaxTenuringThreshold (" UINTX_FORMAT ") must be greater than or "
-                  "equal to InitialTenuringThreshold (" UINTX_FORMAT ")\n",
-                  *value, InitialTenuringThreshold);
-    }
+  if (value < InitialTenuringThreshold) {
+    CommandLineError::print(verbose,
+                            "MaxTenuringThreshold (" UINTX_FORMAT ") must be "
+                            "greater than or equal to InitialTenuringThreshold (" UINTX_FORMAT ")\n",
+                            value, InitialTenuringThreshold);
     return Flag::VIOLATES_CONSTRAINT;
   } else {
     return Flag::SUCCESS;
@@ -180,28 +165,24 @@
 }
 
 #if INCLUDE_ALL_GCS
-Flag::Error G1NewSizePercentConstraintFunc(bool verbose, uintx* value) {
-  if (*value > G1MaxNewSizePercent) {
-    if (verbose == true) {
-      jio_fprintf(defaultStream::error_stream(),
-                  "G1NewSizePercent (" UINTX_FORMAT ") must be less than or "
-                  "equal to G1MaxNewSizePercent (" UINTX_FORMAT ")\n",
-                  *value, G1MaxNewSizePercent);
-    }
+Flag::Error G1NewSizePercentConstraintFunc(uintx value, bool verbose) {
+  if (value > G1MaxNewSizePercent) {
+    CommandLineError::print(verbose,
+                            "G1NewSizePercent (" UINTX_FORMAT ") must be "
+                            "less than or equal to G1MaxNewSizePercent (" UINTX_FORMAT ")\n",
+                            value, G1MaxNewSizePercent);
     return Flag::VIOLATES_CONSTRAINT;
   } else {
     return Flag::SUCCESS;
   }
 }
 
-Flag::Error G1MaxNewSizePercentConstraintFunc(bool verbose, uintx* value) {
-  if (*value < G1NewSizePercent) {
-    if (verbose == true) {
-      jio_fprintf(defaultStream::error_stream(),
-                  "G1MaxNewSizePercent (" UINTX_FORMAT ") must be greater than or "
-                  "equal to G1NewSizePercent (" UINTX_FORMAT ")\n",
-                  *value, G1NewSizePercent);
-    }
+Flag::Error G1MaxNewSizePercentConstraintFunc(uintx value, bool verbose) {
+  if (value < G1NewSizePercent) {
+    CommandLineError::print(verbose,
+                            "G1MaxNewSizePercent (" UINTX_FORMAT ") must be "
+                            "greater than or equal to G1NewSizePercent (" UINTX_FORMAT ")\n",
+                            value, G1NewSizePercent);
     return Flag::VIOLATES_CONSTRAINT;
   } else {
     return Flag::SUCCESS;
@@ -210,65 +191,56 @@
 
 #endif // INCLUDE_ALL_GCS
 
-Flag::Error CMSOldPLABMinConstraintFunc(bool verbose, size_t* value) {
-  if (*value > CMSOldPLABMax) {
-    if (verbose == true) {
-      jio_fprintf(defaultStream::error_stream(),
-                  "CMSOldPLABMin (" SIZE_FORMAT ") must be less than or "
-                  "equal to CMSOldPLABMax (" SIZE_FORMAT ")\n",
-                  *value, CMSOldPLABMax);
-    }
+Flag::Error CMSOldPLABMinConstraintFunc(size_t value, bool verbose) {
+  if (value > CMSOldPLABMax) {
+    CommandLineError::print(verbose,
+                            "CMSOldPLABMin (" SIZE_FORMAT ") must be "
+                            "less than or equal to CMSOldPLABMax (" SIZE_FORMAT ")\n",
+                            value, CMSOldPLABMax);
     return Flag::VIOLATES_CONSTRAINT;
   } else {
     return Flag::SUCCESS;
   }
 }
 
-Flag::Error CMSPrecleanDenominatorConstraintFunc(bool verbose, uintx* value) {
-  if (*value <= CMSPrecleanNumerator) {
-    if (verbose == true) {
-      jio_fprintf(defaultStream::error_stream(),
-                  "CMSPrecleanDenominator (" UINTX_FORMAT ") must be strickly greater than "
-                  "CMSPrecleanNumerator (" UINTX_FORMAT ")\n",
-                  *value, CMSPrecleanNumerator);
-    }
+Flag::Error CMSPrecleanDenominatorConstraintFunc(uintx value, bool verbose) {
+  if (value <= CMSPrecleanNumerator) {
+    CommandLineError::print(verbose,
+                            "CMSPrecleanDenominator (" UINTX_FORMAT ") must be "
+                            "strickly greater than CMSPrecleanNumerator (" UINTX_FORMAT ")\n",
+                            value, CMSPrecleanNumerator);
     return Flag::VIOLATES_CONSTRAINT;
   } else {
     return Flag::SUCCESS;
   }
 }
 
-Flag::Error CMSPrecleanNumeratorConstraintFunc(bool verbose, uintx* value) {
-  if (*value > (CMSPrecleanDenominator - 1)) {
-    if (verbose == true) {
-      jio_fprintf(defaultStream::error_stream(),
-                  "CMSPrecleanNumerator (" UINTX_FORMAT ") must be less than or "
-                  "equal to CMSPrecleanDenominator - 1 (" UINTX_FORMAT ")\n", *value,
-                  CMSPrecleanDenominator - 1);
-    }
+Flag::Error CMSPrecleanNumeratorConstraintFunc(uintx value, bool verbose) {
+  if (value > (CMSPrecleanDenominator - 1)) {
+    CommandLineError::print(verbose,
+                            "CMSPrecleanNumerator (" UINTX_FORMAT ") must be "
+                            "less than or equal to CMSPrecleanDenominator - 1 (" UINTX_FORMAT ")\n",
+                            value, CMSPrecleanDenominator - 1);
     return Flag::VIOLATES_CONSTRAINT;
   } else {
     return Flag::SUCCESS;
   }
 }
 
-Flag::Error SurvivorAlignmentInBytesConstraintFunc(bool verbose, intx* value) {
-  if (*value != 0) {
-    if (!is_power_of_2(*value)) {
-      if (verbose == true) {
-        jio_fprintf(defaultStream::error_stream(),
-                  "SurvivorAlignmentInBytes (" INTX_FORMAT ") must be power of 2\n",
-                  *value);
-      }
+Flag::Error SurvivorAlignmentInBytesConstraintFunc(intx value, bool verbose) {
+  if (value != 0) {
+    if (!is_power_of_2(value)) {
+      CommandLineError::print(verbose,
+                              "SurvivorAlignmentInBytes (" INTX_FORMAT ") must be "
+                              "power of 2\n",
+                              value);
       return Flag::VIOLATES_CONSTRAINT;
     }
-    if (*value < ObjectAlignmentInBytes) {
-      if (verbose == true) {
-        jio_fprintf(defaultStream::error_stream(),
-                  "SurvivorAlignmentInBytes (" INTX_FORMAT ") must be greater than or "
-                  "equal to ObjectAlignmentInBytes (" INTX_FORMAT ")\n",
-                  *value, ObjectAlignmentInBytes);
-      }
+    if (value < ObjectAlignmentInBytes) {
+      CommandLineError::print(verbose,
+                              "SurvivorAlignmentInBytes (" INTX_FORMAT ") must be "
+                              "greater than or equal to ObjectAlignmentInBytes (" INTX_FORMAT ")\n",
+                              value, ObjectAlignmentInBytes);
       return Flag::VIOLATES_CONSTRAINT;
     }
   }
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/runtime/commandLineFlagConstraintsGC.hpp
--- a/hotspot/src/share/vm/runtime/commandLineFlagConstraintsGC.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/runtime/commandLineFlagConstraintsGC.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -34,27 +34,27 @@
  * an appropriate error value.
  */
 
-Flag::Error YoungPLABSizeConstraintFunc(bool verbose, size_t* value);
+Flag::Error YoungPLABSizeConstraintFunc(size_t value, bool verbose);
 
-Flag::Error MinHeapFreeRatioConstraintFunc(bool verbose, uintx* value);
-Flag::Error MaxHeapFreeRatioConstraintFunc(bool verbose, uintx* value);
+Flag::Error MinHeapFreeRatioConstraintFunc(uintx value, bool verbose);
+Flag::Error MaxHeapFreeRatioConstraintFunc(uintx value, bool verbose);
 
-Flag::Error MinMetaspaceFreeRatioConstraintFunc(bool verbose, uintx* value);
-Flag::Error MaxMetaspaceFreeRatioConstraintFunc(bool verbose, uintx* value);
+Flag::Error MinMetaspaceFreeRatioConstraintFunc(uintx value, bool verbose);
+Flag::Error MaxMetaspaceFreeRatioConstraintFunc(uintx value, bool verbose);
 
-Flag::Error InitialTenuringThresholdConstraintFunc(bool verbose, uintx* value);
-Flag::Error MaxTenuringThresholdConstraintFunc(bool verbose, uintx* value);
+Flag::Error InitialTenuringThresholdConstraintFunc(uintx value, bool verbose);
+Flag::Error MaxTenuringThresholdConstraintFunc(uintx value, bool verbose);
 
 #if INCLUDE_ALL_GCS
-Flag::Error G1NewSizePercentConstraintFunc(bool verbose, uintx* value);
-Flag::Error G1MaxNewSizePercentConstraintFunc(bool verbose, uintx* value);
+Flag::Error G1NewSizePercentConstraintFunc(uintx value, bool verbose);
+Flag::Error G1MaxNewSizePercentConstraintFunc(uintx value, bool verbose);
 #endif // INCLUDE_ALL_GCS
 
-Flag::Error CMSOldPLABMinConstraintFunc(bool verbose, size_t* value);
+Flag::Error CMSOldPLABMinConstraintFunc(size_t value, bool verbose);
 
-Flag::Error CMSPrecleanDenominatorConstraintFunc(bool verbose, uintx* value);
-Flag::Error CMSPrecleanNumeratorConstraintFunc(bool verbose, uintx* value);
+Flag::Error CMSPrecleanDenominatorConstraintFunc(uintx value, bool verbose);
+Flag::Error CMSPrecleanNumeratorConstraintFunc(uintx value, bool verbose);
 
-Flag::Error SurvivorAlignmentInBytesConstraintFunc(bool verbose, intx* value);
+Flag::Error SurvivorAlignmentInBytesConstraintFunc(intx value, bool verbose);
 
 #endif /* SHARE_VM_RUNTIME_COMMANDLINEFLAGCONSTRAINTSGC_HPP */
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/runtime/commandLineFlagConstraintsRuntime.cpp
--- a/hotspot/src/share/vm/runtime/commandLineFlagConstraintsRuntime.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/runtime/commandLineFlagConstraintsRuntime.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -25,25 +25,24 @@
 #include "precompiled.hpp"
 #include "runtime/arguments.hpp"
 #include "runtime/commandLineFlagConstraintsRuntime.hpp"
+#include "runtime/commandLineFlagRangeList.hpp"
 #include "runtime/globals.hpp"
 #include "utilities/defaultStream.hpp"
 
-Flag::Error ObjectAlignmentInBytesConstraintFunc(bool verbose, intx* value) {
-  if (!is_power_of_2(*value)) {
-    if (verbose == true) {
-      jio_fprintf(defaultStream::error_stream(),
-                "ObjectAlignmentInBytes=" INTX_FORMAT " must be power of 2\n",
-                *value);
-    }
+Flag::Error ObjectAlignmentInBytesConstraintFunc(intx value, bool verbose) {
+  if (!is_power_of_2(value)) {
+    CommandLineError::print(verbose,
+                            "ObjectAlignmentInBytes (" INTX_FORMAT ") must be "
+                            "power of 2\n",
+                            value);
     return Flag::VIOLATES_CONSTRAINT;
   }
   // In case page size is very small.
-  if (*value >= (intx)os::vm_page_size()) {
-    if (verbose == true) {
-      jio_fprintf(defaultStream::error_stream(),
-                "ObjectAlignmentInBytes=" INTX_FORMAT " must be less than page size " INTX_FORMAT "\n",
-                *value, (intx)os::vm_page_size());
-    }
+  if (value >= (intx)os::vm_page_size()) {
+    CommandLineError::print(verbose,
+                            "ObjectAlignmentInBytes (" INTX_FORMAT ") must be "
+                            "less than page size " INTX_FORMAT "\n",
+                            value, (intx)os::vm_page_size());
     return Flag::VIOLATES_CONSTRAINT;
   }
   return Flag::SUCCESS;
@@ -51,13 +50,12 @@
 
 // Need to enforce the padding not to break the existing field alignments.
 // It is sufficient to check against the largest type size.
-Flag::Error ContendedPaddingWidthConstraintFunc(bool verbose, intx* value) {
-  if ((*value != 0) && ((*value % BytesPerLong) != 0)) {
-    if (verbose == true) {
-      jio_fprintf(defaultStream::error_stream(),
-                "ContendedPaddingWidth=" INTX_FORMAT " must be a multiple of %d\n",
-                *value, BytesPerLong);
-    }
+Flag::Error ContendedPaddingWidthConstraintFunc(intx value, bool verbose) {
+  if ((value != 0) && ((value % BytesPerLong) != 0)) {
+    CommandLineError::print(verbose,
+                            "ContendedPaddingWidth (" INTX_FORMAT ") must be "
+                            "a multiple of %d\n",
+                            value, BytesPerLong);
     return Flag::VIOLATES_CONSTRAINT;
   } else {
     return Flag::SUCCESS;
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/runtime/commandLineFlagConstraintsRuntime.hpp
--- a/hotspot/src/share/vm/runtime/commandLineFlagConstraintsRuntime.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/runtime/commandLineFlagConstraintsRuntime.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -34,8 +34,8 @@
  * an appropriate error value.
  */
 
-Flag::Error ObjectAlignmentInBytesConstraintFunc(bool verbose, intx* value);
+Flag::Error ObjectAlignmentInBytesConstraintFunc(intx value, bool verbose);
 
-Flag::Error ContendedPaddingWidthConstraintFunc(bool verbose, intx* value);
+Flag::Error ContendedPaddingWidthConstraintFunc(intx value, bool verbose);
 
 #endif /* SHARE_VM_RUNTIME_COMMANDLINEFLAGCONSTRAINTSRUNTIME_HPP */
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/runtime/commandLineFlagRangeList.cpp
--- a/hotspot/src/share/vm/runtime/commandLineFlagRangeList.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/runtime/commandLineFlagRangeList.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -32,6 +32,15 @@
 #include "utilities/defaultStream.hpp"
 #include "utilities/macros.hpp"
 
+void CommandLineError::print(bool verbose, const char* msg, ...) {
+  if (verbose) {
+    va_list listPointer;
+    va_start(listPointer, msg);
+    jio_vfprintf(defaultStream::error_stream(), msg, listPointer);
+    va_end(listPointer);
+  }
+}
+
 class CommandLineFlagRange_int : public CommandLineFlagRange {
   int _min;
   int _max;
@@ -44,11 +53,10 @@
 
   Flag::Error check_int(int value, bool verbose = true) {
     if ((value < _min) || (value > _max)) {
-      if (verbose == true) {
-        jio_fprintf(defaultStream::error_stream(),
-                    "int %s=%d is outside the allowed range [ %d ... %d ]\n",
-                    name(), value, _min, _max);
-      }
+      CommandLineError::print(verbose,
+                              "int %s=%d is outside the allowed range "
+                              "[ %d ... %d ]\n",
+                              name(), value, _min, _max);
       return Flag::OUT_OF_BOUNDS;
     } else {
       return Flag::SUCCESS;
@@ -72,11 +80,10 @@
 
   Flag::Error check_intx(intx value, bool verbose = true) {
     if ((value < _min) || (value > _max)) {
-      if (verbose == true) {
-        jio_fprintf(defaultStream::error_stream(),
-                    "intx %s=" INTX_FORMAT " is outside the allowed range [ " INTX_FORMAT " ... " INTX_FORMAT " ]\n",
-                    name(), value, _min, _max);
-      }
+      CommandLineError::print(verbose,
+                              "intx %s=" INTX_FORMAT " is outside the allowed range "
+                              "[ " INTX_FORMAT " ... " INTX_FORMAT " ]\n",
+                              name(), value, _min, _max);
       return Flag::OUT_OF_BOUNDS;
     } else {
       return Flag::SUCCESS;
@@ -100,11 +107,10 @@
 
   Flag::Error check_uint(uint value, bool verbose = true) {
     if ((value < _min) || (value > _max)) {
-      if (verbose == true) {
-        jio_fprintf(defaultStream::error_stream(),
-                    "uintx %s=%u is outside the allowed range [ %u ... %u ]\n",
-                    name(), value, _min, _max);
-      }
+      CommandLineError::print(verbose,
+                              "uint %s=%u is outside the allowed range "
+                              "[ %u ... %u ]\n",
+                              name(), value, _min, _max);
       return Flag::OUT_OF_BOUNDS;
     } else {
       return Flag::SUCCESS;
@@ -128,11 +134,10 @@
 
   Flag::Error check_uintx(uintx value, bool verbose = true) {
     if ((value < _min) || (value > _max)) {
-      if (verbose == true) {
-        jio_fprintf(defaultStream::error_stream(),
-                    "uintx %s=" UINTX_FORMAT " is outside the allowed range [ " UINTX_FORMAT " ... " UINTX_FORMAT " ]\n",
-                    name(), value, _min, _max);
-      }
+      CommandLineError::print(verbose,
+                              "uintx %s=" UINTX_FORMAT " is outside the allowed range "
+                              "[ " UINTX_FORMAT " ... " UINTX_FORMAT " ]\n",
+                              name(), value, _min, _max);
       return Flag::OUT_OF_BOUNDS;
     } else {
       return Flag::SUCCESS;
@@ -156,11 +161,10 @@
 
   Flag::Error check_uint64_t(uint64_t value, bool verbose = true) {
     if ((value < _min) || (value > _max)) {
-      if (verbose == true) {
-        jio_fprintf(defaultStream::error_stream(),
-                    "uint64_t %s=" UINT64_FORMAT " is outside the allowed range [ " UINT64_FORMAT " ... " UINT64_FORMAT " ]\n",
-                    name(), value, _min, _max);
-      }
+      CommandLineError::print(verbose,
+                              "uint64_t %s=" UINT64_FORMAT " is outside the allowed range "
+                              "[ " UINT64_FORMAT " ... " UINT64_FORMAT " ]\n",
+                              name(), value, _min, _max);
       return Flag::OUT_OF_BOUNDS;
     } else {
       return Flag::SUCCESS;
@@ -184,11 +188,10 @@
 
   Flag::Error check_size_t(size_t value, bool verbose = true) {
     if ((value < _min) || (value > _max)) {
-      if (verbose == true) {
-        jio_fprintf(defaultStream::error_stream(),
-                    "size_t %s=" SIZE_FORMAT " is outside the allowed range [ " SIZE_FORMAT " ... " SIZE_FORMAT " ]\n",
-                    name(), value, _min, _max);
-      }
+      CommandLineError::print(verbose,
+                              "size_t %s=" SIZE_FORMAT " is outside the allowed range "
+                              "[ " SIZE_FORMAT " ... " SIZE_FORMAT " ]\n",
+                              name(), value, _min, _max);
       return Flag::OUT_OF_BOUNDS;
     } else {
       return Flag::SUCCESS;
@@ -212,11 +215,10 @@
 
   Flag::Error check_double(double value, bool verbose = true) {
     if ((value < _min) || (value > _max)) {
-      if (verbose == true) {
-        jio_fprintf(defaultStream::error_stream(),
-                    "double %s=%f is outside the allowed range [ %f ... %f ]\n",
-                    name(), value, _min, _max);
-      }
+      CommandLineError::print(verbose,
+                              "double %s=%f is outside the allowed range "
+                              "[ %f ... %f ]\n",
+                              name(), value, _min, _max);
       return Flag::OUT_OF_BOUNDS;
     } else {
       return Flag::SUCCESS;
@@ -300,48 +302,48 @@
   EMIT_RANGES_FOR_GLOBALS_EXT
 
   emit_range_no(NULL ARCH_FLAGS(EMIT_RANGE_DEVELOPER_FLAG,
-                                     EMIT_RANGE_PRODUCT_FLAG,
-                                     EMIT_RANGE_DIAGNOSTIC_FLAG,
-                                     EMIT_RANGE_EXPERIMENTAL_FLAG,
-                                     EMIT_RANGE_NOTPRODUCT_FLAG,
-                                     EMIT_RANGE_CHECK,
-                                     IGNORE_CONSTRAINT));
+                                EMIT_RANGE_PRODUCT_FLAG,
+                                EMIT_RANGE_DIAGNOSTIC_FLAG,
+                                EMIT_RANGE_EXPERIMENTAL_FLAG,
+                                EMIT_RANGE_NOTPRODUCT_FLAG,
+                                EMIT_RANGE_CHECK,
+                                IGNORE_CONSTRAINT));
 
 #ifdef COMPILER1
   emit_range_no(NULL C1_FLAGS(EMIT_RANGE_DEVELOPER_FLAG,
-                                   EMIT_RANGE_PD_DEVELOPER_FLAG,
-                                   EMIT_RANGE_PRODUCT_FLAG,
-                                   EMIT_RANGE_PD_PRODUCT_FLAG,
-                                   EMIT_RANGE_DIAGNOSTIC_FLAG,
-                                   EMIT_RANGE_NOTPRODUCT_FLAG,
-                                   EMIT_RANGE_CHECK,
-                                   IGNORE_CONSTRAINT));
+                              EMIT_RANGE_PD_DEVELOPER_FLAG,
+                              EMIT_RANGE_PRODUCT_FLAG,
+                              EMIT_RANGE_PD_PRODUCT_FLAG,
+                              EMIT_RANGE_DIAGNOSTIC_FLAG,
+                              EMIT_RANGE_NOTPRODUCT_FLAG,
+                              EMIT_RANGE_CHECK,
+                              IGNORE_CONSTRAINT));
 #endif // COMPILER1
 
 #ifdef COMPILER2
   emit_range_no(NULL C2_FLAGS(EMIT_RANGE_DEVELOPER_FLAG,
-                                   EMIT_RANGE_PD_DEVELOPER_FLAG,
-                                   EMIT_RANGE_PRODUCT_FLAG,
-                                   EMIT_RANGE_PD_PRODUCT_FLAG,
-                                   EMIT_RANGE_DIAGNOSTIC_FLAG,
-                                   EMIT_RANGE_EXPERIMENTAL_FLAG,
-                                   EMIT_RANGE_NOTPRODUCT_FLAG,
-                                   EMIT_RANGE_CHECK,
-                                   IGNORE_CONSTRAINT));
+                              EMIT_RANGE_PD_DEVELOPER_FLAG,
+                              EMIT_RANGE_PRODUCT_FLAG,
+                              EMIT_RANGE_PD_PRODUCT_FLAG,
+                              EMIT_RANGE_DIAGNOSTIC_FLAG,
+                              EMIT_RANGE_EXPERIMENTAL_FLAG,
+                              EMIT_RANGE_NOTPRODUCT_FLAG,
+                              EMIT_RANGE_CHECK,
+                              IGNORE_CONSTRAINT));
 #endif // COMPILER2
 
 #if INCLUDE_ALL_GCS
   emit_range_no(NULL G1_FLAGS(EMIT_RANGE_DEVELOPER_FLAG,
-                                   EMIT_RANGE_PD_DEVELOPER_FLAG,
-                                   EMIT_RANGE_PRODUCT_FLAG,
-                                   EMIT_RANGE_PD_PRODUCT_FLAG,
-                                   EMIT_RANGE_DIAGNOSTIC_FLAG,
-                                   EMIT_RANGE_EXPERIMENTAL_FLAG,
-                                   EMIT_RANGE_NOTPRODUCT_FLAG,
-                                   EMIT_RANGE_MANAGEABLE_FLAG,
-                                   EMIT_RANGE_PRODUCT_RW_FLAG,
-                                   EMIT_RANGE_CHECK,
-                                   IGNORE_CONSTRAINT));
+                              EMIT_RANGE_PD_DEVELOPER_FLAG,
+                              EMIT_RANGE_PRODUCT_FLAG,
+                              EMIT_RANGE_PD_PRODUCT_FLAG,
+                              EMIT_RANGE_DIAGNOSTIC_FLAG,
+                              EMIT_RANGE_EXPERIMENTAL_FLAG,
+                              EMIT_RANGE_NOTPRODUCT_FLAG,
+                              EMIT_RANGE_MANAGEABLE_FLAG,
+                              EMIT_RANGE_PRODUCT_RW_FLAG,
+                              EMIT_RANGE_CHECK,
+                              IGNORE_CONSTRAINT));
 #endif // INCLUDE_ALL_GCS
 }
 
@@ -367,45 +369,23 @@
 }
 
 bool CommandLineFlagRangeList::check_ranges() {
-//#define PRINT_RANGES_SIZES
-#ifdef PRINT_RANGES_SIZES
-  {
-    size_t size_ranges = sizeof(CommandLineFlagRangeList);
-    for (int i=0; i<length(); i++) {
-      size_ranges += sizeof(CommandLineFlagRange);
-      CommandLineFlagRange* range = at(i);
-      const char* name = range->name();
-      Flag* flag = Flag::find_flag(name, strlen(name), true, true);
-      if (flag->is_intx()) {
-        size_ranges += 2*sizeof(intx);
-        size_ranges += sizeof(CommandLineFlagRange*);
-      } else if (flag->is_uintx()) {
-        size_ranges += 2*sizeof(uintx);
-        size_ranges += sizeof(CommandLineFlagRange*);
-      } else if (flag->is_uint64_t()) {
-        size_ranges += 2*sizeof(uint64_t);
-        size_ranges += sizeof(CommandLineFlagRange*);
-      } else if (flag->is_size_t()) {
-        size_ranges += 2*sizeof(size_t);
-        size_ranges += sizeof(CommandLineFlagRange*);
-      } else if (flag->is_double()) {
-        size_ranges += 2*sizeof(double);
-        size_ranges += sizeof(CommandLineFlagRange*);
-      }
-    }
-    fprintf(stderr, "Size of %d ranges: " SIZE_FORMAT " bytes\n",
-            length(), size_ranges);
-  }
-#endif // PRINT_RANGES_SIZES
-
   // Check ranges.
   bool status = true;
   for (int i=0; i<length(); i++) {
     CommandLineFlagRange* range = at(i);
     const char* name = range->name();
     Flag* flag = Flag::find_flag(name, strlen(name), true, true);
+    // We must check for NULL here as lp64_product flags on 32 bit architecture
+    // can generate range check (despite that they are declared as constants),
+    // but they will not be returned by Flag::find_flag()
     if (flag != NULL) {
-      if (flag->is_intx()) {
+      if (flag->is_int()) {
+        int value = flag->get_int();
+        if (range->check_int(value, true) != Flag::SUCCESS) status = false;
+      } else if (flag->is_uint()) {
+        uint value = flag->get_uint();
+        if (range->check_uint(value, true) != Flag::SUCCESS) status = false;
+      } else if (flag->is_intx()) {
         intx value = flag->get_intx();
         if (range->check_intx(value, true) != Flag::SUCCESS) status = false;
       } else if (flag->is_uintx()) {
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/runtime/commandLineFlagRangeList.hpp
--- a/hotspot/src/share/vm/runtime/commandLineFlagRangeList.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/runtime/commandLineFlagRangeList.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -38,6 +38,11 @@
  * then we need to use constraint instead.
  */
 
+class CommandLineError : public AllStatic {
+public:
+  static void print(bool verbose, const char* msg, ...);
+};
+
 class CommandLineFlagRange : public CHeapObj<mtInternal> {
 private:
   const char* _name;
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/runtime/globals.cpp
--- a/hotspot/src/share/vm/runtime/globals.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/runtime/globals.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -310,13 +310,17 @@
 void Flag::get_locked_message(char* buf, int buflen) const {
   buf[0] = '\0';
   if (is_diagnostic() && !is_unlocked()) {
-    jio_snprintf(buf, buflen, "Error: VM option '%s' is diagnostic and must be enabled via -XX:+UnlockDiagnosticVMOptions.\n",
-                 _name);
+    jio_snprintf(buf, buflen,
+                 "Error: VM option '%s' is diagnostic and must be enabled via -XX:+UnlockDiagnosticVMOptions.\n"
+                 "Error: The unlock option must precede '%s'.\n",
+                 _name, _name);
     return;
   }
   if (is_experimental() && !is_unlocked()) {
-    jio_snprintf(buf, buflen, "Error: VM option '%s' is experimental and must be enabled via -XX:+UnlockExperimentalVMOptions.\n",
-                 _name);
+    jio_snprintf(buf, buflen,
+                 "Error: VM option '%s' is experimental and must be enabled via -XX:+UnlockExperimentalVMOptions.\n"
+                 "Error: The unlock option must precede '%s'.\n",
+                 _name, _name);
     return;
   }
   if (is_develop() && is_product_build()) {
@@ -515,6 +519,20 @@
   }
 }
 
+const char* Flag::flag_error_str(Flag::Error error) {
+  switch (error) {
+    case Flag::MISSING_NAME: return "MISSING_NAME";
+    case Flag::MISSING_VALUE: return "MISSING_VALUE";
+    case Flag::NON_WRITABLE: return "NON_WRITABLE";
+    case Flag::OUT_OF_BOUNDS: return "OUT_OF_BOUNDS";
+    case Flag::VIOLATES_CONSTRAINT: return "VIOLATES_CONSTRAINT";
+    case Flag::INVALID_FLAG: return "INVALID_FLAG";
+    case Flag::ERR_OTHER: return "ERR_OTHER";
+    case Flag::SUCCESS: return "SUCCESS";
+    default: ShouldNotReachHere(); return "NULL";
+  }
+}
+
 // 4991491 do not "optimize out" the was_set false values: omitting them
 // tickles a Microsoft compiler bug causing flagTable to be malformed
 
@@ -758,17 +776,7 @@
   e.commit();
 }
 
-static Flag::Error get_status_error(Flag::Error status_range, Flag::Error status_constraint) {
-  if (status_range != Flag::SUCCESS) {
-    return status_range;
-  } else if (status_constraint != Flag::SUCCESS) {
-    return status_constraint;
-  } else {
-    return Flag::SUCCESS;
-  }
-}
-
-static Flag::Error apply_constraint_and_check_range_bool(const char* name, bool* new_value, bool verbose = true) {
+static Flag::Error apply_constraint_and_check_range_bool(const char* name, bool new_value, bool verbose = true) {
   Flag::Error status = Flag::SUCCESS;
   CommandLineFlagConstraint* constraint = CommandLineFlagConstraintList::find_if_needs_check(name);
   if (constraint != NULL) {
@@ -789,7 +797,7 @@
   Flag* result = Flag::find_flag(name, len);
   if (result == NULL) return Flag::INVALID_FLAG;
   if (!result->is_bool()) return Flag::WRONG_FORMAT;
-  Flag::Error check = apply_constraint_and_check_range_bool(name, value, !CommandLineFlagConstraintList::validated_after_ergo());
+  Flag::Error check = apply_constraint_and_check_range_bool(name, *value, !CommandLineFlagConstraintList::validated_after_ergo());
   if (check != Flag::SUCCESS) return check;
   bool old_value = result->get_bool();
   trace_flag_changed<EventBooleanFlagChanged, bool>(name, old_value, *value, origin);
@@ -802,7 +810,7 @@
 Flag::Error CommandLineFlagsEx::boolAtPut(CommandLineFlagWithType flag, bool value, Flag::Flags origin) {
   Flag* faddr = address_of_flag(flag);
   guarantee(faddr != NULL && faddr->is_bool(), "wrong flag type");
-  Flag::Error check = apply_constraint_and_check_range_bool(faddr->_name, &value);
+  Flag::Error check = apply_constraint_and_check_range_bool(faddr->_name, value);
   if (check != Flag::SUCCESS) return check;
   trace_flag_changed<EventBooleanFlagChanged, bool>(faddr->_name, faddr->get_bool(), value, origin);
   faddr->set_bool(value);
@@ -810,18 +818,19 @@
   return Flag::SUCCESS;
 }
 
-static Flag::Error apply_constraint_and_check_range_int(const char* name, int* new_value, bool verbose = true) {
-  Flag::Error range_status = Flag::SUCCESS;
+static Flag::Error apply_constraint_and_check_range_int(const char* name, int new_value, bool verbose = true) {
+  Flag::Error status = Flag::SUCCESS;
   CommandLineFlagRange* range = CommandLineFlagRangeList::find(name);
   if (range != NULL) {
-    range_status = range->check_int(*new_value, verbose);
+    status = range->check_int(new_value, verbose);
   }
-  Flag::Error constraint_status = Flag::SUCCESS;
-  CommandLineFlagConstraint* constraint = CommandLineFlagConstraintList::find_if_needs_check(name);
-  if (constraint != NULL) {
-    constraint_status = constraint->apply_int(new_value, verbose);
+  if (status == Flag::SUCCESS) {
+    CommandLineFlagConstraint* constraint = CommandLineFlagConstraintList::find_if_needs_check(name);
+    if (constraint != NULL) {
+      status = constraint->apply_int(new_value, verbose);
+    }
   }
-  return get_status_error(range_status, constraint_status);
+  return status;
 }
 
 Flag::Error CommandLineFlags::intAt(const char* name, size_t len, int* value, bool allow_locked, bool return_flag) {
@@ -836,7 +845,7 @@
   Flag* result = Flag::find_flag(name, len);
   if (result == NULL) return Flag::INVALID_FLAG;
   if (!result->is_int()) return Flag::WRONG_FORMAT;
-  Flag::Error check = apply_constraint_and_check_range_int(name, value, !CommandLineFlagConstraintList::validated_after_ergo());
+  Flag::Error check = apply_constraint_and_check_range_int(name, *value, !CommandLineFlagConstraintList::validated_after_ergo());
   if (check != Flag::SUCCESS) return check;
   int old_value = result->get_int();
   trace_flag_changed<EventIntFlagChanged, s4>(name, old_value, *value, origin);
@@ -849,24 +858,27 @@
 Flag::Error CommandLineFlagsEx::intAtPut(CommandLineFlagWithType flag, int value, Flag::Flags origin) {
   Flag* faddr = address_of_flag(flag);
   guarantee(faddr != NULL && faddr->is_int(), "wrong flag type");
+  Flag::Error check = apply_constraint_and_check_range_int(faddr->_name, value, !CommandLineFlagConstraintList::validated_after_ergo());
+  if (check != Flag::SUCCESS) return check;
   trace_flag_changed<EventIntFlagChanged, s4>(faddr->_name, faddr->get_int(), value, origin);
   faddr->set_int(value);
   faddr->set_origin(origin);
   return Flag::SUCCESS;
 }
 
-static Flag::Error apply_constraint_and_check_range_uint(const char* name, uint* new_value, bool verbose = true) {
-  Flag::Error range_status = Flag::SUCCESS;
+static Flag::Error apply_constraint_and_check_range_uint(const char* name, uint new_value, bool verbose = true) {
+  Flag::Error status = Flag::SUCCESS;
   CommandLineFlagRange* range = CommandLineFlagRangeList::find(name);
   if (range != NULL) {
-    range_status = range->check_uint(*new_value, verbose);
+    status = range->check_uint(new_value, verbose);
   }
-  Flag::Error constraint_status = Flag::SUCCESS;
-  CommandLineFlagConstraint* constraint = CommandLineFlagConstraintList::find_if_needs_check(name);
-  if (constraint != NULL) {
-    constraint_status = constraint->apply_uint(new_value, verbose);
+  if (status == Flag::SUCCESS) {
+    CommandLineFlagConstraint* constraint = CommandLineFlagConstraintList::find_if_needs_check(name);
+    if (constraint != NULL) {
+      status = constraint->apply_uint(new_value, verbose);
+    }
   }
-  return get_status_error(range_status, constraint_status);
+  return status;
 }
 
 Flag::Error CommandLineFlags::uintAt(const char* name, size_t len, uint* value, bool allow_locked, bool return_flag) {
@@ -881,7 +893,7 @@
   Flag* result = Flag::find_flag(name, len);
   if (result == NULL) return Flag::INVALID_FLAG;
   if (!result->is_uint()) return Flag::WRONG_FORMAT;
-  Flag::Error check = apply_constraint_and_check_range_uint(name, value, !CommandLineFlagConstraintList::validated_after_ergo());
+  Flag::Error check = apply_constraint_and_check_range_uint(name, *value, !CommandLineFlagConstraintList::validated_after_ergo());
   if (check != Flag::SUCCESS) return check;
   uint old_value = result->get_uint();
   trace_flag_changed<EventUnsignedIntFlagChanged, u4>(name, old_value, *value, origin);
@@ -894,6 +906,8 @@
 Flag::Error CommandLineFlagsEx::uintAtPut(CommandLineFlagWithType flag, uint value, Flag::Flags origin) {
   Flag* faddr = address_of_flag(flag);
   guarantee(faddr != NULL && faddr->is_uint(), "wrong flag type");
+  Flag::Error check = apply_constraint_and_check_range_uint(faddr->_name, value, !CommandLineFlagConstraintList::validated_after_ergo());
+  if (check != Flag::SUCCESS) return check;
   trace_flag_changed<EventUnsignedIntFlagChanged, u4>(faddr->_name, faddr->get_uint(), value, origin);
   faddr->set_uint(value);
   faddr->set_origin(origin);
@@ -908,25 +922,26 @@
   return Flag::SUCCESS;
 }
 
-static Flag::Error apply_constraint_and_check_range_intx(const char* name, intx* new_value, bool verbose = true) {
-  Flag::Error range_status = Flag::SUCCESS;
+static Flag::Error apply_constraint_and_check_range_intx(const char* name, intx new_value, bool verbose = true) {
+  Flag::Error status = Flag::SUCCESS;
   CommandLineFlagRange* range = CommandLineFlagRangeList::find(name);
   if (range != NULL) {
-    range_status = range->check_intx(*new_value, verbose);
+    status = range->check_intx(new_value, verbose);
   }
-  Flag::Error constraint_status = Flag::SUCCESS;
-  CommandLineFlagConstraint* constraint = CommandLineFlagConstraintList::find_if_needs_check(name);
-  if (constraint != NULL) {
-    constraint_status = constraint->apply_intx(new_value, verbose);
+  if (status == Flag::SUCCESS) {
+    CommandLineFlagConstraint* constraint = CommandLineFlagConstraintList::find_if_needs_check(name);
+    if (constraint != NULL) {
+      status = constraint->apply_intx(new_value, verbose);
+    }
   }
-  return get_status_error(range_status, constraint_status);
+  return status;
 }
 
 Flag::Error CommandLineFlags::intxAtPut(const char* name, size_t len, intx* value, Flag::Flags origin) {
   Flag* result = Flag::find_flag(name, len);
   if (result == NULL) return Flag::INVALID_FLAG;
   if (!result->is_intx()) return Flag::WRONG_FORMAT;
-  Flag::Error check = apply_constraint_and_check_range_intx(name, value, !CommandLineFlagConstraintList::validated_after_ergo());
+  Flag::Error check = apply_constraint_and_check_range_intx(name, *value, !CommandLineFlagConstraintList::validated_after_ergo());
   if (check != Flag::SUCCESS) return check;
   intx old_value = result->get_intx();
   trace_flag_changed<EventLongFlagChanged, intx>(name, old_value, *value, origin);
@@ -939,7 +954,7 @@
 Flag::Error CommandLineFlagsEx::intxAtPut(CommandLineFlagWithType flag, intx value, Flag::Flags origin) {
   Flag* faddr = address_of_flag(flag);
   guarantee(faddr != NULL && faddr->is_intx(), "wrong flag type");
-  Flag::Error check = apply_constraint_and_check_range_intx(faddr->_name, &value);
+  Flag::Error check = apply_constraint_and_check_range_intx(faddr->_name, value);
   if (check != Flag::SUCCESS) return check;
   trace_flag_changed<EventLongFlagChanged, intx>(faddr->_name, faddr->get_intx(), value, origin);
   faddr->set_intx(value);
@@ -955,25 +970,26 @@
   return Flag::SUCCESS;
 }
 
-static Flag::Error apply_constraint_and_check_range_uintx(const char* name, uintx* new_value, bool verbose = true) {
-  Flag::Error range_status = Flag::SUCCESS;
+static Flag::Error apply_constraint_and_check_range_uintx(const char* name, uintx new_value, bool verbose = true) {
+  Flag::Error status = Flag::SUCCESS;
   CommandLineFlagRange* range = CommandLineFlagRangeList::find(name);
   if (range != NULL) {
-    range_status = range->check_uintx(*new_value, verbose);
+    status = range->check_uintx(new_value, verbose);
   }
-  Flag::Error constraint_status = Flag::SUCCESS;
-  CommandLineFlagConstraint* constraint = CommandLineFlagConstraintList::find_if_needs_check(name);
-  if (constraint != NULL) {
-    constraint_status = constraint->apply_uintx(new_value, verbose);
+  if (status == Flag::SUCCESS) {
+    CommandLineFlagConstraint* constraint = CommandLineFlagConstraintList::find_if_needs_check(name);
+    if (constraint != NULL) {
+      status = constraint->apply_uintx(new_value, verbose);
+    }
   }
-  return get_status_error(range_status, constraint_status);
+  return status;
 }
 
 Flag::Error CommandLineFlags::uintxAtPut(const char* name, size_t len, uintx* value, Flag::Flags origin) {
   Flag* result = Flag::find_flag(name, len);
   if (result == NULL) return Flag::INVALID_FLAG;
   if (!result->is_uintx()) return Flag::WRONG_FORMAT;
-  Flag::Error check = apply_constraint_and_check_range_uintx(name, value, !CommandLineFlagConstraintList::validated_after_ergo());
+  Flag::Error check = apply_constraint_and_check_range_uintx(name, *value, !CommandLineFlagConstraintList::validated_after_ergo());
   if (check != Flag::SUCCESS) return check;
   uintx old_value = result->get_uintx();
   trace_flag_changed<EventUnsignedLongFlagChanged, u8>(name, old_value, *value, origin);
@@ -986,7 +1002,7 @@
 Flag::Error CommandLineFlagsEx::uintxAtPut(CommandLineFlagWithType flag, uintx value, Flag::Flags origin) {
   Flag* faddr = address_of_flag(flag);
   guarantee(faddr != NULL && faddr->is_uintx(), "wrong flag type");
-  Flag::Error check = apply_constraint_and_check_range_uintx(faddr->_name, &value);
+  Flag::Error check = apply_constraint_and_check_range_uintx(faddr->_name, value);
   if (check != Flag::SUCCESS) return check;
   trace_flag_changed<EventUnsignedLongFlagChanged, u8>(faddr->_name, faddr->get_uintx(), value, origin);
   faddr->set_uintx(value);
@@ -1002,25 +1018,26 @@
   return Flag::SUCCESS;
 }
 
-static Flag::Error apply_constraint_and_check_range_uint64_t(const char* name, uint64_t* new_value, bool verbose = true) {
-  Flag::Error range_status = Flag::SUCCESS;
+static Flag::Error apply_constraint_and_check_range_uint64_t(const char* name, uint64_t new_value, bool verbose = true) {
+  Flag::Error status = Flag::SUCCESS;
   CommandLineFlagRange* range = CommandLineFlagRangeList::find(name);
   if (range != NULL) {
-    range_status = range->check_uint64_t(*new_value, verbose);
+    status = range->check_uint64_t(new_value, verbose);
   }
-  Flag::Error constraint_status = Flag::SUCCESS;
-  CommandLineFlagConstraint* constraint = CommandLineFlagConstraintList::find_if_needs_check(name);
-  if (constraint != NULL) {
-    constraint_status = constraint->apply_uint64_t(new_value, verbose);
+  if (status == Flag::SUCCESS) {
+    CommandLineFlagConstraint* constraint = CommandLineFlagConstraintList::find_if_needs_check(name);
+    if (constraint != NULL) {
+      status = constraint->apply_uint64_t(new_value, verbose);
+    }
   }
-  return get_status_error(range_status, constraint_status);
+  return status;
 }
 
 Flag::Error CommandLineFlags::uint64_tAtPut(const char* name, size_t len, uint64_t* value, Flag::Flags origin) {
   Flag* result = Flag::find_flag(name, len);
   if (result == NULL) return Flag::INVALID_FLAG;
   if (!result->is_uint64_t()) return Flag::WRONG_FORMAT;
-  Flag::Error check = apply_constraint_and_check_range_uint64_t(name, value, !CommandLineFlagConstraintList::validated_after_ergo());
+  Flag::Error check = apply_constraint_and_check_range_uint64_t(name, *value, !CommandLineFlagConstraintList::validated_after_ergo());
   if (check != Flag::SUCCESS) return check;
   uint64_t old_value = result->get_uint64_t();
   trace_flag_changed<EventUnsignedLongFlagChanged, u8>(name, old_value, *value, origin);
@@ -1033,7 +1050,7 @@
 Flag::Error CommandLineFlagsEx::uint64_tAtPut(CommandLineFlagWithType flag, uint64_t value, Flag::Flags origin) {
   Flag* faddr = address_of_flag(flag);
   guarantee(faddr != NULL && faddr->is_uint64_t(), "wrong flag type");
-  Flag::Error check = apply_constraint_and_check_range_uint64_t(faddr->_name, &value);
+  Flag::Error check = apply_constraint_and_check_range_uint64_t(faddr->_name, value);
   if (check != Flag::SUCCESS) return check;
   trace_flag_changed<EventUnsignedLongFlagChanged, u8>(faddr->_name, faddr->get_uint64_t(), value, origin);
   faddr->set_uint64_t(value);
@@ -1049,25 +1066,26 @@
   return Flag::SUCCESS;
 }
 
-static Flag::Error apply_constraint_and_check_range_size_t(const char* name, size_t* new_value, bool verbose = true) {
-  Flag::Error range_status = Flag::SUCCESS;
+static Flag::Error apply_constraint_and_check_range_size_t(const char* name, size_t new_value, bool verbose = true) {
+  Flag::Error status = Flag::SUCCESS;
   CommandLineFlagRange* range = CommandLineFlagRangeList::find(name);
   if (range != NULL) {
-    range_status = range->check_size_t(*new_value, verbose);
+    status = range->check_size_t(new_value, verbose);
   }
-  Flag::Error constraint_status = Flag::SUCCESS;
-  CommandLineFlagConstraint* constraint = CommandLineFlagConstraintList::find_if_needs_check(name);
-  if (constraint != NULL) {
-    constraint_status = constraint->apply_size_t(new_value, verbose);
+  if (status == Flag::SUCCESS) {
+    CommandLineFlagConstraint* constraint = CommandLineFlagConstraintList::find_if_needs_check(name);
+    if (constraint != NULL) {
+      status = constraint->apply_size_t(new_value, verbose);
+    }
   }
-  return get_status_error(range_status, constraint_status);
+  return status;
 }
 
 Flag::Error CommandLineFlags::size_tAtPut(const char* name, size_t len, size_t* value, Flag::Flags origin) {
   Flag* result = Flag::find_flag(name, len);
   if (result == NULL) return Flag::INVALID_FLAG;
   if (!result->is_size_t()) return Flag::WRONG_FORMAT;
-  Flag::Error check = apply_constraint_and_check_range_size_t(name, value, !CommandLineFlagConstraintList::validated_after_ergo());
+  Flag::Error check = apply_constraint_and_check_range_size_t(name, *value, !CommandLineFlagConstraintList::validated_after_ergo());
   if (check != Flag::SUCCESS) return check;
   size_t old_value = result->get_size_t();
   trace_flag_changed<EventUnsignedLongFlagChanged, u8>(name, old_value, *value, origin);
@@ -1080,7 +1098,7 @@
 Flag::Error CommandLineFlagsEx::size_tAtPut(CommandLineFlagWithType flag, size_t value, Flag::Flags origin) {
   Flag* faddr = address_of_flag(flag);
   guarantee(faddr != NULL && faddr->is_size_t(), "wrong flag type");
-  Flag::Error check = apply_constraint_and_check_range_size_t(faddr->_name, &value);
+  Flag::Error check = apply_constraint_and_check_range_size_t(faddr->_name, value);
   if (check != Flag::SUCCESS) return check;
   trace_flag_changed<EventUnsignedLongFlagChanged, u8>(faddr->_name, faddr->get_size_t(), value, origin);
   faddr->set_size_t(value);
@@ -1096,25 +1114,26 @@
   return Flag::SUCCESS;
 }
 
-static Flag::Error apply_constraint_and_check_range_double(const char* name, double* new_value, bool verbose = true) {
-  Flag::Error range_status = Flag::SUCCESS;
+static Flag::Error apply_constraint_and_check_range_double(const char* name, double new_value, bool verbose = true) {
+  Flag::Error status = Flag::SUCCESS;
   CommandLineFlagRange* range = CommandLineFlagRangeList::find(name);
   if (range != NULL) {
-    range_status = range->check_double(*new_value, verbose);
+    status = range->check_double(new_value, verbose);
   }
-  Flag::Error constraint_status = Flag::SUCCESS;
-  CommandLineFlagConstraint* constraint = CommandLineFlagConstraintList::find_if_needs_check(name);
-  if (constraint != NULL) {
-    constraint_status = constraint->apply_double(new_value, verbose);
+  if (status == Flag::SUCCESS) {
+    CommandLineFlagConstraint* constraint = CommandLineFlagConstraintList::find_if_needs_check(name);
+    if (constraint != NULL) {
+      status = constraint->apply_double(new_value, verbose);
+    }
   }
-  return get_status_error(range_status, constraint_status);
+  return status;
 }
 
 Flag::Error CommandLineFlags::doubleAtPut(const char* name, size_t len, double* value, Flag::Flags origin) {
   Flag* result = Flag::find_flag(name, len);
   if (result == NULL) return Flag::INVALID_FLAG;
   if (!result->is_double()) return Flag::WRONG_FORMAT;
-  Flag::Error check = apply_constraint_and_check_range_double(name, value, !CommandLineFlagConstraintList::validated_after_ergo());
+  Flag::Error check = apply_constraint_and_check_range_double(name, *value, !CommandLineFlagConstraintList::validated_after_ergo());
   if (check != Flag::SUCCESS) return check;
   double old_value = result->get_double();
   trace_flag_changed<EventDoubleFlagChanged, double>(name, old_value, *value, origin);
@@ -1127,7 +1146,7 @@
 Flag::Error CommandLineFlagsEx::doubleAtPut(CommandLineFlagWithType flag, double value, Flag::Flags origin) {
   Flag* faddr = address_of_flag(flag);
   guarantee(faddr != NULL && faddr->is_double(), "wrong flag type");
-  Flag::Error check = apply_constraint_and_check_range_double(faddr->_name, &value);
+  Flag::Error check = apply_constraint_and_check_range_double(faddr->_name, value);
   if (check != Flag::SUCCESS) return check;
   trace_flag_changed<EventDoubleFlagChanged, double>(faddr->_name, faddr->get_double(), value, origin);
   faddr->set_double(value);
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/runtime/globals.hpp
--- a/hotspot/src/share/vm/runtime/globals.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/runtime/globals.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -372,19 +372,7 @@
   void print_kind(outputStream* st);
   void print_as_flag(outputStream* st);
 
-  static const char* flag_error_str(Flag::Error error) {
-    switch (error) {
-      case Flag::MISSING_NAME: return "MISSING_NAME";
-      case Flag::MISSING_VALUE: return "MISSING_VALUE";
-      case Flag::NON_WRITABLE: return "NON_WRITABLE";
-      case Flag::OUT_OF_BOUNDS: return "OUT_OF_BOUNDS";
-      case Flag::VIOLATES_CONSTRAINT: return "VIOLATES_CONSTRAINT";
-      case Flag::INVALID_FLAG: return "INVALID_FLAG";
-      case Flag::ERR_OTHER: return "ERR_OTHER";
-      case Flag::SUCCESS: return "SUCCESS";
-      default: return "NULL";
-    }
-  }
+  static const char* flag_error_str(Flag::Error error);
 };
 
 // debug flags control various aspects of the VM and are global accessible
@@ -1564,6 +1552,10 @@
   product(uint, ParallelGCThreads, 0,                                       \
           "Number of parallel threads parallel gc will use")                \
                                                                             \
+  diagnostic(bool, UseSemaphoreGCThreadsSynchronization, true,              \
+            "Use semaphore synchronization for the GC Threads, "            \
+            "instead of synchronization based on mutexes")                  \
+                                                                            \
   product(bool, UseDynamicNumberOfGCThreads, false,                         \
           "Dynamically choose the number of parallel threads "              \
           "parallel gc will use")                                           \
@@ -1575,7 +1567,7 @@
   product(size_t, HeapSizePerGCThread, ScaleForWordSize(64*M),              \
           "Size of heap (bytes) per GC thread used in calculating the "     \
           "number of GC threads")                                           \
-          range((uintx)os::vm_page_size(), max_uintx)                       \
+          range((size_t)os::vm_page_size(), (size_t)max_uintx)              \
                                                                             \
   product(bool, TraceDynamicGCThreads, false,                               \
           "Trace the dynamic GC thread usage")                              \
@@ -1856,6 +1848,7 @@
   product(size_t, MarkStackSize, NOT_LP64(32*K) LP64_ONLY(4*M),             \
           "Size of marking stack")                                          \
                                                                             \
+  /* where does the range max value of (max_jint - 1) come from? */         \
   product(size_t, MarkStackSizeMax, NOT_LP64(4*M) LP64_ONLY(512*M),         \
           "Maximum size of marking stack")                                  \
           range(1, (max_jint - 1))                                          \
@@ -2920,12 +2913,6 @@
   notproduct(bool, ICMissHistogram, false,                                  \
           "Produce histogram of IC misses")                                 \
                                                                             \
-  notproduct(bool, PrintClassStatistics, false,                             \
-          "Print class statistics at end of run")                           \
-                                                                            \
-  notproduct(bool, PrintMethodStatistics, false,                            \
-          "Print method statistics at end of run")                          \
-                                                                            \
   /* interpreter */                                                         \
   develop(bool, ClearInterpreterLocals, false,                              \
           "Always clear local variables of interpreter activations upon "   \
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/runtime/java.cpp
--- a/hotspot/src/share/vm/runtime/java.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/runtime/java.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -304,13 +304,6 @@
     CodeCache::print_internals();
   }
 
-  if (PrintClassStatistics) {
-    SystemDictionary::print_class_statistics();
-  }
-  if (PrintMethodStatistics) {
-    SystemDictionary::print_method_statistics();
-  }
-
   if (PrintVtableStats) {
     klassVtable::print_statistics();
     klassItable::print_statistics();
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/runtime/stubRoutines.cpp
--- a/hotspot/src/share/vm/runtime/stubRoutines.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/runtime/stubRoutines.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -181,7 +181,7 @@
     StubGenerator_generate(&buffer, false);
     // When new stubs added we need to make sure there is some space left
     // to catch situation when we should increase size again.
-    assert(buffer.insts_remaining() > 200, "increase code_size1");
+    assert(code_size1 == 0 || buffer.insts_remaining() > 200, "increase code_size1");
   }
 }
 
@@ -274,7 +274,7 @@
     StubGenerator_generate(&buffer, true);
     // When new stubs added we need to make sure there is some space left
     // to catch situation when we should increase size again.
-    assert(buffer.insts_remaining() > 200, "increase code_size2");
+    assert(code_size2 == 0 || buffer.insts_remaining() > 200, "increase code_size2");
   }
 
 #ifdef ASSERT
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/runtime/sweeper.cpp
--- a/hotspot/src/share/vm/runtime/sweeper.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/runtime/sweeper.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -618,19 +618,14 @@
         MutexLocker cl(CompiledIC_lock);
         nm->clear_ic_stubs();
       }
-      // Acquiring the CompiledIC_lock may block for a safepoint and set the
-      // nmethod to zombie (see 'CodeCache::make_marked_nmethods_zombies').
-      // Check if nmethod is still non-entrant at this point.
-      if (nm->is_not_entrant()) {
-        if (PrintMethodFlushing && Verbose) {
-          tty->print_cr("### Nmethod %3d/" PTR_FORMAT " (not entrant) being made zombie", nm->compile_id(), nm);
-        }
-        // Code cache state change is tracked in make_zombie()
-        nm->make_zombie();
-        SWEEP(nm);
-        assert(result == None, "sanity");
-        result = MadeZombie;
+      if (PrintMethodFlushing && Verbose) {
+        tty->print_cr("### Nmethod %3d/" PTR_FORMAT " (not entrant) being made zombie", nm->compile_id(), nm);
       }
+      // Code cache state change is tracked in make_zombie()
+      nm->make_zombie();
+      SWEEP(nm);
+      assert(result == None, "sanity");
+      result = MadeZombie;
       assert(nm->is_zombie(), "nmethod must be zombie");
     } else {
       // Still alive, clean up its inline caches
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/runtime/thread.cpp
--- a/hotspot/src/share/vm/runtime/thread.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/runtime/thread.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -3331,7 +3331,6 @@
 
   // Final check of all 'AfterErgo' constraints after ergonomics which may change values.
   bool constraint_result = CommandLineFlagConstraintList::check_constraints(CommandLineFlagConstraint::AfterErgo);
-  Arguments::post_after_ergo_constraint_check(constraint_result);
   if (!constraint_result) {
     return JNI_EINVAL;
   }
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/runtime/vmStructs.cpp
--- a/hotspot/src/share/vm/runtime/vmStructs.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/runtime/vmStructs.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -405,7 +405,7 @@
   nonstatic_field(ObjArrayKlass,               _element_klass,                                Klass*)                                \
   nonstatic_field(ObjArrayKlass,               _bottom_klass,                                 Klass*)                                \
   volatile_nonstatic_field(Symbol,             _refcount,                                     short)                                 \
-  nonstatic_field(Symbol,                      _identity_hash,                                int)                                   \
+  nonstatic_field(Symbol,                      _identity_hash,                                short)                                 \
   nonstatic_field(Symbol,                      _length,                                       unsigned short)                        \
   unchecked_nonstatic_field(Symbol,            _body,                                         sizeof(jbyte)) /* NOTE: no type */     \
   nonstatic_field(TypeArrayKlass,              _max_length,                                   int)                                   \
@@ -1565,6 +1565,7 @@
   declare_toplevel_type(Generation*)                                      \
   declare_toplevel_type(GenerationSpec**)                                 \
   declare_toplevel_type(HeapWord*)                                        \
+  declare_toplevel_type(HeapWord* volatile)                               \
   declare_toplevel_type(MemRegion*)                                       \
   declare_toplevel_type(OffsetTableContigSpace*)                          \
   declare_toplevel_type(Space*)                                           \
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/runtime/vm_operations.cpp
--- a/hotspot/src/share/vm/runtime/vm_operations.cpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/runtime/vm_operations.cpp	Thu Sep 03 16:14:02 2015 -0700
@@ -109,8 +109,8 @@
   // Deoptimize all activations depending on marked nmethods
   Deoptimization::deoptimize_dependents();
 
-  // Make the dependent methods zombies
-  CodeCache::make_marked_nmethods_zombies();
+  // Make the dependent methods not entrant
+  CodeCache::make_marked_nmethods_not_entrant();
 }
 
 void VM_MarkActiveNMethods::doit() {
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/trace/trace.xml
--- a/hotspot/src/share/vm/trace/trace.xml	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/trace/trace.xml	Thu Sep 03 16:14:02 2015 -0700
@@ -346,6 +346,29 @@
       <value type="BYTES64" field="totalSize" label="Total Size" />
     </event>
 
+    <struct id="G1EvacStats">
+      <value type="UINT" field="gcId" label="GC ID" relation="GC_ID"/>
+      <value type="BYTES64" field="allocated" label="Allocated" description="Total memory allocated by PLABs"/>
+      <value type="BYTES64" field="wasted" label="Wasted" description="Total memory wasted within PLABs due to alignment or refill"/>
+      <value type="BYTES64" field="used" label="Used" description="Total memory occupied by objects within PLABs"/>
+      <value type="BYTES64" field="undoWaste" label="Undo Wasted" description="Total memory wasted due to allocation undo within PLABs"/>
+      <value type="BYTES64" field="regionEndWaste" label="Region End Wasted" description="Total memory wasted at the end of regions due to refill"/>
+      <value type="UINT" field="regionsRefilled" label="Region Refills" description="Total memory wasted at the end of regions due to refill"/>
+      <value type="BYTES64" field="directAllocated" label="Allocated (direct)" description="Total memory allocated using direct allocation outside of PLABs"/>
+      <value type="BYTES64" field="failureUsed" label="Used (failure)" description="Total memory occupied by objects in regions where evacuation failed"/>
+      <value type="BYTES64" field="failureWaste" label="Wasted (failure)" description="Total memory left unused in regions where evacuation failed"/>
+    </struct>
+
+    <event id="GCG1EvacuationYoungStatistics" path="vm/gc/detailed/g1_evac_young_stats" label="G1 Evacuation Statistics for Young" is_instant="true"
+           description="Memory related evacuation statistics during GC for the young generation">
+      <structvalue type="G1EvacStats" field="stats" label="Evacuation statistics"/>
+    </event>
+
+    <event id="GCG1EvacuationOldStatistics" path="vm/gc/detailed/g1_evac_old_stats" label="G1 Evacuation Memory Statistics for Old" is_instant="true"
+           description="Memory related evacuation statistics during GC for the old generation">
+      <structvalue type="G1EvacStats" field="stats" label="Evacuation statistics"/>
+    </event>
+
     <!-- Promotion events, Supported GCs are Parallel Scavange, G1 and CMS with Parallel New. -->
     <event id="PromoteObjectInNewPLAB" path="vm/gc/detailed/object_promotion_in_new_PLAB" label="Promotion in new PLAB"
         description="Object survived scavenge and was copied to a new Promotion Local Allocation Buffer (PLAB). Supported GCs are Parallel Scavange, G1 and CMS with Parallel New. Due to promotion being done in parallel an object might be reported multiple times as the GC threads race to copy all objects." 
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/utilities/globalDefinitions_gcc.hpp
--- a/hotspot/src/share/vm/utilities/globalDefinitions_gcc.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/utilities/globalDefinitions_gcc.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -161,7 +161,7 @@
 
 
 //----------------------------------------------------------------------------------------------------
-// Constant for jlong (specifying an long long canstant is C++ compiler specific)
+// Constant for jlong (specifying a long long constant is C++ compiler specific)
 
 // Build a 64bit integer constant
 #define CONST64(x)  (x ## LL)
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/utilities/globalDefinitions_sparcWorks.hpp
--- a/hotspot/src/share/vm/utilities/globalDefinitions_sparcWorks.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/utilities/globalDefinitions_sparcWorks.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -178,7 +178,7 @@
 
 
 //----------------------------------------------------------------------------------------------------
-// Constant for jlong (specifying an long long constant is C++ compiler specific)
+// Constant for jlong (specifying a long long constant is C++ compiler specific)
 
 // Build a 64bit integer constant
 #define CONST64(x)  (x ## LL)
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/utilities/globalDefinitions_visCPP.hpp
--- a/hotspot/src/share/vm/utilities/globalDefinitions_visCPP.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/utilities/globalDefinitions_visCPP.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -148,9 +148,9 @@
 inline int g_isfinite(jdouble f)                 { return _finite(f); }
 
 //----------------------------------------------------------------------------------------------------
-// Constant for jlong (specifying an long long constant is C++ compiler specific)
+// Constant for jlong (specifying a long long constant is C++ compiler specific)
 
-// Build a 64bit integer constant on with Visual C++
+// Build a 64bit integer constant with Visual C++
 #define  CONST64(x) (x ##  i64)
 #define UCONST64(x) (x ## ui64)
 
diff -r eb1661ea942c -r 6675700073c1 hotspot/src/share/vm/utilities/globalDefinitions_xlc.hpp
--- a/hotspot/src/share/vm/utilities/globalDefinitions_xlc.hpp	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/src/share/vm/utilities/globalDefinitions_xlc.hpp	Thu Sep 03 16:14:02 2015 -0700
@@ -108,7 +108,7 @@
 
 
 //----------------------------------------------------------------------------------------------------
-// Constant for jlong (specifying an long long canstant is C++ compiler specific)
+// Constant for jlong (specifying a long long constant is C++ compiler specific)
 
 // Build a 64bit integer constant
 #define CONST64(x)  (x ## LL)
diff -r eb1661ea942c -r 6675700073c1 hotspot/test/compiler/arguments/CheckCICompilerCount.java
--- a/hotspot/test/compiler/arguments/CheckCICompilerCount.java	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/test/compiler/arguments/CheckCICompilerCount.java	Thu Sep 03 16:14:02 2015 -0700
@@ -68,7 +68,7 @@
 
     private static final String[][] NON_TIERED_EXPECTED_OUTPUTS = {
         {
-            "CICompilerCount=0 must be at least 1",
+            "CICompilerCount (0) must be at least 1",
             "Improperly specified VM option 'CICompilerCount=0'"
         },
         {
@@ -123,7 +123,7 @@
 
     private static final String[][] TIERED_EXPECTED_OUTPUTS = {
         {
-            "CICompilerCount=1 must be at least 2",
+            "CICompilerCount (1) must be at least 2",
             "Improperly specified VM option 'CICompilerCount=1'"
         },
         {
diff -r eb1661ea942c -r 6675700073c1 hotspot/test/compiler/arraycopy/TestEliminatedArrayCopyDeopt.java
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/arraycopy/TestEliminatedArrayCopyDeopt.java	Thu Sep 03 16:14:02 2015 -0700
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ * @test
+ * @bug 8130847
+ * @summary Eliminated instance/array written to by an array copy variant must be correctly initialized when reallocated at a deopt
+ * @run main/othervm -XX:-BackgroundCompilation -XX:-UseOnStackReplacement TestEliminatedArrayCopyDeopt
+ *
+ */
+
+// Test that if an ArrayCopy node is eliminated because it doesn't
+// escape, then the correct field/array element values are captured so
+// on a deoptimization, when the object/array is reallocated, it is
+// correctly initialized
+
+public class TestEliminatedArrayCopyDeopt {
+
+    static class A implements Cloneable {
+        int f0;
+        int f1;
+        int f2;
+        int f3;
+        int f4;
+        int f5;
+        int f6;
+        int f7;
+        int f8;
+        int f9;
+        int f10;
+        int f11;
+        int f12;
+        int f13;
+        int f14;
+        int f15;
+
+        public Object clone() throws CloneNotSupportedException {
+            return super.clone();
+        }
+    }
+
+    // Clone
+    static boolean m1(A a, boolean flag) throws CloneNotSupportedException {
+        A c = (A)a.clone();
+        if (flag) {
+            // never taken branch that causes the deoptimization
+            if (c.f0 != 0x42) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    // Array clone
+    static int[] m2_src = null;
+    static boolean m2(boolean flag) throws CloneNotSupportedException {
+        int[] src  = new int[10];
+        m2_src = src;
+        for (int i = 0; i < src.length; i++) {
+            src[i] = 0x42+i;
+        }
+        int[] c = (int[])src.clone();
+        if (flag) {
+            for (int i = 0; i < c.length; i++) {
+                if (c[i] != src[i]) {
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+
+    // Array copy
+    static boolean m3(int[] src, boolean flag) {
+        int[] dst = new int[10];
+        System.arraycopy(src, 0, dst, 0, 10);
+        if (flag) {
+            for (int i = 0; i < dst.length; i++) {
+                if (dst[i] != src[i]) {
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+
+    // Array copy of subrange
+    static boolean m4(int[] src, boolean flag) {
+        int[] dst = new int[10];
+        dst[0] = 0x42;
+        dst[1] = 0x42 - 1;
+        dst[2] = 0x42 - 2;
+        dst[8] = 0x42 - 8;
+        dst[9] = 0x42 - 9;
+        int src_off = 2;
+        int dst_off = 3;
+        int len = 5;
+        System.arraycopy(src, src_off, dst, dst_off, len);
+        if (flag) {
+            for (int i = 0; i < dst.length; i++) {
+                if (i >= dst_off &&  i < dst_off + len) {
+                    if (dst[i] != src[i - dst_off + src_off]) {
+                        return false;
+                    }
+                } else {
+                    if (dst[i] != 0x42-i) {
+                        return false;
+                    }
+                }
+            }
+        }
+        return true;
+    }
+
+    // Array copy with Phi
+    static boolean m5(int[] src, boolean flag1, boolean flag2) {
+        int[] dst = new int[10];
+        if (flag1) {
+            System.arraycopy(src, 0, dst, 0, 10);
+        }
+        if (flag2) {
+            for (int i = 0; i < dst.length; i++) {
+                if (dst[i] != src[i]) {
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+
+    static public void main(String[] args) throws Exception {
+        boolean success = true;
+        A a = new A();
+        a.f0 = 0x42;
+        for (int i = 0; i < 20000; i++) {
+            m1(a, false);
+        }
+        if (!m1(a, true)) {
+            System.out.println("m1 failed");
+            success = false;
+        }
+
+        for (int i = 0; i < 20000; i++) {
+            m2(false);
+        }
+        if (!m2(true)) {
+            System.out.println("m2 failed");
+            success = false;
+        }
+
+        int[] src = new int[10];
+        for (int i = 0; i < src.length; i++) {
+            src[i] = 0x42+i;
+        }
+
+        for (int i = 0; i < 20000; i++) {
+            m3(src, false);
+        }
+        if (!m3(src, true)) {
+            System.out.println("m3 failed");
+            success = false;
+        }
+
+        for (int i = 0; i < 20000; i++) {
+            m4(src, false);
+        }
+        if (!m4(src, true)) {
+            System.out.println("m4 failed");
+            success = false;
+        }
+
+        for (int i = 0; i < 20000; i++) {
+            m5(src, i%2 == 0, false);
+        }
+        if (!m5(src, true, true)) {
+            System.out.println("m4 failed");
+            success = false;
+        }
+
+        if (!success) {
+            throw new RuntimeException("Test failed");
+        }
+    }
+}
diff -r eb1661ea942c -r 6675700073c1 hotspot/test/compiler/arraycopy/TestEliminatedArrayCopyPhi.java
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/arraycopy/TestEliminatedArrayCopyPhi.java	Thu Sep 03 16:14:02 2015 -0700
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ * @test
+ * @bug 8134321
+ * @summary Code that capture field values of eliminated allocation at a safepoint when there's an arraycopy behind a Phi is broken
+ * @run main/othervm -XX:-BackgroundCompilation -XX:-UseOnStackReplacement TestEliminatedArrayCopyPhi
+ *
+ */
+
+public class TestEliminatedArrayCopyPhi {
+
+    static int[] escaped;
+
+    static void test(int[] src, boolean flag1, boolean flag2) {
+        int[] array = new int[10];
+        if (flag1) {
+            System.arraycopy(src, 0, array, 0, src.length);
+        } else {
+        }
+
+        if (flag2) {
+            // never taken
+            escaped = array;
+        }
+    }
+
+    public static void main(String[] args) {
+        int[] src = new int[10];
+        for (int i = 0; i < 20000; i++) {
+            test(src, (i % 2) == 0, false);
+        }
+    }
+}
diff -r eb1661ea942c -r 6675700073c1 hotspot/test/compiler/floatingpoint/NaNTest.java
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/floatingpoint/NaNTest.java	Thu Sep 03 16:14:02 2015 -0700
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+/**
+ * @test
+ * @bug 8076373
+ * @summary Verify if signaling NaNs are preserved.
+ * @run main NaNTest
+ */
+public class NaNTest {
+    static void testFloat() {
+        int originalValue = 0x7f800001;
+        int readBackValue = Float.floatToRawIntBits(Float.intBitsToFloat(originalValue));
+        if (originalValue != readBackValue) {
+            String errorMessage = String.format("Original and read back float values mismatch\n0x%X 0x%X\n",
+                                                originalValue,
+                                                readBackValue);
+            throw new RuntimeException(errorMessage);
+        } else {
+            System.out.printf("Written and read back float values match\n0x%X 0x%X\n",
+                              originalValue,
+                              readBackValue);
+        }
+    }
+
+    static void testDouble() {
+        long originalValue = 0xFFF0000000000001L;
+        long readBackValue = Double.doubleToRawLongBits(Double.longBitsToDouble(originalValue));
+        if (originalValue != readBackValue) {
+            String errorMessage = String.format("Original and read back double values mismatch\n0x%X 0x%X\n",
+                                                originalValue,
+                                                readBackValue);
+            throw new RuntimeException(errorMessage);
+        } else {
+            System.out.printf("Written and read back double values match\n0x%X 0x%X\n",
+                              originalValue,
+                              readBackValue);
+        }
+
+    }
+
+    public static void main(String args[]) {
+        System.out.println("### NanTest started");
+
+        testFloat();
+        testDouble();
+
+        System.out.println("### NanTest ended");
+    }
+}
diff -r eb1661ea942c -r 6675700073c1 hotspot/test/compiler/loopopts/TestMoveStoresOutOfLoops.java
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/loopopts/TestMoveStoresOutOfLoops.java	Thu Sep 03 16:14:02 2015 -0700
@@ -0,0 +1,310 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 8080289
+ * @summary Sink stores out of loops if possible
+ * @run main/othervm -XX:-UseOnStackReplacement -XX:-BackgroundCompilation -XX:+PrintCompilation -XX:CompileCommand=dontinline,TestMoveStoresOutOfLoops::test*  TestMoveStoresOutOfLoops
+ *
+ */
+
+import java.lang.reflect.*;
+import java.util.*;
+import java.util.function.*;
+
+public class TestMoveStoresOutOfLoops {
+
+    private static long[] array = new long[10];
+    private static long[] array2 = new long[10];
+    private static boolean[] array3 = new boolean[1000];
+    private static byte[] byte_array = new byte[10];
+
+    // Array store should be moved out of the loop, value stored
+    // should be 999, the loop should be eliminated
+    static void test_after_1(int idx) {
+        for (int i = 0; i < 1000; i++) {
+            array[idx] = i;
+        }
+    }
+
+    // Array store can't be moved out of loop because of following
+    // non loop invariant array access
+    static void test_after_2(int idx) {
+        for (int i = 0; i < 1000; i++) {
+            array[idx] = i;
+            array2[i%10] = i;
+        }
+    }
+
+    // Array store can't be moved out of loop because of following
+    // use
+    static void test_after_3(int idx) {
+        for (int i = 0; i < 1000; i++) {
+            array[idx] = i;
+            if (array[0] == -1) {
+                break;
+            }
+        }
+    }
+
+    // Array store can't be moved out of loop because of preceding
+    // use
+    static void test_after_4(int idx) {
+        for (int i = 0; i < 1000; i++) {
+            if (array[0] == -2) {
+                break;
+            }
+            array[idx] = i;
+        }
+    }
+
+    // All array stores should be moved out of the loop, one after
+    // the other
+    static void test_after_5(int idx) {
+        for (int i = 0; i < 1000; i++) {
+            array[idx] = i;
+            array[idx+1] = i;
+            array[idx+2] = i;
+            array[idx+3] = i;
+            array[idx+4] = i;
+            array[idx+5] = i;
+        }
+    }
+
+    // Array store can be moved after the loop but needs to be
+    // cloned on both exit paths
+    static void test_after_6(int idx) {
+        for (int i = 0; i < 1000; i++) {
+            array[idx] = i;
+            if (array3[i]) {
+                return;
+            }
+        }
+    }
+
+    // Optimize out redundant stores
+    static void test_stores_1(int ignored) {
+        array[0] = 0;
+        array[1] = 1;
+        array[2] = 2;
+        array[0] = 0;
+        array[1] = 1;
+        array[2] = 2;
+    }
+
+    static void test_stores_2(int idx) {
+        array[idx+0] = 0;
+        array[idx+1] = 1;
+        array[idx+2] = 2;
+        array[idx+0] = 0;
+        array[idx+1] = 1;
+        array[idx+2] = 2;
+    }
+
+    static void test_stores_3(int idx) {
+        byte_array[idx+0] = 0;
+        byte_array[idx+1] = 1;
+        byte_array[idx+2] = 2;
+        byte_array[idx+0] = 0;
+        byte_array[idx+1] = 1;
+        byte_array[idx+2] = 2;
+    }
+
+    // Array store can be moved out of the loop before the loop header
+    static void test_before_1(int idx) {
+        for (int i = 0; i < 1000; i++) {
+            array[idx] = 999;
+        }
+    }
+
+    // Array store can't be moved out of the loop before the loop
+    // header because there's more than one store on this slice
+    static void test_before_2(int idx) {
+        for (int i = 0; i < 1000; i++) {
+            array[idx] = 999;
+            array[i%2] = 0;
+        }
+    }
+
+    // Array store can't be moved out of the loop before the loop
+    // header because of use before store
+    static int test_before_3(int idx) {
+        int res = 0;
+        for (int i = 0; i < 1000; i++) {
+            res += array[i%10];
+            array[idx] = 999;
+        }
+        return res;
+    }
+
+    // Array store can't be moved out of the loop before the loop
+    // header because of possible early exit
+    static void test_before_4(int idx) {
+        for (int i = 0; i < 1000; i++) {
+            if (idx / (i+1) > 0) {
+                return;
+            }
+            array[idx] = 999;
+        }
+    }
+
+    // Array store can't be moved out of the loop before the loop
+    // header because it doesn't postdominate the loop head
+    static void test_before_5(int idx) {
+        for (int i = 0; i < 1000; i++) {
+            if (i % 2 == 0) {
+                array[idx] = 999;
+            }
+        }
+    }
+
+    // Array store can be moved out of the loop before the loop header
+    static int test_before_6(int idx) {
+        int res = 0;
+        for (int i = 0; i < 1000; i++) {
+            if (i%2 == 1) {
+                res *= 2;
+            } else {
+                res++;
+            }
+            array[idx] = 999;
+        }
+        return res;
+    }
+
+    final HashMap<String,Method> tests = new HashMap<>();
+    {
+        for (Method m : this.getClass().getDeclaredMethods()) {
+            if (m.getName().matches("test_(before|after|stores)_[0-9]+")) {
+                assert(Modifier.isStatic(m.getModifiers())) : m;
+                tests.put(m.getName(), m);
+            }
+        }
+    }
+
+    boolean success = true;
+    void doTest(String name, Runnable init, Function<String, Boolean> check) throws Exception {
+        Method m = tests.get(name);
+        for (int i = 0; i < 20000; i++) {
+            init.run();
+            m.invoke(null, 0);
+            success = success && check.apply(name);
+            if (!success) {
+                break;
+            }
+        }
+    }
+
+    static void array_init() {
+        array[0] = -1;
+    }
+
+    static boolean array_check(String name) {
+        boolean success = true;
+        if (array[0] != 999) {
+            success = false;
+            System.out.println(name + " failed: array[0] = " + array[0]);
+        }
+        return success;
+    }
+
+    static void array_init2() {
+        for (int i = 0; i < 6; i++) {
+            array[i] = -1;
+        }
+    }
+
+    static boolean array_check2(String name) {
+        boolean success = true;
+        for (int i = 0; i < 6; i++) {
+            if (array[i] != 999) {
+                success = false;
+                System.out.println(name + " failed: array[" + i + "] = " + array[i]);
+            }
+        }
+        return success;
+    }
+
+    static void array_init3() {
+        for (int i = 0; i < 3; i++) {
+            array[i] = -1;
+        }
+    }
+
+    static boolean array_check3(String name) {
+        boolean success = true;
+        for (int i = 0; i < 3; i++) {
+            if (array[i] != i) {
+                success = false;
+                System.out.println(name + " failed: array[" + i + "] = " + array[i]);
+            }
+        }
+        return success;
+    }
+
+    static void array_init4() {
+        for (int i = 0; i < 3; i++) {
+            byte_array[i] = -1;
+        }
+    }
+
+    static boolean array_check4(String name) {
+        boolean success = true;
+        for (int i = 0; i < 3; i++) {
+            if (byte_array[i] != i) {
+                success = false;
+                System.out.println(name + " failed: byte_array[" + i + "] = " + byte_array[i]);
+            }
+        }
+        return success;
+    }
+
+    static public void main(String[] args) throws Exception {
+        TestMoveStoresOutOfLoops test = new TestMoveStoresOutOfLoops();
+        test.doTest("test_after_1", TestMoveStoresOutOfLoops::array_init, TestMoveStoresOutOfLoops::array_check);
+        test.doTest("test_after_2", TestMoveStoresOutOfLoops::array_init, TestMoveStoresOutOfLoops::array_check);
+        test.doTest("test_after_3", TestMoveStoresOutOfLoops::array_init, TestMoveStoresOutOfLoops::array_check);
+        test.doTest("test_after_4", TestMoveStoresOutOfLoops::array_init, TestMoveStoresOutOfLoops::array_check);
+        test.doTest("test_after_5", TestMoveStoresOutOfLoops::array_init2, TestMoveStoresOutOfLoops::array_check2);
+        test.doTest("test_after_6", TestMoveStoresOutOfLoops::array_init, TestMoveStoresOutOfLoops::array_check);
+        array3[999] = true;
+        test.doTest("test_after_6", TestMoveStoresOutOfLoops::array_init, TestMoveStoresOutOfLoops::array_check);
+
+        test.doTest("test_stores_1", TestMoveStoresOutOfLoops::array_init3, TestMoveStoresOutOfLoops::array_check3);
+        test.doTest("test_stores_2", TestMoveStoresOutOfLoops::array_init3, TestMoveStoresOutOfLoops::array_check3);
+        test.doTest("test_stores_3", TestMoveStoresOutOfLoops::array_init4, TestMoveStoresOutOfLoops::array_check4);
+
+        test.doTest("test_before_1", TestMoveStoresOutOfLoops::array_init, TestMoveStoresOutOfLoops::array_check);
+        test.doTest("test_before_2", TestMoveStoresOutOfLoops::array_init, TestMoveStoresOutOfLoops::array_check);
+        test.doTest("test_before_3", TestMoveStoresOutOfLoops::array_init, TestMoveStoresOutOfLoops::array_check);
+        test.doTest("test_before_4", TestMoveStoresOutOfLoops::array_init, TestMoveStoresOutOfLoops::array_check);
+        test.doTest("test_before_5", TestMoveStoresOutOfLoops::array_init, TestMoveStoresOutOfLoops::array_check);
+        test.doTest("test_before_6", TestMoveStoresOutOfLoops::array_init, TestMoveStoresOutOfLoops::array_check);
+
+        if (!test.success) {
+            throw new RuntimeException("Some tests failed");
+        }
+    }
+}
diff -r eb1661ea942c -r 6675700073c1 hotspot/test/compiler/regalloc/TestVectorRegAlloc.java
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/regalloc/TestVectorRegAlloc.java	Thu Sep 03 16:14:02 2015 -0700
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 8131969
+ * @summary assert in register allocation code when vector Phi for a loop is processed because code assumes all inputs already processed
+ * @run main/othervm -Xbatch TestVectorRegAlloc
+ *
+ */
+
+public class TestVectorRegAlloc {
+
+    static int test_helper_i;
+    static boolean test_helper() {
+        test_helper_i++;
+        return (test_helper_i & 7) != 0;
+    }
+
+    static void test(double[] src, double[] dst, boolean flag) {
+        double j = 0.0;
+        while(test_helper()) {
+            for (int i = 0; i < src.length; i++) {
+                dst[i] = src[i] + j;
+            }
+            // Loop will be unswitched and ReplicateD of zero will be
+            // split through the Phi of outer loop
+            for (int i = 0; i < src.length; i++) {
+                double k;
+                if (flag) {
+                    k = j;
+                } else {
+                    k = 0;
+                }
+                dst[i] = src[i] + k;
+            }
+            j++;
+        }
+    }
+
+    static public void main(String[] args) {
+        double[] src = new double[10];
+        double[] dst = new double[10];
+        for (int i = 0; i < 20000; i++) {
+            test(src, dst, (i % 2) == 0);
+        }
+    }
+}
diff -r eb1661ea942c -r 6675700073c1 hotspot/test/gc/survivorAlignment/TestPromotionFromSurvivorToTenuredAfterMinorGC.java
--- a/hotspot/test/gc/survivorAlignment/TestPromotionFromSurvivorToTenuredAfterMinorGC.java	Thu Sep 03 14:24:41 2015 -0700
+++ b/hotspot/test/gc/survivorAlignment/TestPromotionFromSurvivorToTenuredAfterMinorGC.java	Thu Sep 03 16:14:02 2015 -0700
@@ -28,7 +28,6 @@
  *          when their age exceeded tenuring threshold are not aligned to
  *          SurvivorAlignmentInBytes value.
  * @library /testlibrary /../../test/lib
- * @ignore 8130308
  * @modules java.base/sun.misc
  *          java.management
  * @build TestPromotionFromSurvivorToTenuredAfterMinorGC
@@ -99,11 +98,18 @@
                 .getActualMemoryUsage();
 
         test.allocate();
-        for (int i = 0; i <= SurvivorAlignmentTestMain.MAX_TENURING_THRESHOLD;
-             i++) {
+        for (int i = 0; i <= SurvivorAlignmentTestMain.MAX_TENURING_THRESHOLD; i++) {
             SurvivorAlignmentTestMain.WHITE_BOX.youngGC();
         }
 
+        // Sometimes we see that data unrelated to the test has been allocated during
+        // the loop. This data is included in the expectedMemoryUsage since we look
+        // through all threads to see what they allocated. If this data is still in
+        // the survivor area however, it should not be included in expectedMemoryUsage
+        // since the verification below only look at what's in tenured space.
+        expectedMemoryUsage -= SurvivorAlignmentTestMain.getAlignmentHelper(
+                                   SurvivorAlignmentTestMain.HeapSpace.SURVIVOR)
+                                   .getActualMemoryUsage();
         test.verifyMemoryUsage(expectedMemoryUsage);
     }
 }