--- a/hotspot/src/cpu/aarch64/vm/aarch64.ad Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/cpu/aarch64/vm/aarch64.ad Fri Aug 21 09:12:42 2015 +0200
@@ -1033,27 +1033,39 @@
};
// graph traversal helpers
- MemBarNode *has_parent_membar(const Node *n,
- ProjNode *&ctl, ProjNode *&mem);
- MemBarNode *has_child_membar(const MemBarNode *n,
- ProjNode *&ctl, ProjNode *&mem);
+
+ MemBarNode *parent_membar(const Node *n);
+ MemBarNode *child_membar(const MemBarNode *n);
+ bool leading_membar(const MemBarNode *barrier);
+
+ bool is_card_mark_membar(const MemBarNode *barrier);
+
+ MemBarNode *leading_to_normal(MemBarNode *leading);
+ MemBarNode *normal_to_leading(const MemBarNode *barrier);
+ MemBarNode *card_mark_to_trailing(const MemBarNode *barrier);
+ MemBarNode *trailing_to_card_mark(const MemBarNode *trailing);
+ MemBarNode *trailing_to_leading(const MemBarNode *trailing);
// predicates controlling emit of ldr<x>/ldar<x> and associated dmb
+
bool unnecessary_acquire(const Node *barrier);
bool needs_acquiring_load(const Node *load);
// predicates controlling emit of str<x>/stlr<x> and associated dmbs
+
bool unnecessary_release(const Node *barrier);
bool unnecessary_volatile(const Node *barrier);
bool needs_releasing_store(const Node *store);
- // Use barrier instructions for unsafe volatile gets rather than
- // trying to identify an exact signature for them
- const bool UseBarriersForUnsafeVolatileGet = false;
+ // predicate controlling translation of StoreCM
+ bool unnecessary_storestore(const Node *storecm);
%}
source %{
+ // Optimizaton of volatile gets and puts
+ // -------------------------------------
+ //
// AArch64 has ldar<x> and stlr<x> instructions which we can safely
// use to implement volatile reads and writes. For a volatile read
// we simply need
@@ -1102,15 +1114,19 @@
// A volatile write is translated to the node sequence
//
// MemBarRelease
- // StoreX[mo_release]
+ // StoreX[mo_release] {CardMark}-optional
// MemBarVolatile
//
// n.b. the above node patterns are generated with a strict
// 'signature' configuration of input and output dependencies (see
- // the predicates below for exact details). The two signatures are
- // unique to translated volatile reads/stores -- they will not
- // appear as a result of any other bytecode translation or inlining
- // nor as a consequence of optimizing transforms.
+ // the predicates below for exact details). The card mark may be as
+ // simple as a few extra nodes or, in a few GC configurations, may
+ // include more complex control flow between the leading and
+ // trailing memory barriers. However, whatever the card mark
+ // configuration these signatures are unique to translated volatile
+ // reads/stores -- they will not appear as a result of any other
+ // bytecode translation or inlining nor as a consequence of
+ // optimizing transforms.
//
// We also want to catch inlined unsafe volatile gets and puts and
// be able to implement them using either ldar<x>/stlr<x> or some
@@ -1122,7 +1138,7 @@
//
// MemBarRelease
// MemBarCPUOrder
- // StoreX[mo_release]
+ // StoreX[mo_release] {CardMark}-optional
// MemBarVolatile
//
// n.b. as an aside, the cpuorder membar is not itself subject to
@@ -1130,7 +1146,7 @@
// predicates need to detect its presence in order to correctly
// select the desired adlc rules.
//
- // Inlined unsafe volatiles gets manifest as a somewhat different
+ // Inlined unsafe volatile gets manifest as a somewhat different
// node sequence to a normal volatile get
//
// MemBarCPUOrder
@@ -1173,33 +1189,22 @@
// n.b. the translation rules below which rely on detection of the
// volatile signatures and insert ldar<x> or stlr<x> are failsafe.
// If we see anything other than the signature configurations we
- // always just translate the loads and stors to ldr<x> and str<x>
+ // always just translate the loads and stores to ldr<x> and str<x>
// and translate acquire, release and volatile membars to the
// relevant dmb instructions.
//
- // n.b.b as a case in point for the above comment, the current
- // predicates don't detect the precise signature for certain types
- // of volatile object stores (where the heap_base input type is not
- // known at compile-time to be non-NULL). In those cases the
- // MemBarRelease and MemBarVolatile bracket an if-then-else sequence
- // with a store in each branch (we need a different store depending
- // on whether heap_base is actually NULL). In such a case we will
- // just plant a dmb both before and after the branch/merge. The
- // predicate could (and probably should) be fixed later to also
- // detect this case.
-
- // graph traversal helpers
+
+ // graph traversal helpers used for volatile put/get optimization
+
+ // 1) general purpose helpers
// if node n is linked to a parent MemBarNode by an intervening
- // Control or Memory ProjNode return the MemBarNode otherwise return
+ // Control and Memory ProjNode return the MemBarNode otherwise return
// NULL.
//
// n may only be a Load or a MemBar.
- //
- // The ProjNode* references c and m are used to return the relevant
- // nodes.
-
- MemBarNode *has_parent_membar(const Node *n, ProjNode *&c, ProjNode *&m)
+
+ MemBarNode *parent_membar(const Node *n)
{
Node *ctl = NULL;
Node *mem = NULL;
@@ -1218,15 +1223,11 @@
if (!ctl || !mem || !ctl->is_Proj() || !mem->is_Proj())
return NULL;
- c = ctl->as_Proj();
-
membar = ctl->lookup(0);
if (!membar || !membar->is_MemBar())
return NULL;
- m = mem->as_Proj();
-
if (mem->lookup(0) != membar)
return NULL;
@@ -1235,12 +1236,8 @@
// if n is linked to a child MemBarNode by intervening Control and
// Memory ProjNodes return the MemBarNode otherwise return NULL.
- //
- // The ProjNode** arguments c and m are used to return pointers to
- // the relevant nodes. A null argument means don't don't return a
- // value.
-
- MemBarNode *has_child_membar(const MemBarNode *n, ProjNode *&c, ProjNode *&m)
+
+ MemBarNode *child_membar(const MemBarNode *n)
{
ProjNode *ctl = n->proj_out(TypeFunc::Control);
ProjNode *mem = n->proj_out(TypeFunc::Memory);
@@ -1249,9 +1246,6 @@
if (! ctl || ! mem)
return NULL;
- c = ctl;
- m = mem;
-
MemBarNode *child = NULL;
Node *x;
@@ -1279,9 +1273,838 @@
return NULL;
}
+ // helper predicate use to filter candidates for a leading memory
+ // barrier
+ //
+ // returns true if barrier is a MemBarRelease or a MemBarCPUOrder
+ // whose Ctl and Mem feeds come from a MemBarRelease otherwise false
+
+ bool leading_membar(const MemBarNode *barrier)
+ {
+ int opcode = barrier->Opcode();
+ // if this is a release membar we are ok
+ if (opcode == Op_MemBarRelease)
+ return true;
+ // if its a cpuorder membar . . .
+ if (opcode != Op_MemBarCPUOrder)
+ return false;
+ // then the parent has to be a release membar
+ MemBarNode *parent = parent_membar(barrier);
+ if (!parent)
+ return false;
+ opcode = parent->Opcode();
+ return opcode == Op_MemBarRelease;
+ }
+
+ // 2) card mark detection helper
+
+ // helper predicate which can be used to detect a volatile membar
+ // introduced as part of a conditional card mark sequence either by
+ // G1 or by CMS when UseCondCardMark is true.
+ //
+ // membar can be definitively determined to be part of a card mark
+ // sequence if and only if all the following hold
+ //
+ // i) it is a MemBarVolatile
+ //
+ // ii) either UseG1GC or (UseConcMarkSweepGC && UseCondCardMark) is
+ // true
+ //
+ // iii) the node's Mem projection feeds a StoreCM node.
+
+ bool is_card_mark_membar(const MemBarNode *barrier)
+ {
+ if (!UseG1GC && !(UseConcMarkSweepGC && UseCondCardMark))
+ return false;
+
+ if (barrier->Opcode() != Op_MemBarVolatile)
+ return false;
+
+ ProjNode *mem = barrier->proj_out(TypeFunc::Memory);
+
+ for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax ; i++) {
+ Node *y = mem->fast_out(i);
+ if (y->Opcode() == Op_StoreCM) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+
+ // 3) helper predicates to traverse volatile put graphs which may
+ // contain GC barrier subgraphs
+
+ // Preamble
+ // --------
+ //
+ // for volatile writes we can omit generating barriers and employ a
+ // releasing store when we see a node sequence sequence with a
+ // leading MemBarRelease and a trailing MemBarVolatile as follows
+ //
+ // MemBarRelease
+ // { || } -- optional
+ // {MemBarCPUOrder}
+ // || \\
+ // || StoreX[mo_release]
+ // | \ /
+ // | MergeMem
+ // | /
+ // MemBarVolatile
+ //
+ // where
+ // || and \\ represent Ctl and Mem feeds via Proj nodes
+ // | \ and / indicate further routing of the Ctl and Mem feeds
+ //
+ // this is the graph we see for non-object stores. however, for a
+ // volatile Object store (StoreN/P) we may see other nodes below the
+ // leading membar because of the need for a GC pre- or post-write
+ // barrier.
+ //
+ // with most GC configurations we with see this simple variant which
+ // includes a post-write barrier card mark.
+ //
+ // MemBarRelease______________________________
+ // || \\ Ctl \ \\
+ // || StoreN/P[mo_release] CastP2X StoreB/CM
+ // | \ / . . . /
+ // | MergeMem
+ // | /
+ // || /
+ // MemBarVolatile
+ //
+ // i.e. the leading membar feeds Ctl to a CastP2X (which converts
+ // the object address to an int used to compute the card offset) and
+ // Ctl+Mem to a StoreB node (which does the actual card mark).
+ //
+ // n.b. a StoreCM node will only appear in this configuration when
+ // using CMS. StoreCM differs from a normal card mark write (StoreB)
+ // because it implies a requirement to order visibility of the card
+ // mark (StoreCM) relative to the object put (StoreP/N) using a
+ // StoreStore memory barrier (arguably this ought to be represented
+ // explicitly in the ideal graph but that is not how it works). This
+ // ordering is required for both non-volatile and volatile
+ // puts. Normally that means we need to translate a StoreCM using
+ // the sequence
+ //
+ // dmb ishst
+ // stlrb
+ //
+ // However, in the case of a volatile put if we can recognise this
+ // configuration and plant an stlr for the object write then we can
+ // omit the dmb and just plant an strb since visibility of the stlr
+ // is ordered before visibility of subsequent stores. StoreCM nodes
+ // also arise when using G1 or using CMS with conditional card
+ // marking. In these cases (as we shall see) we don't need to insert
+ // the dmb when translating StoreCM because there is already an
+ // intervening StoreLoad barrier between it and the StoreP/N.
+ //
+ // It is also possible to perform the card mark conditionally on it
+ // currently being unmarked in which case the volatile put graph
+ // will look slightly different
+ //
+ // MemBarRelease
+ // MemBarCPUOrder___________________________________________
+ // || \\ Ctl \ Ctl \ \\ Mem \
+ // || StoreN/P[mo_release] CastP2X If LoadB |
+ // | \ / \ |
+ // | MergeMem . . . StoreB
+ // | / /
+ // || /
+ // MemBarVolatile
+ //
+ // It is worth noting at this stage that both the above
+ // configurations can be uniquely identified by checking that the
+ // memory flow includes the following subgraph:
+ //
+ // MemBarRelease
+ // MemBarCPUOrder
+ // | \ . . .
+ // | StoreX[mo_release] . . .
+ // | /
+ // MergeMem
+ // |
+ // MemBarVolatile
+ //
+ // This is referred to as a *normal* subgraph. It can easily be
+ // detected starting from any candidate MemBarRelease,
+ // StoreX[mo_release] or MemBarVolatile.
+ //
+ // the code below uses two helper predicates, leading_to_normal and
+ // normal_to_leading to identify this configuration, one validating
+ // the layout starting from the top membar and searching down and
+ // the other validating the layout starting from the lower membar
+ // and searching up.
+ //
+ // There are two special case GC configurations when a normal graph
+ // may not be generated: when using G1 (which always employs a
+ // conditional card mark); and when using CMS with conditional card
+ // marking configured. These GCs are both concurrent rather than
+ // stop-the world GCs. So they introduce extra Ctl+Mem flow into the
+ // graph between the leading and trailing membar nodes, in
+ // particular enforcing stronger memory serialisation beween the
+ // object put and the corresponding conditional card mark. CMS
+ // employs a post-write GC barrier while G1 employs both a pre- and
+ // post-write GC barrier. Of course the extra nodes may be absent --
+ // they are only inserted for object puts. This significantly
+ // complicates the task of identifying whether a MemBarRelease,
+ // StoreX[mo_release] or MemBarVolatile forms part of a volatile put
+ // when using these GC configurations (see below).
+ //
+ // In both cases the post-write subtree includes an auxiliary
+ // MemBarVolatile (StoreLoad barrier) separating the object put and
+ // the read of the corresponding card. This poses two additional
+ // problems.
+ //
+ // Firstly, a card mark MemBarVolatile needs to be distinguished
+ // from a normal trailing MemBarVolatile. Resolving this first
+ // problem is straightforward: a card mark MemBarVolatile always
+ // projects a Mem feed to a StoreCM node and that is a unique marker
+ //
+ // MemBarVolatile (card mark)
+ // C | \ . . .
+ // | StoreCM . . .
+ // . . .
+ //
+ // The second problem is how the code generator is to translate the
+ // card mark barrier? It always needs to be translated to a "dmb
+ // ish" instruction whether or not it occurs as part of a volatile
+ // put. A StoreLoad barrier is needed after the object put to ensure
+ // i) visibility to GC threads of the object put and ii) visibility
+ // to the mutator thread of any card clearing write by a GC
+ // thread. Clearly a normal store (str) will not guarantee this
+ // ordering but neither will a releasing store (stlr). The latter
+ // guarantees that the object put is visible but does not guarantee
+ // that writes by other threads have also been observed.
+ //
+ // So, returning to the task of translating the object put and the
+ // leading/trailing membar nodes: what do the non-normal node graph
+ // look like for these 2 special cases? and how can we determine the
+ // status of a MemBarRelease, StoreX[mo_release] or MemBarVolatile
+ // in both normal and non-normal cases?
+ //
+ // A CMS GC post-barrier wraps its card write (StoreCM) inside an If
+ // which selects conditonal execution based on the value loaded
+ // (LoadB) from the card. Ctl and Mem are fed to the If via an
+ // intervening StoreLoad barrier (MemBarVolatile).
+ //
+ // So, with CMS we may see a node graph which looks like this
+ //
+ // MemBarRelease
+ // MemBarCPUOrder_(leading)__________________
+ // C | M \ \\ C \
+ // | \ StoreN/P[mo_release] CastP2X
+ // | Bot \ /
+ // | MergeMem
+ // | /
+ // MemBarVolatile (card mark)
+ // C | || M |
+ // | LoadB |
+ // | | |
+ // | Cmp |\
+ // | / | \
+ // If | \
+ // | \ | \
+ // IfFalse IfTrue | \
+ // \ / \ | \
+ // \ / StoreCM |
+ // \ / | |
+ // Region . . . |
+ // | \ /
+ // | . . . \ / Bot
+ // | MergeMem
+ // | |
+ // MemBarVolatile (trailing)
+ //
+ // The first MergeMem merges the AliasIdxBot Mem slice from the
+ // leading membar and the oopptr Mem slice from the Store into the
+ // card mark membar. The trailing MergeMem merges the AliasIdxBot
+ // Mem slice from the card mark membar and the AliasIdxRaw slice
+ // from the StoreCM into the trailing membar (n.b. the latter
+ // proceeds via a Phi associated with the If region).
+ //
+ // G1 is quite a lot more complicated. The nodes inserted on behalf
+ // of G1 may comprise: a pre-write graph which adds the old value to
+ // the SATB queue; the releasing store itself; and, finally, a
+ // post-write graph which performs a card mark.
+ //
+ // The pre-write graph may be omitted, but only when the put is
+ // writing to a newly allocated (young gen) object and then only if
+ // there is a direct memory chain to the Initialize node for the
+ // object allocation. This will not happen for a volatile put since
+ // any memory chain passes through the leading membar.
+ //
+ // The pre-write graph includes a series of 3 If tests. The outermost
+ // If tests whether SATB is enabled (no else case). The next If tests
+ // whether the old value is non-NULL (no else case). The third tests
+ // whether the SATB queue index is > 0, if so updating the queue. The
+ // else case for this third If calls out to the runtime to allocate a
+ // new queue buffer.
+ //
+ // So with G1 the pre-write and releasing store subgraph looks like
+ // this (the nested Ifs are omitted).
+ //
+ // MemBarRelease (leading)____________
+ // C | || M \ M \ M \ M \ . . .
+ // | LoadB \ LoadL LoadN \
+ // | / \ \
+ // If |\ \
+ // | \ | \ \
+ // IfFalse IfTrue | \ \
+ // | | | \ |
+ // | If | /\ |
+ // | | \ |
+ // | \ |
+ // | . . . \ |
+ // | / | / | |
+ // Region Phi[M] | |
+ // | \ | | |
+ // | \_____ | ___ | |
+ // C | C \ | C \ M | |
+ // | CastP2X | StoreN/P[mo_release] |
+ // | | | |
+ // C | M | M | M |
+ // \ | | /
+ // . . .
+ // (post write subtree elided)
+ // . . .
+ // C \ M /
+ // MemBarVolatile (trailing)
+ //
+ // n.b. the LoadB in this subgraph is not the card read -- it's a
+ // read of the SATB queue active flag.
+ //
+ // The G1 post-write subtree is also optional, this time when the
+ // new value being written is either null or can be identified as a
+ // newly allocated (young gen) object with no intervening control
+ // flow. The latter cannot happen but the former may, in which case
+ // the card mark membar is omitted and the memory feeds from the
+ // leading membar and the StoreN/P are merged direct into the
+ // trailing membar as per the normal subgraph. So, the only special
+ // case which arises is when the post-write subgraph is generated.
+ //
+ // The kernel of the post-write G1 subgraph is the card mark itself
+ // which includes a card mark memory barrier (MemBarVolatile), a
+ // card test (LoadB), and a conditional update (If feeding a
+ // StoreCM). These nodes are surrounded by a series of nested Ifs
+ // which try to avoid doing the card mark. The top level If skips if
+ // the object reference does not cross regions (i.e. it tests if
+ // (adr ^ val) >> log2(regsize) != 0) -- intra-region references
+ // need not be recorded. The next If, which skips on a NULL value,
+ // may be absent (it is not generated if the type of value is >=
+ // OopPtr::NotNull). The 3rd If skips writes to young regions (by
+ // checking if card_val != young). n.b. although this test requires
+ // a pre-read of the card it can safely be done before the StoreLoad
+ // barrier. However that does not bypass the need to reread the card
+ // after the barrier.
+ //
+ // (pre-write subtree elided)
+ // . . . . . . . . . . . .
+ // C | M | M | M |
+ // Region Phi[M] StoreN |
+ // | / \ | |
+ // / \_______ / \ | |
+ // C / C \ . . . \ | |
+ // If CastP2X . . . | | |
+ // / \ | | |
+ // / \ | | |
+ // IfFalse IfTrue | | |
+ // | | | | /|
+ // | If | | / |
+ // | / \ | | / |
+ // | / \ \ | / |
+ // | IfFalse IfTrue MergeMem |
+ // | . . . / \ / |
+ // | / \ / |
+ // | IfFalse IfTrue / |
+ // | . . . | / |
+ // | If / |
+ // | / \ / |
+ // | / \ / |
+ // | IfFalse IfTrue / |
+ // | . . . | / |
+ // | \ / |
+ // | \ / |
+ // | MemBarVolatile__(card mark) |
+ // | || C | M \ M \ |
+ // | LoadB If | | |
+ // | / \ | | |
+ // | . . . | | |
+ // | \ | | /
+ // | StoreCM | /
+ // | . . . | /
+ // | _________/ /
+ // | / _____________/
+ // | . . . . . . | / /
+ // | | | / _________/
+ // | | Phi[M] / /
+ // | | | / /
+ // | | | / /
+ // | Region . . . Phi[M] _____/
+ // | / | /
+ // | | /
+ // | . . . . . . | /
+ // | / | /
+ // Region | | Phi[M]
+ // | | | / Bot
+ // \ MergeMem
+ // \ /
+ // MemBarVolatile
+ //
+ // As with CMS the initial MergeMem merges the AliasIdxBot Mem slice
+ // from the leading membar and the oopptr Mem slice from the Store
+ // into the card mark membar i.e. the memory flow to the card mark
+ // membar still looks like a normal graph.
+ //
+ // The trailing MergeMem merges an AliasIdxBot Mem slice with other
+ // Mem slices (from the StoreCM and other card mark queue stores).
+ // However in this case the AliasIdxBot Mem slice does not come
+ // direct from the card mark membar. It is merged through a series
+ // of Phi nodes. These are needed to merge the AliasIdxBot Mem flow
+ // from the leading membar with the Mem feed from the card mark
+ // membar. Each Phi corresponds to one of the Ifs which may skip
+ // around the card mark membar. So when the If implementing the NULL
+ // value check has been elided the total number of Phis is 2
+ // otherwise it is 3.
+ //
+ // So, the upshot is that in all cases the volatile put graph will
+ // include a *normal* memory subgraph betwen the leading membar and
+ // its child membar. When that child is not a card mark membar then
+ // it marks the end of a volatile put subgraph. If the child is a
+ // card mark membar then the normal subgraph will form part of a
+ // volatile put subgraph if and only if the child feeds an
+ // AliasIdxBot Mem feed to a trailing barrier via a MergeMem. That
+ // feed is either direct (for CMS) or via 2 or 3 Phi nodes merging
+ // the leading barrier memory flow (for G1).
+ //
+ // The predicates controlling generation of instructions for store
+ // and barrier nodes employ a few simple helper functions (described
+ // below) which identify the presence or absence of these subgraph
+ // configurations and provide a means of traversing from one node in
+ // the subgraph to another.
+
+ // leading_to_normal
+ //
+ //graph traversal helper which detects the normal case Mem feed
+ // from a release membar (or, optionally, its cpuorder child) to a
+ // dependent volatile membar i.e. it ensures that the following Mem
+ // flow subgraph is present.
+ //
+ // MemBarRelease
+ // MemBarCPUOrder
+ // | \ . . .
+ // | StoreN/P[mo_release] . . .
+ // | /
+ // MergeMem
+ // |
+ // MemBarVolatile
+ //
+ // if the correct configuration is present returns the volatile
+ // membar otherwise NULL.
+ //
+ // the input membar is expected to be either a cpuorder membar or a
+ // release membar. in the latter case it should not have a cpu membar
+ // child.
+ //
+ // the returned membar may be a card mark membar rather than a
+ // trailing membar.
+
+ MemBarNode *leading_to_normal(MemBarNode *leading)
+ {
+ assert((leading->Opcode() == Op_MemBarRelease ||
+ leading->Opcode() == Op_MemBarCPUOrder),
+ "expecting a volatile or cpuroder membar!");
+
+ // check the mem flow
+ ProjNode *mem = leading->proj_out(TypeFunc::Memory);
+
+ if (!mem)
+ return NULL;
+
+ Node *x = NULL;
+ StoreNode * st = NULL;
+ MergeMemNode *mm = NULL;
+
+ for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
+ x = mem->fast_out(i);
+ if (x->is_MergeMem()) {
+ if (mm != NULL)
+ return NULL;
+ // two merge mems is one too many
+ mm = x->as_MergeMem();
+ } else if (x->is_Store() && x->as_Store()->is_release() && x->Opcode() != Op_StoreCM) {
+ // two releasing stores is one too many
+ if (st != NULL)
+ return NULL;
+ st = x->as_Store();
+ }
+ }
+
+ if (!mm || !st)
+ return NULL;
+
+ bool found = false;
+ // ensure the store feeds the merge
+ for (DUIterator_Fast imax, i = st->fast_outs(imax); i < imax; i++) {
+ if (st->fast_out(i) == mm) {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found)
+ return NULL;
+
+ MemBarNode *mbvol = NULL;
+ // ensure the merge feeds a volatile membar
+ for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
+ x = mm->fast_out(i);
+ if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
+ mbvol = x->as_MemBar();
+ break;
+ }
+ }
+
+ return mbvol;
+ }
+
+ // normal_to_leading
+ //
+ // graph traversal helper which detects the normal case Mem feed
+ // from either a card mark or a trailing membar to a preceding
+ // release membar (optionally its cpuorder child) i.e. it ensures
+ // that the following Mem flow subgraph is present.
+ //
+ // MemBarRelease
+ // MemBarCPUOrder {leading}
+ // | \ . . .
+ // | StoreN/P[mo_release] . . .
+ // | /
+ // MergeMem
+ // |
+ // MemBarVolatile
+ //
+ // this predicate checks for the same flow as the previous predicate
+ // but starting from the bottom rather than the top.
+ //
+ // if the configuration is present returns the cpuorder member for
+ // preference or when absent the release membar otherwise NULL.
+ //
+ // n.b. the input membar is expected to be a MemBarVolatile but
+ // need not be a card mark membar.
+
+ MemBarNode *normal_to_leading(const MemBarNode *barrier)
+ {
+ // input must be a volatile membar
+ assert(barrier->Opcode() == Op_MemBarVolatile, "expecting a volatile membar");
+ Node *x;
+
+ // the Mem feed to the membar should be a merge
+ x = barrier->in(TypeFunc::Memory);
+ if (!x->is_MergeMem())
+ return NULL;
+
+ MergeMemNode *mm = x->as_MergeMem();
+
+ // the AliasIdxBot slice should be another MemBar projection
+ x = mm->in(Compile::AliasIdxBot);
+ // ensure this is a non control projection
+ if (!x->is_Proj() || x->is_CFG())
+ return NULL;
+ // if it is fed by a membar that's the one we want
+ x = x->in(0);
+
+ if (!x->is_MemBar())
+ return NULL;
+
+ MemBarNode *leading = x->as_MemBar();
+ // reject invalid candidates
+ if (!leading_membar(leading))
+ return NULL;
+
+ // ok, we have a leading ReleaseMembar, now for the sanity clauses
+
+ // the leading membar must feed Mem to a releasing store
+ ProjNode *mem = leading->proj_out(TypeFunc::Memory);
+ StoreNode *st = NULL;
+ for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
+ x = mem->fast_out(i);
+ if (x->is_Store() && x->as_Store()->is_release() && x->Opcode() != Op_StoreCM) {
+ st = x->as_Store();
+ break;
+ }
+ }
+ if (st == NULL)
+ return NULL;
+
+ // the releasing store has to feed the same merge
+ for (DUIterator_Fast imax, i = st->fast_outs(imax); i < imax; i++) {
+ if (st->fast_out(i) == mm)
+ return leading;
+ }
+
+ return NULL;
+ }
+
+ // card_mark_to_trailing
+ //
+ // graph traversal helper which detects extra, non-normal Mem feed
+ // from a card mark volatile membar to a trailing membar i.e. it
+ // ensures that one of the following three GC post-write Mem flow
+ // subgraphs is present.
+ //
+ // 1)
+ // . . .
+ // |
+ // MemBarVolatile (card mark)
+ // | |
+ // | StoreCM
+ // | |
+ // | . . .
+ // Bot | /
+ // MergeMem
+ // |
+ // MemBarVolatile (trailing)
+ //
+ //
+ // 2)
+ // MemBarRelease/CPUOrder (leading)
+ // |
+ // |
+ // |\ . . .
+ // | \ |
+ // | \ MemBarVolatile (card mark)
+ // | \ | |
+ // \ \ | StoreCM . . .
+ // \ \ |
+ // \ Phi
+ // \ /
+ // Phi . . .
+ // Bot | /
+ // MergeMem
+ // |
+ // MemBarVolatile (trailing)
+ //
+ // 3)
+ // MemBarRelease/CPUOrder (leading)
+ // |
+ // |\
+ // | \
+ // | \ . . .
+ // | \ |
+ // |\ \ MemBarVolatile (card mark)
+ // | \ \ | |
+ // | \ \ | StoreCM . . .
+ // | \ \ |
+ // \ \ Phi
+ // \ \ /
+ // \ Phi
+ // \ /
+ // Phi . . .
+ // Bot | /
+ // MergeMem
+ // |
+ // MemBarVolatile (trailing)
+ //
+ // configuration 1 is only valid if UseConcMarkSweepGC &&
+ // UseCondCardMark
+ //
+ // configurations 2 and 3 are only valid if UseG1GC.
+ //
+ // if a valid configuration is present returns the trailing membar
+ // otherwise NULL.
+ //
+ // n.b. the supplied membar is expected to be a card mark
+ // MemBarVolatile i.e. the caller must ensure the input node has the
+ // correct operand and feeds Mem to a StoreCM node
+
+ MemBarNode *card_mark_to_trailing(const MemBarNode *barrier)
+ {
+ // input must be a card mark volatile membar
+ assert(is_card_mark_membar(barrier), "expecting a card mark membar");
+
+ Node *feed = barrier->proj_out(TypeFunc::Memory);
+ Node *x;
+ MergeMemNode *mm = NULL;
+
+ const int MAX_PHIS = 3; // max phis we will search through
+ int phicount = 0; // current search count
+
+ bool retry_feed = true;
+ while (retry_feed) {
+ // see if we have a direct MergeMem feed
+ for (DUIterator_Fast imax, i = feed->fast_outs(imax); i < imax; i++) {
+ x = feed->fast_out(i);
+ // the correct Phi will be merging a Bot memory slice
+ if (x->is_MergeMem()) {
+ mm = x->as_MergeMem();
+ break;
+ }
+ }
+ if (mm) {
+ retry_feed = false;
+ } else if (UseG1GC & phicount++ < MAX_PHIS) {
+ // the barrier may feed indirectly via one or two Phi nodes
+ PhiNode *phi = NULL;
+ for (DUIterator_Fast imax, i = feed->fast_outs(imax); i < imax; i++) {
+ x = feed->fast_out(i);
+ // the correct Phi will be merging a Bot memory slice
+ if (x->is_Phi() && x->adr_type() == TypePtr::BOTTOM) {
+ phi = x->as_Phi();
+ break;
+ }
+ }
+ if (!phi)
+ return NULL;
+ // look for another merge below this phi
+ feed = phi;
+ } else {
+ // couldn't find a merge
+ return NULL;
+ }
+ }
+
+ // sanity check this feed turns up as the expected slice
+ assert(mm->as_MergeMem()->in(Compile::AliasIdxBot) == feed, "expecting membar to feed AliasIdxBot slice to Merge");
+
+ MemBarNode *trailing = NULL;
+ // be sure we have a volatile membar below the merge
+ for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
+ x = mm->fast_out(i);
+ if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
+ trailing = x->as_MemBar();
+ break;
+ }
+ }
+
+ return trailing;
+ }
+
+ // trailing_to_card_mark
+ //
+ // graph traversal helper which detects extra, non-normal Mem feed
+ // from a trailing membar to a preceding card mark volatile membar
+ // i.e. it identifies whether one of the three possible extra GC
+ // post-write Mem flow subgraphs is present
+ //
+ // this predicate checks for the same flow as the previous predicate
+ // but starting from the bottom rather than the top.
+ //
+ // if the configurationis present returns the card mark membar
+ // otherwise NULL
+
+ MemBarNode *trailing_to_card_mark(const MemBarNode *trailing)
+ {
+ assert(!is_card_mark_membar(trailing), "not expecting a card mark membar");
+
+ Node *x = trailing->in(TypeFunc::Memory);
+ // the Mem feed to the membar should be a merge
+ if (!x->is_MergeMem())
+ return NULL;
+
+ MergeMemNode *mm = x->as_MergeMem();
+
+ x = mm->in(Compile::AliasIdxBot);
+ // with G1 we may possibly see a Phi or two before we see a Memory
+ // Proj from the card mark membar
+
+ const int MAX_PHIS = 3; // max phis we will search through
+ int phicount = 0; // current search count
+
+ bool retry_feed = !x->is_Proj();
+
+ while (retry_feed) {
+ if (UseG1GC && x->is_Phi() && phicount++ < MAX_PHIS) {
+ PhiNode *phi = x->as_Phi();
+ ProjNode *proj = NULL;
+ PhiNode *nextphi = NULL;
+ bool found_leading = false;
+ for (uint i = 1; i < phi->req(); i++) {
+ x = phi->in(i);
+ if (x->is_Phi()) {
+ nextphi = x->as_Phi();
+ } else if (x->is_Proj()) {
+ int opcode = x->in(0)->Opcode();
+ if (opcode == Op_MemBarVolatile) {
+ proj = x->as_Proj();
+ } else if (opcode == Op_MemBarRelease ||
+ opcode == Op_MemBarCPUOrder) {
+ // probably a leading membar
+ found_leading = true;
+ }
+ }
+ }
+ // if we found a correct looking proj then retry from there
+ // otherwise we must see a leading and a phi or this the
+ // wrong config
+ if (proj != NULL) {
+ x = proj;
+ retry_feed = false;
+ } else if (found_leading && nextphi != NULL) {
+ // retry from this phi to check phi2
+ x = nextphi;
+ } else {
+ // not what we were looking for
+ return NULL;
+ }
+ } else {
+ return NULL;
+ }
+ }
+ // the proj has to come from the card mark membar
+ x = x->in(0);
+ if (!x->is_MemBar())
+ return NULL;
+
+ MemBarNode *card_mark_membar = x->as_MemBar();
+
+ if (!is_card_mark_membar(card_mark_membar))
+ return NULL;
+
+ return card_mark_membar;
+ }
+
+ // trailing_to_leading
+ //
+ // graph traversal helper which checks the Mem flow up the graph
+ // from a (non-card mark) volatile membar attempting to locate and
+ // return an associated leading membar. it first looks for a
+ // subgraph in the normal configuration (relying on helper
+ // normal_to_leading). failing that it then looks for one of the
+ // possible post-write card mark subgraphs linking the trailing node
+ // to a the card mark membar (relying on helper
+ // trailing_to_card_mark), and then checks that the card mark membar
+ // is fed by a leading membar (once again relying on auxiliary
+ // predicate normal_to_leading).
+ //
+ // if the configuration is valid returns the cpuorder member for
+ // preference or when absent the release membar otherwise NULL.
+ //
+ // n.b. the input membar is expected to be a volatile membar but
+ // must *not* be a card mark membar.
+
+ MemBarNode *trailing_to_leading(const MemBarNode *trailing)
+ {
+ assert(!is_card_mark_membar(trailing), "not expecting a card mark membar");
+
+ MemBarNode *leading = normal_to_leading(trailing);
+
+ if (leading)
+ return leading;
+
+ MemBarNode *card_mark_membar = trailing_to_card_mark(trailing);
+
+ if (!card_mark_membar)
+ return NULL;
+
+ return normal_to_leading(card_mark_membar);
+ }
+
// predicates controlling emit of ldr<x>/ldar<x> and associated dmb
-bool unnecessary_acquire(const Node *barrier) {
+bool unnecessary_acquire(const Node *barrier)
+{
// assert barrier->is_MemBar();
if (UseBarriersForVolatile)
// we need to plant a dmb
@@ -1323,13 +2146,11 @@
return (x->is_Load() && x->as_Load()->is_acquire());
}
- // only continue if we want to try to match unsafe volatile gets
- if (UseBarriersForUnsafeVolatileGet)
- return false;
+ // now check for an unsafe volatile get
// need to check for
//
- // MemBarCPUOrder
+ // MemBarCPUOrder
// || \\
// MemBarAcquire* LoadX[mo_acquire]
// ||
@@ -1341,9 +2162,13 @@
// check for a parent MemBarCPUOrder
ProjNode *ctl;
ProjNode *mem;
- MemBarNode *parent = has_parent_membar(barrier, ctl, mem);
+ MemBarNode *parent = parent_membar(barrier);
if (!parent || parent->Opcode() != Op_MemBarCPUOrder)
return false;
+ ctl = parent->proj_out(TypeFunc::Control);
+ mem = parent->proj_out(TypeFunc::Memory);
+ if (!ctl || !mem)
+ return false;
// ensure the proj nodes both feed a LoadX[mo_acquire]
LoadNode *ld = NULL;
for (DUIterator_Fast imax, i = ctl->fast_outs(imax); i < imax; i++) {
@@ -1369,7 +2194,7 @@
if (ld)
return false;
// check for a child cpuorder membar
- MemBarNode *child = has_child_membar(barrier->as_MemBar(), ctl, mem);
+ MemBarNode *child = child_membar(barrier->as_MemBar());
if (!child || child->Opcode() != Op_MemBarCPUOrder)
return false;
@@ -1422,9 +2247,7 @@
return true;
}
- // only continue if we want to try to match unsafe volatile gets
- if (UseBarriersForUnsafeVolatileGet)
- return false;
+ // now check for an unsafe volatile get
// check if Ctl and Proj feed comes from a MemBarCPUOrder
//
@@ -1435,22 +2258,20 @@
// MemBarCPUOrder
MemBarNode *membar;
- ProjNode *ctl;
- ProjNode *mem;
-
- membar = has_parent_membar(ld, ctl, mem);
+
+ membar = parent_membar(ld);
if (!membar || !membar->Opcode() == Op_MemBarCPUOrder)
return false;
// ensure that there is a CPUOrder->Acquire->CPUOrder membar chain
- membar = has_child_membar(membar, ctl, mem);
+ membar = child_membar(membar);
if (!membar || !membar->Opcode() == Op_MemBarAcquire)
return false;
- membar = has_child_membar(membar, ctl, mem);
+ membar = child_membar(membar);
if (!membar || !membar->Opcode() == Op_MemBarCPUOrder)
return false;
@@ -1458,194 +2279,81 @@
return true;
}
-bool unnecessary_release(const Node *n) {
+bool unnecessary_release(const Node *n)
+{
+ assert((n->is_MemBar() &&
+ n->Opcode() == Op_MemBarRelease),
+ "expecting a release membar");
+
+ if (UseBarriersForVolatile)
+ // we need to plant a dmb
+ return false;
+
+ // if there is a dependent CPUOrder barrier then use that as the
+ // leading
+
+ MemBarNode *barrier = n->as_MemBar();
+ // check for an intervening cpuorder membar
+ MemBarNode *b = child_membar(barrier);
+ if (b && b->Opcode() == Op_MemBarCPUOrder) {
+ // ok, so start the check from the dependent cpuorder barrier
+ barrier = b;
+ }
+
+ // must start with a normal feed
+ MemBarNode *child_barrier = leading_to_normal(barrier);
+
+ if (!child_barrier)
+ return false;
+
+ if (!is_card_mark_membar(child_barrier))
+ // this is the trailing membar and we are done
+ return true;
+
+ // must be sure this card mark feeds a trailing membar
+ MemBarNode *trailing = card_mark_to_trailing(child_barrier);
+ return (trailing != NULL);
+}
+
+bool unnecessary_volatile(const Node *n)
+{
// assert n->is_MemBar();
if (UseBarriersForVolatile)
// we need to plant a dmb
return false;
- // ok, so we can omit this release barrier if it has been inserted
- // as part of a volatile store sequence
- //
- // MemBarRelease
- // { || }
- // {MemBarCPUOrder} -- optional
- // || \\
- // || StoreX[mo_release]
- // | \ /
- // | MergeMem
- // | /
- // MemBarVolatile
- //
- // where
- // || and \\ represent Ctl and Mem feeds via Proj nodes
- // | \ and / indicate further routing of the Ctl and Mem feeds
- //
- // so we need to check that
- //
- // ia) the release membar (or its dependent cpuorder membar) feeds
- // control to a store node (via a Control project node)
- //
- // ii) the store is ordered release
- //
- // iii) the release membar (or its dependent cpuorder membar) feeds
- // control to a volatile membar (via the same Control project node)
- //
- // iv) the release membar feeds memory to a merge mem and to the
- // same store (both via a single Memory proj node)
- //
- // v) the store outputs to the merge mem
- //
- // vi) the merge mem outputs to the same volatile membar
- //
- // n.b. if this is an inlined unsafe node then the release membar
- // may feed its control and memory links via an intervening cpuorder
- // membar. this case can be dealt with when we check the release
- // membar projections. if they both feed a single cpuorder membar
- // node continue to make the same checks as above but with the
- // cpuorder membar substituted for the release membar. if they don't
- // both feed a cpuorder membar then the check fails.
- //
- // n.b.b. for an inlined unsafe store of an object in the case where
- // !TypePtr::NULL_PTR->higher_equal(type(heap_base_oop)) we may see
- // an embedded if then else where we expect the store. this is
- // needed to do the right type of store depending on whether
- // heap_base is NULL. We could check for that but for now we can
- // just take the hit of on inserting a redundant dmb for this
- // redundant volatile membar
-
- MemBarNode *barrier = n->as_MemBar();
- ProjNode *ctl;
- ProjNode *mem;
- // check for an intervening cpuorder membar
- MemBarNode *b = has_child_membar(barrier, ctl, mem);
- if (b && b->Opcode() == Op_MemBarCPUOrder) {
- // ok, so start form the dependent cpuorder barrier
- barrier = b;
- }
- // check the ctl and mem flow
- ctl = barrier->proj_out(TypeFunc::Control);
- mem = barrier->proj_out(TypeFunc::Memory);
-
- // the barrier needs to have both a Ctl and Mem projection
- if (! ctl || ! mem)
+ MemBarNode *mbvol = n->as_MemBar();
+
+ // first we check if this is part of a card mark. if so then we have
+ // to generate a StoreLoad barrier
+
+ if (is_card_mark_membar(mbvol))
+ return false;
+
+ // ok, if it's not a card mark then we still need to check if it is
+ // a trailing membar of a volatile put hgraph.
+
+ return (trailing_to_leading(mbvol) != NULL);
+}
+
+// predicates controlling emit of str<x>/stlr<x> and associated dmbs
+
+bool needs_releasing_store(const Node *n)
+{
+ // assert n->is_Store();
+ if (UseBarriersForVolatile)
+ // we use a normal store and dmb combination
return false;
- Node *x = NULL;
- Node *mbvol = NULL;
- StoreNode * st = NULL;
-
- // For a normal volatile write the Ctl ProjNode should have output
- // to a MemBarVolatile and a Store marked as releasing
- //
- // n.b. for an inlined unsafe store of an object in the case where
- // !TypePtr::NULL_PTR->higher_equal(type(heap_base_oop)) we may see
- // an embedded if then else where we expect the store. this is
- // needed to do the right type of store depending on whether
- // heap_base is NULL. We could check for that case too but for now
- // we can just take the hit of inserting a dmb and a non-volatile
- // store to implement the volatile store
-
- for (DUIterator_Fast imax, i = ctl->fast_outs(imax); i < imax; i++) {
- x = ctl->fast_out(i);
- if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
- if (mbvol) {
- return false;
- }
- mbvol = x;
- } else if (x->is_Store()) {
- st = x->as_Store();
- if (! st->is_release()) {
- return false;
- }
- } else if (!x->is_Mach()) {
- // we may see mach nodes added during matching but nothing else
- return false;
- }
- }
-
- if (!mbvol || !st)
- return false;
-
- // the Mem ProjNode should output to a MergeMem and the same Store
- Node *mm = NULL;
- for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
- x = mem->fast_out(i);
- if (!mm && x->is_MergeMem()) {
- mm = x;
- } else if (x != st && !x->is_Mach()) {
- // we may see mach nodes added during matching but nothing else
- return false;
- }
- }
-
- if (!mm)
+ StoreNode *st = n->as_Store();
+
+ // the store must be marked as releasing
+ if (!st->is_release())
return false;
- // the MergeMem should output to the MemBarVolatile
- for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
- x = mm->fast_out(i);
- if (x != mbvol && !x->is_Mach()) {
- // we may see mach nodes added during matching but nothing else
- return false;
- }
- }
-
- return true;
-}
-
-bool unnecessary_volatile(const Node *n) {
- // assert n->is_MemBar();
- if (UseBarriersForVolatile)
- // we need to plant a dmb
- return false;
-
- // ok, so we can omit this volatile barrier if it has been inserted
- // as part of a volatile store sequence
- //
- // MemBarRelease
- // { || }
- // {MemBarCPUOrder} -- optional
- // || \\
- // || StoreX[mo_release]
- // | \ /
- // | MergeMem
- // | /
- // MemBarVolatile
- //
- // where
- // || and \\ represent Ctl and Mem feeds via Proj nodes
- // | \ and / indicate further routing of the Ctl and Mem feeds
- //
- // we need to check that
- //
- // i) the volatile membar gets its control feed from a release
- // membar (or its dependent cpuorder membar) via a Control project
- // node
- //
- // ii) the release membar (or its dependent cpuorder membar) also
- // feeds control to a store node via the same proj node
- //
- // iii) the store is ordered release
- //
- // iv) the release membar (or its dependent cpuorder membar) feeds
- // memory to a merge mem and to the same store (both via a single
- // Memory proj node)
- //
- // v) the store outputs to the merge mem
- //
- // vi) the merge mem outputs to the volatile membar
- //
- // n.b. for an inlined unsafe store of an object in the case where
- // !TypePtr::NULL_PTR->higher_equal(type(heap_base_oop)) we may see
- // an embedded if then else where we expect the store. this is
- // needed to do the right type of store depending on whether
- // heap_base is NULL. We could check for that but for now we can
- // just take the hit of on inserting a redundant dmb for this
- // redundant volatile membar
-
- MemBarNode *mbvol = n->as_MemBar();
- Node *x = n->lookup(TypeFunc::Control);
+ // the store must be fed by a membar
+
+ Node *x = st->lookup(StoreNode::Memory);
if (! x || !x->is_Proj())
return false;
@@ -1659,200 +2367,78 @@
MemBarNode *barrier = x->as_MemBar();
- // if the barrier is a release membar we have what we want. if it is
- // a cpuorder membar then we need to ensure that it is fed by a
- // release membar in which case we proceed to check the graph below
- // this cpuorder membar as the feed
-
- if (x->Opcode() != Op_MemBarRelease) {
- if (x->Opcode() != Op_MemBarCPUOrder)
- return false;
- ProjNode *ctl;
- ProjNode *mem;
- MemBarNode *b = has_parent_membar(x, ctl, mem);
- if (!b || !b->Opcode() == Op_MemBarRelease)
- return false;
- }
-
- ProjNode *ctl = barrier->proj_out(TypeFunc::Control);
- ProjNode *mem = barrier->proj_out(TypeFunc::Memory);
-
- // barrier needs to have both a Ctl and Mem projection
- // and we need to have reached it via the Ctl projection
- if (! ctl || ! mem || ctl != proj)
- return false;
-
- StoreNode * st = NULL;
-
- // The Ctl ProjNode should have output to a MemBarVolatile and
- // a Store marked as releasing
- for (DUIterator_Fast imax, i = ctl->fast_outs(imax); i < imax; i++) {
- x = ctl->fast_out(i);
- if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
- if (x != mbvol) {
- return false;
- }
- } else if (x->is_Store()) {
- st = x->as_Store();
- if (! st->is_release()) {
- return false;
- }
- } else if (!x->is_Mach()){
- // we may see mach nodes added during matching but nothing else
- return false;
- }
- }
-
- if (!st)
- return false;
-
- // the Mem ProjNode should output to a MergeMem and the same Store
- Node *mm = NULL;
- for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
- x = mem->fast_out(i);
- if (!mm && x->is_MergeMem()) {
- mm = x;
- } else if (x != st && !x->is_Mach()) {
- // we may see mach nodes added during matching but nothing else
- return false;
- }
- }
-
- if (!mm)
- return false;
-
- // the MergeMem should output to the MemBarVolatile
- for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
- x = mm->fast_out(i);
- if (x != mbvol && !x->is_Mach()) {
- // we may see mach nodes added during matching but nothing else
- return false;
- }
- }
-
- return true;
-}
-
-
-
-bool needs_releasing_store(const Node *n)
-{
- // assert n->is_Store();
- if (UseBarriersForVolatile)
- // we use a normal store and dmb combination
+ // if the barrier is a release membar or a cpuorder mmebar fed by a
+ // release membar then we need to check whether that forms part of a
+ // volatile put graph.
+
+ // reject invalid candidates
+ if (!leading_membar(barrier))
return false;
- StoreNode *st = n->as_Store();
-
- if (!st->is_release())
- return false;
-
- // check if this store is bracketed by a release (or its dependent
- // cpuorder membar) and a volatile membar
- //
- // MemBarRelease
- // { || }
- // {MemBarCPUOrder} -- optional
- // || \\
- // || StoreX[mo_release]
- // | \ /
- // | MergeMem
- // | /
- // MemBarVolatile
- //
- // where
- // || and \\ represent Ctl and Mem feeds via Proj nodes
- // | \ and / indicate further routing of the Ctl and Mem feeds
- //
-
-
- Node *x = st->lookup(TypeFunc::Control);
-
- if (! x || !x->is_Proj())
- return false;
-
- ProjNode *proj = x->as_Proj();
-
- x = proj->lookup(0);
-
- if (!x || !x->is_MemBar())
- return false;
-
- MemBarNode *barrier = x->as_MemBar();
-
- // if the barrier is a release membar we have what we want. if it is
- // a cpuorder membar then we need to ensure that it is fed by a
- // release membar in which case we proceed to check the graph below
- // this cpuorder membar as the feed
-
- if (x->Opcode() != Op_MemBarRelease) {
- if (x->Opcode() != Op_MemBarCPUOrder)
- return false;
- Node *ctl = x->lookup(TypeFunc::Control);
- Node *mem = x->lookup(TypeFunc::Memory);
- if (!ctl || !ctl->is_Proj() || !mem || !mem->is_Proj())
- return false;
- x = ctl->lookup(0);
- if (!x || !x->is_MemBar() || !x->Opcode() == Op_MemBarRelease)
- return false;
- Node *y = mem->lookup(0);
- if (!y || y != x)
- return false;
- }
-
- ProjNode *ctl = barrier->proj_out(TypeFunc::Control);
- ProjNode *mem = barrier->proj_out(TypeFunc::Memory);
-
- // MemBarRelease needs to have both a Ctl and Mem projection
- // and we need to have reached it via the Ctl projection
- if (! ctl || ! mem || ctl != proj)
- return false;
-
- MemBarNode *mbvol = NULL;
-
- // The Ctl ProjNode should have output to a MemBarVolatile and
- // a Store marked as releasing
- for (DUIterator_Fast imax, i = ctl->fast_outs(imax); i < imax; i++) {
- x = ctl->fast_out(i);
- if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
- mbvol = x->as_MemBar();
- } else if (x->is_Store()) {
- if (x != st) {
- return false;
- }
- } else if (!x->is_Mach()){
- return false;
- }
- }
+ // does this lead a normal subgraph?
+ MemBarNode *mbvol = leading_to_normal(barrier);
if (!mbvol)
return false;
- // the Mem ProjNode should output to a MergeMem and the same Store
- Node *mm = NULL;
- for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
- x = mem->fast_out(i);
- if (!mm && x->is_MergeMem()) {
- mm = x;
- } else if (x != st && !x->is_Mach()) {
- return false;
- }
- }
-
- if (!mm)
+ // all done unless this is a card mark
+ if (!is_card_mark_membar(mbvol))
+ return true;
+
+ // we found a card mark -- just make sure we have a trailing barrier
+
+ return (card_mark_to_trailing(mbvol) != NULL);
+}
+
+// predicate controlling translation of StoreCM
+//
+// returns true if a StoreStore must precede the card write otherwise
+// false
+
+bool unnecessary_storestore(const Node *storecm)
+{
+ assert(storecm->Opcode() == Op_StoreCM, "expecting a StoreCM");
+
+ // we only ever need to generate a dmb ishst between an object put
+ // and the associated card mark when we are using CMS without
+ // conditional card marking
+
+ if (!UseConcMarkSweepGC || UseCondCardMark)
+ return true;
+
+ // if we are implementing volatile puts using barriers then the
+ // object put as an str so we must insert the dmb ishst
+
+ if (UseBarriersForVolatile)
return false;
- // the MergeMem should output to the MemBarVolatile
- for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
- x = mm->fast_out(i);
- if (x != mbvol && !x->is_Mach()) {
- return false;
- }
- }
-
- return true;
-}
-
+ // we can omit the dmb ishst if this StoreCM is part of a volatile
+ // put because in thta case the put will be implemented by stlr
+ //
+ // we need to check for a normal subgraph feeding this StoreCM.
+ // that means the StoreCM must be fed Memory from a leading membar,
+ // either a MemBarRelease or its dependent MemBarCPUOrder, and the
+ // leading membar must be part of a normal subgraph
+
+ Node *x = storecm->in(StoreNode::Memory);
+
+ if (!x->is_Proj())
+ return false;
+
+ x = x->in(0);
+
+ if (!x->is_MemBar())
+ return false;
+
+ MemBarNode *leading = x->as_MemBar();
+
+ // reject invalid candidates
+ if (!leading_membar(leading))
+ return false;
+
+ // we can omit the StoreStore if it is the head of a normal subgraph
+ return (leading_to_normal(leading) != NULL);
+}
#define __ _masm.
@@ -2944,6 +3530,13 @@
as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
%}
+ enc_class aarch64_enc_strb0_ordered(memory mem) %{
+ MacroAssembler _masm(&cbuf);
+ __ membar(Assembler::StoreStore);
+ loadStore(_masm, &MacroAssembler::strb, zr, $mem->opcode(),
+ as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp);
+ %}
+
enc_class aarch64_enc_strh(iRegI src, memory mem) %{
Register src_reg = as_Register($src$$reg);
loadStore(MacroAssembler(&cbuf), &MacroAssembler::strh, src_reg, $mem->opcode(),
@@ -6613,6 +7206,7 @@
instruct storeimmCM0(immI0 zero, memory mem)
%{
match(Set mem (StoreCM mem zero));
+ predicate(unnecessary_storestore(n));
ins_cost(INSN_COST);
format %{ "strb zr, $mem\t# byte" %}
@@ -6622,6 +7216,21 @@
ins_pipe(istore_mem);
%}
+// Store CMS card-mark Immediate with intervening StoreStore
+// needed when using CMS with no conditional card marking
+instruct storeimmCM0_ordered(immI0 zero, memory mem)
+%{
+ match(Set mem (StoreCM mem zero));
+
+ ins_cost(INSN_COST * 2);
+ format %{ "dmb ishst"
+ "\n\tstrb zr, $mem\t# byte" %}
+
+ ins_encode(aarch64_enc_strb0_ordered(mem));
+
+ ins_pipe(istore_mem);
+%}
+
// Store Byte
instruct storeB(iRegIorL2I src, memory mem)
%{
@@ -6643,7 +7252,7 @@
predicate(!needs_releasing_store(n));
ins_cost(INSN_COST);
- format %{ "strb zr, $mem\t# byte" %}
+ format %{ "strb rscractch2, $mem\t# byte" %}
ins_encode(aarch64_enc_strb0(mem));
@@ -7396,6 +8005,7 @@
format %{ "membar_acquire" %}
ins_encode %{
+ __ block_comment("membar_acquire");
__ membar(Assembler::LoadLoad|Assembler::LoadStore);
%}
@@ -7448,6 +8058,7 @@
format %{ "membar_release" %}
ins_encode %{
+ __ block_comment("membar_release");
__ membar(Assembler::LoadStore|Assembler::StoreStore);
%}
ins_pipe(pipe_serial);
@@ -7499,6 +8110,7 @@
format %{ "membar_volatile" %}
ins_encode %{
+ __ block_comment("membar_volatile");
__ membar(Assembler::StoreLoad);
%}
@@ -9429,7 +10041,7 @@
as_Register($src1$$reg),
as_Register($src2$$reg),
Assembler::LSR,
- $src3$$constant & 0x3f);
+ $src3$$constant & 0x1f);
%}
ins_pipe(ialu_reg_reg_shift);
@@ -9465,7 +10077,7 @@
as_Register($src1$$reg),
as_Register($src2$$reg),
Assembler::ASR,
- $src3$$constant & 0x3f);
+ $src3$$constant & 0x1f);
%}
ins_pipe(ialu_reg_reg_shift);
@@ -9501,7 +10113,7 @@
as_Register($src1$$reg),
as_Register($src2$$reg),
Assembler::LSL,
- $src3$$constant & 0x3f);
+ $src3$$constant & 0x1f);
%}
ins_pipe(ialu_reg_reg_shift);
@@ -9537,7 +10149,7 @@
as_Register($src1$$reg),
as_Register($src2$$reg),
Assembler::LSR,
- $src3$$constant & 0x3f);
+ $src3$$constant & 0x1f);
%}
ins_pipe(ialu_reg_reg_shift);
@@ -9573,7 +10185,7 @@
as_Register($src1$$reg),
as_Register($src2$$reg),
Assembler::ASR,
- $src3$$constant & 0x3f);
+ $src3$$constant & 0x1f);
%}
ins_pipe(ialu_reg_reg_shift);
@@ -9609,7 +10221,7 @@
as_Register($src1$$reg),
as_Register($src2$$reg),
Assembler::LSL,
- $src3$$constant & 0x3f);
+ $src3$$constant & 0x1f);
%}
ins_pipe(ialu_reg_reg_shift);
@@ -9645,7 +10257,7 @@
as_Register($src1$$reg),
as_Register($src2$$reg),
Assembler::LSR,
- $src3$$constant & 0x3f);
+ $src3$$constant & 0x1f);
%}
ins_pipe(ialu_reg_reg_shift);
@@ -9681,7 +10293,7 @@
as_Register($src1$$reg),
as_Register($src2$$reg),
Assembler::ASR,
- $src3$$constant & 0x3f);
+ $src3$$constant & 0x1f);
%}
ins_pipe(ialu_reg_reg_shift);
@@ -9717,7 +10329,7 @@
as_Register($src1$$reg),
as_Register($src2$$reg),
Assembler::LSL,
- $src3$$constant & 0x3f);
+ $src3$$constant & 0x1f);
%}
ins_pipe(ialu_reg_reg_shift);
@@ -9754,7 +10366,7 @@
as_Register($src1$$reg),
as_Register($src2$$reg),
Assembler::LSR,
- $src3$$constant & 0x3f);
+ $src3$$constant & 0x1f);
%}
ins_pipe(ialu_reg_reg_shift);
@@ -9792,7 +10404,7 @@
as_Register($src1$$reg),
as_Register($src2$$reg),
Assembler::ASR,
- $src3$$constant & 0x3f);
+ $src3$$constant & 0x1f);
%}
ins_pipe(ialu_reg_reg_shift);
@@ -9830,7 +10442,7 @@
as_Register($src1$$reg),
as_Register($src2$$reg),
Assembler::LSL,
- $src3$$constant & 0x3f);
+ $src3$$constant & 0x1f);
%}
ins_pipe(ialu_reg_reg_shift);
@@ -9868,7 +10480,7 @@
as_Register($src1$$reg),
as_Register($src2$$reg),
Assembler::LSR,
- $src3$$constant & 0x3f);
+ $src3$$constant & 0x1f);
%}
ins_pipe(ialu_reg_reg_shift);
@@ -9906,7 +10518,7 @@
as_Register($src1$$reg),
as_Register($src2$$reg),
Assembler::ASR,
- $src3$$constant & 0x3f);
+ $src3$$constant & 0x1f);
%}
ins_pipe(ialu_reg_reg_shift);
@@ -9944,7 +10556,7 @@
as_Register($src1$$reg),
as_Register($src2$$reg),
Assembler::LSL,
- $src3$$constant & 0x3f);
+ $src3$$constant & 0x1f);
%}
ins_pipe(ialu_reg_reg_shift);
@@ -9982,7 +10594,7 @@
as_Register($src1$$reg),
as_Register($src2$$reg),
Assembler::LSR,
- $src3$$constant & 0x3f);
+ $src3$$constant & 0x1f);
%}
ins_pipe(ialu_reg_reg_shift);
@@ -10020,7 +10632,7 @@
as_Register($src1$$reg),
as_Register($src2$$reg),
Assembler::ASR,
- $src3$$constant & 0x3f);
+ $src3$$constant & 0x1f);
%}
ins_pipe(ialu_reg_reg_shift);
@@ -10058,7 +10670,7 @@
as_Register($src1$$reg),
as_Register($src2$$reg),
Assembler::LSL,
- $src3$$constant & 0x3f);
+ $src3$$constant & 0x1f);
%}
ins_pipe(ialu_reg_reg_shift);
@@ -10096,7 +10708,7 @@
as_Register($src1$$reg),
as_Register($src2$$reg),
Assembler::LSR,
- $src3$$constant & 0x3f);
+ $src3$$constant & 0x1f);
%}
ins_pipe(ialu_reg_reg_shift);
@@ -10134,7 +10746,7 @@
as_Register($src1$$reg),
as_Register($src2$$reg),
Assembler::ASR,
- $src3$$constant & 0x3f);
+ $src3$$constant & 0x1f);
%}
ins_pipe(ialu_reg_reg_shift);
@@ -10172,7 +10784,7 @@
as_Register($src1$$reg),
as_Register($src2$$reg),
Assembler::LSL,
- $src3$$constant & 0x3f);
+ $src3$$constant & 0x1f);
%}
ins_pipe(ialu_reg_reg_shift);
@@ -10210,7 +10822,7 @@
as_Register($src1$$reg),
as_Register($src2$$reg),
Assembler::LSR,
- $src3$$constant & 0x3f);
+ $src3$$constant & 0x1f);
%}
ins_pipe(ialu_reg_reg_shift);
@@ -10248,7 +10860,7 @@
as_Register($src1$$reg),
as_Register($src2$$reg),
Assembler::ASR,
- $src3$$constant & 0x3f);
+ $src3$$constant & 0x1f);
%}
ins_pipe(ialu_reg_reg_shift);
@@ -10286,7 +10898,7 @@
as_Register($src1$$reg),
as_Register($src2$$reg),
Assembler::LSL,
- $src3$$constant & 0x3f);
+ $src3$$constant & 0x1f);
%}
ins_pipe(ialu_reg_reg_shift);
--- a/hotspot/src/cpu/aarch64/vm/aarch64_ad.m4 Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/cpu/aarch64/vm/aarch64_ad.m4 Fri Aug 21 09:12:42 2015 +0200
@@ -42,7 +42,7 @@
as_Register($src1$$reg),
as_Register($src2$$reg),
Assembler::$5,
- $src3$$constant & 0x3f);
+ $src3$$constant & ifelse($1,I,0x1f,0x3f));
%}
ins_pipe(ialu_reg_reg_shift);
@@ -87,7 +87,7 @@
as_Register($src1$$reg),
as_Register($src2$$reg),
Assembler::$5,
- $src3$$constant & 0x3f);
+ $src3$$constant & ifelse($1,I,0x1f,0x3f));
%}
ins_pipe(ialu_reg_reg_shift);
--- a/hotspot/src/cpu/aarch64/vm/assembler_aarch64.cpp Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/cpu/aarch64/vm/assembler_aarch64.cpp Fri Aug 21 09:12:42 2015 +0200
@@ -268,7 +268,7 @@
__ ldar(r21, r28); // ldar x21, [x28]
// LoadStoreExclusiveOp
- __ stxrw(r24, r24, r7); // stxr w24, w24, [x7]
+ __ stxrw(r21, r24, r7); // stxr w21, w24, [x7]
__ stlxrw(r21, r26, r28); // stlxr w21, w26, [x28]
__ ldxrw(r21, r6); // ldxr w21, [x6]
__ ldaxrw(r15, r30); // ldaxr w15, [x30]
@@ -299,7 +299,7 @@
// LoadStoreExclusiveOp
__ ldxpw(r25, r4, r22); // ldxp w25, w4, [x22]
- __ ldaxpw(r14, r14, r15); // ldaxp w14, w14, [x15]
+ __ ldaxpw(r13, r14, r15); // ldaxp w13, w14, [x15]
__ stxpw(r20, r26, r8, r10); // stxp w20, w26, w8, [x10]
__ stlxpw(r23, r18, r18, r18); // stlxp w23, w18, w18, [x18]
@@ -773,7 +773,7 @@
260: c85fffbb ldaxr x27, [x29]
264: c89fffa0 stlr x0, [x29]
268: c8dfff95 ldar x21, [x28]
- 26c: 88187cf8 stxr w24, w24, [x7]
+ 26c: 88157cf8 stxr w21, w24, [x7]
270: 8815ff9a stlxr w21, w26, [x28]
274: 885f7cd5 ldxr w21, [x6]
278: 885fffcf ldaxr w15, [x30]
@@ -796,7 +796,7 @@
2bc: c82870bb stxp w8, x27, x28, [x5]
2c0: c825b8c8 stlxp w5, x8, x14, [x6]
2c4: 887f12d9 ldxp w25, w4, [x22]
- 2c8: 887fb9ee ldaxp w14, w14, [x15]
+ 2c8: 887fb9ed ldaxp w13, w14, [x15]
2cc: 8834215a stxp w20, w26, w8, [x10]
2d0: 8837ca52 stlxp w23, w18, w18, [x18]
2d4: f806317e str x30, [x11,#99]
@@ -1085,13 +1085,13 @@
0xd444c320, 0xd503201f, 0xd69f03e0, 0xd6bf03e0,
0xd5033fdf, 0xd5033f9f, 0xd5033abf, 0xd61f0040,
0xd63f00a0, 0xc8147c55, 0xc805fcfd, 0xc85f7e05,
- 0xc85fffbb, 0xc89fffa0, 0xc8dfff95, 0x88187cf8,
+ 0xc85fffbb, 0xc89fffa0, 0xc8dfff95, 0x88157cf8,
0x8815ff9a, 0x885f7cd5, 0x885fffcf, 0x889ffc73,
0x88dffc56, 0x48127c0f, 0x480bff85, 0x485f7cdd,
0x485ffcf2, 0x489fff99, 0x48dffe62, 0x080a7c3e,
0x0814fed5, 0x085f7c59, 0x085ffcb8, 0x089ffc70,
0x08dfffb6, 0xc87f0a68, 0xc87fcdc7, 0xc82870bb,
- 0xc825b8c8, 0x887f12d9, 0x887fb9ee, 0x8834215a,
+ 0xc825b8c8, 0x887f12d9, 0x887fb9ed, 0x8834215a,
0x8837ca52, 0xf806317e, 0xb81b3337, 0x39000dc2,
0x78005149, 0xf84391f4, 0xb85b220c, 0x385fd356,
0x785d127e, 0x389f4149, 0x79801e3c, 0x79c014a3,
--- a/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp Fri Aug 21 09:12:42 2015 +0200
@@ -1106,13 +1106,13 @@
#define INSN4(NAME, sz, op, o0) /* Four registers */ \
void NAME(Register Rs, Register Rt1, Register Rt2, Register Rn) { \
- assert(Rs != Rn, "unpredictable instruction"); \
+ guarantee(Rs != Rn && Rs != Rt1 && Rs != Rt2, "unpredictable instruction"); \
load_store_exclusive(Rs, Rt1, Rt2, Rn, sz, op, o0); \
}
#define INSN3(NAME, sz, op, o0) /* Three registers */ \
void NAME(Register Rs, Register Rt, Register Rn) { \
- assert(Rs != Rn, "unpredictable instruction"); \
+ guarantee(Rs != Rn && Rs != Rt, "unpredictable instruction"); \
load_store_exclusive(Rs, Rt, (Register)0b11111, Rn, sz, op, o0); \
}
@@ -1124,6 +1124,7 @@
#define INSN_FOO(NAME, sz, op, o0) /* Three registers, encoded differently */ \
void NAME(Register Rt1, Register Rt2, Register Rn) { \
+ guarantee(Rt1 != Rt2, "unpredictable instruction"); \
load_store_exclusive((Register)0b11111, Rt1, Rt2, Rn, sz, op, o0); \
}
--- a/hotspot/src/cpu/aarch64/vm/interp_masm_aarch64.cpp Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/cpu/aarch64/vm/interp_masm_aarch64.cpp Fri Aug 21 09:12:42 2015 +0200
@@ -611,6 +611,7 @@
Label done;
const Register swap_reg = r0;
+ const Register tmp = c_rarg2;
const Register obj_reg = c_rarg3; // Will contain the oop
const int obj_offset = BasicObjectLock::obj_offset_in_bytes();
@@ -624,7 +625,7 @@
ldr(obj_reg, Address(lock_reg, obj_offset));
if (UseBiasedLocking) {
- biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch2, false, done, &slow_case);
+ biased_locking_enter(lock_reg, obj_reg, swap_reg, tmp, false, done, &slow_case);
}
// Load (object->mark() | 1) into swap_reg
@@ -643,7 +644,7 @@
cmpxchgptr(swap_reg, lock_reg, obj_reg, rscratch1, fast, &fail);
bind(fast);
atomic_incw(Address((address)BiasedLocking::fast_path_entry_count_addr()),
- rscratch2, rscratch1);
+ rscratch2, rscratch1, tmp);
b(done);
bind(fail);
} else {
@@ -671,7 +672,7 @@
if (PrintBiasedLockingStatistics) {
br(Assembler::NE, slow_case);
atomic_incw(Address((address)BiasedLocking::fast_path_entry_count_addr()),
- rscratch2, rscratch1);
+ rscratch2, rscratch1, tmp);
}
br(Assembler::EQ, done);
--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp Fri Aug 21 09:12:42 2015 +0200
@@ -34,6 +34,7 @@
#include "memory/resourceArea.hpp"
#include "nativeInst_aarch64.hpp"
#include "oops/klass.inline.hpp"
+#include "oops/oop.inline.hpp"
#include "opto/compile.hpp"
#include "opto/node.hpp"
#include "runtime/biasedLocking.hpp"
@@ -398,11 +399,7 @@
if (PrintBiasedLockingStatistics && counters == NULL)
counters = BiasedLocking::counters();
- bool need_tmp_reg = false;
- if (tmp_reg == noreg) {
- tmp_reg = rscratch2;
- }
- assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1);
+ assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes());
Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes());
@@ -432,7 +429,7 @@
if (counters != NULL) {
Label around;
cbnz(tmp_reg, around);
- atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1);
+ atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
b(done);
bind(around);
} else {
@@ -485,7 +482,7 @@
bind(here);
if (counters != NULL) {
atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
- tmp_reg, rscratch1);
+ tmp_reg, rscratch1, rscratch2);
}
}
b(done);
@@ -511,7 +508,7 @@
bind(here);
if (counters != NULL) {
atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
- tmp_reg, rscratch1);
+ tmp_reg, rscratch1, rscratch2);
}
}
b(done);
@@ -539,7 +536,7 @@
// removing the bias bit from the object's header.
if (counters != NULL) {
atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
- rscratch1);
+ rscratch1, rscratch2);
}
bind(nope);
}
@@ -1640,15 +1637,15 @@
return Address(Rd);
}
-void MacroAssembler::atomic_incw(Register counter_addr, Register tmp) {
+void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
Label retry_load;
bind(retry_load);
// flush and load exclusive from the memory location
ldxrw(tmp, counter_addr);
addw(tmp, tmp, 1);
// if we store+flush with no intervening write tmp wil be zero
- stxrw(tmp, tmp, counter_addr);
- cbnzw(tmp, retry_load);
+ stxrw(tmp2, tmp, counter_addr);
+ cbnzw(tmp2, retry_load);
}
@@ -2021,6 +2018,14 @@
}
}
+void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
+ if (decrement.is_register()) {
+ subw(Rd, Rn, decrement.as_register());
+ } else {
+ subw(Rd, Rn, decrement.as_constant());
+ }
+}
+
void MacroAssembler::reinit_heapbase()
{
if (UseCompressedOops) {
@@ -2110,7 +2115,7 @@
return a != b.as_register() && a != c && b.as_register() != c;
}
-#define ATOMIC_OP(LDXR, OP, STXR) \
+#define ATOMIC_OP(LDXR, OP, IOP, STXR) \
void MacroAssembler::atomic_##OP(Register prev, RegisterOrConstant incr, Register addr) { \
Register result = rscratch2; \
if (prev->is_valid()) \
@@ -2120,14 +2125,15 @@
bind(retry_load); \
LDXR(result, addr); \
OP(rscratch1, result, incr); \
- STXR(rscratch1, rscratch1, addr); \
- cbnzw(rscratch1, retry_load); \
- if (prev->is_valid() && prev != result) \
- mov(prev, result); \
+ STXR(rscratch2, rscratch1, addr); \
+ cbnzw(rscratch2, retry_load); \
+ if (prev->is_valid() && prev != result) { \
+ IOP(prev, rscratch1, incr); \
+ } \
}
-ATOMIC_OP(ldxr, add, stxr)
-ATOMIC_OP(ldxrw, addw, stxrw)
+ATOMIC_OP(ldxr, add, sub, stxr)
+ATOMIC_OP(ldxrw, addw, subw, stxrw)
#undef ATOMIC_OP
--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp Fri Aug 21 09:12:42 2015 +0200
@@ -107,9 +107,7 @@
// Biased locking support
// lock_reg and obj_reg must be loaded up with the appropriate values.
// swap_reg is killed.
- // tmp_reg is optional. If it is supplied (i.e., != noreg) it will
- // be killed; if not supplied, push/pop will be used internally to
- // allocate a temporary (inefficient, avoid if possible).
+ // tmp_reg must be supplied and must not be rscratch1 or rscratch2
// Optional slow case is for implementations (interpreter and C1) which branch to
// slow case directly. Leaves condition codes set for C2's Fast_Lock node.
// Returns offset of first potentially-faulting instruction for null
@@ -126,10 +124,10 @@
// Helper functions for statistics gathering.
// Unconditional atomic increment.
- void atomic_incw(Register counter_addr, Register tmp);
- void atomic_incw(Address counter_addr, Register tmp1, Register tmp2) {
+ void atomic_incw(Register counter_addr, Register tmp, Register tmp2);
+ void atomic_incw(Address counter_addr, Register tmp1, Register tmp2, Register tmp3) {
lea(tmp1, counter_addr);
- atomic_incw(tmp1, tmp2);
+ atomic_incw(tmp1, tmp2, tmp3);
}
// Load Effective Address
void lea(Register r, const Address &a) {
@@ -1057,6 +1055,7 @@
void add(Register Rd, Register Rn, RegisterOrConstant increment);
void addw(Register Rd, Register Rn, RegisterOrConstant increment);
void sub(Register Rd, Register Rn, RegisterOrConstant decrement);
+ void subw(Register Rd, Register Rn, RegisterOrConstant decrement);
void adrp(Register reg1, const Address &dest, unsigned long &byte_offset);
--- a/hotspot/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp Fri Aug 21 09:12:42 2015 +0200
@@ -1774,6 +1774,7 @@
const Register obj_reg = r19; // Will contain the oop
const Register lock_reg = r13; // Address of compiler lock object (BasicLock)
const Register old_hdr = r13; // value of old header at unlock time
+ const Register tmp = c_rarg3;
Label slow_path_lock;
Label lock_done;
@@ -1795,7 +1796,7 @@
__ ldr(obj_reg, Address(oop_handle_reg, 0));
if (UseBiasedLocking) {
- __ biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch2, false, lock_done, &slow_path_lock);
+ __ biased_locking_enter(lock_reg, obj_reg, swap_reg, tmp, false, lock_done, &slow_path_lock);
}
// Load (object->mark() | 1) into swap_reg %r0
--- a/hotspot/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp Fri Aug 21 09:12:42 2015 +0200
@@ -1913,15 +1913,18 @@
}
void TemplateInterpreterGenerator::count_bytecode() {
+ Register rscratch3 = r0;
__ push(rscratch1);
__ push(rscratch2);
+ __ push(rscratch3);
Label L;
__ mov(rscratch2, (address) &BytecodeCounter::_counter_value);
__ bind(L);
__ ldxr(rscratch1, rscratch2);
__ add(rscratch1, rscratch1, 1);
- __ stxr(rscratch1, rscratch1, rscratch2);
- __ cbnzw(rscratch1, L);
+ __ stxr(rscratch3, rscratch1, rscratch2);
+ __ cbnzw(rscratch3, L);
+ __ pop(rscratch3);
__ pop(rscratch2);
__ pop(rscratch1);
}
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp Fri Aug 21 09:12:42 2015 +0200
@@ -1674,6 +1674,13 @@
emit_simd_arith(0x2A, dst, src, VEX_SIMD_F3, true);
}
+void Assembler::cvtsi2ssq(XMMRegister dst, Register src) {
+ NOT_LP64(assert(VM_Version::supports_sse(), ""));
+ int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3, true);
+ emit_int8(0x2A);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
void Assembler::cvtss2sd(XMMRegister dst, XMMRegister src) {
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
emit_simd_arith(0x5A, dst, src, VEX_SIMD_F3);
@@ -6604,13 +6611,6 @@
emit_operand(dst, src);
}
-void Assembler::cvtsi2ssq(XMMRegister dst, Register src) {
- NOT_LP64(assert(VM_Version::supports_sse(), ""));
- int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3, true);
- emit_int8(0x2A);
- emit_int8((unsigned char)(0xC0 | encode));
-}
-
void Assembler::cvtsi2ssq(XMMRegister dst, Address src) {
NOT_LP64(assert(VM_Version::supports_sse(), ""));
if (VM_Version::supports_evex()) {
--- a/hotspot/src/cpu/x86/vm/interp_masm_x86.cpp Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/interp_masm_x86.cpp Fri Aug 21 09:12:42 2015 +0200
@@ -355,8 +355,8 @@
case ctos: // fall through
case stos: // fall through
case itos: movl(rax, val_addr); break;
- case ftos: movflt(xmm0, val_addr); break;
- case dtos: movdbl(xmm0, val_addr); break;
+ case ftos: load_float(val_addr); break;
+ case dtos: load_double(val_addr); break;
case vtos: /* nothing to do */ break;
default : ShouldNotReachHere();
}
@@ -376,8 +376,8 @@
case ctos: // fall through
case stos: // fall through
case itos: movl(rax, val_addr); break;
- case ftos: fld_s(val_addr); break;
- case dtos: fld_d(val_addr); break;
+ case ftos: load_float(val_addr); break;
+ case dtos: load_double(val_addr); break;
case vtos: /* nothing to do */ break;
default : ShouldNotReachHere();
}
@@ -578,6 +578,26 @@
push(r);
}
+void InterpreterMacroAssembler::push_f(XMMRegister r) {
+ subptr(rsp, wordSize);
+ movflt(Address(rsp, 0), r);
+}
+
+void InterpreterMacroAssembler::pop_f(XMMRegister r) {
+ movflt(r, Address(rsp, 0));
+ addptr(rsp, wordSize);
+}
+
+void InterpreterMacroAssembler::push_d(XMMRegister r) {
+ subptr(rsp, 2 * wordSize);
+ movdbl(Address(rsp, 0), r);
+}
+
+void InterpreterMacroAssembler::pop_d(XMMRegister r) {
+ movdbl(r, Address(rsp, 0));
+ addptr(rsp, 2 * Interpreter::stackElementSize);
+}
+
#ifdef _LP64
void InterpreterMacroAssembler::pop_i(Register r) {
// XXX can't use pop currently, upper half non clean
@@ -590,31 +610,11 @@
addptr(rsp, 2 * Interpreter::stackElementSize);
}
-void InterpreterMacroAssembler::pop_f(XMMRegister r) {
- movflt(r, Address(rsp, 0));
- addptr(rsp, wordSize);
-}
-
-void InterpreterMacroAssembler::pop_d(XMMRegister r) {
- movdbl(r, Address(rsp, 0));
- addptr(rsp, 2 * Interpreter::stackElementSize);
-}
-
void InterpreterMacroAssembler::push_l(Register r) {
subptr(rsp, 2 * wordSize);
movq(Address(rsp, 0), r);
}
-void InterpreterMacroAssembler::push_f(XMMRegister r) {
- subptr(rsp, wordSize);
- movflt(Address(rsp, 0), r);
-}
-
-void InterpreterMacroAssembler::push_d(XMMRegister r) {
- subptr(rsp, 2 * wordSize);
- movdbl(Address(rsp, 0), r);
-}
-
void InterpreterMacroAssembler::pop(TosState state) {
switch (state) {
case atos: pop_ptr(); break;
@@ -623,8 +623,8 @@
case stos:
case itos: pop_i(); break;
case ltos: pop_l(); break;
- case ftos: pop_f(); break;
- case dtos: pop_d(); break;
+ case ftos: pop_f(xmm0); break;
+ case dtos: pop_d(xmm0); break;
case vtos: /* nothing to do */ break;
default: ShouldNotReachHere();
}
@@ -640,8 +640,8 @@
case stos:
case itos: push_i(); break;
case ltos: push_l(); break;
- case ftos: push_f(); break;
- case dtos: push_d(); break;
+ case ftos: push_f(xmm0); break;
+ case dtos: push_d(xmm0); break;
case vtos: /* nothing to do */ break;
default : ShouldNotReachHere();
}
@@ -675,8 +675,20 @@
case stos: // fall through
case itos: pop_i(rax); break;
case ltos: pop_l(rax, rdx); break;
- case ftos: pop_f(); break;
- case dtos: pop_d(); break;
+ case ftos:
+ if (UseSSE >= 1) {
+ pop_f(xmm0);
+ } else {
+ pop_f();
+ }
+ break;
+ case dtos:
+ if (UseSSE >= 2) {
+ pop_d(xmm0);
+ } else {
+ pop_d();
+ }
+ break;
case vtos: /* nothing to do */ break;
default : ShouldNotReachHere();
}
@@ -695,7 +707,7 @@
fstp_s(Address(rsp, 0));
}
-void InterpreterMacroAssembler::push_d(Register r) {
+void InterpreterMacroAssembler::push_d() {
// Do not schedule for no AGI! Never write beyond rsp!
subptr(rsp, 2 * wordSize);
fstp_d(Address(rsp, 0));
@@ -711,8 +723,20 @@
case stos: // fall through
case itos: push_i(rax); break;
case ltos: push_l(rax, rdx); break;
- case ftos: push_f(); break;
- case dtos: push_d(rax); break;
+ case ftos:
+ if (UseSSE >= 1) {
+ push_f(xmm0);
+ } else {
+ push_f();
+ }
+ break;
+ case dtos:
+ if (UseSSE >= 2) {
+ push_d(xmm0);
+ } else {
+ push_d();
+ }
+ break;
case vtos: /* nothing to do */ break;
default : ShouldNotReachHere();
}
@@ -995,22 +1019,6 @@
leave(); // remove frame anchor
pop(ret_addr); // get return address
mov(rsp, rbx); // set sp to sender sp
-#ifndef _LP64
- if (UseSSE) {
- // float and double are returned in xmm register in SSE-mode
- if (state == ftos && UseSSE >= 1) {
- subptr(rsp, wordSize);
- fstp_s(Address(rsp, 0));
- movflt(xmm0, Address(rsp, 0));
- addptr(rsp, wordSize);
- } else if (state == dtos && UseSSE >= 2) {
- subptr(rsp, 2*wordSize);
- fstp_d(Address(rsp, 0));
- movdbl(xmm0, Address(rsp, 0));
- addptr(rsp, 2*wordSize);
- }
- }
-#endif // _LP64
}
#endif // !CC_INTERP
@@ -1783,7 +1791,10 @@
void InterpreterMacroAssembler::verify_FPU(int stack_depth, TosState state) {
#ifndef _LP64
- if (state == ftos || state == dtos) MacroAssembler::verify_FPU(stack_depth);
+ if ((state == ftos && UseSSE < 1) ||
+ (state == dtos && UseSSE < 2)) {
+ MacroAssembler::verify_FPU(stack_depth);
+ }
#endif
}
--- a/hotspot/src/cpu/x86/vm/interp_masm_x86.hpp Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/interp_masm_x86.hpp Fri Aug 21 09:12:42 2015 +0200
@@ -140,20 +140,20 @@
void push_ptr(Register r = rax);
void push_i(Register r = rax);
+ void push_f(XMMRegister r);
+ void pop_f(XMMRegister r);
+ void pop_d(XMMRegister r);
+ void push_d(XMMRegister r);
#ifdef _LP64
void pop_l(Register r = rax);
- void pop_f(XMMRegister r = xmm0);
- void pop_d(XMMRegister r = xmm0);
void push_l(Register r = rax);
- void push_f(XMMRegister r = xmm0);
- void push_d(XMMRegister r = xmm0);
#else
void pop_l(Register lo = rax, Register hi = rdx);
void pop_f();
void pop_d();
void push_l(Register lo = rax, Register hi = rdx);
- void push_d(Register r = rax);
+ void push_d();
void push_f();
#endif // _LP64
--- a/hotspot/src/cpu/x86/vm/interpreterGenerator_x86.hpp Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/interpreterGenerator_x86.hpp Fri Aug 21 09:12:42 2015 +0200
@@ -42,6 +42,12 @@
address generate_Reference_get_entry();
address generate_CRC32_update_entry();
address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind);
+#ifndef _LP64
+ address generate_Float_intBitsToFloat_entry();
+ address generate_Float_floatToRawIntBits_entry();
+ address generate_Double_longBitsToDouble_entry();
+ address generate_Double_doubleToRawLongBits_entry();
+#endif
void lock_method(void);
void generate_stack_overflow_check(void);
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Fri Aug 21 09:12:42 2015 +0200
@@ -3314,6 +3314,42 @@
fincstp();
}
+void MacroAssembler::load_float(Address src) {
+ if (UseSSE >= 1) {
+ movflt(xmm0, src);
+ } else {
+ LP64_ONLY(ShouldNotReachHere());
+ NOT_LP64(fld_s(src));
+ }
+}
+
+void MacroAssembler::store_float(Address dst) {
+ if (UseSSE >= 1) {
+ movflt(dst, xmm0);
+ } else {
+ LP64_ONLY(ShouldNotReachHere());
+ NOT_LP64(fstp_s(dst));
+ }
+}
+
+void MacroAssembler::load_double(Address src) {
+ if (UseSSE >= 2) {
+ movdbl(xmm0, src);
+ } else {
+ LP64_ONLY(ShouldNotReachHere());
+ NOT_LP64(fld_d(src));
+ }
+}
+
+void MacroAssembler::store_double(Address dst) {
+ if (UseSSE >= 2) {
+ movdbl(dst, xmm0);
+ } else {
+ LP64_ONLY(ShouldNotReachHere());
+ NOT_LP64(fstp_d(dst));
+ }
+}
+
void MacroAssembler::fremr(Register tmp) {
save_rax(tmp);
{ Label L;
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp Fri Aug 21 09:12:42 2015 +0200
@@ -471,6 +471,22 @@
// Pop ST (ffree & fincstp combined)
void fpop();
+ // Load float value from 'address'. If UseSSE >= 1, the value is loaded into
+ // register xmm0. Otherwise, the value is loaded onto the FPU stack.
+ void load_float(Address src);
+
+ // Store float value to 'address'. If UseSSE >= 1, the value is stored
+ // from register xmm0. Otherwise, the value is stored from the FPU stack.
+ void store_float(Address dst);
+
+ // Load double value from 'address'. If UseSSE >= 2, the value is loaded into
+ // register xmm0. Otherwise, the value is loaded onto the FPU stack.
+ void load_double(Address src);
+
+ // Store double value to 'address'. If UseSSE >= 2, the value is stored
+ // from register xmm0. Otherwise, the value is stored from the FPU stack.
+ void store_double(Address dst);
+
// pushes double TOS element of FPU stack on CPU stack; pops from FPU stack
void push_fTOS();
--- a/hotspot/src/cpu/x86/vm/templateInterpreter_x86_32.cpp Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/templateInterpreter_x86_32.cpp Fri Aug 21 09:12:42 2015 +0200
@@ -170,22 +170,12 @@
__ MacroAssembler::verify_FPU(0, "generate_return_entry_for compiled");
}
- // In SSE mode, interpreter returns FP results in xmm0 but they need
- // to end up back on the FPU so it can operate on them.
- if (state == ftos && UseSSE >= 1) {
- __ subptr(rsp, wordSize);
- __ movflt(Address(rsp, 0), xmm0);
- __ fld_s(Address(rsp, 0));
- __ addptr(rsp, wordSize);
- } else if (state == dtos && UseSSE >= 2) {
- __ subptr(rsp, 2*wordSize);
- __ movdbl(Address(rsp, 0), xmm0);
- __ fld_d(Address(rsp, 0));
- __ addptr(rsp, 2*wordSize);
+ if (state == ftos) {
+ __ MacroAssembler::verify_FPU(UseSSE >= 1 ? 0 : 1, "generate_return_entry_for in interpreter");
+ } else if (state == dtos) {
+ __ MacroAssembler::verify_FPU(UseSSE >= 2 ? 0 : 1, "generate_return_entry_for in interpreter");
}
- __ MacroAssembler::verify_FPU(state == ftos || state == dtos ? 1 : 0, "generate_return_entry_for in interpreter");
-
// Restore stack bottom in case i2c adjusted stack
__ movptr(rsp, Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize));
// and NULL it as marker that rsp is now tos until next java call
@@ -217,21 +207,12 @@
address TemplateInterpreterGenerator::generate_deopt_entry_for(TosState state, int step) {
address entry = __ pc();
- // In SSE mode, FP results are in xmm0
- if (state == ftos && UseSSE > 0) {
- __ subptr(rsp, wordSize);
- __ movflt(Address(rsp, 0), xmm0);
- __ fld_s(Address(rsp, 0));
- __ addptr(rsp, wordSize);
- } else if (state == dtos && UseSSE >= 2) {
- __ subptr(rsp, 2*wordSize);
- __ movdbl(Address(rsp, 0), xmm0);
- __ fld_d(Address(rsp, 0));
- __ addptr(rsp, 2*wordSize);
+ if (state == ftos) {
+ __ MacroAssembler::verify_FPU(UseSSE >= 1 ? 0 : 1, "generate_deopt_entry_for in interpreter");
+ } else if (state == dtos) {
+ __ MacroAssembler::verify_FPU(UseSSE >= 2 ? 0 : 1, "generate_deopt_entry_for in interpreter");
}
- __ MacroAssembler::verify_FPU(state == ftos || state == dtos ? 1 : 0, "generate_deopt_entry_for in interpreter");
-
// The stack is not extended by deopt but we must NULL last_sp as this
// entry is like a "return".
__ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
@@ -735,7 +716,7 @@
if (UseCRC32Intrinsics) {
address entry = __ pc();
- // rbx,: Method*
+ // rbx: Method*
// rsi: senderSP must preserved for slow path, set SP to it on fast path
// rdx: scratch
// rdi: scratch
@@ -841,6 +822,124 @@
return generate_native_entry(false);
}
+/**
+ * Method entry for static native method:
+ * java.lang.Float.intBitsToFloat(int bits)
+ */
+address InterpreterGenerator::generate_Float_intBitsToFloat_entry() {
+ address entry;
+
+ if (UseSSE >= 1) {
+ entry = __ pc();
+
+ // rsi: the sender's SP
+
+ // Skip safepoint check (compiler intrinsic versions of this method
+ // do not perform safepoint checks either).
+
+ // Load 'bits' into xmm0 (interpreter returns results in xmm0)
+ __ movflt(xmm0, Address(rsp, wordSize));
+
+ // Return
+ __ pop(rdi); // get return address
+ __ mov(rsp, rsi); // set rsp to the sender's SP
+ __ jmp(rdi);
+ } else {
+ entry = generate_native_entry(false);
+ }
+
+ return entry;
+}
+
+/**
+ * Method entry for static native method:
+ * java.lang.Float.floatToRawIntBits(float value)
+ */
+address InterpreterGenerator::generate_Float_floatToRawIntBits_entry() {
+ address entry;
+
+ if (UseSSE >= 1) {
+ entry = __ pc();
+
+ // rsi: the sender's SP
+
+ // Skip safepoint check (compiler intrinsic versions of this method
+ // do not perform safepoint checks either).
+
+ // Load the parameter (a floating-point value) into rax.
+ __ movl(rax, Address(rsp, wordSize));
+
+ // Return
+ __ pop(rdi); // get return address
+ __ mov(rsp, rsi); // set rsp to the sender's SP
+ __ jmp(rdi);
+ } else {
+ entry = generate_native_entry(false);
+ }
+
+ return entry;
+}
+
+
+/**
+ * Method entry for static native method:
+ * java.lang.Double.longBitsToDouble(long bits)
+ */
+address InterpreterGenerator::generate_Double_longBitsToDouble_entry() {
+ address entry;
+
+ if (UseSSE >= 2) {
+ entry = __ pc();
+
+ // rsi: the sender's SP
+
+ // Skip safepoint check (compiler intrinsic versions of this method
+ // do not perform safepoint checks either).
+
+ // Load 'bits' into xmm0 (interpreter returns results in xmm0)
+ __ movdbl(xmm0, Address(rsp, wordSize));
+
+ // Return
+ __ pop(rdi); // get return address
+ __ mov(rsp, rsi); // set rsp to the sender's SP
+ __ jmp(rdi);
+ } else {
+ entry = generate_native_entry(false);
+ }
+
+ return entry;
+}
+
+/**
+ * Method entry for static native method:
+ * java.lang.Double.doubleToRawLongBits(double value)
+ */
+address InterpreterGenerator::generate_Double_doubleToRawLongBits_entry() {
+ address entry;
+
+ if (UseSSE >= 2) {
+ entry = __ pc();
+
+ // rsi: the sender's SP
+
+ // Skip safepoint check (compiler intrinsic versions of this method
+ // do not perform safepoint checks either).
+
+ // Load the parameter (a floating-point value) into rax.
+ __ movl(rdx, Address(rsp, 2*wordSize));
+ __ movl(rax, Address(rsp, wordSize));
+
+ // Return
+ __ pop(rdi); // get return address
+ __ mov(rsp, rsi); // set rsp to the sender's SP
+ __ jmp(rdi);
+ } else {
+ entry = generate_native_entry(false);
+ }
+
+ return entry;
+}
+
//
// Interpreter stub for calling a native method. (asm interpreter)
// This sets up a somewhat different looking stack for calling the native method
@@ -1090,7 +1189,7 @@
double_handler.addr());
__ jcc(Assembler::notEqual, L);
__ bind(push_double);
- __ push(dtos);
+ __ push_d(); // FP values are returned using the FPU, so push FPU contents (even if UseSSE > 0).
__ bind(L);
}
__ push(ltos);
--- a/hotspot/src/cpu/x86/vm/templateInterpreter_x86_64.cpp Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/templateInterpreter_x86_64.cpp Fri Aug 21 09:12:42 2015 +0200
@@ -1707,10 +1707,10 @@
address& vep) {
assert(t->is_valid() && t->tos_in() == vtos, "illegal template");
Label L;
- aep = __ pc(); __ push_ptr(); __ jmp(L);
- fep = __ pc(); __ push_f(); __ jmp(L);
- dep = __ pc(); __ push_d(); __ jmp(L);
- lep = __ pc(); __ push_l(); __ jmp(L);
+ aep = __ pc(); __ push_ptr(); __ jmp(L);
+ fep = __ pc(); __ push_f(xmm0); __ jmp(L);
+ dep = __ pc(); __ push_d(xmm0); __ jmp(L);
+ lep = __ pc(); __ push_l(); __ jmp(L);
bep = cep = sep =
iep = __ pc(); __ push_i();
vep = __ pc();
--- a/hotspot/src/cpu/x86/vm/templateTable_x86.cpp Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/templateTable_x86.cpp Fri Aug 21 09:12:42 2015 +0200
@@ -349,53 +349,60 @@
void TemplateTable::fconst(int value) {
transition(vtos, ftos);
+ if (UseSSE >= 1) {
+ static float one = 1.0f, two = 2.0f;
+ switch (value) {
+ case 0:
+ __ xorps(xmm0, xmm0);
+ break;
+ case 1:
+ __ movflt(xmm0, ExternalAddress((address) &one));
+ break;
+ case 2:
+ __ movflt(xmm0, ExternalAddress((address) &two));
+ break;
+ default:
+ ShouldNotReachHere();
+ break;
+ }
+ } else {
#ifdef _LP64
- static float one = 1.0f, two = 2.0f;
- switch (value) {
- case 0:
- __ xorps(xmm0, xmm0);
- break;
- case 1:
- __ movflt(xmm0, ExternalAddress((address) &one));
- break;
- case 2:
- __ movflt(xmm0, ExternalAddress((address) &two));
- break;
- default:
ShouldNotReachHere();
- break;
+#else
+ if (value == 0) { __ fldz();
+ } else if (value == 1) { __ fld1();
+ } else if (value == 2) { __ fld1(); __ fld1(); __ faddp(); // should do a better solution here
+ } else { ShouldNotReachHere();
+ }
+#endif // _LP64
}
-#else
- if (value == 0) { __ fldz();
- } else if (value == 1) { __ fld1();
- } else if (value == 2) { __ fld1(); __ fld1(); __ faddp(); // should do a better solution here
- } else { ShouldNotReachHere();
- }
-#endif
}
void TemplateTable::dconst(int value) {
transition(vtos, dtos);
+ if (UseSSE >= 2) {
+ static double one = 1.0;
+ switch (value) {
+ case 0:
+ __ xorpd(xmm0, xmm0);
+ break;
+ case 1:
+ __ movdbl(xmm0, ExternalAddress((address) &one));
+ break;
+ default:
+ ShouldNotReachHere();
+ break;
+ }
+ } else {
#ifdef _LP64
- static double one = 1.0;
- switch (value) {
- case 0:
- __ xorpd(xmm0, xmm0);
- break;
- case 1:
- __ movdbl(xmm0, ExternalAddress((address) &one));
- break;
- default:
ShouldNotReachHere();
- break;
+#else
+ if (value == 0) { __ fldz();
+ } else if (value == 1) { __ fld1();
+ } else { ShouldNotReachHere();
+ }
+#endif
}
-
-#else
- if (value == 0) { __ fldz();
- } else if (value == 1) { __ fld1();
- } else { ShouldNotReachHere();
- }
-#endif
}
void TemplateTable::bipush() {
@@ -454,8 +461,7 @@
__ jccb(Assembler::notEqual, notFloat);
// ftos
- LP64_ONLY(__ movflt(xmm0, Address(rcx, rbx, Address::times_8, base_offset)));
- NOT_LP64(__ fld_s( Address(rcx, rbx, Address::times_ptr, base_offset)));
+ __ load_float(Address(rcx, rbx, Address::times_ptr, base_offset));
__ push(ftos);
__ jmp(Done);
@@ -522,8 +528,7 @@
__ jccb(Assembler::notEqual, Long);
// dtos
- LP64_ONLY(__ movdbl(xmm0, Address(rcx, rbx, Address::times_8, base_offset)));
- NOT_LP64(__ fld_d( Address(rcx, rbx, Address::times_ptr, base_offset)));
+ __ load_double(Address(rcx, rbx, Address::times_ptr, base_offset));
__ push(dtos);
__ jmpb(Done);
@@ -617,15 +622,13 @@
void TemplateTable::fload() {
transition(vtos, ftos);
locals_index(rbx);
- LP64_ONLY(__ movflt(xmm0, faddress(rbx)));
- NOT_LP64(__ fld_s(faddress(rbx)));
+ __ load_float(faddress(rbx));
}
void TemplateTable::dload() {
transition(vtos, dtos);
locals_index(rbx);
- LP64_ONLY(__ movdbl(xmm0, daddress(rbx)));
- NOT_LP64(__ fld_d(daddress(rbx)));
+ __ load_double(daddress(rbx));
}
void TemplateTable::aload() {
@@ -657,15 +660,13 @@
void TemplateTable::wide_fload() {
transition(vtos, ftos);
locals_index_wide(rbx);
- LP64_ONLY(__ movflt(xmm0, faddress(rbx)));
- NOT_LP64(__ fld_s(faddress(rbx)));
+ __ load_float(faddress(rbx));
}
void TemplateTable::wide_dload() {
transition(vtos, dtos);
locals_index_wide(rbx);
- LP64_ONLY(__ movdbl(xmm0, daddress(rbx)));
- NOT_LP64(__ fld_d(daddress(rbx)));
+ __ load_double(daddress(rbx));
}
void TemplateTable::wide_aload() {
@@ -726,10 +727,9 @@
// rax: index
// rdx: array
index_check(rdx, rax); // kills rbx
- LP64_ONLY(__ movflt(xmm0, Address(rdx, rax,
- Address::times_4,
- arrayOopDesc::base_offset_in_bytes(T_FLOAT))));
- NOT_LP64(__ fld_s(Address(rdx, rax, Address::times_4, arrayOopDesc::base_offset_in_bytes(T_FLOAT))));
+ __ load_float(Address(rdx, rax,
+ Address::times_4,
+ arrayOopDesc::base_offset_in_bytes(T_FLOAT)));
}
void TemplateTable::daload() {
@@ -737,10 +737,9 @@
// rax: index
// rdx: array
index_check(rdx, rax); // kills rbx
- LP64_ONLY(__ movdbl(xmm0, Address(rdx, rax,
- Address::times_8,
- arrayOopDesc::base_offset_in_bytes(T_DOUBLE))));
- NOT_LP64(__ fld_d(Address(rdx, rax, Address::times_8, arrayOopDesc::base_offset_in_bytes(T_DOUBLE))));
+ __ load_double(Address(rdx, rax,
+ Address::times_8,
+ arrayOopDesc::base_offset_in_bytes(T_DOUBLE)));
}
void TemplateTable::aaload() {
@@ -807,14 +806,12 @@
void TemplateTable::fload(int n) {
transition(vtos, ftos);
- LP64_ONLY(__ movflt(xmm0, faddress(n)));
- NOT_LP64(__ fld_s(faddress(n)));
+ __ load_float(faddress(n));
}
void TemplateTable::dload(int n) {
transition(vtos, dtos);
- LP64_ONLY(__ movdbl(xmm0, daddress(n)));
- NOT_LP64(__ fld_d(daddress(n)));
+ __ load_double(daddress(n));
}
void TemplateTable::aload(int n) {
@@ -919,15 +916,13 @@
void TemplateTable::fstore() {
transition(ftos, vtos);
locals_index(rbx);
- LP64_ONLY(__ movflt(faddress(rbx), xmm0));
- NOT_LP64(__ fstp_s(faddress(rbx)));
+ __ store_float(faddress(rbx));
}
void TemplateTable::dstore() {
transition(dtos, vtos);
locals_index(rbx);
- LP64_ONLY(__ movdbl(daddress(rbx), xmm0));
- NOT_LP64(__ fstp_d(daddress(rbx)));
+ __ store_double(daddress(rbx));
}
void TemplateTable::astore() {
@@ -956,7 +951,7 @@
void TemplateTable::wide_fstore() {
#ifdef _LP64
transition(vtos, vtos);
- __ pop_f();
+ __ pop_f(xmm0);
locals_index_wide(rbx);
__ movflt(faddress(rbx), xmm0);
#else
@@ -967,7 +962,7 @@
void TemplateTable::wide_dstore() {
#ifdef _LP64
transition(vtos, vtos);
- __ pop_d();
+ __ pop_d(xmm0);
locals_index_wide(rbx);
__ movdbl(daddress(rbx), xmm0);
#else
@@ -1011,29 +1006,21 @@
void TemplateTable::fastore() {
transition(ftos, vtos);
__ pop_i(rbx);
- // xmm0: value
+ // value is in UseSSE >= 1 ? xmm0 : ST(0)
// rbx: index
// rdx: array
index_check(rdx, rbx); // prefer index in rbx
- LP64_ONLY(__ movflt(Address(rdx, rbx,
- Address::times_4,
- arrayOopDesc::base_offset_in_bytes(T_FLOAT)),
- xmm0));
- NOT_LP64(__ fstp_s(Address(rdx, rbx, Address::times_4, arrayOopDesc::base_offset_in_bytes(T_FLOAT))));
+ __ store_float(Address(rdx, rbx, Address::times_4, arrayOopDesc::base_offset_in_bytes(T_FLOAT)));
}
void TemplateTable::dastore() {
transition(dtos, vtos);
__ pop_i(rbx);
- // xmm0: value
+ // value is in UseSSE >= 2 ? xmm0 : ST(0)
// rbx: index
// rdx: array
index_check(rdx, rbx); // prefer index in rbx
- LP64_ONLY(__ movdbl(Address(rdx, rbx,
- Address::times_8,
- arrayOopDesc::base_offset_in_bytes(T_DOUBLE)),
- xmm0));
- NOT_LP64(__ fstp_d(Address(rdx, rbx, Address::times_8, arrayOopDesc::base_offset_in_bytes(T_DOUBLE))));
+ __ store_double(Address(rdx, rbx, Address::times_8, arrayOopDesc::base_offset_in_bytes(T_DOUBLE)));
}
void TemplateTable::aastore() {
@@ -1134,14 +1121,12 @@
void TemplateTable::fstore(int n) {
transition(ftos, vtos);
- LP64_ONLY(__ movflt(faddress(n), xmm0));
- NOT_LP64(__ fstp_s(faddress(n)));
+ __ store_float(faddress(n));
}
void TemplateTable::dstore(int n) {
transition(dtos, vtos);
- LP64_ONLY(__ movdbl(daddress(n), xmm0));
- NOT_LP64(__ fstp_d(daddress(n)));
+ __ store_double(daddress(n));
}
@@ -1425,82 +1410,127 @@
void TemplateTable::fop2(Operation op) {
transition(ftos, ftos);
+
+ if (UseSSE >= 1) {
+ switch (op) {
+ case add:
+ __ addss(xmm0, at_rsp());
+ __ addptr(rsp, Interpreter::stackElementSize);
+ break;
+ case sub:
+ __ movflt(xmm1, xmm0);
+ __ pop_f(xmm0);
+ __ subss(xmm0, xmm1);
+ break;
+ case mul:
+ __ mulss(xmm0, at_rsp());
+ __ addptr(rsp, Interpreter::stackElementSize);
+ break;
+ case div:
+ __ movflt(xmm1, xmm0);
+ __ pop_f(xmm0);
+ __ divss(xmm0, xmm1);
+ break;
+ case rem:
+ // On x86_64 platforms the SharedRuntime::frem method is called to perform the
+ // modulo operation. The frem method calls the function
+ // double fmod(double x, double y) in math.h. The documentation of fmod states:
+ // "If x or y is a NaN, a NaN is returned." without specifying what type of NaN
+ // (signalling or quiet) is returned.
+ //
+ // On x86_32 platforms the FPU is used to perform the modulo operation. The
+ // reason is that on 32-bit Windows the sign of modulo operations diverges from
+ // what is considered the standard (e.g., -0.0f % -3.14f is 0.0f (and not -0.0f).
+ // The fprem instruction used on x86_32 is functionally equivalent to
+ // SharedRuntime::frem in that it returns a NaN.
#ifdef _LP64
- switch (op) {
- case add:
- __ addss(xmm0, at_rsp());
- __ addptr(rsp, Interpreter::stackElementSize);
- break;
- case sub:
- __ movflt(xmm1, xmm0);
- __ pop_f(xmm0);
- __ subss(xmm0, xmm1);
- break;
- case mul:
- __ mulss(xmm0, at_rsp());
- __ addptr(rsp, Interpreter::stackElementSize);
- break;
- case div:
- __ movflt(xmm1, xmm0);
- __ pop_f(xmm0);
- __ divss(xmm0, xmm1);
- break;
- case rem:
- __ movflt(xmm1, xmm0);
- __ pop_f(xmm0);
- __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::frem), 2);
- break;
- default:
+ __ movflt(xmm1, xmm0);
+ __ pop_f(xmm0);
+ __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::frem), 2);
+#else
+ __ push_f(xmm0);
+ __ pop_f();
+ __ fld_s(at_rsp());
+ __ fremr(rax);
+ __ f2ieee();
+ __ pop(rax); // pop second operand off the stack
+ __ push_f();
+ __ pop_f(xmm0);
+#endif
+ break;
+ default:
+ ShouldNotReachHere();
+ break;
+ }
+ } else {
+#ifdef _LP64
ShouldNotReachHere();
- break;
- }
#else
- switch (op) {
+ switch (op) {
case add: __ fadd_s (at_rsp()); break;
case sub: __ fsubr_s(at_rsp()); break;
case mul: __ fmul_s (at_rsp()); break;
case div: __ fdivr_s(at_rsp()); break;
case rem: __ fld_s (at_rsp()); __ fremr(rax); break;
default : ShouldNotReachHere();
+ }
+ __ f2ieee();
+ __ pop(rax); // pop second operand off the stack
+#endif // _LP64
}
- __ f2ieee();
- __ pop(rax); // pop float thing off
-#endif
}
void TemplateTable::dop2(Operation op) {
transition(dtos, dtos);
+ if (UseSSE >= 2) {
+ switch (op) {
+ case add:
+ __ addsd(xmm0, at_rsp());
+ __ addptr(rsp, 2 * Interpreter::stackElementSize);
+ break;
+ case sub:
+ __ movdbl(xmm1, xmm0);
+ __ pop_d(xmm0);
+ __ subsd(xmm0, xmm1);
+ break;
+ case mul:
+ __ mulsd(xmm0, at_rsp());
+ __ addptr(rsp, 2 * Interpreter::stackElementSize);
+ break;
+ case div:
+ __ movdbl(xmm1, xmm0);
+ __ pop_d(xmm0);
+ __ divsd(xmm0, xmm1);
+ break;
+ case rem:
+ // Similar to fop2(), the modulo operation is performed using the
+ // SharedRuntime::drem method (on x86_64 platforms) or using the
+ // FPU (on x86_32 platforms) for the same reasons as mentioned in fop2().
#ifdef _LP64
- switch (op) {
- case add:
- __ addsd(xmm0, at_rsp());
- __ addptr(rsp, 2 * Interpreter::stackElementSize);
- break;
- case sub:
- __ movdbl(xmm1, xmm0);
- __ pop_d(xmm0);
- __ subsd(xmm0, xmm1);
- break;
- case mul:
- __ mulsd(xmm0, at_rsp());
- __ addptr(rsp, 2 * Interpreter::stackElementSize);
- break;
- case div:
- __ movdbl(xmm1, xmm0);
- __ pop_d(xmm0);
- __ divsd(xmm0, xmm1);
- break;
- case rem:
- __ movdbl(xmm1, xmm0);
- __ pop_d(xmm0);
- __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::drem), 2);
- break;
- default:
+ __ movdbl(xmm1, xmm0);
+ __ pop_d(xmm0);
+ __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::drem), 2);
+#else
+ __ push_d(xmm0);
+ __ pop_d();
+ __ fld_d(at_rsp());
+ __ fremr(rax);
+ __ d2ieee();
+ __ pop(rax);
+ __ pop(rdx);
+ __ push_d();
+ __ pop_d(xmm0);
+#endif
+ break;
+ default:
+ ShouldNotReachHere();
+ break;
+ }
+ } else {
+#ifdef _LP64
ShouldNotReachHere();
- break;
- }
#else
- switch (op) {
+ switch (op) {
case add: __ fadd_d (at_rsp()); break;
case sub: __ fsubr_d(at_rsp()); break;
case mul: {
@@ -1543,12 +1573,13 @@
}
case rem: __ fld_d (at_rsp()); __ fremr(rax); break;
default : ShouldNotReachHere();
+ }
+ __ d2ieee();
+ // Pop double precision number from rsp.
+ __ pop(rax);
+ __ pop(rdx);
+#endif
}
- __ d2ieee();
- // Pop double precision number from rsp.
- __ pop(rax);
- __ pop(rdx);
-#endif
}
void TemplateTable::ineg() {
@@ -1562,7 +1593,6 @@
NOT_LP64(__ lneg(rdx, rax));
}
-#ifdef _LP64
// Note: 'double' and 'long long' have 32-bits alignment on x86.
static jlong* double_quadword(jlong *adr, jlong lo, jlong hi) {
// Use the expression (adr)&(~0xF) to provide 128-bits aligned address
@@ -1577,26 +1607,30 @@
// Buffer for 128-bits masks used by SSE instructions.
static jlong float_signflip_pool[2*2];
static jlong double_signflip_pool[2*2];
-#endif
void TemplateTable::fneg() {
transition(ftos, ftos);
-#ifdef _LP64
- static jlong *float_signflip = double_quadword(&float_signflip_pool[1], 0x8000000080000000, 0x8000000080000000);
- __ xorps(xmm0, ExternalAddress((address) float_signflip));
-#else
- __ fchs();
-#endif
+ if (UseSSE >= 1) {
+ static jlong *float_signflip = double_quadword(&float_signflip_pool[1], 0x8000000080000000, 0x8000000080000000);
+ __ xorps(xmm0, ExternalAddress((address) float_signflip));
+ } else {
+ LP64_ONLY(ShouldNotReachHere());
+ NOT_LP64(__ fchs());
+ }
}
void TemplateTable::dneg() {
transition(dtos, dtos);
+ if (UseSSE >= 2) {
+ static jlong *double_signflip = double_quadword(&double_signflip_pool[1], 0x8000000000000000, 0x8000000000000000);
+ __ xorpd(xmm0, ExternalAddress((address) double_signflip));
+ } else {
#ifdef _LP64
- static jlong *double_signflip = double_quadword(&double_signflip_pool[1], 0x8000000000000000, 0x8000000000000000);
- __ xorpd(xmm0, ExternalAddress((address) double_signflip));
+ ShouldNotReachHere();
#else
- __ fchs();
+ __ fchs();
#endif
+ }
}
void TemplateTable::iinc() {
@@ -1798,18 +1832,26 @@
__ extend_sign(rdx, rax);
break;
case Bytecodes::_i2f:
- __ push(rax); // store int on tos
- __ fild_s(at_rsp()); // load int to ST0
- __ f2ieee(); // truncate to float size
- __ pop(rcx); // adjust rsp
+ if (UseSSE >= 1) {
+ __ cvtsi2ssl(xmm0, rax);
+ } else {
+ __ push(rax); // store int on tos
+ __ fild_s(at_rsp()); // load int to ST0
+ __ f2ieee(); // truncate to float size
+ __ pop(rcx); // adjust rsp
+ }
break;
case Bytecodes::_i2d:
+ if (UseSSE >= 2) {
+ __ cvtsi2sdl(xmm0, rax);
+ } else {
__ push(rax); // add one slot for d2ieee()
__ push(rax); // store int on tos
__ fild_s(at_rsp()); // load int to ST0
__ d2ieee(); // truncate to double size
__ pop(rcx); // adjust rsp
__ pop(rcx);
+ }
break;
case Bytecodes::_i2b:
__ shll(rax, 24); // truncate upper 24 bits
@@ -1829,50 +1871,102 @@
/* nothing to do */
break;
case Bytecodes::_l2f:
+ // On 64-bit platforms, the cvtsi2ssq instruction is used to convert
+ // 64-bit long values to floats. On 32-bit platforms it is not possible
+ // to use that instruction with 64-bit operands, therefore the FPU is
+ // used to perform the conversion.
__ push(rdx); // store long on tos
__ push(rax);
__ fild_d(at_rsp()); // load long to ST0
__ f2ieee(); // truncate to float size
__ pop(rcx); // adjust rsp
__ pop(rcx);
+ if (UseSSE >= 1) {
+ __ push_f();
+ __ pop_f(xmm0);
+ }
break;
case Bytecodes::_l2d:
+ // On 32-bit platforms the FPU is used for conversion because on
+ // 32-bit platforms it is not not possible to use the cvtsi2sdq
+ // instruction with 64-bit operands.
__ push(rdx); // store long on tos
__ push(rax);
__ fild_d(at_rsp()); // load long to ST0
__ d2ieee(); // truncate to double size
__ pop(rcx); // adjust rsp
__ pop(rcx);
+ if (UseSSE >= 2) {
+ __ push_d();
+ __ pop_d(xmm0);
+ }
break;
case Bytecodes::_f2i:
- __ push(rcx); // reserve space for argument
- __ fstp_s(at_rsp()); // pass float argument on stack
+ // SharedRuntime::f2i does not differentiate between sNaNs and qNaNs
+ // as it returns 0 for any NaN.
+ if (UseSSE >= 1) {
+ __ push_f(xmm0);
+ } else {
+ __ push(rcx); // reserve space for argument
+ __ fstp_s(at_rsp()); // pass float argument on stack
+ }
__ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::f2i), 1);
break;
case Bytecodes::_f2l:
- __ push(rcx); // reserve space for argument
- __ fstp_s(at_rsp()); // pass float argument on stack
+ // SharedRuntime::f2l does not differentiate between sNaNs and qNaNs
+ // as it returns 0 for any NaN.
+ if (UseSSE >= 1) {
+ __ push_f(xmm0);
+ } else {
+ __ push(rcx); // reserve space for argument
+ __ fstp_s(at_rsp()); // pass float argument on stack
+ }
__ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::f2l), 1);
break;
case Bytecodes::_f2d:
- /* nothing to do */
+ if (UseSSE < 1) {
+ /* nothing to do */
+ } else if (UseSSE == 1) {
+ __ push_f(xmm0);
+ __ pop_f();
+ } else { // UseSSE >= 2
+ __ cvtss2sd(xmm0, xmm0);
+ }
break;
case Bytecodes::_d2i:
- __ push(rcx); // reserve space for argument
- __ push(rcx);
- __ fstp_d(at_rsp()); // pass double argument on stack
+ if (UseSSE >= 2) {
+ __ push_d(xmm0);
+ } else {
+ __ push(rcx); // reserve space for argument
+ __ push(rcx);
+ __ fstp_d(at_rsp()); // pass double argument on stack
+ }
__ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::d2i), 2);
break;
case Bytecodes::_d2l:
- __ push(rcx); // reserve space for argument
- __ push(rcx);
- __ fstp_d(at_rsp()); // pass double argument on stack
+ if (UseSSE >= 2) {
+ __ push_d(xmm0);
+ } else {
+ __ push(rcx); // reserve space for argument
+ __ push(rcx);
+ __ fstp_d(at_rsp()); // pass double argument on stack
+ }
__ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::d2l), 2);
break;
case Bytecodes::_d2f:
- __ push(rcx); // reserve space for f2ieee()
- __ f2ieee(); // truncate to float size
- __ pop(rcx); // adjust rsp
+ if (UseSSE <= 1) {
+ __ push(rcx); // reserve space for f2ieee()
+ __ f2ieee(); // truncate to float size
+ __ pop(rcx); // adjust rsp
+ if (UseSSE == 1) {
+ // The cvtsd2ss instruction is not available if UseSSE==1, therefore
+ // the conversion is performed using the FPU in this case.
+ __ push_f();
+ __ pop_f(xmm0);
+ }
+ } else { // UseSSE >= 2
+ __ cvtsd2ss(xmm0, xmm0);
+ }
break;
default :
ShouldNotReachHere();
@@ -1901,42 +1995,47 @@
}
void TemplateTable::float_cmp(bool is_float, int unordered_result) {
-#ifdef _LP64
- Label done;
- if (is_float) {
- // XXX get rid of pop here, use ... reg, mem32
- __ pop_f(xmm1);
- __ ucomiss(xmm1, xmm0);
- } else {
- // XXX get rid of pop here, use ... reg, mem64
- __ pop_d(xmm1);
- __ ucomisd(xmm1, xmm0);
- }
- if (unordered_result < 0) {
- __ movl(rax, -1);
- __ jccb(Assembler::parity, done);
- __ jccb(Assembler::below, done);
- __ setb(Assembler::notEqual, rdx);
- __ movzbl(rax, rdx);
+ if ((is_float && UseSSE >= 1) ||
+ (!is_float && UseSSE >= 2)) {
+ Label done;
+ if (is_float) {
+ // XXX get rid of pop here, use ... reg, mem32
+ __ pop_f(xmm1);
+ __ ucomiss(xmm1, xmm0);
+ } else {
+ // XXX get rid of pop here, use ... reg, mem64
+ __ pop_d(xmm1);
+ __ ucomisd(xmm1, xmm0);
+ }
+ if (unordered_result < 0) {
+ __ movl(rax, -1);
+ __ jccb(Assembler::parity, done);
+ __ jccb(Assembler::below, done);
+ __ setb(Assembler::notEqual, rdx);
+ __ movzbl(rax, rdx);
+ } else {
+ __ movl(rax, 1);
+ __ jccb(Assembler::parity, done);
+ __ jccb(Assembler::above, done);
+ __ movl(rax, 0);
+ __ jccb(Assembler::equal, done);
+ __ decrementl(rax);
+ }
+ __ bind(done);
} else {
- __ movl(rax, 1);
- __ jccb(Assembler::parity, done);
- __ jccb(Assembler::above, done);
- __ movl(rax, 0);
- __ jccb(Assembler::equal, done);
- __ decrementl(rax);
- }
- __ bind(done);
+#ifdef _LP64
+ ShouldNotReachHere();
#else
- if (is_float) {
- __ fld_s(at_rsp());
- } else {
- __ fld_d(at_rsp());
- __ pop(rdx);
+ if (is_float) {
+ __ fld_s(at_rsp());
+ } else {
+ __ fld_d(at_rsp());
+ __ pop(rdx);
+ }
+ __ pop(rcx);
+ __ fcmp2int(rax, unordered_result < 0);
+#endif // _LP64
}
- __ pop(rcx);
- __ fcmp2int(rax, unordered_result < 0);
-#endif
}
void TemplateTable::branch(bool is_jsr, bool is_wide) {
@@ -2748,8 +2847,7 @@
__ jcc(Assembler::notEqual, notFloat);
// ftos
- LP64_ONLY(__ movflt(xmm0, field));
- NOT_LP64(__ fld_s(field));
+ __ load_float(field);
__ push(ftos);
// Rewrite bytecode to be faster
if (!is_static && rc == may_rewrite) {
@@ -2763,8 +2861,7 @@
__ jcc(Assembler::notEqual, notDouble);
#endif
// dtos
- LP64_ONLY(__ movdbl(xmm0, field));
- NOT_LP64(__ fld_d(field));
+ __ load_double(field);
__ push(dtos);
// Rewrite bytecode to be faster
if (!is_static && rc == may_rewrite) {
@@ -3046,8 +3143,7 @@
{
__ pop(ftos);
if (!is_static) pop_and_check_object(obj);
- NOT_LP64( __ fstp_s(field);)
- LP64_ONLY( __ movflt(field, xmm0);)
+ __ store_float(field);
if (!is_static && rc == may_rewrite) {
patch_bytecode(Bytecodes::_fast_fputfield, bc, rbx, true, byte_no);
}
@@ -3064,8 +3160,7 @@
{
__ pop(dtos);
if (!is_static) pop_and_check_object(obj);
- NOT_LP64( __ fstp_d(field);)
- LP64_ONLY( __ movdbl(field, xmm0);)
+ __ store_double(field);
if (!is_static && rc == may_rewrite) {
patch_bytecode(Bytecodes::_fast_dputfield, bc, rbx, true, byte_no);
}
@@ -3123,8 +3218,8 @@
case Bytecodes::_fast_sputfield: // fall through
case Bytecodes::_fast_cputfield: // fall through
case Bytecodes::_fast_iputfield: __ push_i(rax); break;
- case Bytecodes::_fast_dputfield: __ push_d(); break;
- case Bytecodes::_fast_fputfield: __ push_f(); break;
+ case Bytecodes::_fast_dputfield: __ push(dtos); break;
+ case Bytecodes::_fast_fputfield: __ push(ftos); break;
case Bytecodes::_fast_lputfield: __ push_l(rax); break;
default:
@@ -3147,8 +3242,8 @@
case Bytecodes::_fast_sputfield: // fall through
case Bytecodes::_fast_cputfield: // fall through
case Bytecodes::_fast_iputfield: __ pop_i(rax); break;
- case Bytecodes::_fast_dputfield: __ pop_d(); break;
- case Bytecodes::_fast_fputfield: __ pop_f(); break;
+ case Bytecodes::_fast_dputfield: __ pop(dtos); break;
+ case Bytecodes::_fast_fputfield: __ pop(ftos); break;
case Bytecodes::_fast_lputfield: __ pop_l(rax); break;
}
__ bind(L2);
@@ -3212,12 +3307,10 @@
__ movw(field, rax);
break;
case Bytecodes::_fast_fputfield:
- NOT_LP64( __ fstp_s(field); )
- LP64_ONLY( __ movflt(field, xmm0);)
+ __ store_float(field);
break;
case Bytecodes::_fast_dputfield:
- NOT_LP64( __ fstp_d(field); )
- LP64_ONLY( __ movdbl(field, xmm0);)
+ __ store_double(field);
break;
default:
ShouldNotReachHere();
@@ -3302,12 +3395,10 @@
__ load_unsigned_short(rax, field);
break;
case Bytecodes::_fast_fgetfield:
- LP64_ONLY(__ movflt(xmm0, field));
- NOT_LP64(__ fld_s(field));
+ __ load_float(field);
break;
case Bytecodes::_fast_dgetfield:
- LP64_ONLY(__ movdbl(xmm0, field));
- NOT_LP64(__ fld_d(field));
+ __ load_double(field);
break;
default:
ShouldNotReachHere();
@@ -3347,8 +3438,7 @@
__ verify_oop(rax);
break;
case ftos:
- LP64_ONLY(__ movflt(xmm0, field));
- NOT_LP64(__ fld_s(field));
+ __ load_float(field);
break;
default:
ShouldNotReachHere();
--- a/hotspot/src/share/vm/compiler/compileBroker.cpp Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/share/vm/compiler/compileBroker.cpp Fri Aug 21 09:12:42 2015 +0200
@@ -1399,6 +1399,28 @@
// do the compilation
if (method->is_native()) {
if (!PreferInterpreterNativeStubs || method->is_method_handle_intrinsic()) {
+ // The following native methods:
+ //
+ // java.lang.Float.intBitsToFloat
+ // java.lang.Float.floatToRawIntBits
+ // java.lang.Double.longBitsToDouble
+ // java.lang.Double.doubleToRawLongBits
+ //
+ // are called through the interpreter even if interpreter native stubs
+ // are not preferred (i.e., calling through adapter handlers is preferred).
+ // The reason is that on x86_32 signaling NaNs (sNaNs) are not preserved
+ // if the version of the methods from the native libraries is called.
+ // As the interpreter and the C2-intrinsified version of the methods preserves
+ // sNaNs, that would result in an inconsistent way of handling of sNaNs.
+ if ((UseSSE >= 1 &&
+ (method->intrinsic_id() == vmIntrinsics::_intBitsToFloat ||
+ method->intrinsic_id() == vmIntrinsics::_floatToRawIntBits)) ||
+ (UseSSE >= 2 &&
+ (method->intrinsic_id() == vmIntrinsics::_longBitsToDouble ||
+ method->intrinsic_id() == vmIntrinsics::_doubleToRawLongBits))) {
+ return NULL;
+ }
+
// To properly handle the appendix argument for out-of-line calls we are using a small trampoline that
// pops off the appendix argument and jumps to the target (see gen_special_dispatch in SharedRuntime).
//
--- a/hotspot/src/share/vm/interpreter/abstractInterpreter.hpp Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/share/vm/interpreter/abstractInterpreter.hpp Fri Aug 21 09:12:42 2015 +0200
@@ -90,6 +90,10 @@
java_util_zip_CRC32_update, // implementation of java.util.zip.CRC32.update()
java_util_zip_CRC32_updateBytes, // implementation of java.util.zip.CRC32.updateBytes()
java_util_zip_CRC32_updateByteBuffer, // implementation of java.util.zip.CRC32.updateByteBuffer()
+ java_lang_Float_intBitsToFloat, // implementation of java.lang.Float.intBitsToFloat()
+ java_lang_Float_floatToRawIntBits, // implementation of java.lang.Float.floatToRawIntBits()
+ java_lang_Double_longBitsToDouble, // implementation of java.lang.Double.longBitsToDouble()
+ java_lang_Double_doubleToRawLongBits, // implementation of java.lang.Double.doubleToRawLongBits()
number_of_method_entries,
invalid = -1
};
--- a/hotspot/src/share/vm/interpreter/interpreter.cpp Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/share/vm/interpreter/interpreter.cpp Fri Aug 21 09:12:42 2015 +0200
@@ -234,7 +234,15 @@
case vmIntrinsics::_updateByteBufferCRC32 : return java_util_zip_CRC32_updateByteBuffer;
}
}
-#endif
+
+ switch(m->intrinsic_id()) {
+ case vmIntrinsics::_intBitsToFloat: return java_lang_Float_intBitsToFloat;
+ case vmIntrinsics::_floatToRawIntBits: return java_lang_Float_floatToRawIntBits;
+ case vmIntrinsics::_longBitsToDouble: return java_lang_Double_longBitsToDouble;
+ case vmIntrinsics::_doubleToRawLongBits: return java_lang_Double_doubleToRawLongBits;
+ }
+
+#endif // CC_INTERP
// Native method?
// Note: This test must come _before_ the test for intrinsic
@@ -559,6 +567,25 @@
: // fall thru
case Interpreter::java_util_zip_CRC32_updateByteBuffer
: entry_point = generate_CRC32_updateBytes_entry(kind); break;
+#if defined(TARGET_ARCH_x86) && !defined(_LP64)
+ // On x86_32 platforms, a special entry is generated for the following four methods.
+ // On other platforms the normal entry is used to enter these methods.
+ case Interpreter::java_lang_Float_intBitsToFloat
+ : entry_point = generate_Float_intBitsToFloat_entry(); break;
+ case Interpreter::java_lang_Float_floatToRawIntBits
+ : entry_point = generate_Float_floatToRawIntBits_entry(); break;
+ case Interpreter::java_lang_Double_longBitsToDouble
+ : entry_point = generate_Double_longBitsToDouble_entry(); break;
+ case Interpreter::java_lang_Double_doubleToRawLongBits
+ : entry_point = generate_Double_doubleToRawLongBits_entry(); break;
+#else
+ case Interpreter::java_lang_Float_intBitsToFloat:
+ case Interpreter::java_lang_Float_floatToRawIntBits:
+ case Interpreter::java_lang_Double_longBitsToDouble:
+ case Interpreter::java_lang_Double_doubleToRawLongBits:
+ entry_point = generate_native_entry(false);
+ break;
+#endif // defined(TARGET_ARCH_x86) && !defined(_LP64)
#endif // CC_INTERP
default:
fatal(err_msg("unexpected method kind: %d", kind));
--- a/hotspot/src/share/vm/interpreter/templateInterpreter.cpp Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/share/vm/interpreter/templateInterpreter.cpp Fri Aug 21 09:12:42 2015 +0200
@@ -397,34 +397,39 @@
// all non-native method kinds
method_entry(zerolocals)
- method_entry(zerolocals_synchronized)
- method_entry(empty)
- method_entry(accessor)
- method_entry(abstract)
- method_entry(java_lang_math_sin )
- method_entry(java_lang_math_cos )
- method_entry(java_lang_math_tan )
- method_entry(java_lang_math_abs )
- method_entry(java_lang_math_sqrt )
- method_entry(java_lang_math_log )
- method_entry(java_lang_math_log10)
- method_entry(java_lang_math_exp )
- method_entry(java_lang_math_pow )
- method_entry(java_lang_ref_reference_get)
+ method_entry(zerolocals_synchronized)
+ method_entry(empty)
+ method_entry(accessor)
+ method_entry(abstract)
+ method_entry(java_lang_math_sin )
+ method_entry(java_lang_math_cos )
+ method_entry(java_lang_math_tan )
+ method_entry(java_lang_math_abs )
+ method_entry(java_lang_math_sqrt )
+ method_entry(java_lang_math_log )
+ method_entry(java_lang_math_log10)
+ method_entry(java_lang_math_exp )
+ method_entry(java_lang_math_pow )
+ method_entry(java_lang_ref_reference_get)
- if (UseCRC32Intrinsics) {
- method_entry(java_util_zip_CRC32_update)
- method_entry(java_util_zip_CRC32_updateBytes)
- method_entry(java_util_zip_CRC32_updateByteBuffer)
- }
+ if (UseCRC32Intrinsics) {
+ method_entry(java_util_zip_CRC32_update)
+ method_entry(java_util_zip_CRC32_updateBytes)
+ method_entry(java_util_zip_CRC32_updateByteBuffer)
+ }
+
+ method_entry(java_lang_Float_intBitsToFloat);
+ method_entry(java_lang_Float_floatToRawIntBits);
+ method_entry(java_lang_Double_longBitsToDouble);
+ method_entry(java_lang_Double_doubleToRawLongBits);
initialize_method_handle_entries();
// all native method kinds (must be one contiguous block)
Interpreter::_native_entry_begin = Interpreter::code()->code_end();
method_entry(native)
- method_entry(native_synchronized)
- Interpreter::_native_entry_end = Interpreter::code()->code_end();
+ method_entry(native_synchronized)
+ Interpreter::_native_entry_end = Interpreter::code()->code_end();
#undef method_entry
--- a/hotspot/src/share/vm/memory/metaspace.hpp Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/share/vm/memory/metaspace.hpp Fri Aug 21 09:12:42 2015 +0200
@@ -254,7 +254,7 @@
// Debugging support
void verify();
- static void print_compressed_class_space(outputStream* st, const char* requested_addr = 0);
+ static void print_compressed_class_space(outputStream* st, const char* requested_addr = 0) NOT_LP64({});
class AllocRecordClosure : public StackObj {
public:
--- a/hotspot/src/share/vm/opto/chaitin.cpp Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/share/vm/opto/chaitin.cpp Fri Aug 21 09:12:42 2015 +0200
@@ -990,9 +990,13 @@
// FOUR registers!
#ifdef ASSERT
if (is_vect) {
- assert(lrgmask.is_aligned_sets(lrg.num_regs()), "vector should be aligned");
- assert(!lrg._fat_proj, "sanity");
- assert(RegMask::num_registers(kreg) == lrg.num_regs(), "sanity");
+ if (lrg.num_regs() != 0) {
+ assert(lrgmask.is_aligned_sets(lrg.num_regs()), "vector should be aligned");
+ assert(!lrg._fat_proj, "sanity");
+ assert(RegMask::num_registers(kreg) == lrg.num_regs(), "sanity");
+ } else {
+ assert(n->is_Phi(), "not all inputs processed only if Phi");
+ }
}
#endif
if (!is_vect && lrg.num_regs() == 2 && !lrg._fat_proj && rm.is_misaligned_pair()) {
--- a/hotspot/src/share/vm/opto/compile.hpp Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/share/vm/opto/compile.hpp Fri Aug 21 09:12:42 2015 +0200
@@ -93,7 +93,7 @@
public:
void set_idx(node_idx_t idx) {
- _idx_clone_orig = _idx_clone_orig & 0xFFFFFFFF00000000 | idx;
+ _idx_clone_orig = _idx_clone_orig & CONST64(0xFFFFFFFF00000000) | idx;
}
node_idx_t idx() const { return (node_idx_t)(_idx_clone_orig & 0xFFFFFFFF); }
--- a/hotspot/src/share/vm/utilities/globalDefinitions_gcc.hpp Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/share/vm/utilities/globalDefinitions_gcc.hpp Fri Aug 21 09:12:42 2015 +0200
@@ -161,7 +161,7 @@
//----------------------------------------------------------------------------------------------------
-// Constant for jlong (specifying an long long canstant is C++ compiler specific)
+// Constant for jlong (specifying a long long constant is C++ compiler specific)
// Build a 64bit integer constant
#define CONST64(x) (x ## LL)
--- a/hotspot/src/share/vm/utilities/globalDefinitions_sparcWorks.hpp Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/share/vm/utilities/globalDefinitions_sparcWorks.hpp Fri Aug 21 09:12:42 2015 +0200
@@ -178,7 +178,7 @@
//----------------------------------------------------------------------------------------------------
-// Constant for jlong (specifying an long long constant is C++ compiler specific)
+// Constant for jlong (specifying a long long constant is C++ compiler specific)
// Build a 64bit integer constant
#define CONST64(x) (x ## LL)
--- a/hotspot/src/share/vm/utilities/globalDefinitions_visCPP.hpp Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/share/vm/utilities/globalDefinitions_visCPP.hpp Fri Aug 21 09:12:42 2015 +0200
@@ -148,9 +148,9 @@
inline int g_isfinite(jdouble f) { return _finite(f); }
//----------------------------------------------------------------------------------------------------
-// Constant for jlong (specifying an long long constant is C++ compiler specific)
+// Constant for jlong (specifying a long long constant is C++ compiler specific)
-// Build a 64bit integer constant on with Visual C++
+// Build a 64bit integer constant with Visual C++
#define CONST64(x) (x ## i64)
#define UCONST64(x) (x ## ui64)
--- a/hotspot/src/share/vm/utilities/globalDefinitions_xlc.hpp Thu Aug 20 09:31:28 2015 +0200
+++ b/hotspot/src/share/vm/utilities/globalDefinitions_xlc.hpp Fri Aug 21 09:12:42 2015 +0200
@@ -108,7 +108,7 @@
//----------------------------------------------------------------------------------------------------
-// Constant for jlong (specifying an long long canstant is C++ compiler specific)
+// Constant for jlong (specifying a long long constant is C++ compiler specific)
// Build a 64bit integer constant
#define CONST64(x) (x ## LL)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/floatingpoint/NaNTest.java Fri Aug 21 09:12:42 2015 +0200
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+/**
+ * @test
+ * @bug 8076373
+ * @summary Verify if signaling NaNs are preserved.
+ * @run main NaNTest
+ */
+public class NaNTest {
+ static void testFloat() {
+ int originalValue = 0x7f800001;
+ int readBackValue = Float.floatToRawIntBits(Float.intBitsToFloat(originalValue));
+ if (originalValue != readBackValue) {
+ String errorMessage = String.format("Original and read back float values mismatch\n0x%X 0x%X\n",
+ originalValue,
+ readBackValue);
+ throw new RuntimeException(errorMessage);
+ } else {
+ System.out.printf("Written and read back float values match\n0x%X 0x%X\n",
+ originalValue,
+ readBackValue);
+ }
+ }
+
+ static void testDouble() {
+ long originalValue = 0xFFF0000000000001L;
+ long readBackValue = Double.doubleToRawLongBits(Double.longBitsToDouble(originalValue));
+ if (originalValue != readBackValue) {
+ String errorMessage = String.format("Original and read back double values mismatch\n0x%X 0x%X\n",
+ originalValue,
+ readBackValue);
+ throw new RuntimeException(errorMessage);
+ } else {
+ System.out.printf("Written and read back double values match\n0x%X 0x%X\n",
+ originalValue,
+ readBackValue);
+ }
+
+ }
+
+ public static void main(String args[]) {
+ System.out.println("### NanTest started");
+
+ testFloat();
+ testDouble();
+
+ System.out.println("### NanTest ended");
+ }
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/regalloc/TestVectorRegAlloc.java Fri Aug 21 09:12:42 2015 +0200
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 8131969
+ * @summary assert in register allocation code when vector Phi for a loop is processed because code assumes all inputs already processed
+ * @run main/othervm -Xbatch TestVectorRegAlloc
+ *
+ */
+
+public class TestVectorRegAlloc {
+
+ static int test_helper_i;
+ static boolean test_helper() {
+ test_helper_i++;
+ return (test_helper_i & 7) != 0;
+ }
+
+ static void test(double[] src, double[] dst, boolean flag) {
+ double j = 0.0;
+ while(test_helper()) {
+ for (int i = 0; i < src.length; i++) {
+ dst[i] = src[i] + j;
+ }
+ // Loop will be unswitched and ReplicateD of zero will be
+ // split through the Phi of outer loop
+ for (int i = 0; i < src.length; i++) {
+ double k;
+ if (flag) {
+ k = j;
+ } else {
+ k = 0;
+ }
+ dst[i] = src[i] + k;
+ }
+ j++;
+ }
+ }
+
+ static public void main(String[] args) {
+ double[] src = new double[10];
+ double[] dst = new double[10];
+ for (int i = 0; i < 20000; i++) {
+ test(src, dst, (i % 2) == 0);
+ }
+ }
+}