8154826: AArch64: take advantage better of base + shifted offset addressing mode
authorroland
Mon, 09 May 2016 11:34:09 +0200
changeset 38286 0ddb6f84e138
parent 38285 20b85a0ba796
child 38287 ab815717c073
child 38656 22c78787d80c
8154826: AArch64: take advantage better of base + shifted offset addressing mode Summary: reshape address subtree to fit aarch64 addressing mode Reviewed-by: kvn, aph
hotspot/src/cpu/aarch64/vm/aarch64.ad
hotspot/src/cpu/ppc/vm/ppc.ad
hotspot/src/cpu/sparc/vm/sparc.ad
hotspot/src/cpu/x86/vm/x86.ad
hotspot/src/cpu/x86/vm/x86_32.ad
hotspot/src/cpu/x86/vm/x86_64.ad
hotspot/src/share/vm/opto/compile.cpp
hotspot/src/share/vm/opto/compile.hpp
hotspot/src/share/vm/opto/matcher.cpp
hotspot/src/share/vm/opto/matcher.hpp
--- a/hotspot/src/cpu/aarch64/vm/aarch64.ad	Mon May 09 01:21:55 2016 -0700
+++ b/hotspot/src/cpu/aarch64/vm/aarch64.ad	Mon May 09 11:34:09 2016 +0200
@@ -996,6 +996,7 @@
 source_hpp %{
 
 #include "gc/shared/cardTableModRefBS.hpp"
+#include "opto/addnode.hpp"
 
 class CallStubImpl {
 
@@ -1061,6 +1062,9 @@
 
   // predicate controlling translation of StoreCM
   bool unnecessary_storestore(const Node *storecm);
+
+  // predicate controlling addressing modes
+  bool size_fits_all_mem_uses(AddPNode* addp, int shift);
 %}
 
 source %{
@@ -3449,11 +3453,6 @@
 // Does the CPU require late expand (see block.cpp for description of late expand)?
 const bool Matcher::require_postalloc_expand = false;
 
-// Should the Matcher clone shifts on addressing modes, expecting them
-// to be subsumed into complex addressing expressions or compute them
-// into registers?  True for Intel but false for most RISCs
-const bool Matcher::clone_shift_expressions = false;
-
 // Do we need to mask the count passed to shift instructions or does
 // the cpu only look at the lower 5/6 bits anyway?
 const bool Matcher::need_masked_shift_count = false;
@@ -3572,8 +3571,119 @@
   return FP_REG_mask();
 }
 
+bool size_fits_all_mem_uses(AddPNode* addp, int shift) {
+  for (DUIterator_Fast imax, i = addp->fast_outs(imax); i < imax; i++) {
+    Node* u = addp->fast_out(i);
+    if (u->is_Mem()) {
+      int opsize = u->as_Mem()->memory_size();
+      assert(opsize > 0, "unexpected memory operand size");
+      if (u->as_Mem()->memory_size() != (1<<shift)) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 const bool Matcher::convi2l_type_required = false;
 
+// Should the Matcher clone shifts on addressing modes, expecting them
+// to be subsumed into complex addressing expressions or compute them
+// into registers?
+bool Matcher::clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
+  if (clone_base_plus_offset_address(m, mstack, address_visited)) {
+    return true;
+  }
+
+  Node *off = m->in(AddPNode::Offset);
+  if (off->Opcode() == Op_LShiftL && off->in(2)->is_Con() &&
+      size_fits_all_mem_uses(m, off->in(2)->get_int()) &&
+      // Are there other uses besides address expressions?
+      !is_visited(off)) {
+    address_visited.set(off->_idx); // Flag as address_visited
+    mstack.push(off->in(2), Visit);
+    Node *conv = off->in(1);
+    if (conv->Opcode() == Op_ConvI2L &&
+        // Are there other uses besides address expressions?
+        !is_visited(conv)) {
+      address_visited.set(conv->_idx); // Flag as address_visited
+      mstack.push(conv->in(1), Pre_Visit);
+    } else {
+      mstack.push(conv, Pre_Visit);
+    }
+    address_visited.test_set(m->_idx); // Flag as address_visited
+    mstack.push(m->in(AddPNode::Address), Pre_Visit);
+    mstack.push(m->in(AddPNode::Base), Pre_Visit);
+    return true;
+  } else if (off->Opcode() == Op_ConvI2L &&
+             // Are there other uses besides address expressions?
+             !is_visited(off)) {
+    address_visited.test_set(m->_idx); // Flag as address_visited
+    address_visited.set(off->_idx); // Flag as address_visited
+    mstack.push(off->in(1), Pre_Visit);
+    mstack.push(m->in(AddPNode::Address), Pre_Visit);
+    mstack.push(m->in(AddPNode::Base), Pre_Visit);
+    return true;
+  }
+  return false;
+}
+
+// Transform:
+// (AddP base (AddP base address (LShiftL index con)) offset)
+// into:
+// (AddP base (AddP base offset) (LShiftL index con))
+// to take full advantage of ARM's addressing modes
+void Compile::reshape_address(AddPNode* addp) {
+  Node *addr = addp->in(AddPNode::Address);
+  if (addr->is_AddP() && addr->in(AddPNode::Base) == addp->in(AddPNode::Base)) {
+    const AddPNode *addp2 = addr->as_AddP();
+    if ((addp2->in(AddPNode::Offset)->Opcode() == Op_LShiftL &&
+         addp2->in(AddPNode::Offset)->in(2)->is_Con() &&
+         size_fits_all_mem_uses(addp, addp2->in(AddPNode::Offset)->in(2)->get_int())) ||
+        addp2->in(AddPNode::Offset)->Opcode() == Op_ConvI2L) {
+
+      // Any use that can't embed the address computation?
+      for (DUIterator_Fast imax, i = addp->fast_outs(imax); i < imax; i++) {
+        Node* u = addp->fast_out(i);
+        if (!u->is_Mem() || u->is_LoadVector() || u->is_StoreVector() || u->Opcode() == Op_StoreCM) {
+          return;
+        }
+      }
+      
+      Node* off = addp->in(AddPNode::Offset);
+      Node* addr2 = addp2->in(AddPNode::Address);
+      Node* base = addp->in(AddPNode::Base);
+      
+      Node* new_addr = NULL;
+      // Check whether the graph already has the new AddP we need
+      // before we create one (no GVN available here).
+      for (DUIterator_Fast imax, i = addr2->fast_outs(imax); i < imax; i++) {
+        Node* u = addr2->fast_out(i);
+        if (u->is_AddP() &&
+            u->in(AddPNode::Base) == base &&
+            u->in(AddPNode::Address) == addr2 &&
+            u->in(AddPNode::Offset) == off) {
+          new_addr = u;
+          break;
+        }
+      }
+      
+      if (new_addr == NULL) {
+        new_addr = new AddPNode(base, addr2, off);
+      }
+      Node* new_off = addp2->in(AddPNode::Offset);
+      addp->set_req(AddPNode::Address, new_addr);
+      if (addr->outcnt() == 0) {
+        addr->disconnect_inputs(NULL, this);
+      }
+      addp->set_req(AddPNode::Offset, new_off);
+      if (off->outcnt() == 0) {
+        off->disconnect_inputs(NULL, this);
+      }
+    }
+  }
+}
+
 // helper for encoding java_to_runtime calls on sim
 //
 // this is needed to compute the extra arguments required when
@@ -3643,12 +3753,10 @@
     // encoder that the index needs to be sign extended, so we have to
     // enumerate all the cases.
     switch (opcode) {
-    case INDINDEXSCALEDOFFSETI2L:
     case INDINDEXSCALEDI2L:
-    case INDINDEXSCALEDOFFSETI2LN:
     case INDINDEXSCALEDI2LN:
-    case INDINDEXOFFSETI2L:
-    case INDINDEXOFFSETI2LN:
+    case INDINDEXI2L:
+    case INDINDEXI2LN:
       scale = Address::sxtw(size);
       break;
     default:
@@ -3658,12 +3766,8 @@
     if (index == -1) {
       (masm.*insn)(reg, Address(base, disp));
     } else {
-      if (disp == 0) {
-        (masm.*insn)(reg, Address(base, as_Register(index), scale));
-      } else {
-        masm.lea(rscratch1, Address(base, disp));
-        (masm.*insn)(reg, Address(rscratch1, as_Register(index), scale));
-      }
+      assert(disp == 0, "unsupported address mode: disp = %d", disp);
+      (masm.*insn)(reg, Address(base, as_Register(index), scale));
     }
   }
 
@@ -3674,9 +3778,7 @@
     Address::extend scale;
 
     switch (opcode) {
-    case INDINDEXSCALEDOFFSETI2L:
     case INDINDEXSCALEDI2L:
-    case INDINDEXSCALEDOFFSETI2LN:
     case INDINDEXSCALEDI2LN:
       scale = Address::sxtw(size);
       break;
@@ -3687,12 +3789,8 @@
      if (index == -1) {
       (masm.*insn)(reg, Address(base, disp));
     } else {
-      if (disp == 0) {
-        (masm.*insn)(reg, Address(base, as_Register(index), scale));
-      } else {
-        masm.lea(rscratch1, Address(base, disp));
-        (masm.*insn)(reg, Address(rscratch1, as_Register(index), scale));
-      }
+      assert(disp == 0, "unsupported address mode: disp = %d", disp);
+      (masm.*insn)(reg, Address(base, as_Register(index), scale));
     }
   }
 
@@ -6106,65 +6204,10 @@
   %}
 %}
 
-operand indIndexScaledOffsetI(iRegP reg, iRegL lreg, immIScale scale, immIU12 off)
-%{
-  constraint(ALLOC_IN_RC(ptr_reg));
-  match(AddP (AddP reg (LShiftL lreg scale)) off);
-  op_cost(INSN_COST);
-  format %{ "$reg, $lreg lsl($scale), $off" %}
-  interface(MEMORY_INTER) %{
-    base($reg);
-    index($lreg);
-    scale($scale);
-    disp($off);
-  %}
-%}
-
-operand indIndexScaledOffsetL(iRegP reg, iRegL lreg, immIScale scale, immLU12 off)
-%{
-  constraint(ALLOC_IN_RC(ptr_reg));
-  match(AddP (AddP reg (LShiftL lreg scale)) off);
-  op_cost(INSN_COST);
-  format %{ "$reg, $lreg lsl($scale), $off" %}
-  interface(MEMORY_INTER) %{
-    base($reg);
-    index($lreg);
-    scale($scale);
-    disp($off);
-  %}
-%}
-
-operand indIndexOffsetI2L(iRegP reg, iRegI ireg, immLU12 off)
-%{
-  constraint(ALLOC_IN_RC(ptr_reg));
-  match(AddP (AddP reg (ConvI2L ireg)) off);
-  op_cost(INSN_COST);
-  format %{ "$reg, $ireg, $off I2L" %}
-  interface(MEMORY_INTER) %{
-    base($reg);
-    index($ireg);
-    scale(0x0);
-    disp($off);
-  %}
-%}
-
-operand indIndexScaledOffsetI2L(iRegP reg, iRegI ireg, immIScale scale, immLU12 off)
-%{
-  constraint(ALLOC_IN_RC(ptr_reg));
-  match(AddP (AddP reg (LShiftL (ConvI2L ireg) scale)) off);
-  op_cost(INSN_COST);
-  format %{ "$reg, $ireg sxtw($scale), $off I2L" %}
-  interface(MEMORY_INTER) %{
-    base($reg);
-    index($ireg);
-    scale($scale);
-    disp($off);
-  %}
-%}
-
 operand indIndexScaledI2L(iRegP reg, iRegI ireg, immIScale scale)
 %{
   constraint(ALLOC_IN_RC(ptr_reg));
+  predicate(size_fits_all_mem_uses(n->as_AddP(), n->in(AddPNode::Offset)->in(2)->get_int()));
   match(AddP reg (LShiftL (ConvI2L ireg) scale));
   op_cost(0);
   format %{ "$reg, $ireg sxtw($scale), 0, I2L" %}
@@ -6179,6 +6222,7 @@
 operand indIndexScaled(iRegP reg, iRegL lreg, immIScale scale)
 %{
   constraint(ALLOC_IN_RC(ptr_reg));
+  predicate(size_fits_all_mem_uses(n->as_AddP(), n->in(AddPNode::Offset)->in(2)->get_int()));
   match(AddP reg (LShiftL lreg scale));
   op_cost(0);
   format %{ "$reg, $lreg lsl($scale)" %}
@@ -6190,6 +6234,20 @@
   %}
 %}
 
+operand indIndexI2L(iRegP reg, iRegI ireg)
+%{
+  constraint(ALLOC_IN_RC(ptr_reg));
+  match(AddP reg (ConvI2L ireg));
+  op_cost(0);
+  format %{ "$reg, $ireg, 0, I2L" %}
+  interface(MEMORY_INTER) %{
+    base($reg);
+    index($ireg);
+    scale(0x0);
+    disp(0x0);
+  %}
+%}
+
 operand indIndex(iRegP reg, iRegL lreg)
 %{
   constraint(ALLOC_IN_RC(ptr_reg));
@@ -6331,69 +6389,9 @@
   %}
 %}
 
-operand indIndexScaledOffsetIN(iRegN reg, iRegL lreg, immIScale scale, immIU12 off)
-%{
-  predicate(Universe::narrow_oop_shift() == 0);
-  constraint(ALLOC_IN_RC(ptr_reg));
-  match(AddP (AddP (DecodeN reg) (LShiftL lreg scale)) off);
-  op_cost(0);
-  format %{ "$reg, $lreg lsl($scale), $off\t# narrow" %}
-  interface(MEMORY_INTER) %{
-    base($reg);
-    index($lreg);
-    scale($scale);
-    disp($off);
-  %}
-%}
-
-operand indIndexScaledOffsetLN(iRegN reg, iRegL lreg, immIScale scale, immLU12 off)
-%{
-  predicate(Universe::narrow_oop_shift() == 0);
-  constraint(ALLOC_IN_RC(ptr_reg));
-  match(AddP (AddP (DecodeN reg) (LShiftL lreg scale)) off);
-  op_cost(INSN_COST);
-  format %{ "$reg, $lreg lsl($scale), $off\t# narrow" %}
-  interface(MEMORY_INTER) %{
-    base($reg);
-    index($lreg);
-    scale($scale);
-    disp($off);
-  %}
-%}
-
-operand indIndexOffsetI2LN(iRegN reg, iRegI ireg, immLU12 off)
-%{
-  predicate(Universe::narrow_oop_shift() == 0);
-  constraint(ALLOC_IN_RC(ptr_reg));
-  match(AddP (AddP (DecodeN reg) (ConvI2L ireg)) off);
-  op_cost(INSN_COST);
-  format %{ "$reg, $ireg, $off I2L\t# narrow" %}
-  interface(MEMORY_INTER) %{
-    base($reg);
-    index($ireg);
-    scale(0x0);
-    disp($off);
-  %}
-%}
-
-operand indIndexScaledOffsetI2LN(iRegN reg, iRegI ireg, immIScale scale, immLU12 off)
-%{
-  predicate(Universe::narrow_oop_shift() == 0);
-  constraint(ALLOC_IN_RC(ptr_reg));
-  match(AddP (AddP (DecodeN reg) (LShiftL (ConvI2L ireg) scale)) off);
-  op_cost(INSN_COST);
-  format %{ "$reg, $ireg sxtw($scale), $off I2L\t# narrow" %}
-  interface(MEMORY_INTER) %{
-    base($reg);
-    index($ireg);
-    scale($scale);
-    disp($off);
-  %}
-%}
-
 operand indIndexScaledI2LN(iRegN reg, iRegI ireg, immIScale scale)
 %{
-  predicate(Universe::narrow_oop_shift() == 0);
+  predicate(Universe::narrow_oop_shift() == 0 && size_fits_all_mem_uses(n->as_AddP(), n->in(AddPNode::Offset)->in(2)->get_int()));
   constraint(ALLOC_IN_RC(ptr_reg));
   match(AddP (DecodeN reg) (LShiftL (ConvI2L ireg) scale));
   op_cost(0);
@@ -6408,7 +6406,7 @@
 
 operand indIndexScaledN(iRegN reg, iRegL lreg, immIScale scale)
 %{
-  predicate(Universe::narrow_oop_shift() == 0);
+  predicate(Universe::narrow_oop_shift() == 0 && size_fits_all_mem_uses(n->as_AddP(), n->in(AddPNode::Offset)->in(2)->get_int()));
   constraint(ALLOC_IN_RC(ptr_reg));
   match(AddP (DecodeN reg) (LShiftL lreg scale));
   op_cost(0);
@@ -6421,6 +6419,21 @@
   %}
 %}
 
+operand indIndexI2LN(iRegN reg, iRegI ireg)
+%{
+  predicate(Universe::narrow_oop_shift() == 0);
+  constraint(ALLOC_IN_RC(ptr_reg));
+  match(AddP (DecodeN reg) (ConvI2L ireg));
+  op_cost(0);
+  format %{ "$reg, $ireg, 0, I2L\t# narrow" %}
+  interface(MEMORY_INTER) %{
+    base($reg);
+    index($ireg);
+    scale(0x0);
+    disp(0x0);
+  %}
+%}
+
 operand indIndexN(iRegN reg, iRegL lreg)
 %{
   predicate(Universe::narrow_oop_shift() == 0);
@@ -6641,9 +6654,8 @@
 // memory is used to define read/write location for load/store
 // instruction defs. we can turn a memory op into an Address
 
-opclass memory(indirect, indIndexScaledOffsetI, indIndexScaledOffsetL, indIndexOffsetI2L, indIndexScaledOffsetI2L, indIndexScaled, indIndexScaledI2L, indIndex, indOffI, indOffL,
-               indirectN, indIndexScaledOffsetIN, indIndexScaledOffsetLN, indIndexOffsetI2LN, indIndexScaledOffsetI2LN, indIndexScaledN, indIndexScaledI2LN, indIndexN, indOffIN, indOffLN);
-
+opclass memory(indirect, indIndexScaled, indIndexScaledI2L, indIndexI2L, indIndex, indOffI, indOffL,
+               indirectN, indIndexScaledN, indIndexScaledI2LN, indIndexI2LN, indIndexN, indOffIN, indOffLN);
 
 // iRegIorL2I is used for src inputs in rules for 32 bit int (I)
 // operations. it allows the src to be either an iRegI or a (ConvL2I
--- a/hotspot/src/cpu/ppc/vm/ppc.ad	Mon May 09 01:21:55 2016 -0700
+++ b/hotspot/src/cpu/ppc/vm/ppc.ad	Mon May 09 11:34:09 2016 +0200
@@ -817,6 +817,16 @@
 
 source %{
 
+// Should the Matcher clone shifts on addressing modes, expecting them
+// to be subsumed into complex addressing expressions or compute them
+// into registers?
+bool Matcher::clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
+  return clone_base_plus_offset_address(m, mstack, address_visited);
+}
+
+void Compile::reshape_address(AddPNode* addp) {
+}
+
 // Optimize load-acquire.
 //
 // Check if acquire is unnecessary due to following operation that does
@@ -2157,11 +2167,6 @@
 // Power6 requires postalloc expand (see block.cpp for description of postalloc expand).
 const bool Matcher::require_postalloc_expand = true;
 
-// Should the Matcher clone shifts on addressing modes, expecting them to
-// be subsumed into complex addressing expressions or compute them into
-// registers? True for Intel but false for most RISCs.
-const bool Matcher::clone_shift_expressions = false;
-
 // Do we need to mask the count passed to shift instructions or does
 // the cpu only look at the lower 5/6 bits anyway?
 // PowerPC requires masked shift counts.
--- a/hotspot/src/cpu/sparc/vm/sparc.ad	Mon May 09 01:21:55 2016 -0700
+++ b/hotspot/src/cpu/sparc/vm/sparc.ad	Mon May 09 11:34:09 2016 +0200
@@ -1995,11 +1995,6 @@
 // Does the CPU require late expand (see block.cpp for description of late expand)?
 const bool Matcher::require_postalloc_expand = false;
 
-// Should the Matcher clone shifts on addressing modes, expecting them to
-// be subsumed into complex addressing expressions or compute them into
-// registers?  True for Intel but false for most RISCs
-const bool Matcher::clone_shift_expressions = false;
-
 // Do we need to mask the count passed to shift instructions or does
 // the cpu only look at the lower 5/6 bits anyway?
 const bool Matcher::need_masked_shift_count = false;
@@ -2133,8 +2128,19 @@
   return L7_REGP_mask();
 }
 
+
 const bool Matcher::convi2l_type_required = true;
 
+// Should the Matcher clone shifts on addressing modes, expecting them
+// to be subsumed into complex addressing expressions or compute them
+// into registers?
+bool Matcher::clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
+  return clone_base_plus_offset_address(m, mstack, address_visited);
+}
+
+void Compile::reshape_address(AddPNode* addp) {
+}
+
 %}
 
 
--- a/hotspot/src/cpu/x86/vm/x86.ad	Mon May 09 01:21:55 2016 -0700
+++ b/hotspot/src/cpu/x86/vm/x86.ad	Mon May 09 11:34:09 2016 +0200
@@ -1586,6 +1586,8 @@
 
 source %{
 
+#include "opto/addnode.hpp"
+
 // Emit exception handler code.
 // Stuff framesize into a register and call a VM stub routine.
 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
@@ -1861,8 +1863,79 @@
   return false;
 }
 
+
 const bool Matcher::convi2l_type_required = true;
 
+// Check for shift by small constant as well
+static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
+  if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
+      shift->in(2)->get_int() <= 3 &&
+      // Are there other uses besides address expressions?
+      !matcher->is_visited(shift)) {
+    address_visited.set(shift->_idx); // Flag as address_visited
+    mstack.push(shift->in(2), Matcher::Visit);
+    Node *conv = shift->in(1);
+#ifdef _LP64
+    // Allow Matcher to match the rule which bypass
+    // ConvI2L operation for an array index on LP64
+    // if the index value is positive.
+    if (conv->Opcode() == Op_ConvI2L &&
+        conv->as_Type()->type()->is_long()->_lo >= 0 &&
+        // Are there other uses besides address expressions?
+        !matcher->is_visited(conv)) {
+      address_visited.set(conv->_idx); // Flag as address_visited
+      mstack.push(conv->in(1), Matcher::Pre_Visit);
+    } else
+#endif
+      mstack.push(conv, Matcher::Pre_Visit);
+    return true;
+  }
+  return false;
+}
+
+// Should the Matcher clone shifts on addressing modes, expecting them
+// to be subsumed into complex addressing expressions or compute them
+// into registers?
+bool Matcher::clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
+  Node *off = m->in(AddPNode::Offset);
+  if (off->is_Con()) {
+    address_visited.test_set(m->_idx); // Flag as address_visited
+    Node *adr = m->in(AddPNode::Address);
+
+    // Intel can handle 2 adds in addressing mode
+    // AtomicAdd is not an addressing expression.
+    // Cheap to find it by looking for screwy base.
+    if (adr->is_AddP() &&
+        !adr->in(AddPNode::Base)->is_top() &&
+        // Are there other uses besides address expressions?
+        !is_visited(adr)) {
+      address_visited.set(adr->_idx); // Flag as address_visited
+      Node *shift = adr->in(AddPNode::Offset);
+      if (!clone_shift(shift, this, mstack, address_visited)) {
+        mstack.push(shift, Pre_Visit);
+      }
+      mstack.push(adr->in(AddPNode::Address), Pre_Visit);
+      mstack.push(adr->in(AddPNode::Base), Pre_Visit);
+    } else {
+      mstack.push(adr, Pre_Visit);
+    }
+
+    // Clone X+offset as it also folds into most addressing expressions
+    mstack.push(off, Visit);
+    mstack.push(m->in(AddPNode::Base), Pre_Visit);
+    return true;
+  } else if (clone_shift(off, this, mstack, address_visited)) {
+    address_visited.test_set(m->_idx); // Flag as address_visited
+    mstack.push(m->in(AddPNode::Address), Pre_Visit);
+    mstack.push(m->in(AddPNode::Base), Pre_Visit);
+    return true;
+  }
+  return false;
+}
+
+void Compile::reshape_address(AddPNode* addp) {
+}
+
 // Helper methods for MachSpillCopyNode::implementation().
 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
--- a/hotspot/src/cpu/x86/vm/x86_32.ad	Mon May 09 01:21:55 2016 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad	Mon May 09 11:34:09 2016 +0200
@@ -1438,11 +1438,6 @@
 // Does the CPU require late expand (see block.cpp for description of late expand)?
 const bool Matcher::require_postalloc_expand = false;
 
-// Should the Matcher clone shifts on addressing modes, expecting them to
-// be subsumed into complex addressing expressions or compute them into
-// registers?  True for Intel but false for most RISCs
-const bool Matcher::clone_shift_expressions = true;
-
 // Do we need to mask the count passed to shift instructions or does
 // the cpu only look at the lower 5/6 bits anyway?
 const bool Matcher::need_masked_shift_count = false;
--- a/hotspot/src/cpu/x86/vm/x86_64.ad	Mon May 09 01:21:55 2016 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad	Mon May 09 11:34:09 2016 +0200
@@ -1646,11 +1646,6 @@
 // Does the CPU require late expand (see block.cpp for description of late expand)?
 const bool Matcher::require_postalloc_expand = false;
 
-// Should the Matcher clone shifts on addressing modes, expecting them
-// to be subsumed into complex addressing expressions or compute them
-// into registers?  True for Intel but false for most RISCs
-const bool Matcher::clone_shift_expressions = true;
-
 // Do we need to mask the count passed to shift instructions or does
 // the cpu only look at the lower 5/6 bits anyway?
 const bool Matcher::need_masked_shift_count = false;
--- a/hotspot/src/share/vm/opto/compile.cpp	Mon May 09 01:21:55 2016 -0700
+++ b/hotspot/src/share/vm/opto/compile.cpp	Mon May 09 11:34:09 2016 +0200
@@ -2905,6 +2905,8 @@
       }
     }
 #endif
+    // platform dependent reshaping of the address expression
+    reshape_address(n->as_AddP());
     break;
   }
 
--- a/hotspot/src/share/vm/opto/compile.hpp	Mon May 09 01:21:55 2016 -0700
+++ b/hotspot/src/share/vm/opto/compile.hpp	Mon May 09 11:34:09 2016 +0200
@@ -44,6 +44,7 @@
 #include "trace/tracing.hpp"
 #include "utilities/ticks.hpp"
 
+class AddPNode;
 class Block;
 class Bundle;
 class C2Compiler;
@@ -579,6 +580,8 @@
   int                   _scratch_const_size;    // For temporary code buffers.
   bool                  _in_scratch_emit_size;  // true when in scratch_emit_size.
 
+  void reshape_address(AddPNode* n);
+
  public:
   // Accessors
 
--- a/hotspot/src/share/vm/opto/matcher.cpp	Mon May 09 01:21:55 2016 -0700
+++ b/hotspot/src/share/vm/opto/matcher.cpp	Mon May 09 11:34:09 2016 +0200
@@ -963,44 +963,6 @@
 }
 #endif
 
-
-//------------------------------MStack-----------------------------------------
-// State and MStack class used in xform() and find_shared() iterative methods.
-enum Node_State { Pre_Visit,  // node has to be pre-visited
-                      Visit,  // visit node
-                 Post_Visit,  // post-visit node
-             Alt_Post_Visit   // alternative post-visit path
-                };
-
-class MStack: public Node_Stack {
-  public:
-    MStack(int size) : Node_Stack(size) { }
-
-    void push(Node *n, Node_State ns) {
-      Node_Stack::push(n, (uint)ns);
-    }
-    void push(Node *n, Node_State ns, Node *parent, int indx) {
-      ++_inode_top;
-      if ((_inode_top + 1) >= _inode_max) grow();
-      _inode_top->node = parent;
-      _inode_top->indx = (uint)indx;
-      ++_inode_top;
-      _inode_top->node = n;
-      _inode_top->indx = (uint)ns;
-    }
-    Node *parent() {
-      pop();
-      return node();
-    }
-    Node_State state() const {
-      return (Node_State)index();
-    }
-    void set_state(Node_State ns) {
-      set_index((uint)ns);
-    }
-};
-
-
 //------------------------------xform------------------------------------------
 // Given a Node in old-space, Match him (Label/Reduce) to produce a machine
 // Node in new-space.  Given a new-space Node, recursively walk his children.
@@ -2046,37 +2008,22 @@
 }
 #endif // X86
 
-// A method-klass-holder may be passed in the inline_cache_reg
-// and then expanded into the inline_cache_reg and a method_oop register
-//   defined in ad_<arch>.cpp
-
-// Check for shift by small constant as well
-static bool clone_shift(Node* shift, Matcher* matcher, MStack& mstack, VectorSet& address_visited) {
-  if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
-      shift->in(2)->get_int() <= 3 &&
-      // Are there other uses besides address expressions?
-      !matcher->is_visited(shift)) {
-    address_visited.set(shift->_idx); // Flag as address_visited
-    mstack.push(shift->in(2), Visit);
-    Node *conv = shift->in(1);
-#ifdef _LP64
-    // Allow Matcher to match the rule which bypass
-    // ConvI2L operation for an array index on LP64
-    // if the index value is positive.
-    if (conv->Opcode() == Op_ConvI2L &&
-        conv->as_Type()->type()->is_long()->_lo >= 0 &&
-        // Are there other uses besides address expressions?
-        !matcher->is_visited(conv)) {
-      address_visited.set(conv->_idx); // Flag as address_visited
-      mstack.push(conv->in(1), Pre_Visit);
-    } else
-#endif
-      mstack.push(conv, Pre_Visit);
+bool Matcher::clone_base_plus_offset_address(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
+  Node *off = m->in(AddPNode::Offset);
+  if (off->is_Con()) {
+    address_visited.test_set(m->_idx); // Flag as address_visited
+    mstack.push(m->in(AddPNode::Address), Pre_Visit);
+    // Clone X+offset as it also folds into most addressing expressions
+    mstack.push(off, Visit);
+    mstack.push(m->in(AddPNode::Base), Pre_Visit);
     return true;
   }
   return false;
 }
 
+// A method-klass-holder may be passed in the inline_cache_reg
+// and then expanded into the inline_cache_reg and a method_oop register
+//   defined in ad_<arch>.cpp
 
 //------------------------------find_shared------------------------------------
 // Set bits if Node is shared or otherwise a root
@@ -2251,40 +2198,9 @@
           // But they should be marked as shared if there are other uses
           // besides address expressions.
 
-          Node *off = m->in(AddPNode::Offset);
-          if (off->is_Con()) {
-            address_visited.test_set(m->_idx); // Flag as address_visited
-            Node *adr = m->in(AddPNode::Address);
-
-            // Intel, ARM and friends can handle 2 adds in addressing mode
-            if( clone_shift_expressions && adr->is_AddP() &&
-                // AtomicAdd is not an addressing expression.
-                // Cheap to find it by looking for screwy base.
-                !adr->in(AddPNode::Base)->is_top() &&
-                // Are there other uses besides address expressions?
-                !is_visited(adr) ) {
-              address_visited.set(adr->_idx); // Flag as address_visited
-              Node *shift = adr->in(AddPNode::Offset);
-              if (!clone_shift(shift, this, mstack, address_visited)) {
-                mstack.push(shift, Pre_Visit);
-              }
-              mstack.push(adr->in(AddPNode::Address), Pre_Visit);
-              mstack.push(adr->in(AddPNode::Base), Pre_Visit);
-            } else {  // Sparc, Alpha, PPC and friends
-              mstack.push(adr, Pre_Visit);
-            }
-
-            // Clone X+offset as it also folds into most addressing expressions
-            mstack.push(off, Visit);
-            mstack.push(m->in(AddPNode::Base), Pre_Visit);
-            continue; // for(int i = ...)
-          } else if (clone_shift_expressions &&
-                     clone_shift(off, this, mstack, address_visited)) {
-              address_visited.test_set(m->_idx); // Flag as address_visited
-              mstack.push(m->in(AddPNode::Address), Pre_Visit);
-              mstack.push(m->in(AddPNode::Base), Pre_Visit);
-              continue;
-          } // if( off->is_Con() )
+          if (clone_address_expressions(m->as_AddP(), mstack, address_visited)) {
+            continue;
+          }
         }   // if( mem_op &&
         mstack.push(m, Pre_Visit);
       }     // for(int i = ...)
--- a/hotspot/src/share/vm/opto/matcher.hpp	Mon May 09 01:21:55 2016 -0700
+++ b/hotspot/src/share/vm/opto/matcher.hpp	Mon May 09 11:34:09 2016 +0200
@@ -40,6 +40,45 @@
 //---------------------------Matcher-------------------------------------------
 class Matcher : public PhaseTransform {
   friend class VMStructs;
+
+public:
+
+  // State and MStack class used in xform() and find_shared() iterative methods.
+  enum Node_State { Pre_Visit,  // node has to be pre-visited
+                    Visit,  // visit node
+                    Post_Visit,  // post-visit node
+                    Alt_Post_Visit   // alternative post-visit path
+  };
+
+  class MStack: public Node_Stack {
+  public:
+    MStack(int size) : Node_Stack(size) { }
+
+    void push(Node *n, Node_State ns) {
+      Node_Stack::push(n, (uint)ns);
+    }
+    void push(Node *n, Node_State ns, Node *parent, int indx) {
+      ++_inode_top;
+      if ((_inode_top + 1) >= _inode_max) grow();
+      _inode_top->node = parent;
+      _inode_top->indx = (uint)indx;
+      ++_inode_top;
+      _inode_top->node = n;
+      _inode_top->indx = (uint)ns;
+    }
+    Node *parent() {
+      pop();
+      return node();
+    }
+    Node_State state() const {
+      return (Node_State)index();
+    }
+    void set_state(Node_State ns) {
+      set_index((uint)ns);
+    }
+  };
+
+private:
   // Private arena of State objects
   ResourceArea _states_arena;
 
@@ -411,7 +450,9 @@
   // Should the Matcher clone shifts on addressing modes, expecting them to
   // be subsumed into complex addressing expressions or compute them into
   // registers?  True for Intel but false for most RISCs
-  static const bool clone_shift_expressions;
+  bool clone_address_expressions(AddPNode* m, MStack& mstack, VectorSet& address_visited);
+  // Clone base + offset address expression
+  bool clone_base_plus_offset_address(AddPNode* m, MStack& mstack, VectorSet& address_visited);
 
   static bool narrow_oop_use_complex_address();
   static bool narrow_klass_use_complex_address();