8139340: SuperWord enhancement to support vector conditional move (CMovVD) on Intel AVX cpu
authoriveresov
Mon, 26 Oct 2015 19:33:31 -0700
changeset 33469 30f4811eded0
parent 33468 b82c87453d2d
child 33470 0ce01b662ff2
8139340: SuperWord enhancement to support vector conditional move (CMovVD) on Intel AVX cpu Summary: Emit vector conditional moves Reviewed-by: kvn Contributed-by: jan.civlin@intel.com
hotspot/src/cpu/x86/vm/assembler_x86.cpp
hotspot/src/cpu/x86/vm/assembler_x86.hpp
hotspot/src/cpu/x86/vm/x86.ad
hotspot/src/share/vm/adlc/formssel.cpp
hotspot/src/share/vm/opto/c2_globals.hpp
hotspot/src/share/vm/opto/classes.hpp
hotspot/src/share/vm/opto/compile.cpp
hotspot/src/share/vm/opto/compile.hpp
hotspot/src/share/vm/opto/loopnode.hpp
hotspot/src/share/vm/opto/loopopts.cpp
hotspot/src/share/vm/opto/matcher.cpp
hotspot/src/share/vm/opto/superword.cpp
hotspot/src/share/vm/opto/superword.hpp
hotspot/src/share/vm/opto/vectornode.cpp
hotspot/src/share/vm/opto/vectornode.hpp
hotspot/src/share/vm/runtime/vmStructs.cpp
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Mon Oct 26 10:36:54 2015 +0100
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Mon Oct 26 19:33:31 2015 -0700
@@ -6349,6 +6349,26 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
+void Assembler::cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len) {
+  assert(VM_Version::supports_avx(), "");
+  assert(!VM_Version::supports_evex(), "");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F, /* no_mask_reg */ false);
+  emit_int8((unsigned char)0xC2);
+  emit_int8((unsigned char)(0xC0 | encode));
+  emit_int8((unsigned char)(0xF & cop));
+}
+
+void Assembler::vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len) {
+  assert(VM_Version::supports_avx(), "");
+  assert(!VM_Version::supports_evex(), "");
+  int encode = vex_prefix_and_encode(dst, nds, src1, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A, /* no_mask_reg */ false);
+  emit_int8((unsigned char)0x4B);
+  emit_int8((unsigned char)(0xC0 | encode));
+  int src2_enc = src2->encoding();
+  emit_int8((unsigned char)(0xF0 & src2_enc<<4));
+}
+
+
 #ifndef _LP64
 
 void Assembler::incl(Register dst) {
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Mon Oct 26 10:36:54 2015 +0100
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Mon Oct 26 19:33:31 2015 -0700
@@ -2147,6 +2147,11 @@
   // runtime code and native libraries.
   void vzeroupper();
 
+  // AVX support for vectorized conditional move (double). The following two instructions used only coupled.
+  void cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len);
+  void vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len);
+
+
  protected:
   // Next instructions require address alignment 16 bytes SSE mode.
   // They should be called only from corresponding MacroAssembler instructions.
--- a/hotspot/src/cpu/x86/vm/x86.ad	Mon Oct 26 10:36:54 2015 +0100
+++ b/hotspot/src/cpu/x86/vm/x86.ad	Mon Oct 26 19:33:31 2015 -0700
@@ -1707,6 +1707,10 @@
       if (!VM_Version::supports_cx8())
         ret_value = false;
       break;
+    case Op_CMoveVD:
+      if (UseAVX < 1 || UseAVX > 2)
+        ret_value = false;
+      break;
   }
 
   return ret_value;  // Per default match rules are supported.
@@ -2089,6 +2093,29 @@
   interface(REG_INTER);
 %}
 
+// Comparison Code for FP conditional move
+operand cmpOp_vcmppd() %{
+  match(Bool);
+
+  predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
+            n->as_Bool()->_test._test != BoolTest::no_overflow);
+  format %{ "" %}
+  interface(COND_INTER) %{
+    equal        (0x0, "eq");
+    less         (0x1, "lt");
+    less_equal   (0x2, "le");
+    not_equal    (0xC, "ne");
+    greater_equal(0xD, "ge");
+    greater      (0xE, "gt");
+    //TODO cannot compile (adlc breaks) without two next lines with error:
+    // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
+    // equal' for overflow.
+    overflow     (0x20, "o");  // not really supported by the instruction
+    no_overflow  (0x21, "no"); // not really supported by the instruction
+  %}
+%}
+
+
 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 
 // ============================================================================
@@ -7393,6 +7420,22 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vcmov4D_reg(vecY dst, vecY src1, vecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
+  predicate(UseAVX > 0 && UseAVX < 3 && n->as_Vector()->length() == 4);
+  match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
+  effect(TEMP dst, USE src1, USE src2);
+  format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
+            "vpblendd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
+         %}
+  ins_encode %{
+    int vector_len = 1;
+    int cond = (Assembler::Condition)($copnd$$cmpcode);
+    __ cmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
+    __ vpblendd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // --------------------------------- DIV --------------------------------------
 
 // Floats vector div
--- a/hotspot/src/share/vm/adlc/formssel.cpp	Mon Oct 26 10:36:54 2015 +0100
+++ b/hotspot/src/share/vm/adlc/formssel.cpp	Mon Oct 26 19:33:31 2015 -0700
@@ -4140,6 +4140,7 @@
     "AddVB","AddVS","AddVI","AddVL","AddVF","AddVD",
     "SubVB","SubVS","SubVI","SubVL","SubVF","SubVD",
     "MulVS","MulVI","MulVL","MulVF","MulVD",
+    "CMoveVD",
     "DivVF","DivVD",
     "AbsVF","AbsVD",
     "NegVF","NegVD",
--- a/hotspot/src/share/vm/opto/c2_globals.hpp	Mon Oct 26 10:36:54 2015 +0100
+++ b/hotspot/src/share/vm/opto/c2_globals.hpp	Mon Oct 26 19:33:31 2015 -0700
@@ -342,6 +342,9 @@
   product(bool, SuperWordReductions, true,                                  \
           "Enable reductions support in superword.")                        \
                                                                             \
+  product(bool, UseCMoveUnconditionally, false,                             \
+          "Use CMove (scalar and vector) ignoring profitability test.")     \
+                                                                            \
   product(bool, DoReserveCopyInSuperWord, true,                             \
           "Create reserve copy of graph in SuperWord.")                     \
                                                                             \
--- a/hotspot/src/share/vm/opto/classes.hpp	Mon Oct 26 10:36:54 2015 +0100
+++ b/hotspot/src/share/vm/opto/classes.hpp	Mon Oct 26 19:33:31 2015 -0700
@@ -64,6 +64,7 @@
 macro(ClearArray)
 macro(ConstraintCast)
 macro(CMoveD)
+macro(CMoveVD)
 macro(CMoveF)
 macro(CMoveI)
 macro(CMoveL)
--- a/hotspot/src/share/vm/opto/compile.cpp	Mon Oct 26 10:36:54 2015 +0100
+++ b/hotspot/src/share/vm/opto/compile.cpp	Mon Oct 26 19:33:31 2015 -0700
@@ -1108,20 +1108,17 @@
 
   set_do_vector_loop(false);
 
-  bool do_vector = false;
   if (AllowVectorizeOnDemand) {
     if (has_method() && (_directive->VectorizeOption || _directive->VectorizeDebugOption)) {
       set_do_vector_loop(true);
+      NOT_PRODUCT(if (do_vector_loop() && Verbose) {tty->print("Compile::Init: do vectorized loops (SIMD like) for method %s\n",  method()->name()->as_quoted_ascii());})
     } else if (has_method() && method()->name() != 0 &&
                method()->intrinsic_id() == vmIntrinsics::_forEachRemaining) {
       set_do_vector_loop(true);
     }
-#ifndef PRODUCT
-    if (do_vector_loop() && Verbose) {
-      tty->print("Compile::Init: do vectorized loops (SIMD like) for method %s\n",  method()->name()->as_quoted_ascii());
-    }
-#endif
   }
+  set_use_cmove(UseCMoveUnconditionally /* || do_vector_loop()*/); //TODO: consider do_vector_loop() mandate use_cmove unconditionally
+  NOT_PRODUCT(if (use_cmove() && Verbose && has_method()) {tty->print("Compile::Init: use CMove without profitability tests for method %s\n",  method()->name()->as_quoted_ascii());})
 
   set_age_code(has_method() && method()->profile_aging());
   set_rtm_state(NoRTM); // No RTM lock eliding by default
--- a/hotspot/src/share/vm/opto/compile.hpp	Mon Oct 26 10:36:54 2015 +0100
+++ b/hotspot/src/share/vm/opto/compile.hpp	Mon Oct 26 19:33:31 2015 -0700
@@ -374,6 +374,7 @@
   bool                  _do_count_invocations;  // True if we generate code to count invocations
   bool                  _do_method_data_update; // True if we generate code to update MethodData*s
   bool                  _do_vector_loop;        // True if allowed to execute loop in parallel iterations
+  bool                  _use_cmove;             // True if CMove should be used without profitability analysis
   bool                  _age_code;              // True if we need to profile code age (decrement the aging counter)
   int                   _AliasLevel;            // Locally-adjusted version of AliasLevel flag.
   bool                  _print_assembly;        // True if we should dump assembly code for this compilation
@@ -657,6 +658,8 @@
   void          set_do_method_data_update(bool z) { _do_method_data_update = z; }
   bool              do_vector_loop() const      { return _do_vector_loop; }
   void          set_do_vector_loop(bool z)      { _do_vector_loop = z; }
+  bool              use_cmove() const           { return _use_cmove; }
+  void          set_use_cmove(bool z)           { _use_cmove = z; }
   bool              age_code() const             { return _age_code; }
   void          set_age_code(bool z)             { _age_code = z; }
   int               AliasLevel() const           { return _AliasLevel; }
--- a/hotspot/src/share/vm/opto/loopnode.hpp	Mon Oct 26 10:36:54 2015 +0100
+++ b/hotspot/src/share/vm/opto/loopnode.hpp	Mon Oct 26 19:33:31 2015 -0700
@@ -1143,7 +1143,7 @@
 // dominated in the outer loop by this node chain:
 //   intcon(1)->If->IfFalse->reserved_copy.
 // The original loop is dominated by the the same node chain but IfTrue projection:
-//   intcon(1)->If->IfTrue->original_loop.
+//   intcon(0)->If->IfTrue->original_loop.
 //
 // In this implementation of CountedLoopReserveKit the ctor includes create_reserve()
 // and the dtor, checks _use_new value.
--- a/hotspot/src/share/vm/opto/loopopts.cpp	Mon Oct 26 10:36:54 2015 +0100
+++ b/hotspot/src/share/vm/opto/loopopts.cpp	Mon Oct 26 19:33:31 2015 -0700
@@ -512,8 +512,11 @@
     PhiNode* phi = out->as_Phi();
     BasicType bt = phi->type()->basic_type();
     switch (bt) {
-    case T_FLOAT:
-    case T_DOUBLE: {
+    case T_DOUBLE:
+      if (C->use_cmove()) {
+        continue; //TODO: maybe we want to add some cost
+      }
+    case T_FLOAT: {
       cost += Matcher::float_cmove_cost(); // Could be very expensive
       break;
     }
@@ -573,7 +576,7 @@
         }
       }
     }
-  }
+  }//for
   Node* bol = iff->in(1);
   assert(bol->Opcode() == Op_Bool, "");
   int cmp_op = bol->in(1)->Opcode();
@@ -595,7 +598,8 @@
   }
   // Check for highly predictable branch.  No point in CMOV'ing if
   // we are going to predict accurately all the time.
-  if (iff->_prob < infrequent_prob ||
+  if (C->use_cmove() && cmp_op == Op_CmpD) ;//keep going
+  else if (iff->_prob < infrequent_prob ||
       iff->_prob > (1.0f - infrequent_prob))
     return NULL;
 
--- a/hotspot/src/share/vm/opto/matcher.cpp	Mon Oct 26 10:36:54 2015 +0100
+++ b/hotspot/src/share/vm/opto/matcher.cpp	Mon Oct 26 19:33:31 2015 -0700
@@ -2316,7 +2316,8 @@
       case Op_CMoveI:
       case Op_CMoveL:
       case Op_CMoveN:
-      case Op_CMoveP: {
+      case Op_CMoveP:
+      case Op_CMoveVD:  {
         // Restructure into a binary tree for Matching.  It's possible that
         // we could move this code up next to the graph reshaping for IfNodes
         // or vice-versa, but I do not want to debug this for Ladybird.
--- a/hotspot/src/share/vm/opto/superword.cpp	Mon Oct 26 10:36:54 2015 +0100
+++ b/hotspot/src/share/vm/opto/superword.cpp	Mon Oct 26 19:33:31 2015 -0700
@@ -37,6 +37,7 @@
 #include "opto/opaquenode.hpp"
 #include "opto/superword.hpp"
 #include "opto/vectornode.hpp"
+#include "opto/movenode.hpp"
 
 //
 //                  S U P E R W O R D   T R A N S F O R M
@@ -55,6 +56,7 @@
   _mem_slice_tail(arena(), 8,  0, NULL),  // memory slice tails
   _node_info(arena(), 8,  0, SWNodeInfo::initial), // info needed per node
   _clone_map(phase->C->clone_map()),      // map of nodes created in cloning
+  _cmovev_kit(_arena, this),              // map to facilitate CMoveVD creation
   _align_to_ref(NULL),                    // memory reference to align vectors to
   _disjoint_ptrs(arena(), 8,  0, OrderedPair::initial), // runtime disambiguated pointer pairs
   _dg(_arena),                            // dependence graph
@@ -72,6 +74,7 @@
   _num_work_vecs(0),                      // amount of vector work we have
   _num_reductions(0),                     // amount of reduction work we have
   _do_vector_loop(phase->C->do_vector_loop()),  // whether to do vectorization/simd style
+  _do_reserve_copy(DoReserveCopyInSuperWord),
   _ii_first(-1),                          // first loop generation index - only if do_vector_loop()
   _ii_last(-1),                           // last loop generation index - only if do_vector_loop()
   _ii_order(arena(), 8, 0, 0)
@@ -82,10 +85,6 @@
     _vector_loop_debug = phase->C->directive()->VectorizeDebugOption;
   }
 
-  _CountedLoopReserveKit_debug = 0;
-  if (_phase->C->method() != NULL) {
-    _CountedLoopReserveKit_debug = phase->C->directive()->DoReserveCopyInSuperWordDebugOption;
-  }
 #endif
 }
 
@@ -103,7 +102,18 @@
   if (!cl->is_main_loop() ) return; // skip normal, pre, and post loops
   // Check for no control flow in body (other than exit)
   Node *cl_exit = cl->loopexit();
-  if (cl_exit->in(0) != lpt->_head) return;
+  if (cl_exit->in(0) != lpt->_head) {
+    #ifndef PRODUCT
+      if (TraceSuperWord) {
+        tty->print_cr("SuperWord::transform_loop: loop too complicated, cl_exit->in(0) != lpt->_head");
+        tty->print("cl_exit %d", cl_exit->_idx); cl_exit->dump();
+        tty->print("cl_exit->in(0) %d", cl_exit->in(0)->_idx); cl_exit->in(0)->dump();
+        tty->print("lpt->_head %d", lpt->_head->_idx); lpt->_head->dump();
+        lpt->dump_head();
+      }
+    #endif
+    return;
+  }
 
   // Make sure the are no extra control users of the loop backedge
   if (cl->back_control()->outcnt() != 1) {
@@ -392,6 +402,10 @@
 
   construct_my_pack_map();
 
+  if (_do_vector_loop) {
+    merge_packs_to_cmovd();
+  }
+
   filter_packs();
 
   schedule();
@@ -1072,6 +1086,17 @@
 
 //------------------------------data_size---------------------------
 int SuperWord::data_size(Node* s) {
+  Node* use = NULL; //test if the node is a candidate for CMoveVD optimization, then return the size of CMov
+  if (_do_vector_loop) {
+    use = _cmovev_kit.is_Bool_candidate(s);
+    if (use != NULL) {
+      return data_size(use);
+    }
+    use = _cmovev_kit.is_CmpD_candidate(s);
+    if (use != NULL) {
+      return data_size(use);
+    }
+  }
   int bsize = type2aelembytes(velt_basic_type(s));
   assert(bsize != 0, "valid size");
   return bsize;
@@ -1118,6 +1143,7 @@
   if (s1->is_Load()) return false;
 
   int align = alignment(s1);
+  NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_use_defs: s1 %d, align %d", s1->_idx, align);)
   bool changed = false;
   int start = s1->is_Store() ? MemNode::ValueIn   : 1;
   int end   = s1->is_Store() ? MemNode::ValueIn+1 : s1->req();
@@ -1132,6 +1158,7 @@
         pair->push(t1);
         pair->push(t2);
         _packset.append(pair);
+        NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_use_defs: set_alignment(%d, %d, %d)", t1->_idx, t2->_idx, align);)
         set_alignment(t1, t2, align);
         changed = true;
       }
@@ -1153,6 +1180,7 @@
   if (s1->is_Store()) return false;
 
   int align = alignment(s1);
+  NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_def_uses: s1 %d, align %d", s1->_idx, align);)
   int savings = -1;
   int num_s1_uses = 0;
   Node* u1 = NULL;
@@ -1184,6 +1212,7 @@
     pair->push(u1);
     pair->push(u2);
     _packset.append(pair);
+    NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_def_uses: set_alignment(%d, %d, %d)", u1->_idx, u2->_idx, align);)
     set_alignment(u1, u2, align);
     changed = true;
   }
@@ -1459,6 +1488,196 @@
 #endif
 }
 
+//------------------------------merge_packs_to_cmovd---------------------------
+// Merge CMoveD into new vector-nodes
+// We want to catch this pattern and subsume CmpD and Bool into CMoveD
+//
+//                   SubD             ConD
+//                  /  |               /
+//                 /   |           /   /
+//                /    |       /      /
+//               /     |   /         /
+//              /      /            /
+//             /    /  |           /
+//            v /      |          /
+//         CmpD        |         /
+//          |          |        /
+//          v          |       /
+//         Bool        |      /
+//           \         |     /
+//             \       |    /
+//               \     |   /
+//                 \   |  /
+//                   \ v /
+//                   CMoveD
+//
+
+void SuperWord::merge_packs_to_cmovd() {
+  for (int i = _packset.length() - 1; i >= 0; i--) {
+    _cmovev_kit.make_cmovevd_pack(_packset.at(i));
+  }
+  #ifndef PRODUCT
+    if (TraceSuperWord) {
+      tty->print_cr("\nSuperWord::merge_packs_to_cmovd(): After merge");
+      print_packset();
+      tty->cr();
+    }
+  #endif
+}
+
+Node* CMoveKit::is_Bool_candidate(Node* def) const {
+  Node* use = NULL;
+  if (!def->is_Bool() || def->in(0) != NULL || def->outcnt() != 1) {
+    return NULL;
+  }
+  for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
+    use = def->fast_out(j);
+    if (!_sw->same_generation(def, use) || !use->is_CMove()) {
+      return NULL;
+    }
+  }
+  return use;
+}
+
+Node* CMoveKit::is_CmpD_candidate(Node* def) const {
+  Node* use = NULL;
+  if (!def->is_Cmp() || def->in(0) != NULL || def->outcnt() != 1) {
+    return NULL;
+  }
+  for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
+    use = def->fast_out(j);
+    if (!_sw->same_generation(def, use) || (use = is_Bool_candidate(use)) == NULL || !_sw->same_generation(def, use)) {
+      return NULL;
+    }
+  }
+  return use;
+}
+
+Node_List* CMoveKit::make_cmovevd_pack(Node_List* cmovd_pk) {
+  Node *cmovd = cmovd_pk->at(0);
+  if (!cmovd->is_CMove()) {
+    return NULL;
+  }
+  if (pack(cmovd) != NULL) { // already in the cmov pack
+    return NULL;
+  }
+  if (cmovd->in(0) != NULL) {
+    NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::make_cmovevd_pack: CMoveD %d has control flow, escaping...", cmovd->_idx); cmovd->dump();})
+    return NULL;
+  }
+
+  Node* bol = cmovd->as_CMove()->in(CMoveNode::Condition);
+  if (!bol->is_Bool()
+      || bol->outcnt() != 1
+      || !_sw->same_generation(bol, cmovd)
+      || bol->in(0) != NULL  // BoolNode has control flow!!
+      || _sw->my_pack(bol) == NULL) {
+      NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::make_cmovevd_pack: Bool %d does not fit CMoveD %d for building vector, escaping...", bol->_idx, cmovd->_idx); bol->dump();})
+      return NULL;
+  }
+  Node_List* bool_pk = _sw->my_pack(bol);
+  if (bool_pk->size() != cmovd_pk->size() ) {
+    return NULL;
+  }
+
+  Node* cmpd = bol->in(1);
+  if (!cmpd->is_Cmp()
+      || cmpd->outcnt() != 1
+      || !_sw->same_generation(cmpd, cmovd)
+      || cmpd->in(0) != NULL  // CmpDNode has control flow!!
+      || _sw->my_pack(cmpd) == NULL) {
+      NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::make_cmovevd_pack: CmpD %d does not fit CMoveD %d for building vector, escaping...", cmpd->_idx, cmovd->_idx); cmpd->dump();})
+      return NULL;
+  }
+  Node_List* cmpd_pk = _sw->my_pack(cmpd);
+  if (cmpd_pk->size() != cmovd_pk->size() ) {
+    return NULL;
+  }
+
+  if (!test_cmpd_pack(cmpd_pk, cmovd_pk)) {
+    NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::make_cmovevd_pack: cmpd pack for CmpD %d failed vectorization test", cmpd->_idx); cmpd->dump();})
+    return NULL;
+  }
+
+  Node_List* new_cmpd_pk = new Node_List();
+  uint sz = cmovd_pk->size() - 1;
+  for (uint i = 0; i <= sz; ++i) {
+    Node* cmov = cmovd_pk->at(i);
+    Node* bol  = bool_pk->at(i);
+    Node* cmp  = cmpd_pk->at(i);
+
+    new_cmpd_pk->insert(i, cmov);
+
+    map(cmov, new_cmpd_pk);
+    map(bol, new_cmpd_pk);
+    map(cmp, new_cmpd_pk);
+
+    _sw->set_my_pack(cmov, new_cmpd_pk); // and keep old packs for cmp and bool
+  }
+  _sw->_packset.remove(cmovd_pk);
+  _sw->_packset.remove(bool_pk);
+  _sw->_packset.remove(cmpd_pk);
+  _sw->_packset.append(new_cmpd_pk);
+  NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print_cr("CMoveKit::make_cmovevd_pack: added syntactic CMoveD pack"); _sw->print_pack(new_cmpd_pk);})
+  return new_cmpd_pk;
+}
+
+bool CMoveKit::test_cmpd_pack(Node_List* cmpd_pk, Node_List* cmovd_pk) {
+  Node* cmpd0 = cmpd_pk->at(0);
+  assert(cmpd0->is_Cmp(), "CMoveKit::test_cmpd_pack: should be CmpDNode");
+  assert(cmovd_pk->at(0)->is_CMove(), "CMoveKit::test_cmpd_pack: should be CMoveD");
+  assert(cmpd_pk->size() == cmovd_pk->size(), "CMoveKit::test_cmpd_pack: should be same size");
+  Node* in1 = cmpd0->in(1);
+  Node* in2 = cmpd0->in(2);
+  Node_List* in1_pk = _sw->my_pack(in1);
+  Node_List* in2_pk = _sw->my_pack(in2);
+
+  if (in1_pk != NULL && in1_pk->size() != cmpd_pk->size()
+    || in2_pk != NULL && in2_pk->size() != cmpd_pk->size() ) {
+    return false;
+  }
+
+  // test if "all" in1 are in the same pack or the same node
+  if (in1_pk == NULL) {
+    for (uint j = 1; j < cmpd_pk->size(); j++) {
+      if (cmpd_pk->at(j)->in(1) != in1) {
+        return false;
+      }
+    }//for: in1_pk is not pack but all CmpD nodes in the pack have the same in(1)
+  }
+  // test if "all" in2 are in the same pack or the same node
+  if (in2_pk == NULL) {
+    for (uint j = 1; j < cmpd_pk->size(); j++) {
+      if (cmpd_pk->at(j)->in(2) != in2) {
+        return false;
+      }
+    }//for: in2_pk is not pack but all CmpD nodes in the pack have the same in(2)
+  }
+  //now check if cmpd_pk may be subsumed in vector built for cmovd_pk
+  int cmovd_ind1, cmovd_ind2;
+  if (cmpd_pk->at(0)->in(1) == cmovd_pk->at(0)->as_CMove()->in(CMoveNode::IfFalse)
+   && cmpd_pk->at(0)->in(2) == cmovd_pk->at(0)->as_CMove()->in(CMoveNode::IfTrue)) {
+      cmovd_ind1 = CMoveNode::IfFalse;
+      cmovd_ind2 = CMoveNode::IfTrue;
+  } else if (cmpd_pk->at(0)->in(2) == cmovd_pk->at(0)->as_CMove()->in(CMoveNode::IfFalse)
+          && cmpd_pk->at(0)->in(1) == cmovd_pk->at(0)->as_CMove()->in(CMoveNode::IfTrue)) {
+      cmovd_ind2 = CMoveNode::IfFalse;
+      cmovd_ind1 = CMoveNode::IfTrue;
+  }
+  else {
+    return false;
+  }
+
+  for (uint j = 1; j < cmpd_pk->size(); j++) {
+    if (cmpd_pk->at(j)->in(1) != cmovd_pk->at(j)->as_CMove()->in(cmovd_ind1)
+        || cmpd_pk->at(j)->in(2) != cmovd_pk->at(j)->as_CMove()->in(cmovd_ind2)) {
+        return false;
+    }//if
+  }
+  NOT_PRODUCT(if(_sw->is_trace_cmov()) { tty->print("CMoveKit::test_cmpd_pack: cmpd pack for 1st CmpD %d is OK for vectorization: ", cmpd0->_idx); cmpd0->dump(); })
+  return true;
+}
+
 //------------------------------implemented---------------------------
 // Can code be generated for pack p?
 bool SuperWord::implemented(Node_List* p) {
@@ -1478,21 +1697,31 @@
     } else {
       retValue = VectorNode::implemented(opc, size, velt_basic_type(p0));
     }
+    if (!retValue) {
+      if (is_cmov_pack(p)) {
+        NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::implemented: found cmpd pack"); print_pack(p);})
+        return true;
+      }
+    }
   }
   return retValue;
 }
 
+bool SuperWord::is_cmov_pack(Node_List* p) {
+  return _cmovev_kit.pack(p->at(0)) != NULL;
+}
 //------------------------------same_inputs--------------------------
 // For pack p, are all idx operands the same?
-static bool same_inputs(Node_List* p, int idx) {
+bool SuperWord::same_inputs(Node_List* p, int idx) {
   Node* p0 = p->at(0);
   uint vlen = p->size();
   Node* p0_def = p0->in(idx);
   for (uint i = 1; i < vlen; i++) {
     Node* pi = p->at(i);
     Node* pi_def = pi->in(idx);
-    if (p0_def != pi_def)
+    if (p0_def != pi_def) {
       return false;
+    }
   }
   return true;
 }
@@ -1510,8 +1739,9 @@
   // the same. Later, implement PackNode and allow differing, non-vector inputs
   // (maybe just the ones from outside the block.)
   for (uint i = start; i < end; i++) {
-    if (!is_vector_use(p0, i))
+    if (!is_vector_use(p0, i)) {
       return false;
+    }
   }
   // Check if reductions are connected
   if (p0->is_reduction()) {
@@ -1542,6 +1772,9 @@
     // just the ones outside the block.)
     for (uint i = 0; i < p->size(); i++) {
       Node* def = p->at(i);
+      if (is_cmov_pack_internal_node(p, def)) {
+        continue;
+      }
       for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
         Node* use = def->fast_out(j);
         for (uint k = 0; k < use->req(); k++) {
@@ -1811,14 +2044,14 @@
   uint max_vlen_in_bytes = 0;
   uint max_vlen = 0;
 
-  NOT_PRODUCT(if(_CountedLoopReserveKit_debug > 0) {tty->print_cr("SWPointer::output: print loop before create_reserve_version_of_loop"); print_loop(true);})
-
-  CountedLoopReserveKit make_reversable(_phase, _lpt, DoReserveCopyInSuperWord);
-
-  NOT_PRODUCT(if(_CountedLoopReserveKit_debug > 0) {tty->print_cr("SWPointer::output: print loop after create_reserve_version_of_loop"); print_loop(true);})
-
-  if (DoReserveCopyInSuperWord && !make_reversable.has_reserved()) {
-    NOT_PRODUCT({tty->print_cr("SWPointer::output: loop was not reserved correctly, exiting SuperWord");})
+  NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("SWPointer::output: print loop before create_reserve_version_of_loop"); print_loop(true);})
+
+  CountedLoopReserveKit make_reversable(_phase, _lpt, do_reserve_copy());
+
+  NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("SWPointer::output: print loop after create_reserve_version_of_loop"); print_loop(true);})
+
+  if (do_reserve_copy() && !make_reversable.has_reserved()) {
+    NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: loop was not reserved correctly, exiting SuperWord");})
     return;
   }
 
@@ -1831,6 +2064,7 @@
       Node* vn = NULL;
       Node* low_adr = p->at(0);
       Node* first   = executed_first(p);
+      NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: %d executed first, %d executed last in pack", first->_idx, n->_idx); print_pack(p);})
       int   opc = n->Opcode();
       if (n->is_Load()) {
         Node* ctl = n->in(MemNode::Control);
@@ -1856,13 +2090,21 @@
       } else if (n->is_Store()) {
         // Promote value to be stored to vector
         Node* val = vector_opd(p, MemNode::ValueIn);
+        if (val == NULL) {
+          if (do_reserve_copy()) {
+            NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: val should not be NULL, exiting SuperWord");})
+            return; //and reverse to backup IG
+          }
+          ShouldNotReachHere();
+        }
+
         Node* ctl = n->in(MemNode::Control);
         Node* mem = first->in(MemNode::Memory);
         Node* adr = low_adr->in(MemNode::Address);
         const TypePtr* atyp = n->adr_type();
         vn = StoreVectorNode::make(opc, ctl, mem, adr, atyp, val, vlen);
         vlen_in_bytes = vn->as_StoreVector()->memory_size();
-      } else if (n->req() == 3) {
+      } else if (n->req() == 3 && !is_cmov_pack(p)) {
         // Promote operands to vector
         Node* in1 = NULL;
         bool node_isa_reduction = n->is_reduction();
@@ -1871,8 +2113,22 @@
           in1 = low_adr->in(1);
         } else {
           in1 = vector_opd(p, 1);
+          if (in1 == NULL) {
+            if (do_reserve_copy()) {
+              NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: in1 should not be NULL, exiting SuperWord");})
+              return; //and reverse to backup IG
+            }
+            ShouldNotReachHere();
+          }
         }
         Node* in2 = vector_opd(p, 2);
+        if (in2 == NULL) {
+          if (do_reserve_copy()) {
+            NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: in2 should not be NULL, exiting SuperWord");})
+            return; //and reverse to backup IG
+          }
+          ShouldNotReachHere();
+        }
         if (VectorNode::is_invariant_vector(in1) && (node_isa_reduction == false) && (n->is_Add() || n->is_Mul())) {
           // Move invariant vector input into second position to avoid register spilling.
           Node* tmp = in1;
@@ -1896,10 +2152,71 @@
         Node* in = vector_opd(p, 1);
         vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n));
         vlen_in_bytes = vn->as_Vector()->length_in_bytes();
+      } else if (is_cmov_pack(p)) {
+        if (!n->is_CMove()) {
+          continue;
+        }
+        // place here CMoveVDNode
+        NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: print before CMove vectorization"); print_loop(false);})
+        Node* bol = n->in(CMoveNode::Condition);
+        if (!bol->is_Bool() && bol->Opcode() == Op_ExtractI && bol->req() > 1 ) {
+          NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: %d is not Bool node, trying its in(1) node %d", bol->_idx, bol->in(1)->_idx); bol->dump(); bol->in(1)->dump();})
+          bol = bol->in(1); //may be ExtractNode
+        }
+
+        assert(bol->is_Bool(), "should be BoolNode - too late to bail out!");
+        if (!bol->is_Bool()) {
+          if (do_reserve_copy()) {
+            NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: expected %d bool node, exiting SuperWord", bol->_idx); bol->dump();})
+            return; //and reverse to backup IG
+          }
+          ShouldNotReachHere();
+        }
+
+        int cond = (int)bol->as_Bool()->_test._test;
+        Node* in_cc  = _igvn.intcon(cond);
+        NOT_PRODUCT(if(is_trace_cmov()) {tty->print("SWPointer::output: created intcon in_cc node %d", in_cc->_idx); in_cc->dump();})
+        Node* cc = bol->clone();
+        cc->set_req(1, in_cc);
+        NOT_PRODUCT(if(is_trace_cmov()) {tty->print("SWPointer::output: created bool cc node %d", cc->_idx); cc->dump();})
+
+        Node* src1 = vector_opd(p, 2); //2=CMoveNode::IfFalse
+        if (src1 == NULL) {
+          if (do_reserve_copy()) {
+            NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: src1 should not be NULL, exiting SuperWord");})
+            return; //and reverse to backup IG
+          }
+          ShouldNotReachHere();
+        }
+        Node* src2 = vector_opd(p, 3); //3=CMoveNode::IfTrue
+        if (src2 == NULL) {
+          if (do_reserve_copy()) {
+            NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: src2 should not be NULL, exiting SuperWord");})
+            return; //and reverse to backup IG
+          }
+          ShouldNotReachHere();
+        }
+        BasicType bt = velt_basic_type(n);
+        const TypeVect* vt = TypeVect::make(bt, vlen);
+        vn = new CMoveVDNode(cc, src1, src2, vt);
+        NOT_PRODUCT(if(is_trace_cmov()) {tty->print("SWPointer::output: created new CMove node %d: ", vn->_idx); vn->dump();})
       } else {
+        if (do_reserve_copy()) {
+          NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: ShouldNotReachHere, exiting SuperWord");})
+          return; //and reverse to backup IG
+        }
         ShouldNotReachHere();
       }
+
       assert(vn != NULL, "sanity");
+      if (vn == NULL) {
+        if (do_reserve_copy()){
+          NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: got NULL node, cannot proceed, exiting SuperWord");})
+          return; //and reverse to backup IG
+        }
+        ShouldNotReachHere();
+      }
+
       _igvn.register_new_node_with_optimizer(vn);
       _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0)));
       for (uint j = 0; j < p->size(); j++) {
@@ -1919,7 +2236,8 @@
       }
 #endif
     }
-  }
+  }//for (int i = 0; i < _block.length(); i++)
+
   C->set_max_vector_size(max_vlen_in_bytes);
 
   if (SuperWordLoopUnrollAnalysis) {
@@ -1935,10 +2253,10 @@
     }
   }
 
-  if (DoReserveCopyInSuperWord) {
+  if (do_reserve_copy()) {
     make_reversable.use_new();
   }
-  NOT_PRODUCT(if(_CountedLoopReserveKit_debug > 0) {tty->print_cr("\n Final loop after SuperWord"); print_loop(true);})
+  NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("\n Final loop after SuperWord"); print_loop(true);})
   return;
 }
 
@@ -1952,6 +2270,10 @@
   if (same_inputs(p, opd_idx)) {
     if (opd->is_Vector() || opd->is_LoadVector()) {
       assert(((opd_idx != 2) || !VectorNode::is_shift(p0)), "shift's count can't be vector");
+      if (opd_idx == 2 && VectorNode::is_shift(p0)) {
+        NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("shift's count can't be vector");})
+        return NULL;
+      }
       return opd; // input is matching vector
     }
     if ((opd_idx == 2) && VectorNode::is_shift(p0)) {
@@ -1974,6 +2296,10 @@
           _phase->set_ctrl(cnt, _phase->get_ctrl(opd));
         }
         assert(opd->bottom_type()->isa_int(), "int type only");
+        if (!opd->bottom_type()->isa_int()) {
+          NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("Should be int type only");})
+          return NULL;
+        }
         // Move non constant shift count into vector register.
         cnt = VectorNode::shift_count(p0, cnt, vlen, velt_basic_type(p0));
       }
@@ -1984,6 +2310,10 @@
       return cnt;
     }
     assert(!opd->is_StoreVector(), "such vector is not expected here");
+    if (opd->is_StoreVector()) {
+      NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("StoreVector is not expected here");})
+      return NULL;
+    }
     // Convert scalar input to vector with the same number of elements as
     // p0's vector. Use p0's type because size of operand's container in
     // vector should match p0's size regardless operand's size.
@@ -2010,6 +2340,10 @@
     Node* pi = p->at(i);
     Node* in = pi->in(opd_idx);
     assert(my_pack(in) == NULL, "Should already have been unpacked");
+    if (my_pack(in) != NULL) {
+      NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("Should already have been unpacked");})
+      return NULL;
+    }
     assert(opd_bt == in->bottom_type()->basic_type(), "all same type");
     pk->add_opd(in);
   }
@@ -2041,8 +2375,9 @@
       for (uint k = 0; k < use->req(); k++) {
         Node* n = use->in(k);
         if (def == n) {
-          if (!is_vector_use(use, k)) {
-            _n_idx_list.push(use, k);
+          Node_List* u_pk = my_pack(use);
+          if ((u_pk == NULL || !is_cmov_pack(u_pk) || use->is_CMove()) && !is_vector_use(use, k)) {
+              _n_idx_list.push(use, k);
           }
         }
       }
--- a/hotspot/src/share/vm/opto/superword.hpp	Mon Oct 26 10:36:54 2015 +0100
+++ b/hotspot/src/share/vm/opto/superword.hpp	Mon Oct 26 19:33:31 2015 -0700
@@ -29,6 +29,7 @@
 #include "opto/phaseX.hpp"
 #include "opto/vectornode.hpp"
 #include "utilities/growableArray.hpp"
+#include "libadt/dict.hpp"
 
 //
 //                  S U P E R W O R D   T R A N S F O R M
@@ -200,6 +201,24 @@
   static const SWNodeInfo initial;
 };
 
+class SuperWord;
+class CMoveKit {
+ friend class SuperWord;
+ private:
+  SuperWord* _sw;
+  Dict* _dict;
+  CMoveKit(Arena* a, SuperWord* sw) : _sw(sw)  {_dict = new Dict(cmpkey, hashkey, a);}
+  void*     _2p(Node* key)        const  { return (void*)(intptr_t)key; } // 2 conversion functions to make gcc happy
+  Dict*     dict()                const  { return _dict; }
+  void map(Node* key, Node_List* val)    { assert(_dict->operator[](_2p(key)) == NULL, "key existed"); _dict->Insert(_2p(key), (void*)val); }
+  void unmap(Node* key)                  { _dict->Delete(_2p(key)); }
+  Node_List* pack(Node* key)      const  { return (Node_List*)_dict->operator[](_2p(key)); }
+  Node* is_Bool_candidate(Node* nd) const; // if it is the right candidate return corresponding CMove* ,
+  Node* is_CmpD_candidate(Node* nd) const; // otherwise return NULL
+  Node_List* make_cmovevd_pack(Node_List* cmovd_pk);
+  bool test_cmpd_pack(Node_List* cmpd_pk, Node_List* cmovd_pk);
+};//class CMoveKit
+
 // JVMCI: OrderedPair is moved up to deal with compilation issues on Windows
 //------------------------------OrderedPair---------------------------
 // Ordered pair of Node*.
@@ -229,6 +248,7 @@
 // Transforms scalar operations into packed (superword) operations.
 class SuperWord : public ResourceObj {
  friend class SWPointer;
+ friend class CMoveKit;
  private:
   PhaseIdealLoop* _phase;
   Arena*          _arena;
@@ -247,8 +267,8 @@
   GrowableArray<Node*> _iteration_first; // nodes in the generation that has deps from phi
   GrowableArray<Node*> _iteration_last;  // nodes in the generation that has deps to   phi
   GrowableArray<SWNodeInfo> _node_info;  // Info needed per node
-  CloneMap&                 _clone_map;  // map of nodes created in cloning
-
+  CloneMap&            _clone_map;       // map of nodes created in cloning
+  CMoveKit             _cmovev_kit;      // support for vectorization of CMov
   MemNode* _align_to_ref;                // Memory reference that pre-loop will align to
 
   GrowableArray<OrderedPair> _disjoint_ptrs; // runtime disambiguated pointer pairs
@@ -282,8 +302,11 @@
   bool     is_trace_mem_slice()    { return (_vector_loop_debug & 4) > 0; }
   bool     is_trace_loop()         { return (_vector_loop_debug & 8) > 0; }
   bool     is_trace_adjacent()     { return (_vector_loop_debug & 16) > 0; }
+  bool     is_trace_cmov()         { return (_vector_loop_debug & 32) > 0; }
+  bool     is_trace_loop_reverse() { return (_vector_loop_debug & 64) > 0; }
 #endif
   bool     do_vector_loop()        { return _do_vector_loop; }
+  bool     do_reserve_copy()       { return _do_reserve_copy; }
  private:
   IdealLoopTree* _lpt;             // Current loop tree node
   LoopNode*      _lp;              // Current LoopNode
@@ -292,6 +315,7 @@
   bool           _race_possible;   // In cases where SDMU is true
   bool           _early_return;    // True if we do not initialize
   bool           _do_vector_loop;  // whether to do vectorization/simd style
+  bool           _do_reserve_copy; // do reserve copy of the graph(loop) before final modification in output
   int            _num_work_vecs;   // Number of non memory vector operations
   int            _num_reductions;  // Number of reduction expressions applied
   int            _ii_first;        // generation with direct deps from mem phi
@@ -299,7 +323,6 @@
   GrowableArray<int> _ii_order;
 #ifndef PRODUCT
   uintx          _vector_loop_debug; // provide more printing in debug mode
-  uintx          _CountedLoopReserveKit_debug; // for debugging CountedLoopReserveKit
 #endif
 
   // Accessors
@@ -360,9 +383,13 @@
   bool same_velt_type(Node* n1, Node* n2);
 
   // my_pack
-  Node_List* my_pack(Node* n)                { return !in_bb(n) ? NULL : _node_info.adr_at(bb_idx(n))->_my_pack; }
-  void set_my_pack(Node* n, Node_List* p)    { int i = bb_idx(n); grow_node_info(i); _node_info.adr_at(i)->_my_pack = p; }
-
+  Node_List* my_pack(Node* n)                 { return !in_bb(n) ? NULL : _node_info.adr_at(bb_idx(n))->_my_pack; }
+  void set_my_pack(Node* n, Node_List* p)     { int i = bb_idx(n); grow_node_info(i); _node_info.adr_at(i)->_my_pack = p; }
+  // is pack good for converting into one vector node replacing 12 nodes of Cmp, Bool, CMov
+  bool is_cmov_pack(Node_List* p);
+  bool is_cmov_pack_internal_node(Node_List* p, Node* nd) { return is_cmov_pack(p) && !nd->is_CMove(); }
+  // For pack p, are all idx operands the same?
+  bool same_inputs(Node_List* p, int idx);
   // CloneMap utilities
   bool same_origin_idx(Node* a, Node* b) const;
   bool same_generation(Node* a, Node* b) const;
@@ -439,6 +466,8 @@
   void construct_my_pack_map();
   // Remove packs that are not implemented or not profitable.
   void filter_packs();
+  // Merge CMoveD into new vector-nodes
+  void merge_packs_to_cmovd();
   // Adjust the memory graph for the packed operations
   void schedule();
   // Remove "current" from its current position in the memory graph and insert
--- a/hotspot/src/share/vm/opto/vectornode.cpp	Mon Oct 26 10:36:54 2015 +0100
+++ b/hotspot/src/share/vm/opto/vectornode.cpp	Mon Oct 26 19:33:31 2015 -0700
@@ -86,6 +86,9 @@
   case Op_MulD:
     assert(bt == T_DOUBLE, "must be");
     return Op_MulVD;
+  case Op_CMoveD:
+    assert(bt == T_DOUBLE, "must be");
+    return Op_CMoveVD;
   case Op_DivF:
     assert(bt == T_FLOAT, "must be");
     return Op_DivVF;
@@ -185,7 +188,7 @@
       (vlen > 1) && is_power_of_2(vlen) &&
       Matcher::vector_size_supported(bt, vlen)) {
     int vopc = VectorNode::opcode(opc, bt);
-    return vopc > 0 && Matcher::match_rule_supported(vopc);
+    return vopc > 0 && Matcher::match_rule_supported(vopc) && (vopc != Op_CMoveD || vlen == 4);
   }
   return false;
 }
--- a/hotspot/src/share/vm/opto/vectornode.hpp	Mon Oct 26 10:36:54 2015 +0100
+++ b/hotspot/src/share/vm/opto/vectornode.hpp	Mon Oct 26 19:33:31 2015 -0700
@@ -44,6 +44,13 @@
     init_req(2, n2);
   }
 
+  VectorNode(Node* n1, Node* n2, Node* n3, const TypeVect* vt) : TypeNode(vt, 4) {
+    init_class_id(Class_Vector);
+    init_req(1, n1);
+    init_req(2, n2);
+    init_req(3, n3);
+  }
+
   const TypeVect* vect_type() const { return type()->is_vect(); }
   uint length() const { return vect_type()->length(); } // Vector length
   uint length_in_bytes() const { return vect_type()->length_in_bytes(); }
@@ -253,6 +260,14 @@
   virtual int Opcode() const;
 };
 
+//------------------------------CMoveVDNode--------------------------------------
+// Vector multiply double
+class CMoveVDNode : public VectorNode {
+public:
+  CMoveVDNode(Node* in1, Node* in2, Node* in3, const TypeVect* vt) : VectorNode(in1, in2, in3, vt) {}
+  virtual int Opcode() const;
+};
+
 //------------------------------MulReductionVINode--------------------------------------
 // Vector multiply int as a reduction
 class MulReductionVINode : public ReductionNode {
--- a/hotspot/src/share/vm/runtime/vmStructs.cpp	Mon Oct 26 10:36:54 2015 +0100
+++ b/hotspot/src/share/vm/runtime/vmStructs.cpp	Mon Oct 26 19:33:31 2015 -0700
@@ -2101,6 +2101,7 @@
   declare_c2_type(MulVFNode, VectorNode)                                  \
   declare_c2_type(MulReductionVFNode, ReductionNode)                      \
   declare_c2_type(MulVDNode, VectorNode)                                  \
+  declare_c2_type(CMoveVDNode, VectorNode)                                \
   declare_c2_type(MulReductionVDNode, ReductionNode)                      \
   declare_c2_type(DivVFNode, VectorNode)                                  \
   declare_c2_type(DivVDNode, VectorNode)                                  \