8192846: Support cmov vectorization for float
authorkvn
Tue, 05 Dec 2017 09:49:23 -0800
changeset 48309 1a0499fd252e
parent 48308 00bd985f3dec
child 48310 0dc66cdf4720
8192846: Support cmov vectorization for float Reviewed-by: kvn Contributed-by: razvan.a.lupusoru@intel.com
src/hotspot/cpu/x86/assembler_x86.cpp
src/hotspot/cpu/x86/assembler_x86.hpp
src/hotspot/cpu/x86/x86.ad
src/hotspot/share/adlc/formssel.cpp
src/hotspot/share/opto/c2_globals.hpp
src/hotspot/share/opto/classes.hpp
src/hotspot/share/opto/loopopts.cpp
src/hotspot/share/opto/matcher.cpp
src/hotspot/share/opto/superword.cpp
src/hotspot/share/opto/vectornode.cpp
src/hotspot/share/opto/vectornode.hpp
src/hotspot/share/runtime/vmStructs.cpp
--- a/src/hotspot/cpu/x86/assembler_x86.cpp	Tue Dec 05 21:26:11 2017 +0530
+++ b/src/hotspot/cpu/x86/assembler_x86.cpp	Tue Dec 05 09:49:23 2017 -0800
@@ -7449,6 +7449,27 @@
   emit_int8((unsigned char)(0xF0 & src2_enc<<4));
 }
 
+void Assembler::cmpps(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len) {
+  assert(VM_Version::supports_avx(), "");
+  assert(!VM_Version::supports_evex(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = simd_prefix_and_encode(dst, nds, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  emit_int8((unsigned char)0xC2);
+  emit_int8((unsigned char)(0xC0 | encode));
+  emit_int8((unsigned char)(0xF & cop));
+}
+
+void Assembler::blendvps(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len) {
+  assert(VM_Version::supports_avx(), "");
+  assert(!VM_Version::supports_evex(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src1->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8((unsigned char)0x4A);
+  emit_int8((unsigned char)(0xC0 | encode));
+  int src2_enc = src2->encoding();
+  emit_int8((unsigned char)(0xF0 & src2_enc<<4));
+}
+
 void Assembler::vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) {
   assert(VM_Version::supports_avx2(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
--- a/src/hotspot/cpu/x86/assembler_x86.hpp	Tue Dec 05 21:26:11 2017 +0530
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp	Tue Dec 05 09:49:23 2017 -0800
@@ -2114,9 +2114,11 @@
   // runtime code and native libraries.
   void vzeroupper();
 
-  // AVX support for vectorized conditional move (double). The following two instructions used only coupled.
+  // AVX support for vectorized conditional move (float/double). The following two instructions used only coupled.
   void cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len);
   void blendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len);
+  void cmpps(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len);
+  void blendvps(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len);
   void vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len);
 
  protected:
--- a/src/hotspot/cpu/x86/x86.ad	Tue Dec 05 21:26:11 2017 +0530
+++ b/src/hotspot/cpu/x86/x86.ad	Tue Dec 05 09:49:23 2017 -0800
@@ -1263,6 +1263,7 @@
       if (!VM_Version::supports_cx8())
         ret_value = false;
       break;
+    case Op_CMoveVF:
     case Op_CMoveVD:
       if (UseAVX < 1 || UseAVX > 2)
         ret_value = false;
@@ -1304,6 +1305,9 @@
         if ((vlen == 32) && (VM_Version::supports_avx512bw() == false))
           ret_value = false;
         break;
+      case Op_CMoveVF:
+        if (vlen != 8)
+          ret_value  = false;
       case Op_CMoveVD:
         if (vlen != 4)
           ret_value  = false;
@@ -8170,6 +8174,22 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct vcmov8F_reg(vecY dst, vecY src1, vecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
+  predicate(UseAVX > 0 && UseAVX < 3 && n->as_Vector()->length() == 8);
+  match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
+  effect(TEMP dst, USE src1, USE src2);
+  format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
+            "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
+         %}
+  ins_encode %{
+    int vector_len = 1;
+    int cond = (Assembler::Condition)($copnd$$cmpcode);
+    __ cmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
+    __ blendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct vcmov4D_reg(vecY dst, vecY src1, vecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
   predicate(UseAVX > 0 && UseAVX < 3 && n->as_Vector()->length() == 4);
   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
--- a/src/hotspot/share/adlc/formssel.cpp	Tue Dec 05 21:26:11 2017 +0530
+++ b/src/hotspot/share/adlc/formssel.cpp	Tue Dec 05 09:49:23 2017 -0800
@@ -4164,7 +4164,7 @@
     "AddVB","AddVS","AddVI","AddVL","AddVF","AddVD",
     "SubVB","SubVS","SubVI","SubVL","SubVF","SubVD",
     "MulVS","MulVI","MulVL","MulVF","MulVD",
-    "CMoveVD",
+    "CMoveVD", "CMoveVF",
     "DivVF","DivVD",
     "AbsVF","AbsVD",
     "NegVF","NegVD",
--- a/src/hotspot/share/opto/c2_globals.hpp	Tue Dec 05 21:26:11 2017 +0530
+++ b/src/hotspot/share/opto/c2_globals.hpp	Tue Dec 05 09:49:23 2017 -0800
@@ -195,6 +195,9 @@
   product(bool, UseSubwordForMaxVector, true,                               \
           "Use Subword Analysis to set maximum vector size")                \
                                                                             \
+  product(bool, UseVectorCmov, false,                                       \
+          "Use Vectorized Cmov")                                            \
+                                                                            \
   develop(intx, UnrollLimitForProfileCheck, 1,                              \
           "Don't use profile_trip_cnt() to restrict unrolling until "       \
           "unrolling would push the number of unrolled iterations above "   \
--- a/src/hotspot/share/opto/classes.hpp	Tue Dec 05 21:26:11 2017 +0530
+++ b/src/hotspot/share/opto/classes.hpp	Tue Dec 05 09:49:23 2017 -0800
@@ -66,6 +66,7 @@
 macro(CMoveD)
 macro(CMoveVD)
 macro(CMoveF)
+macro(CMoveVF)
 macro(CMoveI)
 macro(CMoveL)
 macro(CMoveP)
--- a/src/hotspot/share/opto/loopopts.cpp	Tue Dec 05 21:26:11 2017 +0530
+++ b/src/hotspot/share/opto/loopopts.cpp	Tue Dec 05 09:49:23 2017 -0800
@@ -528,13 +528,12 @@
     BasicType bt = phi->type()->basic_type();
     switch (bt) {
     case T_DOUBLE:
+    case T_FLOAT:
       if (C->use_cmove()) {
         continue; //TODO: maybe we want to add some cost
       }
-    case T_FLOAT: {
       cost += Matcher::float_cmove_cost(); // Could be very expensive
       break;
-    }
     case T_LONG: {
       cost += Matcher::long_cmove_cost(); // May encodes as 2 CMOV's
     }
@@ -613,8 +612,9 @@
   }
   // Check for highly predictable branch.  No point in CMOV'ing if
   // we are going to predict accurately all the time.
-  if (C->use_cmove() && cmp_op == Op_CmpD) ;//keep going
-  else if (iff->_prob < infrequent_prob ||
+  if (C->use_cmove() && (cmp_op == Op_CmpF || cmp_op == Op_CmpD)) {
+    //keep going
+  } else if (iff->_prob < infrequent_prob ||
       iff->_prob > (1.0f - infrequent_prob))
     return NULL;
 
--- a/src/hotspot/share/opto/matcher.cpp	Tue Dec 05 21:26:11 2017 +0530
+++ b/src/hotspot/share/opto/matcher.cpp	Tue Dec 05 09:49:23 2017 -0800
@@ -2267,6 +2267,7 @@
       case Op_CMoveL:
       case Op_CMoveN:
       case Op_CMoveP:
+      case Op_CMoveVF:
       case Op_CMoveVD:  {
         // Restructure into a binary tree for Matching.  It's possible that
         // we could move this code up next to the graph reshaping for IfNodes
--- a/src/hotspot/share/opto/superword.cpp	Tue Dec 05 21:26:11 2017 +0530
+++ b/src/hotspot/share/opto/superword.cpp	Tue Dec 05 09:49:23 2017 -0800
@@ -58,7 +58,7 @@
   _mem_slice_tail(arena(), 8,  0, NULL),  // memory slice tails
   _node_info(arena(), 8,  0, SWNodeInfo::initial), // info needed per node
   _clone_map(phase->C->clone_map()),      // map of nodes created in cloning
-  _cmovev_kit(_arena, this),              // map to facilitate CMoveVD creation
+  _cmovev_kit(_arena, this),              // map to facilitate CMoveV creation
   _align_to_ref(NULL),                    // memory reference to align vectors to
   _disjoint_ptrs(arena(), 8,  0, OrderedPair::initial), // runtime disambiguated pointer pairs
   _dg(_arena),                            // dependence graph
@@ -511,8 +511,7 @@
     combine_packs();
 
     construct_my_pack_map();
-
-    if (_do_vector_loop) {
+    if (UseVectorCmov) {
       merge_packs_to_cmovd();
     }
 
@@ -1249,8 +1248,8 @@
 
 //------------------------------data_size---------------------------
 int SuperWord::data_size(Node* s) {
-  Node* use = NULL; //test if the node is a candidate for CMoveVD optimization, then return the size of CMov
-  if (_do_vector_loop) {
+  Node* use = NULL; //test if the node is a candidate for CMoveV optimization, then return the size of CMov
+  if (UseVectorCmov) {
     use = _cmovev_kit.is_Bool_candidate(s);
     if (use != NULL) {
       return data_size(use);
@@ -1260,6 +1259,7 @@
       return data_size(use);
     }
   }
+
   int bsize = type2aelembytes(velt_basic_type(s));
   assert(bsize != 0, "valid size");
   return bsize;
@@ -1718,6 +1718,9 @@
   if (!cmovd->is_CMove()) {
     return NULL;
   }
+  if (cmovd->Opcode() != Op_CMoveF && cmovd->Opcode() != Op_CMoveD) {
+    return NULL;
+  }
   if (pack(cmovd) != NULL) { // already in the cmov pack
     return NULL;
   }
@@ -2377,7 +2380,13 @@
         }
         BasicType bt = velt_basic_type(n);
         const TypeVect* vt = TypeVect::make(bt, vlen);
-        vn = new CMoveVDNode(cc, src1, src2, vt);
+        assert(bt == T_FLOAT || bt == T_DOUBLE, "Only vectorization for FP cmovs is supported");
+        if (bt == T_FLOAT) {
+          vn = new CMoveVFNode(cc, src1, src2, vt);
+        } else {
+          assert(bt == T_DOUBLE, "Expected double");
+          vn = new CMoveVDNode(cc, src1, src2, vt);
+        }
         NOT_PRODUCT(if(is_trace_cmov()) {tty->print("SWPointer::output: created new CMove node %d: ", vn->_idx); vn->dump();})
       } else if (opc == Op_FmaD || opc == Op_FmaF) {
         // Promote operands to vector
--- a/src/hotspot/share/opto/vectornode.cpp	Tue Dec 05 21:26:11 2017 +0530
+++ b/src/hotspot/share/opto/vectornode.cpp	Tue Dec 05 09:49:23 2017 -0800
@@ -92,6 +92,9 @@
   case Op_FmaF:
     assert(bt == T_FLOAT, "must be");
     return Op_FmaVF;
+  case Op_CMoveF:
+    assert(bt == T_FLOAT, "must be");
+    return Op_CMoveVF;
   case Op_CMoveD:
     assert(bt == T_DOUBLE, "must be");
     return Op_CMoveVD;
--- a/src/hotspot/share/opto/vectornode.hpp	Tue Dec 05 21:26:11 2017 +0530
+++ b/src/hotspot/share/opto/vectornode.hpp	Tue Dec 05 09:49:23 2017 -0800
@@ -277,8 +277,16 @@
   virtual int Opcode() const;
 };
 
+//------------------------------CMoveVFNode--------------------------------------
+// Vector float conditional move
+class CMoveVFNode : public VectorNode {
+public:
+  CMoveVFNode(Node* in1, Node* in2, Node* in3, const TypeVect* vt) : VectorNode(in1, in2, in3, vt) {}
+  virtual int Opcode() const;
+};
+
 //------------------------------CMoveVDNode--------------------------------------
-// Vector multiply double
+// Vector double conditional move
 class CMoveVDNode : public VectorNode {
 public:
   CMoveVDNode(Node* in1, Node* in2, Node* in3, const TypeVect* vt) : VectorNode(in1, in2, in3, vt) {}
--- a/src/hotspot/share/runtime/vmStructs.cpp	Tue Dec 05 21:26:11 2017 +0530
+++ b/src/hotspot/share/runtime/vmStructs.cpp	Tue Dec 05 09:49:23 2017 -0800
@@ -1991,6 +1991,7 @@
   declare_c2_type(MulVDNode, VectorNode)                                  \
   declare_c2_type(FmaVDNode, VectorNode)                                  \
   declare_c2_type(FmaVFNode, VectorNode)                                  \
+  declare_c2_type(CMoveVFNode, VectorNode)                                \
   declare_c2_type(CMoveVDNode, VectorNode)                                \
   declare_c2_type(MulReductionVDNode, ReductionNode)                      \
   declare_c2_type(DivVFNode, VectorNode)                                  \