8181616: FMA Vectorization on x86
authorvdeshpande
Wed, 07 Jun 2017 13:09:46 -0700
changeset 46528 cf0da758e7b5
parent 46527 df19f7e4b9f7
child 46529 e473f49d42a7
8181616: FMA Vectorization on x86 Reviewed-by: kvn
hotspot/src/cpu/x86/vm/assembler_x86.cpp
hotspot/src/cpu/x86/vm/assembler_x86.hpp
hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
hotspot/src/cpu/x86/vm/vm_version_x86.cpp
hotspot/src/cpu/x86/vm/vm_version_x86.hpp
hotspot/src/cpu/x86/vm/x86.ad
hotspot/src/share/vm/adlc/formssel.cpp
hotspot/src/share/vm/opto/classes.hpp
hotspot/src/share/vm/opto/matcher.cpp
hotspot/src/share/vm/opto/superword.cpp
hotspot/src/share/vm/opto/vectornode.cpp
hotspot/src/share/vm/opto/vectornode.hpp
hotspot/src/share/vm/runtime/vmStructs.cpp
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Wed Jun 07 13:09:46 2017 -0700
@@ -5092,6 +5092,42 @@
   emit_operand(dst, src);
 }
 
+void Assembler::vfmadd231pd(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len) {
+  assert(VM_Version::supports_fma(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int8((unsigned char)0xB8);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::vfmadd231ps(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len) {
+  assert(VM_Version::supports_fma(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int8((unsigned char)0xB8);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::vfmadd231pd(XMMRegister dst, XMMRegister src1, Address src2, int vector_len) {
+  assert(VM_Version::supports_fma(), "");
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
+  vex_prefix(src2, src1->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int8((unsigned char)0xB8);
+  emit_operand(dst, src2);
+}
+
+void Assembler::vfmadd231ps(XMMRegister dst, XMMRegister src1, Address src2, int vector_len) {
+  assert(VM_Version::supports_fma(), "");
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
+  vex_prefix(src2, src1->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int8((unsigned char)0xB8);
+  emit_operand(dst, src2);
+}
+
 void Assembler::divpd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Wed Jun 07 13:09:46 2017 -0700
@@ -1906,6 +1906,11 @@
   void vmulpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
   void vmulps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
+  void vfmadd231pd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vfmadd231ps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vfmadd231pd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+  void vfmadd231ps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+
   // Divide Packed Floating-Point Values
   void divpd(XMMRegister dst, XMMRegister src);
   void divps(XMMRegister dst, XMMRegister src);
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Wed Jun 07 13:09:46 2017 -0700
@@ -3165,8 +3165,37 @@
   }
 }
 
-
-
+// dst = c = a * b + c
+void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
+  Assembler::vfmadd231pd(c, a, b, vector_len);
+  if (dst != c) {
+    vmovdqu(dst, c);
+  }
+}
+
+// dst = c = a * b + c
+void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
+  Assembler::vfmadd231ps(c, a, b, vector_len);
+  if (dst != c) {
+    vmovdqu(dst, c);
+  }
+}
+
+// dst = c = a * b + c
+void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
+  Assembler::vfmadd231pd(c, a, b, vector_len);
+  if (dst != c) {
+    vmovdqu(dst, c);
+  }
+}
+
+// dst = c = a * b + c
+void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
+  Assembler::vfmadd231ps(c, a, b, vector_len);
+  if (dst != c) {
+    vmovdqu(dst, c);
+  }
+}
 
 void MacroAssembler::incrementl(AddressLiteral dst) {
   if (reachable(dst)) {
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp	Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp	Wed Jun 07 13:09:46 2017 -0700
@@ -456,6 +456,11 @@
   void fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
   void fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
 
+  void vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len);
+  void vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len);
+  void vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len);
+  void vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len);
+
 
   // same as fcmp2int, but using SSE2
   void cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less);
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp	Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp	Wed Jun 07 13:09:46 2017 -0700
@@ -812,7 +812,7 @@
     FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
   }
 
-  if (supports_fma() && UseSSE >= 2) {
+  if (supports_fma()) {
     if (FLAG_IS_DEFAULT(UseFMA)) {
       UseFMA = true;
     }
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp	Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp	Wed Jun 07 13:09:46 2017 -0700
@@ -732,7 +732,7 @@
   static bool supports_avx256only() { return (supports_avx2() && !supports_evex()); }
   static bool supports_avxonly()    { return ((supports_avx2() || supports_avx()) && !supports_evex()); }
   static bool supports_sha()        { return (_features & CPU_SHA) != 0; }
-  static bool supports_fma()        { return (_features & CPU_FMA) != 0; }
+  static bool supports_fma()        { return (_features & CPU_FMA) != 0 && supports_avx(); }
   static bool supports_vzeroupper() { return (_features & CPU_VZEROUPPER) != 0; }
 
   // Intel features
--- a/hotspot/src/cpu/x86/vm/x86.ad	Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/cpu/x86/vm/x86.ad	Wed Jun 07 13:09:46 2017 -0700
@@ -10520,3 +10520,161 @@
   ins_pipe( pipe_slow );
 %}
 
+// --------------------------------- FMA --------------------------------------
+
+// a * b + c
+instruct vfma2D_reg(vecX a, vecX b, vecX c) %{
+  predicate(UseFMA && n->as_Vector()->length() == 2);
+  match(Set c (FmaVD  c (Binary a b)));
+  format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
+  ins_cost(150);
+  ins_encode %{
+    int vector_len = 0;
+    __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// a * b + c
+instruct vfma2D_mem(vecX a, memory b, vecX c) %{
+  predicate(UseFMA && n->as_Vector()->length() == 2);
+  match(Set c (FmaVD  c (Binary a (LoadVector b))));
+  format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
+  ins_cost(150);
+  ins_encode %{
+    int vector_len = 0;
+    __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+
+// a * b + c
+instruct vfma4D_reg(vecY a, vecY b, vecY c) %{
+  predicate(UseFMA && n->as_Vector()->length() == 4);
+  match(Set c (FmaVD  c (Binary a b)));
+  format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
+  ins_cost(150);
+  ins_encode %{
+    int vector_len = 1;
+    __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// a * b + c
+instruct vfma4D_mem(vecY a, memory b, vecY c) %{
+  predicate(UseFMA && n->as_Vector()->length() == 4);
+  match(Set c (FmaVD  c (Binary a (LoadVector b))));
+  format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
+  ins_cost(150);
+  ins_encode %{
+    int vector_len = 1;
+    __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// a * b + c
+instruct vfma8D_reg(vecZ a, vecZ b, vecZ c) %{
+  predicate(UseFMA && n->as_Vector()->length() == 8);
+  match(Set c (FmaVD  c (Binary a b)));
+  format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
+  ins_cost(150);
+  ins_encode %{
+    int vector_len = 2;
+    __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// a * b + c
+instruct vfma8D_mem(vecZ a, memory b, vecZ c) %{
+  predicate(UseFMA && n->as_Vector()->length() == 8);
+  match(Set c (FmaVD  c (Binary a (LoadVector b))));
+  format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
+  ins_cost(150);
+  ins_encode %{
+    int vector_len = 2;
+    __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// a * b + c
+instruct vfma4F_reg(vecX a, vecX b, vecX c) %{
+  predicate(UseFMA && n->as_Vector()->length() == 4);
+  match(Set c (FmaVF  c (Binary a b)));
+  format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
+  ins_cost(150);
+  ins_encode %{
+    int vector_len = 0;
+    __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// a * b + c
+instruct vfma4F_mem(vecX a, memory b, vecX c) %{
+  predicate(UseFMA && n->as_Vector()->length() == 4);
+  match(Set c (FmaVF  c (Binary a (LoadVector b))));
+  format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
+  ins_cost(150);
+  ins_encode %{
+    int vector_len = 0;
+    __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// a * b + c
+instruct vfma8F_reg(vecY a, vecY b, vecY c) %{
+  predicate(UseFMA && n->as_Vector()->length() == 8);
+  match(Set c (FmaVF  c (Binary a b)));
+  format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
+  ins_cost(150);
+  ins_encode %{
+    int vector_len = 1;
+    __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// a * b + c
+instruct vfma8F_mem(vecY a, memory b, vecY c) %{
+  predicate(UseFMA && n->as_Vector()->length() == 8);
+  match(Set c (FmaVF  c (Binary a (LoadVector b))));
+  format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
+  ins_cost(150);
+  ins_encode %{
+    int vector_len = 1;
+    __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// a * b + c
+instruct vfma16F_reg(vecZ a, vecZ b, vecZ c) %{
+  predicate(UseFMA && n->as_Vector()->length() == 16);
+  match(Set c (FmaVF  c (Binary a b)));
+  format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
+  ins_cost(150);
+  ins_encode %{
+    int vector_len = 2;
+    __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// a * b + c
+instruct vfma16F_mem(vecZ a, memory b, vecZ c) %{
+  predicate(UseFMA && n->as_Vector()->length() == 16);
+  match(Set c (FmaVF  c (Binary a (LoadVector b))));
+  format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
+  ins_cost(150);
+  ins_encode %{
+    int vector_len = 2;
+    __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
--- a/hotspot/src/share/vm/adlc/formssel.cpp	Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/share/vm/adlc/formssel.cpp	Wed Jun 07 13:09:46 2017 -0700
@@ -4179,6 +4179,7 @@
     "URShiftVB","URShiftVS","URShiftVI","URShiftVL",
     "ReplicateB","ReplicateS","ReplicateI","ReplicateL","ReplicateF","ReplicateD",
     "LoadVector","StoreVector",
+    "FmaVD", "FmaVF",
     // Next are not supported currently.
     "PackB","PackS","PackI","PackL","PackF","PackD","Pack2L","Pack2D",
     "ExtractB","ExtractUB","ExtractC","ExtractS","ExtractI","ExtractL","ExtractF","ExtractD"
--- a/hotspot/src/share/vm/opto/classes.hpp	Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/share/vm/opto/classes.hpp	Wed Jun 07 13:09:46 2017 -0700
@@ -310,6 +310,8 @@
 macro(MulReductionVF)
 macro(MulVD)
 macro(MulReductionVD)
+macro(FmaVD)
+macro(FmaVF)
 macro(DivVF)
 macro(DivVD)
 macro(AbsVF)
--- a/hotspot/src/share/vm/opto/matcher.cpp	Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/share/vm/opto/matcher.cpp	Wed Jun 07 13:09:46 2017 -0700
@@ -977,7 +977,6 @@
   // Use one stack to keep both: child's node/state and parent's node/index
   MStack mstack(max_stack * 2 * 2); // usually: C->live_nodes() * 2 * 2
   mstack.push(n, Visit, NULL, -1);  // set NULL as parent to indicate root
-
   while (mstack.is_nonempty()) {
     C->check_node_count(NodeLimitFudgeFactor, "too many nodes matching instructions");
     if (C->failing()) return NULL;
@@ -2122,6 +2121,8 @@
       case Op_EncodeISOArray:
       case Op_FmaD:
       case Op_FmaF:
+      case Op_FmaVD:
+      case Op_FmaVF:
         set_shared(n); // Force result into register (it will be anyways)
         break;
       case Op_ConP: {  // Convert pointers above the centerline to NUL
@@ -2311,7 +2312,9 @@
         break;
       }
       case Op_FmaD:
-      case Op_FmaF: {
+      case Op_FmaF:
+      case Op_FmaVD:
+      case Op_FmaVF: {
         // Restructure into a binary tree for Matching.
         Node* pair = new BinaryNode(n->in(1), n->in(2));
         n->set_req(2, pair);
--- a/hotspot/src/share/vm/opto/superword.cpp	Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/share/vm/opto/superword.cpp	Wed Jun 07 13:09:46 2017 -0700
@@ -2324,6 +2324,13 @@
         const TypeVect* vt = TypeVect::make(bt, vlen);
         vn = new CMoveVDNode(cc, src1, src2, vt);
         NOT_PRODUCT(if(is_trace_cmov()) {tty->print("SWPointer::output: created new CMove node %d: ", vn->_idx); vn->dump();})
+      } else if (opc == Op_FmaD || opc == Op_FmaF) {
+        // Promote operands to vector
+        Node* in1 = vector_opd(p, 1);
+        Node* in2 = vector_opd(p, 2);
+        Node* in3 = vector_opd(p, 3);
+        vn = VectorNode::make(opc, in1, in2, in3, vlen, velt_basic_type(n));
+        vlen_in_bytes = vn->as_Vector()->length_in_bytes();
       } else {
         if (do_reserve_copy()) {
           NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: ShouldNotReachHere, exiting SuperWord");})
--- a/hotspot/src/share/vm/opto/vectornode.cpp	Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/share/vm/opto/vectornode.cpp	Wed Jun 07 13:09:46 2017 -0700
@@ -86,6 +86,12 @@
   case Op_MulD:
     assert(bt == T_DOUBLE, "must be");
     return Op_MulVD;
+  case Op_FmaD:
+    assert(bt == T_DOUBLE, "must be");
+    return Op_FmaVD;
+  case Op_FmaF:
+    assert(bt == T_FLOAT, "must be");
+    return Op_FmaVF;
   case Op_CMoveD:
     assert(bt == T_DOUBLE, "must be");
     return Op_CMoveVD;
@@ -259,6 +265,11 @@
     *start = 2;
     *end   = n->req();
     break;
+  case Op_FmaD:
+  case Op_FmaF:
+    *start = 1;
+    *end   = 4; // 3 vector operands
+    break;
   default:
     *start = 1;
     *end   = n->req(); // default is all operands
@@ -328,6 +339,19 @@
 
 }
 
+VectorNode* VectorNode::make(int opc, Node* n1, Node* n2, Node* n3, uint vlen, BasicType bt) {
+  const TypeVect* vt = TypeVect::make(bt, vlen);
+  int vopc = VectorNode::opcode(opc, bt);
+  // This method should not be called for unimplemented vectors.
+  guarantee(vopc > 0, "Vector for '%s' is not implemented", NodeClassNames[opc]);
+  switch (vopc) {
+  case Op_FmaVD: return new FmaVDNode(n1, n2, n3, vt);
+  case Op_FmaVF: return new FmaVFNode(n1, n2, n3, vt);
+  }
+  fatal("Missed vector creation for '%s'", NodeClassNames[vopc]);
+  return NULL;
+}
+
 // Scalar promotion
 VectorNode* VectorNode::scalar2vector(Node* s, uint vlen, const Type* opd_t) {
   BasicType bt = opd_t->array_element_basic_type();
--- a/hotspot/src/share/vm/opto/vectornode.hpp	Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/share/vm/opto/vectornode.hpp	Wed Jun 07 13:09:46 2017 -0700
@@ -62,6 +62,7 @@
   static VectorNode* scalar2vector(Node* s, uint vlen, const Type* opd_t);
   static VectorNode* shift_count(Node* shift, Node* cnt, uint vlen, BasicType bt);
   static VectorNode* make(int opc, Node* n1, Node* n2, uint vlen, BasicType bt);
+  static VectorNode* make(int opc, Node* n1, Node* n2, Node* n3, uint vlen, BasicType bt);
 
   static int  opcode(int opc, BasicType bt);
   static bool implemented(int opc, uint vlen, BasicType bt);
@@ -260,6 +261,22 @@
   virtual int Opcode() const;
 };
 
+//------------------------------FmaVDNode--------------------------------------
+// Vector multiply double
+class FmaVDNode : public VectorNode {
+public:
+  FmaVDNode(Node* in1, Node* in2, Node* in3, const TypeVect* vt) : VectorNode(in1, in2, in3, vt) {}
+  virtual int Opcode() const;
+};
+
+//------------------------------FmaVFNode--------------------------------------
+// Vector multiply float
+class FmaVFNode : public VectorNode {
+public:
+  FmaVFNode(Node* in1, Node* in2, Node* in3, const TypeVect* vt) : VectorNode(in1, in2, in3, vt) {}
+  virtual int Opcode() const;
+};
+
 //------------------------------CMoveVDNode--------------------------------------
 // Vector multiply double
 class CMoveVDNode : public VectorNode {
--- a/hotspot/src/share/vm/runtime/vmStructs.cpp	Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/share/vm/runtime/vmStructs.cpp	Wed Jun 07 13:09:46 2017 -0700
@@ -2057,6 +2057,8 @@
   declare_c2_type(MulVFNode, VectorNode)                                  \
   declare_c2_type(MulReductionVFNode, ReductionNode)                      \
   declare_c2_type(MulVDNode, VectorNode)                                  \
+  declare_c2_type(FmaVDNode, VectorNode)                                  \
+  declare_c2_type(FmaVFNode, VectorNode)                                  \
   declare_c2_type(CMoveVDNode, VectorNode)                                \
   declare_c2_type(MulReductionVDNode, ReductionNode)                      \
   declare_c2_type(DivVFNode, VectorNode)                                  \