8154122: Intrinsify fused mac operations
authorvdeshpande
Fri, 26 Aug 2016 12:17:50 -0700
changeset 41323 ddd5600d4762
parent 41321 463ff7705f2f
child 41324 58b801e2b380
8154122: Intrinsify fused mac operations Summary: added FMA intrinsics on x86 Reviewed-by: kvn, aph, darcy
hotspot/src/cpu/aarch64/vm/c1_LIRGenerator_aarch64.cpp
hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp
hotspot/src/cpu/ppc/vm/c1_LIRGenerator_ppc.cpp
hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp
hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp
hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp
hotspot/src/cpu/x86/vm/abstractInterpreter_x86.cpp
hotspot/src/cpu/x86/vm/assembler_x86.cpp
hotspot/src/cpu/x86/vm/assembler_x86.hpp
hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp
hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
hotspot/src/cpu/x86/vm/templateInterpreterGenerator_x86_32.cpp
hotspot/src/cpu/x86/vm/templateInterpreterGenerator_x86_64.cpp
hotspot/src/cpu/x86/vm/vmStructs_x86.hpp
hotspot/src/cpu/x86/vm/vm_version_x86.cpp
hotspot/src/cpu/x86/vm/vm_version_x86.hpp
hotspot/src/cpu/x86/vm/x86.ad
hotspot/src/jdk.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java
hotspot/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot.amd64/src/jdk/vm/ci/hotspot/amd64/AMD64HotSpotJVMCIBackendFactory.java
hotspot/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot.amd64/src/jdk/vm/ci/hotspot/amd64/AMD64HotSpotVMConfig.java
hotspot/src/share/vm/adlc/formssel.cpp
hotspot/src/share/vm/c1/c1_Compiler.cpp
hotspot/src/share/vm/c1/c1_LIR.cpp
hotspot/src/share/vm/c1/c1_LIR.hpp
hotspot/src/share/vm/c1/c1_LIRGenerator.cpp
hotspot/src/share/vm/c1/c1_LIRGenerator.hpp
hotspot/src/share/vm/classfile/vmSymbols.cpp
hotspot/src/share/vm/classfile/vmSymbols.hpp
hotspot/src/share/vm/interpreter/abstractInterpreter.cpp
hotspot/src/share/vm/interpreter/abstractInterpreter.hpp
hotspot/src/share/vm/interpreter/templateInterpreterGenerator.cpp
hotspot/src/share/vm/jvmci/vmStructs_jvmci.cpp
hotspot/src/share/vm/opto/c2compiler.cpp
hotspot/src/share/vm/opto/classes.hpp
hotspot/src/share/vm/opto/library_call.cpp
hotspot/src/share/vm/opto/matcher.cpp
hotspot/src/share/vm/opto/mulnode.cpp
hotspot/src/share/vm/opto/mulnode.hpp
hotspot/src/share/vm/runtime/globals.hpp
hotspot/src/share/vm/runtime/vmStructs.cpp
--- a/hotspot/src/cpu/aarch64/vm/c1_LIRGenerator_aarch64.cpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/c1_LIRGenerator_aarch64.cpp	Fri Aug 26 12:17:50 2016 -0700
@@ -1032,6 +1032,10 @@
   Unimplemented();
 }
 
+void LIRGenerator::do_FmaIntrinsic(Intrinsic* x) {
+  fatal("FMA intrinsic is not implemented on this platform");
+}
+
 void LIRGenerator::do_vectorizedMismatch(Intrinsic* x) {
   fatal("vectorizedMismatch intrinsic is not implemented on this platform");
 }
--- a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Fri Aug 26 12:17:50 2016 -0700
@@ -262,6 +262,11 @@
     FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
   }
 
+  if (UseFMA) {
+    warning("FMA instructions are not available on this CPU");
+    FLAG_SET_DEFAULT(UseFMA, false);
+  }
+
   if (auxv & (HWCAP_SHA1 | HWCAP_SHA2)) {
     if (FLAG_IS_DEFAULT(UseSHA)) {
       FLAG_SET_DEFAULT(UseSHA, true);
--- a/hotspot/src/cpu/ppc/vm/c1_LIRGenerator_ppc.cpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/ppc/vm/c1_LIRGenerator_ppc.cpp	Fri Aug 26 12:17:50 2016 -0700
@@ -1433,6 +1433,10 @@
   }
 }
 
+void LIRGenerator::do_FmaIntrinsic(Intrinsic* x) {
+  fatal("FMA intrinsic is not implemented on this platform");
+}
+
 void LIRGenerator::do_vectorizedMismatch(Intrinsic* x) {
   fatal("vectorizedMismatch intrinsic is not implemented on this platform");
 }
--- a/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp	Fri Aug 26 12:17:50 2016 -0700
@@ -230,6 +230,11 @@
     FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
   }
 
+  if (UseFMA) {
+    warning("FMA instructions are not available on this CPU");
+    FLAG_SET_DEFAULT(UseFMA, false);
+  }
+
   if (UseSHA) {
     warning("SHA instructions are not available on this CPU");
     FLAG_SET_DEFAULT(UseSHA, false);
--- a/hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp	Fri Aug 26 12:17:50 2016 -0700
@@ -953,6 +953,10 @@
   }
 }
 
+void LIRGenerator::do_FmaIntrinsic(Intrinsic* x) {
+  fatal("FMA intrinsic is not implemented on this platform");
+}
+
 void LIRGenerator::do_vectorizedMismatch(Intrinsic* x) {
   fatal("vectorizedMismatch intrinsic is not implemented on this platform");
 }
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp	Fri Aug 26 12:17:50 2016 -0700
@@ -266,6 +266,11 @@
     FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
   }
 
+  if (UseFMA) {
+    warning("FMA instructions are not available on this CPU");
+    FLAG_SET_DEFAULT(UseFMA, false);
+  }
+
   // SHA1, SHA256, and SHA512 instructions were added to SPARC T-series at different times
   if (has_sha1() || has_sha256() || has_sha512()) {
     if (UseVIS > 0) { // SHA intrinsics use VIS1 instructions
--- a/hotspot/src/cpu/x86/vm/abstractInterpreter_x86.cpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/abstractInterpreter_x86.cpp	Fri Aug 26 12:17:50 2016 -0700
@@ -172,7 +172,9 @@
     case Interpreter::java_lang_math_log10   : // fall thru
     case Interpreter::java_lang_math_sqrt    : // fall thru
     case Interpreter::java_lang_math_pow     : // fall thru
-    case Interpreter::java_lang_math_exp     :
+    case Interpreter::java_lang_math_exp     : // fall thru
+    case Interpreter::java_lang_math_fmaD    : // fall thru
+    case Interpreter::java_lang_math_fmaF    :
       return false;
     default:
       return true;
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Fri Aug 26 12:17:50 2016 -0700
@@ -4769,6 +4769,22 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
+void Assembler::vfmadd231sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
+  assert(VM_Version::supports_fma(), "");
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int8((unsigned char)0xB9);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::vfmadd231ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
+  assert(VM_Version::supports_fma(), "");
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int8((unsigned char)0xB9);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   InstructionMark im(this);
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Fri Aug 26 12:17:50 2016 -0700
@@ -1860,6 +1860,8 @@
   void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
   void vdivss(XMMRegister dst, XMMRegister nds, Address src);
   void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vfmadd231sd(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vfmadd231ss(XMMRegister dst, XMMRegister nds, XMMRegister src);
   void vmulsd(XMMRegister dst, XMMRegister nds, Address src);
   void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
   void vmulss(XMMRegister dst, XMMRegister nds, Address src);
--- a/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp	Fri Aug 26 12:17:50 2016 -0700
@@ -1345,6 +1345,18 @@
                       op->result_opr(),
                       op->info());
       break;
+    case lir_fmad:
+      __ fmad(op->result_opr()->as_xmm_double_reg(),
+              op->in_opr1()->as_xmm_double_reg(),
+              op->in_opr2()->as_xmm_double_reg(),
+              op->in_opr3()->as_xmm_double_reg());
+      break;
+    case lir_fmaf:
+      __ fmaf(op->result_opr()->as_xmm_float_reg(),
+              op->in_opr1()->as_xmm_float_reg(),
+              op->in_opr2()->as_xmm_float_reg(),
+              op->in_opr3()->as_xmm_float_reg());
+      break;
     default:      ShouldNotReachHere(); break;
   }
 }
--- a/hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp	Fri Aug 26 12:17:50 2016 -0700
@@ -806,6 +806,32 @@
   }
 }
 
+void LIRGenerator::do_FmaIntrinsic(Intrinsic* x) {
+  assert(x->number_of_arguments() == 3, "wrong type");
+  assert(UseFMA, "Needs FMA instructions support.");
+  LIRItem value(x->argument_at(0), this);
+  LIRItem value1(x->argument_at(1), this);
+  LIRItem value2(x->argument_at(2), this);
+
+  value2.set_destroys_register();
+
+  value.load_item();
+  value1.load_item();
+  value2.load_item();
+
+  LIR_Opr calc_input = value.result();
+  LIR_Opr calc_input1 = value1.result();
+  LIR_Opr calc_input2 = value2.result();
+  LIR_Opr calc_result = rlock_result(x);
+
+  switch (x->id()) {
+  case vmIntrinsics::_fmaD:   __ fmad(calc_input, calc_input1, calc_input2, calc_result); break;
+  case vmIntrinsics::_fmaF:   __ fmaf(calc_input, calc_input1, calc_input2, calc_result); break;
+  default:                    ShouldNotReachHere();
+  }
+
+}
+
 
 void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
   assert(x->number_of_arguments() == 1 || (x->number_of_arguments() == 2 && x->id() == vmIntrinsics::_dpow), "wrong type");
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Fri Aug 26 12:17:50 2016 -0700
@@ -3147,6 +3147,24 @@
   fpop();
 }
 
+// dst = c = a * b + c
+void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
+  Assembler::vfmadd231sd(c, a, b);
+  if (dst != c) {
+    movdbl(dst, c);
+  }
+}
+
+// dst = c = a * b + c
+void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
+  Assembler::vfmadd231ss(c, a, b);
+  if (dst != c) {
+    movflt(dst, c);
+  }
+}
+
+
+
 
 void MacroAssembler::incrementl(AddressLiteral dst) {
   if (reachable(dst)) {
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp	Fri Aug 26 12:17:50 2016 -0700
@@ -449,6 +449,10 @@
   // tmp is a temporary register, if none is available use noreg
   void fremr(Register tmp);
 
+  // dst = c = a * b + c
+  void fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
+  void fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
+
 
   // same as fcmp2int, but using SSE2
   void cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less);
--- a/hotspot/src/cpu/x86/vm/templateInterpreterGenerator_x86_32.cpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/templateInterpreterGenerator_x86_32.cpp	Fri Aug 26 12:17:50 2016 -0700
@@ -341,6 +341,27 @@
   //        [ lo(arg) ]
   //        [ hi(arg) ]
   //
+  if (kind == Interpreter::java_lang_math_fmaD) {
+    __ movdbl(xmm2, Address(rsp, 5 * wordSize));
+    __ movdbl(xmm1, Address(rsp, 3 * wordSize));
+    __ movdbl(xmm0, Address(rsp, 1 * wordSize));
+    __ fmad(xmm0, xmm1, xmm2, xmm0);
+    __ pop(rdi);                               // get return address
+    __ mov(rsp, rsi);                          // set sp to sender sp
+    __ jmp(rdi);
+
+    return entry_point;
+  } else if (kind == Interpreter::java_lang_math_fmaF) {
+    __ movflt(xmm2, Address(rsp, 3 * wordSize));
+    __ movflt(xmm1, Address(rsp, 2 * wordSize));
+    __ movflt(xmm0, Address(rsp, 1 * wordSize));
+    __ fmaf(xmm0, xmm1, xmm2, xmm0);
+    __ pop(rdi);                               // get return address
+    __ mov(rsp, rsi);                          // set sp to sender sp
+    __ jmp(rdi);
+
+    return entry_point;
+ }
 
   __ fld_d(Address(rsp, 1*wordSize));
   switch (kind) {
--- a/hotspot/src/cpu/x86/vm/templateInterpreterGenerator_x86_64.cpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/templateInterpreterGenerator_x86_64.cpp	Fri Aug 26 12:17:50 2016 -0700
@@ -369,8 +369,17 @@
   //        [ hi(arg) ]
   //
 
-
-  if (kind == Interpreter::java_lang_math_sqrt) {
+  if (kind == Interpreter::java_lang_math_fmaD) {
+    __ movdbl(xmm0, Address(rsp, wordSize));
+    __ movdbl(xmm1, Address(rsp, 3 * wordSize));
+    __ movdbl(xmm2, Address(rsp, 5 * wordSize));
+    __ fmad(xmm0, xmm1, xmm2, xmm0);
+  } else if (kind == Interpreter::java_lang_math_fmaF) {
+    __ movflt(xmm0, Address(rsp, wordSize));
+    __ movflt(xmm1, Address(rsp, 2 * wordSize));
+    __ movflt(xmm2, Address(rsp, 3 * wordSize));
+    __ fmaf(xmm0, xmm1, xmm2, xmm0);
+  } else if (kind == Interpreter::java_lang_math_sqrt) {
     __ sqrtsd(xmm0, Address(rsp, wordSize));
   } else if (kind == Interpreter::java_lang_math_exp) {
     __ movdbl(xmm0, Address(rsp, wordSize));
--- a/hotspot/src/cpu/x86/vm/vmStructs_x86.hpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/vmStructs_x86.hpp	Fri Aug 26 12:17:50 2016 -0700
@@ -73,6 +73,7 @@
 #define VM_LONG_CONSTANTS_CPU(declare_constant, declare_preprocessor_constant, declare_c1_constant, declare_c2_constant, declare_c2_preprocessor_constant) \
   declare_preprocessor_constant("VM_Version::CPU_AVX512BW", CPU_AVX512BW) \
   declare_preprocessor_constant("VM_Version::CPU_AVX512VL", CPU_AVX512VL) \
-  declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA)
+  declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA)           \
+  declare_preprocessor_constant("VM_Version::CPU_FMA", CPU_FMA)
 
 #endif // CPU_X86_VM_VMSTRUCTS_X86_HPP
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp	Fri Aug 26 12:17:50 2016 -0700
@@ -578,7 +578,7 @@
   }
 
   char buf[256];
-  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
                cores_per_cpu(), threads_per_core(),
                cpu_family(), _model, _stepping,
                (supports_cmov() ? ", cmov" : ""),
@@ -610,7 +610,8 @@
                (supports_bmi2() ? ", bmi2" : ""),
                (supports_adx() ? ", adx" : ""),
                (supports_evex() ? ", evex" : ""),
-               (supports_sha() ? ", sha" : ""));
+               (supports_sha() ? ", sha" : ""),
+               (supports_fma() ? ", fma" : ""));
   _features_string = os::strdup(buf);
 
   // UseSSE is set to the smaller of what hardware supports and what
@@ -732,6 +733,15 @@
     FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
   }
 
+  if (supports_fma() && UseSSE >= 2) {
+    if (FLAG_IS_DEFAULT(UseFMA)) {
+      UseFMA = true;
+    }
+  } else if (UseFMA) {
+    warning("FMA instructions are not available on this CPU");
+    FLAG_SET_DEFAULT(UseFMA, false);
+  }
+
   if (supports_sha() LP64_ONLY(|| supports_avx2() && supports_bmi2())) {
     if (FLAG_IS_DEFAULT(UseSHA)) {
       UseSHA = true;
@@ -773,7 +783,6 @@
     FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
   }
 
-  // Adjust RTM (Restricted Transactional Memory) flags
   if (!supports_rtm() && UseRTMLocking) {
     // Can't continue because UseRTMLocking affects UseBiasedLocking flag
     // setting during arguments processing. See use_biased_locking().
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp	Fri Aug 26 12:17:50 2016 -0700
@@ -74,7 +74,8 @@
                         : 1,
                ssse3    : 1,
                cid      : 1,
-                        : 2,
+                        : 1,
+               fma      : 1,
                cmpxchg16: 1,
                         : 4,
                dca      : 1,
@@ -289,6 +290,7 @@
 #define CPU_AVX512BW ((uint64_t)UCONST64(0x100000000)) // enums are limited to 31 bit
 #define CPU_AVX512VL ((uint64_t)UCONST64(0x200000000)) // EVEX instructions with smaller vector length
 #define CPU_SHA ((uint64_t)UCONST64(0x400000000))      // SHA instructions
+#define CPU_FMA ((uint64_t)UCONST64(0x800000000))      // FMA instructions
 
   enum Extended_Family {
     // AMD
@@ -522,6 +524,8 @@
         result |= CPU_SHA;
       if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
         result |= CPU_LZCNT;
+      if (_cpuid_info.std_cpuid1_ecx.bits.fma != 0)
+        result |= CPU_FMA;
       // for Intel, ecx.bits.misalignsse bit (bit 8) indicates support for prefetchw
       if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
         result |= CPU_3DNOW_PREFETCH;
@@ -726,6 +730,7 @@
   static bool supports_avx256only() { return (supports_avx2() && !supports_evex()); }
   static bool supports_avxonly()    { return ((supports_avx2() || supports_avx()) && !supports_evex()); }
   static bool supports_sha()        { return (_features & CPU_SHA) != 0; }
+  static bool supports_fma()        { return (_features & CPU_FMA) != 0; }
   // Intel features
   static bool is_intel_family_core() { return is_intel() &&
                                        extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }
--- a/hotspot/src/cpu/x86/vm/x86.ad	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/x86.ad	Fri Aug 26 12:17:50 2016 -0700
@@ -3113,6 +3113,30 @@
   ins_pipe(pipe_slow);
 %}
 
+// a * b + c
+instruct fmaD_reg(regD a, regD b, regD c) %{
+  predicate(UseFMA);
+  match(Set c (FmaD  c (Binary a b)));
+  format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
+  ins_cost(150);
+  ins_encode %{
+    __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// a * b + c
+instruct fmaF_reg(regF a, regF b, regF c) %{
+  predicate(UseFMA);
+  match(Set c (FmaF  c (Binary a b)));
+  format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
+  ins_cost(150);
+  ins_encode %{
+    __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // ====================VECTOR INSTRUCTIONS=====================================
 
 // Load vectors (4 bytes long)
--- a/hotspot/src/jdk.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/jdk.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java	Fri Aug 26 12:17:50 2016 -0700
@@ -205,7 +205,8 @@
         AVX512CD,
         AVX512BW,
         AVX512VL,
-        SHA
+        SHA,
+        FMA
     }
 
     private final EnumSet<CPUFeature> features;
--- a/hotspot/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot.amd64/src/jdk/vm/ci/hotspot/amd64/AMD64HotSpotJVMCIBackendFactory.java	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot.amd64/src/jdk/vm/ci/hotspot/amd64/AMD64HotSpotJVMCIBackendFactory.java	Fri Aug 26 12:17:50 2016 -0700
@@ -124,6 +124,9 @@
         if ((config.vmVersionFeatures & config.amd64SHA) != 0) {
             features.add(AMD64.CPUFeature.SHA);
         }
+        if ((config.vmVersionFeatures & config.amd64FMA) != 0) {
+            features.add(AMD64.CPUFeature.FMA);
+        }
         return features;
     }
 
--- a/hotspot/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot.amd64/src/jdk/vm/ci/hotspot/amd64/AMD64HotSpotVMConfig.java	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot.amd64/src/jdk/vm/ci/hotspot/amd64/AMD64HotSpotVMConfig.java	Fri Aug 26 12:17:50 2016 -0700
@@ -78,4 +78,5 @@
     final long amd64AVX512BW = getConstant("VM_Version::CPU_AVX512BW", Long.class);
     final long amd64AVX512VL = getConstant("VM_Version::CPU_AVX512VL", Long.class);
     final long amd64SHA = getConstant("VM_Version::CPU_SHA", Long.class);
+    final long amd64FMA = getConstant("VM_Version::CPU_FMA", Long.class);
 }
--- a/hotspot/src/share/vm/adlc/formssel.cpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/adlc/formssel.cpp	Fri Aug 26 12:17:50 2016 -0700
@@ -4038,6 +4038,8 @@
         strcmp(opType,"EncodeP")==0 ||
         strcmp(opType,"EncodePKlass")==0 ||
         strcmp(opType,"DecodeNKlass")==0 ||
+        strcmp(opType,"FmaD") == 0 ||
+        strcmp(opType,"FmaF") == 0 ||
         strcmp(opType,"RoundDouble")==0 ||
         strcmp(opType,"RoundFloat")==0 ||
         strcmp(opType,"ReverseBytesI")==0 ||
--- a/hotspot/src/share/vm/c1/c1_Compiler.cpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/c1/c1_Compiler.cpp	Fri Aug 26 12:17:50 2016 -0700
@@ -162,6 +162,8 @@
   case vmIntrinsics::_dlog10:
   case vmIntrinsics::_dexp:
   case vmIntrinsics::_dpow:
+  case vmIntrinsics::_fmaD:
+  case vmIntrinsics::_fmaF:
   case vmIntrinsics::_getObject:
   case vmIntrinsics::_getBoolean:
   case vmIntrinsics::_getByte:
--- a/hotspot/src/share/vm/c1/c1_LIR.cpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/c1/c1_LIR.cpp	Fri Aug 26 12:17:50 2016 -0700
@@ -666,7 +666,9 @@
 
 // LIR_Op3
     case lir_idiv:
-    case lir_irem: {
+    case lir_irem:
+    case lir_fmad:
+    case lir_fmaf: {
       assert(op->as_Op3() != NULL, "must be");
       LIR_Op3* op3= (LIR_Op3*)op;
 
@@ -1663,6 +1665,8 @@
      // LIR_Op3
      case lir_idiv:                  s = "idiv";          break;
      case lir_irem:                  s = "irem";          break;
+     case lir_fmad:                  s = "fmad";          break;
+     case lir_fmaf:                  s = "fmaf";          break;
      // LIR_OpJavaCall
      case lir_static_call:           s = "static";        break;
      case lir_optvirtual_call:       s = "optvirtual";    break;
--- a/hotspot/src/share/vm/c1/c1_LIR.hpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/c1/c1_LIR.hpp	Fri Aug 26 12:17:50 2016 -0700
@@ -956,6 +956,8 @@
   , begin_op3
       , lir_idiv
       , lir_irem
+      , lir_fmad
+      , lir_fmaf
   , end_op3
   , begin_opJavaCall
       , lir_static_call
@@ -2149,6 +2151,8 @@
 
   void abs (LIR_Opr from, LIR_Opr to, LIR_Opr tmp)                { append(new LIR_Op2(lir_abs , from, tmp, to)); }
   void sqrt(LIR_Opr from, LIR_Opr to, LIR_Opr tmp)                { append(new LIR_Op2(lir_sqrt, from, tmp, to)); }
+  void fmad(LIR_Opr from, LIR_Opr from1, LIR_Opr from2, LIR_Opr to) { append(new LIR_Op3(lir_fmad, from, from1, from2, to)); }
+  void fmaf(LIR_Opr from, LIR_Opr from1, LIR_Opr from2, LIR_Opr to) { append(new LIR_Op3(lir_fmaf, from, from1, from2, to)); }
   void log10 (LIR_Opr from, LIR_Opr to, LIR_Opr tmp)              { append(new LIR_Op2(lir_log10, from, LIR_OprFact::illegalOpr, to, tmp)); }
   void tan (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_tan , from, tmp1, to, tmp2)); }
 
--- a/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp	Fri Aug 26 12:17:50 2016 -0700
@@ -3147,6 +3147,9 @@
   case vmIntrinsics::_dpow :          do_MathIntrinsic(x); break;
   case vmIntrinsics::_arraycopy:      do_ArrayCopy(x);     break;
 
+  case vmIntrinsics::_fmaD:           do_FmaIntrinsic(x); break;
+  case vmIntrinsics::_fmaF:           do_FmaIntrinsic(x); break;
+
   // java.nio.Buffer.checkIndex
   case vmIntrinsics::_checkIndex:     do_NIOCheckIndex(x); break;
 
--- a/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp	Fri Aug 26 12:17:50 2016 -0700
@@ -245,6 +245,7 @@
   void do_isPrimitive(Intrinsic* x);
   void do_getClass(Intrinsic* x);
   void do_currentThread(Intrinsic* x);
+  void do_FmaIntrinsic(Intrinsic* x);
   void do_MathIntrinsic(Intrinsic* x);
   void do_LibmIntrinsic(Intrinsic* x);
   void do_ArrayCopy(Intrinsic* x);
--- a/hotspot/src/share/vm/classfile/vmSymbols.cpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/classfile/vmSymbols.cpp	Fri Aug 26 12:17:50 2016 -0700
@@ -355,6 +355,8 @@
   case vmIntrinsics::_updateBytesCRC32:
   case vmIntrinsics::_updateByteBufferCRC32:
   case vmIntrinsics::_vectorizedMismatch:
+  case vmIntrinsics::_fmaD:
+  case vmIntrinsics::_fmaF:
     return true;
   default:
     return false;
@@ -387,6 +389,8 @@
   case vmIntrinsics::_updateBytesCRC32:
   case vmIntrinsics::_updateByteBufferCRC32:
   case vmIntrinsics::_vectorizedMismatch:
+  case vmIntrinsics::_fmaD:
+  case vmIntrinsics::_fmaF:
     return false;
   default:
     return true;
@@ -535,6 +539,10 @@
   case vmIntrinsics::_doubleToLongBits:
     if (!InlineMathNatives) return true;
     break;
+  case vmIntrinsics::_fmaD:
+  case vmIntrinsics::_fmaF:
+    if (!InlineMathNatives || !UseFMA) return true;
+    break;
   case vmIntrinsics::_arraycopy:
     if (!InlineArrayCopy) return true;
     break;
--- a/hotspot/src/share/vm/classfile/vmSymbols.hpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp	Fri Aug 26 12:17:50 2016 -0700
@@ -755,8 +755,10 @@
   do_class(java_lang_Math,                "java/lang/Math")                                                             \
   do_class(java_lang_StrictMath,          "java/lang/StrictMath")                                                       \
   do_signature(double2_double_signature,  "(DD)D")                                                                      \
+  do_signature(double3_double_signature,  "(DDD)D")                                                                     \
+  do_signature(float3_float_signature,    "(FFF)F")                                                                     \
   do_signature(int2_int_signature,        "(II)I")                                                                      \
-  do_signature(long2_long_signature,      "(JJ)J")                                                                         \
+  do_signature(long2_long_signature,      "(JJ)J")                                                                      \
                                                                                                                         \
   /* here are the math names, all together: */                                                                          \
   do_name(abs_name,"abs")       do_name(sin_name,"sin")         do_name(cos_name,"cos")                                 \
@@ -770,6 +772,7 @@
   do_name(multiplyExact_name,"multiplyExact")                                                                           \
   do_name(negateExact_name,"negateExact")                                                                               \
   do_name(subtractExact_name,"subtractExact")                                                                           \
+  do_name(fma_name, "fma")                                                                                              \
                                                                                                                         \
   do_intrinsic(_dabs,                     java_lang_Math,         abs_name,   double_double_signature,           F_S)   \
   do_intrinsic(_dsin,                     java_lang_Math,         sin_name,   double_double_signature,           F_S)   \
@@ -795,6 +798,8 @@
   do_intrinsic(_negateExactL,             java_lang_Math,         negateExact_name, long_long_signature,         F_S)   \
   do_intrinsic(_subtractExactI,           java_lang_Math,         subtractExact_name, int2_int_signature,        F_S)   \
   do_intrinsic(_subtractExactL,           java_lang_Math,         subtractExact_name, long2_long_signature,      F_S)   \
+  do_intrinsic(_fmaD,                     java_lang_Math,         fma_name,           double3_double_signature,  F_S)   \
+  do_intrinsic(_fmaF,                     java_lang_Math,         fma_name,           float3_float_signature,    F_S)   \
                                                                                                                         \
   do_intrinsic(_floatToRawIntBits,        java_lang_Float,        floatToRawIntBits_name,   float_int_signature, F_S)   \
    do_name(     floatToRawIntBits_name,                          "floatToRawIntBits")                                   \
--- a/hotspot/src/share/vm/interpreter/abstractInterpreter.cpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/interpreter/abstractInterpreter.cpp	Fri Aug 26 12:17:50 2016 -0700
@@ -194,6 +194,13 @@
                                 return java_lang_ref_reference_get;
   }
 
+  if (UseFMA) {
+    switch (m->intrinsic_id()) {
+      case vmIntrinsics::_fmaD: return java_lang_math_fmaD;
+      case vmIntrinsics::_fmaF: return java_lang_math_fmaF;
+    }
+  }
+
   // Accessor method?
   if (m->is_getter()) {
     // TODO: We should have used ::is_accessor above, but fast accessors in Zero expect only getters.
@@ -281,6 +288,8 @@
     case java_lang_math_sqrt    : tty->print("java_lang_math_sqrt"    ); break;
     case java_lang_math_log     : tty->print("java_lang_math_log"     ); break;
     case java_lang_math_log10   : tty->print("java_lang_math_log10"   ); break;
+    case java_lang_math_fmaD    : tty->print("java_lang_math_fmaD"    ); break;
+    case java_lang_math_fmaF    : tty->print("java_lang_math_fmaF"    ); break;
     case java_util_zip_CRC32_update           : tty->print("java_util_zip_CRC32_update"); break;
     case java_util_zip_CRC32_updateBytes      : tty->print("java_util_zip_CRC32_updateBytes"); break;
     case java_util_zip_CRC32_updateByteBuffer : tty->print("java_util_zip_CRC32_updateByteBuffer"); break;
--- a/hotspot/src/share/vm/interpreter/abstractInterpreter.hpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/interpreter/abstractInterpreter.hpp	Fri Aug 26 12:17:50 2016 -0700
@@ -76,6 +76,8 @@
     java_lang_math_log10,                                       // implementation of java.lang.Math.log10 (x)
     java_lang_math_pow,                                         // implementation of java.lang.Math.pow   (x,y)
     java_lang_math_exp,                                         // implementation of java.lang.Math.exp   (x)
+    java_lang_math_fmaF,                                        // implementation of java.lang.Math.fma   (x, y, z)
+    java_lang_math_fmaD,                                        // implementation of java.lang.Math.fma   (x, y, z)
     java_lang_ref_reference_get,                                // implementation of java.lang.ref.Reference.get()
     java_util_zip_CRC32_update,                                 // implementation of java.util.zip.CRC32.update()
     java_util_zip_CRC32_updateBytes,                            // implementation of java.util.zip.CRC32.updateBytes()
--- a/hotspot/src/share/vm/interpreter/templateInterpreterGenerator.cpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/interpreter/templateInterpreterGenerator.cpp	Fri Aug 26 12:17:50 2016 -0700
@@ -239,6 +239,10 @@
       method_entry(java_lang_math_log10)
       method_entry(java_lang_math_exp  )
       method_entry(java_lang_math_pow  )
+      if (UseFMA) {
+        method_entry(java_lang_math_fmaF)
+        method_entry(java_lang_math_fmaD)
+      }
       method_entry(java_lang_ref_reference_get)
 
       AbstractInterpreter::initialize_method_handle_entries();
@@ -445,7 +449,9 @@
   case Interpreter::java_lang_math_log10   : // fall thru
   case Interpreter::java_lang_math_sqrt    : // fall thru
   case Interpreter::java_lang_math_pow     : // fall thru
-  case Interpreter::java_lang_math_exp     : entry_point = generate_math_entry(kind);      break;
+  case Interpreter::java_lang_math_exp     : // fall thru
+  case Interpreter::java_lang_math_fmaD    : // fall thru
+  case Interpreter::java_lang_math_fmaF     : entry_point = generate_math_entry(kind);      break;
   case Interpreter::java_lang_ref_reference_get
                                            : entry_point = generate_Reference_get_entry(); break;
   case Interpreter::java_util_zip_CRC32_update
--- a/hotspot/src/share/vm/jvmci/vmStructs_jvmci.cpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/jvmci/vmStructs_jvmci.cpp	Fri Aug 26 12:17:50 2016 -0700
@@ -660,7 +660,8 @@
 #define VM_LONG_CONSTANTS_CPU(declare_constant, declare_preprocessor_constant, declare_c1_constant, declare_c2_constant, declare_c2_preprocessor_constant) \
   declare_preprocessor_constant("VM_Version::CPU_AVX512BW", CPU_AVX512BW) \
   declare_preprocessor_constant("VM_Version::CPU_AVX512VL", CPU_AVX512VL) \
-  declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA)
+  declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA)           \
+  declare_preprocessor_constant("VM_Version::CPU_FMA", CPU_FMA)
 
 #endif
 
--- a/hotspot/src/share/vm/opto/c2compiler.cpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/opto/c2compiler.cpp	Fri Aug 26 12:17:50 2016 -0700
@@ -416,6 +416,12 @@
   case vmIntrinsics::_onSpinWait:
     if (!Matcher::match_rule_supported(Op_OnSpinWait)) return false;
     break;
+  case vmIntrinsics::_fmaD:
+    if (!UseFMA || !Matcher::match_rule_supported(Op_FmaD)) return false;
+    break;
+  case vmIntrinsics::_fmaF:
+    if (!UseFMA || !Matcher::match_rule_supported(Op_FmaF)) return false;
+    break;
   case vmIntrinsics::_hashCode:
   case vmIntrinsics::_identityHashCode:
   case vmIntrinsics::_getClass:
--- a/hotspot/src/share/vm/opto/classes.hpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/opto/classes.hpp	Fri Aug 26 12:17:50 2016 -0700
@@ -151,6 +151,8 @@
 macro(EncodePKlass)
 macro(FastLock)
 macro(FastUnlock)
+macro(FmaD)
+macro(FmaF)
 macro(Goto)
 macro(Halt)
 macro(HasNegatives)
--- a/hotspot/src/share/vm/opto/library_call.cpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/opto/library_call.cpp	Fri Aug 26 12:17:50 2016 -0700
@@ -317,6 +317,7 @@
   bool inline_montgomeryMultiply();
   bool inline_montgomerySquare();
   bool inline_vectorizedMismatch();
+  bool inline_fma(vmIntrinsics::ID id);
 
   bool inline_profileBoolean();
   bool inline_isCompileConstant();
@@ -825,6 +826,10 @@
   case vmIntrinsics::_hasNegatives:
     return inline_hasNegatives();
 
+  case vmIntrinsics::_fmaD:
+  case vmIntrinsics::_fmaF:
+    return inline_fma(intrinsic_id());
+
   default:
     // If you get here, it may be that someone has added a new intrinsic
     // to the list in vmSymbols.hpp without implementing it here.
@@ -6657,6 +6662,35 @@
   return instof_false;  // even if it is NULL
 }
 
+//-------------inline_fma-----------------------------------
+bool LibraryCallKit::inline_fma(vmIntrinsics::ID id) {
+  Node *a = NULL;
+  Node *b = NULL;
+  Node *c = NULL;
+  Node* result = NULL;
+  switch (id) {
+  case vmIntrinsics::_fmaD:
+    assert(callee()->signature()->size() == 6, "fma has 3 parameters of size 2 each.");
+    // no receiver since it is static method
+    a = round_double_node(argument(0));
+    b = round_double_node(argument(2));
+    c = round_double_node(argument(4));
+    result = _gvn.transform(new FmaDNode(control(), a, b, c));
+    break;
+  case vmIntrinsics::_fmaF:
+    assert(callee()->signature()->size() == 3, "fma has 3 parameters of size 1 each.");
+    a = argument(0);
+    b = argument(1);
+    c = argument(2);
+    result = _gvn.transform(new FmaFNode(control(), a, b, c));
+    break;
+  default:
+    fatal_unexpected_iid(id);  break;
+  }
+  set_result(result);
+  return true;
+}
+
 bool LibraryCallKit::inline_profileBoolean() {
   Node* counts = argument(1);
   const TypeAryPtr* ary = NULL;
--- a/hotspot/src/share/vm/opto/matcher.cpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/opto/matcher.cpp	Fri Aug 26 12:17:50 2016 -0700
@@ -2117,6 +2117,8 @@
       case Op_StrInflatedCopy:
       case Op_StrCompressedCopy:
       case Op_EncodeISOArray:
+      case Op_FmaD:
+      case Op_FmaF:
         set_shared(n); // Force result into register (it will be anyways)
         break;
       case Op_ConP: {  // Convert pointers above the centerline to NUL
@@ -2305,6 +2307,15 @@
         n->del_req(4);
         break;
       }
+      case Op_FmaD:
+      case Op_FmaF: {
+        // Restructure into a binary tree for Matching.
+        Node* pair = new BinaryNode(n->in(1), n->in(2));
+        n->set_req(2, pair);
+        n->set_req(1, n->in(3));
+        n->del_req(3);
+        break;
+      }
       default:
         break;
       }
--- a/hotspot/src/share/vm/opto/mulnode.cpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/opto/mulnode.cpp	Fri Aug 26 12:17:50 2016 -0700
@@ -1343,3 +1343,47 @@
 
   return TypeLong::LONG;                // Give up
 }
+
+//=============================================================================
+//------------------------------Value------------------------------------------
+const Type* FmaDNode::Value(PhaseGVN* phase) const {
+  const Type *t1 = phase->type(in(1));
+  if (t1 == Type::TOP) return Type::TOP;
+  if (t1->base() != Type::DoubleCon) return Type::DOUBLE;
+  const Type *t2 = phase->type(in(2));
+  if (t2 == Type::TOP) return Type::TOP;
+  if (t2->base() != Type::DoubleCon) return Type::DOUBLE;
+  const Type *t3 = phase->type(in(3));
+  if (t3 == Type::TOP) return Type::TOP;
+  if (t3->base() != Type::DoubleCon) return Type::DOUBLE;
+#ifndef __STDC_IEC_559__
+  return Type::DOUBLE;
+#else
+  double d1 = t1->getd();
+  double d2 = t2->getd();
+  double d3 = t3->getd();
+  return TypeD::make(fma(d1, d2, d3));
+#endif
+}
+
+//=============================================================================
+//------------------------------Value------------------------------------------
+const Type* FmaFNode::Value(PhaseGVN* phase) const {
+  const Type *t1 = phase->type(in(1));
+  if (t1 == Type::TOP) return Type::TOP;
+  if (t1->base() != Type::FloatCon) return Type::FLOAT;
+  const Type *t2 = phase->type(in(2));
+  if (t2 == Type::TOP) return Type::TOP;
+  if (t2->base() != Type::FloatCon) return Type::FLOAT;
+  const Type *t3 = phase->type(in(3));
+  if (t3 == Type::TOP) return Type::TOP;
+  if (t3->base() != Type::FloatCon) return Type::FLOAT;
+#ifndef __STDC_IEC_559__
+  return Type::FLOAT;
+#else
+  float f1 = t1->getf();
+  float f2 = t2->getf();
+  float f3 = t3->getf();
+  return TypeF::make(fma(f1, f2, f3));
+#endif
+}
--- a/hotspot/src/share/vm/opto/mulnode.hpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/opto/mulnode.hpp	Fri Aug 26 12:17:50 2016 -0700
@@ -263,4 +263,26 @@
   virtual uint ideal_reg() const { return Op_RegL; }
 };
 
+//------------------------------FmaDNode--------------------------------------
+// fused-multiply-add double
+class FmaDNode : public Node {
+public:
+  FmaDNode(Node *c, Node *in1, Node *in2, Node *in3) : Node(c, in1, in2, in3) {}
+  virtual int Opcode() const;
+  const Type *bottom_type() const { return Type::DOUBLE; }
+  virtual uint ideal_reg() const { return Op_RegD; }
+  virtual const Type* Value(PhaseGVN* phase) const;
+};
+
+//------------------------------FmaFNode--------------------------------------
+// fused-multiply-add float
+class FmaFNode : public Node {
+public:
+  FmaFNode(Node *c, Node *in1, Node *in2, Node *in3) : Node(c, in1, in2, in3) {}
+  virtual int Opcode() const;
+  const Type *bottom_type() const { return Type::FLOAT; }
+  virtual uint ideal_reg() const { return Op_RegF; }
+  virtual const Type* Value(PhaseGVN* phase) const;
+};
+
 #endif // SHARE_VM_OPTO_MULNODE_HPP
--- a/hotspot/src/share/vm/runtime/globals.hpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/runtime/globals.hpp	Fri Aug 26 12:17:50 2016 -0700
@@ -659,6 +659,9 @@
   product(bool, UseAES, false,                                              \
           "Control whether AES instructions can be used on x86/x64")        \
                                                                             \
+  product(bool, UseFMA, false,                                              \
+          "Control whether FMA instructions can be used")                   \
+                                                                            \
   product(bool, UseSHA, false,                                              \
           "Control whether SHA instructions can be used "                   \
           "on SPARC, on ARM and on x86")                                    \
--- a/hotspot/src/share/vm/runtime/vmStructs.cpp	Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/runtime/vmStructs.cpp	Fri Aug 26 12:17:50 2016 -0700
@@ -2105,6 +2105,8 @@
   declare_c2_type(OverflowAddLNode, OverflowLNode)                        \
   declare_c2_type(OverflowSubLNode, OverflowLNode)                        \
   declare_c2_type(OverflowMulLNode, OverflowLNode)                        \
+  declare_c2_type(FmaDNode, Node)                                         \
+  declare_c2_type(FmaFNode, Node)                                         \
                                                                           \
   /*********************/                                                 \
   /* Adapter Blob Entries */                                              \