8154122: Intrinsify fused mac operations
Summary: added FMA intrinsics on x86
Reviewed-by: kvn, aph, darcy
--- a/hotspot/src/cpu/aarch64/vm/c1_LIRGenerator_aarch64.cpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/c1_LIRGenerator_aarch64.cpp Fri Aug 26 12:17:50 2016 -0700
@@ -1032,6 +1032,10 @@
Unimplemented();
}
+void LIRGenerator::do_FmaIntrinsic(Intrinsic* x) {
+ fatal("FMA intrinsic is not implemented on this platform");
+}
+
void LIRGenerator::do_vectorizedMismatch(Intrinsic* x) {
fatal("vectorizedMismatch intrinsic is not implemented on this platform");
}
--- a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp Fri Aug 26 12:17:50 2016 -0700
@@ -262,6 +262,11 @@
FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
}
+ if (UseFMA) {
+ warning("FMA instructions are not available on this CPU");
+ FLAG_SET_DEFAULT(UseFMA, false);
+ }
+
if (auxv & (HWCAP_SHA1 | HWCAP_SHA2)) {
if (FLAG_IS_DEFAULT(UseSHA)) {
FLAG_SET_DEFAULT(UseSHA, true);
--- a/hotspot/src/cpu/ppc/vm/c1_LIRGenerator_ppc.cpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/ppc/vm/c1_LIRGenerator_ppc.cpp Fri Aug 26 12:17:50 2016 -0700
@@ -1433,6 +1433,10 @@
}
}
+void LIRGenerator::do_FmaIntrinsic(Intrinsic* x) {
+ fatal("FMA intrinsic is not implemented on this platform");
+}
+
void LIRGenerator::do_vectorizedMismatch(Intrinsic* x) {
fatal("vectorizedMismatch intrinsic is not implemented on this platform");
}
--- a/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp Fri Aug 26 12:17:50 2016 -0700
@@ -230,6 +230,11 @@
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
}
+ if (UseFMA) {
+ warning("FMA instructions are not available on this CPU");
+ FLAG_SET_DEFAULT(UseFMA, false);
+ }
+
if (UseSHA) {
warning("SHA instructions are not available on this CPU");
FLAG_SET_DEFAULT(UseSHA, false);
--- a/hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp Fri Aug 26 12:17:50 2016 -0700
@@ -953,6 +953,10 @@
}
}
+void LIRGenerator::do_FmaIntrinsic(Intrinsic* x) {
+ fatal("FMA intrinsic is not implemented on this platform");
+}
+
void LIRGenerator::do_vectorizedMismatch(Intrinsic* x) {
fatal("vectorizedMismatch intrinsic is not implemented on this platform");
}
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp Fri Aug 26 12:17:50 2016 -0700
@@ -266,6 +266,11 @@
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
}
+ if (UseFMA) {
+ warning("FMA instructions are not available on this CPU");
+ FLAG_SET_DEFAULT(UseFMA, false);
+ }
+
// SHA1, SHA256, and SHA512 instructions were added to SPARC T-series at different times
if (has_sha1() || has_sha256() || has_sha512()) {
if (UseVIS > 0) { // SHA intrinsics use VIS1 instructions
--- a/hotspot/src/cpu/x86/vm/abstractInterpreter_x86.cpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/abstractInterpreter_x86.cpp Fri Aug 26 12:17:50 2016 -0700
@@ -172,7 +172,9 @@
case Interpreter::java_lang_math_log10 : // fall thru
case Interpreter::java_lang_math_sqrt : // fall thru
case Interpreter::java_lang_math_pow : // fall thru
- case Interpreter::java_lang_math_exp :
+ case Interpreter::java_lang_math_exp : // fall thru
+ case Interpreter::java_lang_math_fmaD : // fall thru
+ case Interpreter::java_lang_math_fmaF :
return false;
default:
return true;
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp Fri Aug 26 12:17:50 2016 -0700
@@ -4769,6 +4769,22 @@
emit_int8((unsigned char)(0xC0 | encode));
}
+void Assembler::vfmadd231sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
+ assert(VM_Version::supports_fma(), "");
+ InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+ int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+ emit_int8((unsigned char)0xB9);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::vfmadd231ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
+ assert(VM_Version::supports_fma(), "");
+ InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+ int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+ emit_int8((unsigned char)0xB9);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, Address src) {
assert(VM_Version::supports_avx(), "");
InstructionMark im(this);
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp Fri Aug 26 12:17:50 2016 -0700
@@ -1860,6 +1860,8 @@
void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vdivss(XMMRegister dst, XMMRegister nds, Address src);
void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src);
+ void vfmadd231sd(XMMRegister dst, XMMRegister nds, XMMRegister src);
+ void vfmadd231ss(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vmulsd(XMMRegister dst, XMMRegister nds, Address src);
void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vmulss(XMMRegister dst, XMMRegister nds, Address src);
--- a/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp Fri Aug 26 12:17:50 2016 -0700
@@ -1345,6 +1345,18 @@
op->result_opr(),
op->info());
break;
+ case lir_fmad:
+ __ fmad(op->result_opr()->as_xmm_double_reg(),
+ op->in_opr1()->as_xmm_double_reg(),
+ op->in_opr2()->as_xmm_double_reg(),
+ op->in_opr3()->as_xmm_double_reg());
+ break;
+ case lir_fmaf:
+ __ fmaf(op->result_opr()->as_xmm_float_reg(),
+ op->in_opr1()->as_xmm_float_reg(),
+ op->in_opr2()->as_xmm_float_reg(),
+ op->in_opr3()->as_xmm_float_reg());
+ break;
default: ShouldNotReachHere(); break;
}
}
--- a/hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp Fri Aug 26 12:17:50 2016 -0700
@@ -806,6 +806,32 @@
}
}
+void LIRGenerator::do_FmaIntrinsic(Intrinsic* x) {
+ assert(x->number_of_arguments() == 3, "wrong type");
+ assert(UseFMA, "Needs FMA instructions support.");
+ LIRItem value(x->argument_at(0), this);
+ LIRItem value1(x->argument_at(1), this);
+ LIRItem value2(x->argument_at(2), this);
+
+ value2.set_destroys_register();
+
+ value.load_item();
+ value1.load_item();
+ value2.load_item();
+
+ LIR_Opr calc_input = value.result();
+ LIR_Opr calc_input1 = value1.result();
+ LIR_Opr calc_input2 = value2.result();
+ LIR_Opr calc_result = rlock_result(x);
+
+ switch (x->id()) {
+ case vmIntrinsics::_fmaD: __ fmad(calc_input, calc_input1, calc_input2, calc_result); break;
+ case vmIntrinsics::_fmaF: __ fmaf(calc_input, calc_input1, calc_input2, calc_result); break;
+ default: ShouldNotReachHere();
+ }
+
+}
+
void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
assert(x->number_of_arguments() == 1 || (x->number_of_arguments() == 2 && x->id() == vmIntrinsics::_dpow), "wrong type");
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Fri Aug 26 12:17:50 2016 -0700
@@ -3147,6 +3147,24 @@
fpop();
}
+// dst = c = a * b + c
+void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
+ Assembler::vfmadd231sd(c, a, b);
+ if (dst != c) {
+ movdbl(dst, c);
+ }
+}
+
+// dst = c = a * b + c
+void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
+ Assembler::vfmadd231ss(c, a, b);
+ if (dst != c) {
+ movflt(dst, c);
+ }
+}
+
+
+
void MacroAssembler::incrementl(AddressLiteral dst) {
if (reachable(dst)) {
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp Fri Aug 26 12:17:50 2016 -0700
@@ -449,6 +449,10 @@
// tmp is a temporary register, if none is available use noreg
void fremr(Register tmp);
+ // dst = c = a * b + c
+ void fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
+ void fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
+
// same as fcmp2int, but using SSE2
void cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less);
--- a/hotspot/src/cpu/x86/vm/templateInterpreterGenerator_x86_32.cpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/templateInterpreterGenerator_x86_32.cpp Fri Aug 26 12:17:50 2016 -0700
@@ -341,6 +341,27 @@
// [ lo(arg) ]
// [ hi(arg) ]
//
+ if (kind == Interpreter::java_lang_math_fmaD) {
+ __ movdbl(xmm2, Address(rsp, 5 * wordSize));
+ __ movdbl(xmm1, Address(rsp, 3 * wordSize));
+ __ movdbl(xmm0, Address(rsp, 1 * wordSize));
+ __ fmad(xmm0, xmm1, xmm2, xmm0);
+ __ pop(rdi); // get return address
+ __ mov(rsp, rsi); // set sp to sender sp
+ __ jmp(rdi);
+
+ return entry_point;
+ } else if (kind == Interpreter::java_lang_math_fmaF) {
+ __ movflt(xmm2, Address(rsp, 3 * wordSize));
+ __ movflt(xmm1, Address(rsp, 2 * wordSize));
+ __ movflt(xmm0, Address(rsp, 1 * wordSize));
+ __ fmaf(xmm0, xmm1, xmm2, xmm0);
+ __ pop(rdi); // get return address
+ __ mov(rsp, rsi); // set sp to sender sp
+ __ jmp(rdi);
+
+ return entry_point;
+ }
__ fld_d(Address(rsp, 1*wordSize));
switch (kind) {
--- a/hotspot/src/cpu/x86/vm/templateInterpreterGenerator_x86_64.cpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/templateInterpreterGenerator_x86_64.cpp Fri Aug 26 12:17:50 2016 -0700
@@ -369,8 +369,17 @@
// [ hi(arg) ]
//
-
- if (kind == Interpreter::java_lang_math_sqrt) {
+ if (kind == Interpreter::java_lang_math_fmaD) {
+ __ movdbl(xmm0, Address(rsp, wordSize));
+ __ movdbl(xmm1, Address(rsp, 3 * wordSize));
+ __ movdbl(xmm2, Address(rsp, 5 * wordSize));
+ __ fmad(xmm0, xmm1, xmm2, xmm0);
+ } else if (kind == Interpreter::java_lang_math_fmaF) {
+ __ movflt(xmm0, Address(rsp, wordSize));
+ __ movflt(xmm1, Address(rsp, 2 * wordSize));
+ __ movflt(xmm2, Address(rsp, 3 * wordSize));
+ __ fmaf(xmm0, xmm1, xmm2, xmm0);
+ } else if (kind == Interpreter::java_lang_math_sqrt) {
__ sqrtsd(xmm0, Address(rsp, wordSize));
} else if (kind == Interpreter::java_lang_math_exp) {
__ movdbl(xmm0, Address(rsp, wordSize));
--- a/hotspot/src/cpu/x86/vm/vmStructs_x86.hpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/vmStructs_x86.hpp Fri Aug 26 12:17:50 2016 -0700
@@ -73,6 +73,7 @@
#define VM_LONG_CONSTANTS_CPU(declare_constant, declare_preprocessor_constant, declare_c1_constant, declare_c2_constant, declare_c2_preprocessor_constant) \
declare_preprocessor_constant("VM_Version::CPU_AVX512BW", CPU_AVX512BW) \
declare_preprocessor_constant("VM_Version::CPU_AVX512VL", CPU_AVX512VL) \
- declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA)
+ declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA) \
+ declare_preprocessor_constant("VM_Version::CPU_FMA", CPU_FMA)
#endif // CPU_X86_VM_VMSTRUCTS_X86_HPP
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Fri Aug 26 12:17:50 2016 -0700
@@ -578,7 +578,7 @@
}
char buf[256];
- jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+ jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
cores_per_cpu(), threads_per_core(),
cpu_family(), _model, _stepping,
(supports_cmov() ? ", cmov" : ""),
@@ -610,7 +610,8 @@
(supports_bmi2() ? ", bmi2" : ""),
(supports_adx() ? ", adx" : ""),
(supports_evex() ? ", evex" : ""),
- (supports_sha() ? ", sha" : ""));
+ (supports_sha() ? ", sha" : ""),
+ (supports_fma() ? ", fma" : ""));
_features_string = os::strdup(buf);
// UseSSE is set to the smaller of what hardware supports and what
@@ -732,6 +733,15 @@
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
}
+ if (supports_fma() && UseSSE >= 2) {
+ if (FLAG_IS_DEFAULT(UseFMA)) {
+ UseFMA = true;
+ }
+ } else if (UseFMA) {
+ warning("FMA instructions are not available on this CPU");
+ FLAG_SET_DEFAULT(UseFMA, false);
+ }
+
if (supports_sha() LP64_ONLY(|| supports_avx2() && supports_bmi2())) {
if (FLAG_IS_DEFAULT(UseSHA)) {
UseSHA = true;
@@ -773,7 +783,6 @@
FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
}
- // Adjust RTM (Restricted Transactional Memory) flags
if (!supports_rtm() && UseRTMLocking) {
// Can't continue because UseRTMLocking affects UseBiasedLocking flag
// setting during arguments processing. See use_biased_locking().
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp Fri Aug 26 12:17:50 2016 -0700
@@ -74,7 +74,8 @@
: 1,
ssse3 : 1,
cid : 1,
- : 2,
+ : 1,
+ fma : 1,
cmpxchg16: 1,
: 4,
dca : 1,
@@ -289,6 +290,7 @@
#define CPU_AVX512BW ((uint64_t)UCONST64(0x100000000)) // enums are limited to 31 bit
#define CPU_AVX512VL ((uint64_t)UCONST64(0x200000000)) // EVEX instructions with smaller vector length
#define CPU_SHA ((uint64_t)UCONST64(0x400000000)) // SHA instructions
+#define CPU_FMA ((uint64_t)UCONST64(0x800000000)) // FMA instructions
enum Extended_Family {
// AMD
@@ -522,6 +524,8 @@
result |= CPU_SHA;
if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
result |= CPU_LZCNT;
+ if (_cpuid_info.std_cpuid1_ecx.bits.fma != 0)
+ result |= CPU_FMA;
// for Intel, ecx.bits.misalignsse bit (bit 8) indicates support for prefetchw
if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
result |= CPU_3DNOW_PREFETCH;
@@ -726,6 +730,7 @@
static bool supports_avx256only() { return (supports_avx2() && !supports_evex()); }
static bool supports_avxonly() { return ((supports_avx2() || supports_avx()) && !supports_evex()); }
static bool supports_sha() { return (_features & CPU_SHA) != 0; }
+ static bool supports_fma() { return (_features & CPU_FMA) != 0; }
// Intel features
static bool is_intel_family_core() { return is_intel() &&
extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }
--- a/hotspot/src/cpu/x86/vm/x86.ad Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/x86.ad Fri Aug 26 12:17:50 2016 -0700
@@ -3113,6 +3113,30 @@
ins_pipe(pipe_slow);
%}
+// a * b + c
+instruct fmaD_reg(regD a, regD b, regD c) %{
+ predicate(UseFMA);
+ match(Set c (FmaD c (Binary a b)));
+ format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
+ ins_cost(150);
+ ins_encode %{
+ __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+// a * b + c
+instruct fmaF_reg(regF a, regF b, regF c) %{
+ predicate(UseFMA);
+ match(Set c (FmaF c (Binary a b)));
+ format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
+ ins_cost(150);
+ ins_encode %{
+ __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
// ====================VECTOR INSTRUCTIONS=====================================
// Load vectors (4 bytes long)
--- a/hotspot/src/jdk.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/jdk.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java Fri Aug 26 12:17:50 2016 -0700
@@ -205,7 +205,8 @@
AVX512CD,
AVX512BW,
AVX512VL,
- SHA
+ SHA,
+ FMA
}
private final EnumSet<CPUFeature> features;
--- a/hotspot/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot.amd64/src/jdk/vm/ci/hotspot/amd64/AMD64HotSpotJVMCIBackendFactory.java Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot.amd64/src/jdk/vm/ci/hotspot/amd64/AMD64HotSpotJVMCIBackendFactory.java Fri Aug 26 12:17:50 2016 -0700
@@ -124,6 +124,9 @@
if ((config.vmVersionFeatures & config.amd64SHA) != 0) {
features.add(AMD64.CPUFeature.SHA);
}
+ if ((config.vmVersionFeatures & config.amd64FMA) != 0) {
+ features.add(AMD64.CPUFeature.FMA);
+ }
return features;
}
--- a/hotspot/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot.amd64/src/jdk/vm/ci/hotspot/amd64/AMD64HotSpotVMConfig.java Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot.amd64/src/jdk/vm/ci/hotspot/amd64/AMD64HotSpotVMConfig.java Fri Aug 26 12:17:50 2016 -0700
@@ -78,4 +78,5 @@
final long amd64AVX512BW = getConstant("VM_Version::CPU_AVX512BW", Long.class);
final long amd64AVX512VL = getConstant("VM_Version::CPU_AVX512VL", Long.class);
final long amd64SHA = getConstant("VM_Version::CPU_SHA", Long.class);
+ final long amd64FMA = getConstant("VM_Version::CPU_FMA", Long.class);
}
--- a/hotspot/src/share/vm/adlc/formssel.cpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/adlc/formssel.cpp Fri Aug 26 12:17:50 2016 -0700
@@ -4038,6 +4038,8 @@
strcmp(opType,"EncodeP")==0 ||
strcmp(opType,"EncodePKlass")==0 ||
strcmp(opType,"DecodeNKlass")==0 ||
+ strcmp(opType,"FmaD") == 0 ||
+ strcmp(opType,"FmaF") == 0 ||
strcmp(opType,"RoundDouble")==0 ||
strcmp(opType,"RoundFloat")==0 ||
strcmp(opType,"ReverseBytesI")==0 ||
--- a/hotspot/src/share/vm/c1/c1_Compiler.cpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/c1/c1_Compiler.cpp Fri Aug 26 12:17:50 2016 -0700
@@ -162,6 +162,8 @@
case vmIntrinsics::_dlog10:
case vmIntrinsics::_dexp:
case vmIntrinsics::_dpow:
+ case vmIntrinsics::_fmaD:
+ case vmIntrinsics::_fmaF:
case vmIntrinsics::_getObject:
case vmIntrinsics::_getBoolean:
case vmIntrinsics::_getByte:
--- a/hotspot/src/share/vm/c1/c1_LIR.cpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/c1/c1_LIR.cpp Fri Aug 26 12:17:50 2016 -0700
@@ -666,7 +666,9 @@
// LIR_Op3
case lir_idiv:
- case lir_irem: {
+ case lir_irem:
+ case lir_fmad:
+ case lir_fmaf: {
assert(op->as_Op3() != NULL, "must be");
LIR_Op3* op3= (LIR_Op3*)op;
@@ -1663,6 +1665,8 @@
// LIR_Op3
case lir_idiv: s = "idiv"; break;
case lir_irem: s = "irem"; break;
+ case lir_fmad: s = "fmad"; break;
+ case lir_fmaf: s = "fmaf"; break;
// LIR_OpJavaCall
case lir_static_call: s = "static"; break;
case lir_optvirtual_call: s = "optvirtual"; break;
--- a/hotspot/src/share/vm/c1/c1_LIR.hpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/c1/c1_LIR.hpp Fri Aug 26 12:17:50 2016 -0700
@@ -956,6 +956,8 @@
, begin_op3
, lir_idiv
, lir_irem
+ , lir_fmad
+ , lir_fmaf
, end_op3
, begin_opJavaCall
, lir_static_call
@@ -2149,6 +2151,8 @@
void abs (LIR_Opr from, LIR_Opr to, LIR_Opr tmp) { append(new LIR_Op2(lir_abs , from, tmp, to)); }
void sqrt(LIR_Opr from, LIR_Opr to, LIR_Opr tmp) { append(new LIR_Op2(lir_sqrt, from, tmp, to)); }
+ void fmad(LIR_Opr from, LIR_Opr from1, LIR_Opr from2, LIR_Opr to) { append(new LIR_Op3(lir_fmad, from, from1, from2, to)); }
+ void fmaf(LIR_Opr from, LIR_Opr from1, LIR_Opr from2, LIR_Opr to) { append(new LIR_Op3(lir_fmaf, from, from1, from2, to)); }
void log10 (LIR_Opr from, LIR_Opr to, LIR_Opr tmp) { append(new LIR_Op2(lir_log10, from, LIR_OprFact::illegalOpr, to, tmp)); }
void tan (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_tan , from, tmp1, to, tmp2)); }
--- a/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp Fri Aug 26 12:17:50 2016 -0700
@@ -3147,6 +3147,9 @@
case vmIntrinsics::_dpow : do_MathIntrinsic(x); break;
case vmIntrinsics::_arraycopy: do_ArrayCopy(x); break;
+ case vmIntrinsics::_fmaD: do_FmaIntrinsic(x); break;
+ case vmIntrinsics::_fmaF: do_FmaIntrinsic(x); break;
+
// java.nio.Buffer.checkIndex
case vmIntrinsics::_checkIndex: do_NIOCheckIndex(x); break;
--- a/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp Fri Aug 26 12:17:50 2016 -0700
@@ -245,6 +245,7 @@
void do_isPrimitive(Intrinsic* x);
void do_getClass(Intrinsic* x);
void do_currentThread(Intrinsic* x);
+ void do_FmaIntrinsic(Intrinsic* x);
void do_MathIntrinsic(Intrinsic* x);
void do_LibmIntrinsic(Intrinsic* x);
void do_ArrayCopy(Intrinsic* x);
--- a/hotspot/src/share/vm/classfile/vmSymbols.cpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/classfile/vmSymbols.cpp Fri Aug 26 12:17:50 2016 -0700
@@ -355,6 +355,8 @@
case vmIntrinsics::_updateBytesCRC32:
case vmIntrinsics::_updateByteBufferCRC32:
case vmIntrinsics::_vectorizedMismatch:
+ case vmIntrinsics::_fmaD:
+ case vmIntrinsics::_fmaF:
return true;
default:
return false;
@@ -387,6 +389,8 @@
case vmIntrinsics::_updateBytesCRC32:
case vmIntrinsics::_updateByteBufferCRC32:
case vmIntrinsics::_vectorizedMismatch:
+ case vmIntrinsics::_fmaD:
+ case vmIntrinsics::_fmaF:
return false;
default:
return true;
@@ -535,6 +539,10 @@
case vmIntrinsics::_doubleToLongBits:
if (!InlineMathNatives) return true;
break;
+ case vmIntrinsics::_fmaD:
+ case vmIntrinsics::_fmaF:
+ if (!InlineMathNatives || !UseFMA) return true;
+ break;
case vmIntrinsics::_arraycopy:
if (!InlineArrayCopy) return true;
break;
--- a/hotspot/src/share/vm/classfile/vmSymbols.hpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp Fri Aug 26 12:17:50 2016 -0700
@@ -755,8 +755,10 @@
do_class(java_lang_Math, "java/lang/Math") \
do_class(java_lang_StrictMath, "java/lang/StrictMath") \
do_signature(double2_double_signature, "(DD)D") \
+ do_signature(double3_double_signature, "(DDD)D") \
+ do_signature(float3_float_signature, "(FFF)F") \
do_signature(int2_int_signature, "(II)I") \
- do_signature(long2_long_signature, "(JJ)J") \
+ do_signature(long2_long_signature, "(JJ)J") \
\
/* here are the math names, all together: */ \
do_name(abs_name,"abs") do_name(sin_name,"sin") do_name(cos_name,"cos") \
@@ -770,6 +772,7 @@
do_name(multiplyExact_name,"multiplyExact") \
do_name(negateExact_name,"negateExact") \
do_name(subtractExact_name,"subtractExact") \
+ do_name(fma_name, "fma") \
\
do_intrinsic(_dabs, java_lang_Math, abs_name, double_double_signature, F_S) \
do_intrinsic(_dsin, java_lang_Math, sin_name, double_double_signature, F_S) \
@@ -795,6 +798,8 @@
do_intrinsic(_negateExactL, java_lang_Math, negateExact_name, long_long_signature, F_S) \
do_intrinsic(_subtractExactI, java_lang_Math, subtractExact_name, int2_int_signature, F_S) \
do_intrinsic(_subtractExactL, java_lang_Math, subtractExact_name, long2_long_signature, F_S) \
+ do_intrinsic(_fmaD, java_lang_Math, fma_name, double3_double_signature, F_S) \
+ do_intrinsic(_fmaF, java_lang_Math, fma_name, float3_float_signature, F_S) \
\
do_intrinsic(_floatToRawIntBits, java_lang_Float, floatToRawIntBits_name, float_int_signature, F_S) \
do_name( floatToRawIntBits_name, "floatToRawIntBits") \
--- a/hotspot/src/share/vm/interpreter/abstractInterpreter.cpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/interpreter/abstractInterpreter.cpp Fri Aug 26 12:17:50 2016 -0700
@@ -194,6 +194,13 @@
return java_lang_ref_reference_get;
}
+ if (UseFMA) {
+ switch (m->intrinsic_id()) {
+ case vmIntrinsics::_fmaD: return java_lang_math_fmaD;
+ case vmIntrinsics::_fmaF: return java_lang_math_fmaF;
+ }
+ }
+
// Accessor method?
if (m->is_getter()) {
// TODO: We should have used ::is_accessor above, but fast accessors in Zero expect only getters.
@@ -281,6 +288,8 @@
case java_lang_math_sqrt : tty->print("java_lang_math_sqrt" ); break;
case java_lang_math_log : tty->print("java_lang_math_log" ); break;
case java_lang_math_log10 : tty->print("java_lang_math_log10" ); break;
+ case java_lang_math_fmaD : tty->print("java_lang_math_fmaD" ); break;
+ case java_lang_math_fmaF : tty->print("java_lang_math_fmaF" ); break;
case java_util_zip_CRC32_update : tty->print("java_util_zip_CRC32_update"); break;
case java_util_zip_CRC32_updateBytes : tty->print("java_util_zip_CRC32_updateBytes"); break;
case java_util_zip_CRC32_updateByteBuffer : tty->print("java_util_zip_CRC32_updateByteBuffer"); break;
--- a/hotspot/src/share/vm/interpreter/abstractInterpreter.hpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/interpreter/abstractInterpreter.hpp Fri Aug 26 12:17:50 2016 -0700
@@ -76,6 +76,8 @@
java_lang_math_log10, // implementation of java.lang.Math.log10 (x)
java_lang_math_pow, // implementation of java.lang.Math.pow (x,y)
java_lang_math_exp, // implementation of java.lang.Math.exp (x)
+ java_lang_math_fmaF, // implementation of java.lang.Math.fma (x, y, z)
+ java_lang_math_fmaD, // implementation of java.lang.Math.fma (x, y, z)
java_lang_ref_reference_get, // implementation of java.lang.ref.Reference.get()
java_util_zip_CRC32_update, // implementation of java.util.zip.CRC32.update()
java_util_zip_CRC32_updateBytes, // implementation of java.util.zip.CRC32.updateBytes()
--- a/hotspot/src/share/vm/interpreter/templateInterpreterGenerator.cpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/interpreter/templateInterpreterGenerator.cpp Fri Aug 26 12:17:50 2016 -0700
@@ -239,6 +239,10 @@
method_entry(java_lang_math_log10)
method_entry(java_lang_math_exp )
method_entry(java_lang_math_pow )
+ if (UseFMA) {
+ method_entry(java_lang_math_fmaF)
+ method_entry(java_lang_math_fmaD)
+ }
method_entry(java_lang_ref_reference_get)
AbstractInterpreter::initialize_method_handle_entries();
@@ -445,7 +449,9 @@
case Interpreter::java_lang_math_log10 : // fall thru
case Interpreter::java_lang_math_sqrt : // fall thru
case Interpreter::java_lang_math_pow : // fall thru
- case Interpreter::java_lang_math_exp : entry_point = generate_math_entry(kind); break;
+ case Interpreter::java_lang_math_exp : // fall thru
+ case Interpreter::java_lang_math_fmaD : // fall thru
+ case Interpreter::java_lang_math_fmaF : entry_point = generate_math_entry(kind); break;
case Interpreter::java_lang_ref_reference_get
: entry_point = generate_Reference_get_entry(); break;
case Interpreter::java_util_zip_CRC32_update
--- a/hotspot/src/share/vm/jvmci/vmStructs_jvmci.cpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/jvmci/vmStructs_jvmci.cpp Fri Aug 26 12:17:50 2016 -0700
@@ -660,7 +660,8 @@
#define VM_LONG_CONSTANTS_CPU(declare_constant, declare_preprocessor_constant, declare_c1_constant, declare_c2_constant, declare_c2_preprocessor_constant) \
declare_preprocessor_constant("VM_Version::CPU_AVX512BW", CPU_AVX512BW) \
declare_preprocessor_constant("VM_Version::CPU_AVX512VL", CPU_AVX512VL) \
- declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA)
+ declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA) \
+ declare_preprocessor_constant("VM_Version::CPU_FMA", CPU_FMA)
#endif
--- a/hotspot/src/share/vm/opto/c2compiler.cpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/opto/c2compiler.cpp Fri Aug 26 12:17:50 2016 -0700
@@ -416,6 +416,12 @@
case vmIntrinsics::_onSpinWait:
if (!Matcher::match_rule_supported(Op_OnSpinWait)) return false;
break;
+ case vmIntrinsics::_fmaD:
+ if (!UseFMA || !Matcher::match_rule_supported(Op_FmaD)) return false;
+ break;
+ case vmIntrinsics::_fmaF:
+ if (!UseFMA || !Matcher::match_rule_supported(Op_FmaF)) return false;
+ break;
case vmIntrinsics::_hashCode:
case vmIntrinsics::_identityHashCode:
case vmIntrinsics::_getClass:
--- a/hotspot/src/share/vm/opto/classes.hpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/opto/classes.hpp Fri Aug 26 12:17:50 2016 -0700
@@ -151,6 +151,8 @@
macro(EncodePKlass)
macro(FastLock)
macro(FastUnlock)
+macro(FmaD)
+macro(FmaF)
macro(Goto)
macro(Halt)
macro(HasNegatives)
--- a/hotspot/src/share/vm/opto/library_call.cpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/opto/library_call.cpp Fri Aug 26 12:17:50 2016 -0700
@@ -317,6 +317,7 @@
bool inline_montgomeryMultiply();
bool inline_montgomerySquare();
bool inline_vectorizedMismatch();
+ bool inline_fma(vmIntrinsics::ID id);
bool inline_profileBoolean();
bool inline_isCompileConstant();
@@ -825,6 +826,10 @@
case vmIntrinsics::_hasNegatives:
return inline_hasNegatives();
+ case vmIntrinsics::_fmaD:
+ case vmIntrinsics::_fmaF:
+ return inline_fma(intrinsic_id());
+
default:
// If you get here, it may be that someone has added a new intrinsic
// to the list in vmSymbols.hpp without implementing it here.
@@ -6657,6 +6662,35 @@
return instof_false; // even if it is NULL
}
+//-------------inline_fma-----------------------------------
+bool LibraryCallKit::inline_fma(vmIntrinsics::ID id) {
+ Node *a = NULL;
+ Node *b = NULL;
+ Node *c = NULL;
+ Node* result = NULL;
+ switch (id) {
+ case vmIntrinsics::_fmaD:
+ assert(callee()->signature()->size() == 6, "fma has 3 parameters of size 2 each.");
+ // no receiver since it is static method
+ a = round_double_node(argument(0));
+ b = round_double_node(argument(2));
+ c = round_double_node(argument(4));
+ result = _gvn.transform(new FmaDNode(control(), a, b, c));
+ break;
+ case vmIntrinsics::_fmaF:
+ assert(callee()->signature()->size() == 3, "fma has 3 parameters of size 1 each.");
+ a = argument(0);
+ b = argument(1);
+ c = argument(2);
+ result = _gvn.transform(new FmaFNode(control(), a, b, c));
+ break;
+ default:
+ fatal_unexpected_iid(id); break;
+ }
+ set_result(result);
+ return true;
+}
+
bool LibraryCallKit::inline_profileBoolean() {
Node* counts = argument(1);
const TypeAryPtr* ary = NULL;
--- a/hotspot/src/share/vm/opto/matcher.cpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/opto/matcher.cpp Fri Aug 26 12:17:50 2016 -0700
@@ -2117,6 +2117,8 @@
case Op_StrInflatedCopy:
case Op_StrCompressedCopy:
case Op_EncodeISOArray:
+ case Op_FmaD:
+ case Op_FmaF:
set_shared(n); // Force result into register (it will be anyways)
break;
case Op_ConP: { // Convert pointers above the centerline to NUL
@@ -2305,6 +2307,15 @@
n->del_req(4);
break;
}
+ case Op_FmaD:
+ case Op_FmaF: {
+ // Restructure into a binary tree for Matching.
+ Node* pair = new BinaryNode(n->in(1), n->in(2));
+ n->set_req(2, pair);
+ n->set_req(1, n->in(3));
+ n->del_req(3);
+ break;
+ }
default:
break;
}
--- a/hotspot/src/share/vm/opto/mulnode.cpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/opto/mulnode.cpp Fri Aug 26 12:17:50 2016 -0700
@@ -1343,3 +1343,47 @@
return TypeLong::LONG; // Give up
}
+
+//=============================================================================
+//------------------------------Value------------------------------------------
+const Type* FmaDNode::Value(PhaseGVN* phase) const {
+ const Type *t1 = phase->type(in(1));
+ if (t1 == Type::TOP) return Type::TOP;
+ if (t1->base() != Type::DoubleCon) return Type::DOUBLE;
+ const Type *t2 = phase->type(in(2));
+ if (t2 == Type::TOP) return Type::TOP;
+ if (t2->base() != Type::DoubleCon) return Type::DOUBLE;
+ const Type *t3 = phase->type(in(3));
+ if (t3 == Type::TOP) return Type::TOP;
+ if (t3->base() != Type::DoubleCon) return Type::DOUBLE;
+#ifndef __STDC_IEC_559__
+ return Type::DOUBLE;
+#else
+ double d1 = t1->getd();
+ double d2 = t2->getd();
+ double d3 = t3->getd();
+ return TypeD::make(fma(d1, d2, d3));
+#endif
+}
+
+//=============================================================================
+//------------------------------Value------------------------------------------
+const Type* FmaFNode::Value(PhaseGVN* phase) const {
+ const Type *t1 = phase->type(in(1));
+ if (t1 == Type::TOP) return Type::TOP;
+ if (t1->base() != Type::FloatCon) return Type::FLOAT;
+ const Type *t2 = phase->type(in(2));
+ if (t2 == Type::TOP) return Type::TOP;
+ if (t2->base() != Type::FloatCon) return Type::FLOAT;
+ const Type *t3 = phase->type(in(3));
+ if (t3 == Type::TOP) return Type::TOP;
+ if (t3->base() != Type::FloatCon) return Type::FLOAT;
+#ifndef __STDC_IEC_559__
+ return Type::FLOAT;
+#else
+ float f1 = t1->getf();
+ float f2 = t2->getf();
+ float f3 = t3->getf();
+ return TypeF::make(fma(f1, f2, f3));
+#endif
+}
--- a/hotspot/src/share/vm/opto/mulnode.hpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/opto/mulnode.hpp Fri Aug 26 12:17:50 2016 -0700
@@ -263,4 +263,26 @@
virtual uint ideal_reg() const { return Op_RegL; }
};
+//------------------------------FmaDNode--------------------------------------
+// fused-multiply-add double
+class FmaDNode : public Node {
+public:
+ FmaDNode(Node *c, Node *in1, Node *in2, Node *in3) : Node(c, in1, in2, in3) {}
+ virtual int Opcode() const;
+ const Type *bottom_type() const { return Type::DOUBLE; }
+ virtual uint ideal_reg() const { return Op_RegD; }
+ virtual const Type* Value(PhaseGVN* phase) const;
+};
+
+//------------------------------FmaFNode--------------------------------------
+// fused-multiply-add float
+class FmaFNode : public Node {
+public:
+ FmaFNode(Node *c, Node *in1, Node *in2, Node *in3) : Node(c, in1, in2, in3) {}
+ virtual int Opcode() const;
+ const Type *bottom_type() const { return Type::FLOAT; }
+ virtual uint ideal_reg() const { return Op_RegF; }
+ virtual const Type* Value(PhaseGVN* phase) const;
+};
+
#endif // SHARE_VM_OPTO_MULNODE_HPP
--- a/hotspot/src/share/vm/runtime/globals.hpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/runtime/globals.hpp Fri Aug 26 12:17:50 2016 -0700
@@ -659,6 +659,9 @@
product(bool, UseAES, false, \
"Control whether AES instructions can be used on x86/x64") \
\
+ product(bool, UseFMA, false, \
+ "Control whether FMA instructions can be used") \
+ \
product(bool, UseSHA, false, \
"Control whether SHA instructions can be used " \
"on SPARC, on ARM and on x86") \
--- a/hotspot/src/share/vm/runtime/vmStructs.cpp Thu Sep 01 16:47:53 2016 +0200
+++ b/hotspot/src/share/vm/runtime/vmStructs.cpp Fri Aug 26 12:17:50 2016 -0700
@@ -2105,6 +2105,8 @@
declare_c2_type(OverflowAddLNode, OverflowLNode) \
declare_c2_type(OverflowSubLNode, OverflowLNode) \
declare_c2_type(OverflowMulLNode, OverflowLNode) \
+ declare_c2_type(FmaDNode, Node) \
+ declare_c2_type(FmaFNode, Node) \
\
/*********************/ \
/* Adapter Blob Entries */ \