--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp Wed Jun 07 13:09:46 2017 -0700
@@ -5092,6 +5092,42 @@
emit_operand(dst, src);
}
+void Assembler::vfmadd231pd(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len) {
+ assert(VM_Version::supports_fma(), "");
+ InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+ int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+ emit_int8((unsigned char)0xB8);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::vfmadd231ps(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len) {
+ assert(VM_Version::supports_fma(), "");
+ InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+ int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+ emit_int8((unsigned char)0xB8);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::vfmadd231pd(XMMRegister dst, XMMRegister src1, Address src2, int vector_len) {
+ assert(VM_Version::supports_fma(), "");
+ InstructionMark im(this);
+ InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+ attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
+ vex_prefix(src2, src1->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+ emit_int8((unsigned char)0xB8);
+ emit_operand(dst, src2);
+}
+
+void Assembler::vfmadd231ps(XMMRegister dst, XMMRegister src1, Address src2, int vector_len) {
+ assert(VM_Version::supports_fma(), "");
+ InstructionMark im(this);
+ InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+ attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_32bit);
+ vex_prefix(src2, src1->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+ emit_int8((unsigned char)0xB8);
+ emit_operand(dst, src2);
+}
+
void Assembler::divpd(XMMRegister dst, XMMRegister src) {
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp Wed Jun 07 13:09:46 2017 -0700
@@ -1906,6 +1906,11 @@
void vmulpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
void vmulps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+ void vfmadd231pd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vfmadd231ps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vfmadd231pd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+ void vfmadd231ps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+
// Divide Packed Floating-Point Values
void divpd(XMMRegister dst, XMMRegister src);
void divps(XMMRegister dst, XMMRegister src);
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Wed Jun 07 13:09:46 2017 -0700
@@ -3165,8 +3165,37 @@
}
}
-
-
+// dst = c = a * b + c
+void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
+ Assembler::vfmadd231pd(c, a, b, vector_len);
+ if (dst != c) {
+ vmovdqu(dst, c);
+ }
+}
+
+// dst = c = a * b + c
+void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
+ Assembler::vfmadd231ps(c, a, b, vector_len);
+ if (dst != c) {
+ vmovdqu(dst, c);
+ }
+}
+
+// dst = c = a * b + c
+void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
+ Assembler::vfmadd231pd(c, a, b, vector_len);
+ if (dst != c) {
+ vmovdqu(dst, c);
+ }
+}
+
+// dst = c = a * b + c
+void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
+ Assembler::vfmadd231ps(c, a, b, vector_len);
+ if (dst != c) {
+ vmovdqu(dst, c);
+ }
+}
void MacroAssembler::incrementl(AddressLiteral dst) {
if (reachable(dst)) {
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp Wed Jun 07 13:09:46 2017 -0700
@@ -456,6 +456,11 @@
void fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
void fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
+ void vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len);
+ void vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len);
+ void vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len);
+ void vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len);
+
// same as fcmp2int, but using SSE2
void cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less);
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Wed Jun 07 13:09:46 2017 -0700
@@ -812,7 +812,7 @@
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
}
- if (supports_fma() && UseSSE >= 2) {
+ if (supports_fma()) {
if (FLAG_IS_DEFAULT(UseFMA)) {
UseFMA = true;
}
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp Wed Jun 07 13:09:46 2017 -0700
@@ -732,7 +732,7 @@
static bool supports_avx256only() { return (supports_avx2() && !supports_evex()); }
static bool supports_avxonly() { return ((supports_avx2() || supports_avx()) && !supports_evex()); }
static bool supports_sha() { return (_features & CPU_SHA) != 0; }
- static bool supports_fma() { return (_features & CPU_FMA) != 0; }
+ static bool supports_fma() { return (_features & CPU_FMA) != 0 && supports_avx(); }
static bool supports_vzeroupper() { return (_features & CPU_VZEROUPPER) != 0; }
// Intel features
--- a/hotspot/src/cpu/x86/vm/x86.ad Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/cpu/x86/vm/x86.ad Wed Jun 07 13:09:46 2017 -0700
@@ -10520,3 +10520,161 @@
ins_pipe( pipe_slow );
%}
+// --------------------------------- FMA --------------------------------------
+
+// a * b + c
+instruct vfma2D_reg(vecX a, vecX b, vecX c) %{
+ predicate(UseFMA && n->as_Vector()->length() == 2);
+ match(Set c (FmaVD c (Binary a b)));
+ format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
+ ins_cost(150);
+ ins_encode %{
+ int vector_len = 0;
+ __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+// a * b + c
+instruct vfma2D_mem(vecX a, memory b, vecX c) %{
+ predicate(UseFMA && n->as_Vector()->length() == 2);
+ match(Set c (FmaVD c (Binary a (LoadVector b))));
+ format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
+ ins_cost(150);
+ ins_encode %{
+ int vector_len = 0;
+ __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+
+// a * b + c
+instruct vfma4D_reg(vecY a, vecY b, vecY c) %{
+ predicate(UseFMA && n->as_Vector()->length() == 4);
+ match(Set c (FmaVD c (Binary a b)));
+ format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
+ ins_cost(150);
+ ins_encode %{
+ int vector_len = 1;
+ __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+// a * b + c
+instruct vfma4D_mem(vecY a, memory b, vecY c) %{
+ predicate(UseFMA && n->as_Vector()->length() == 4);
+ match(Set c (FmaVD c (Binary a (LoadVector b))));
+ format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
+ ins_cost(150);
+ ins_encode %{
+ int vector_len = 1;
+ __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+// a * b + c
+instruct vfma8D_reg(vecZ a, vecZ b, vecZ c) %{
+ predicate(UseFMA && n->as_Vector()->length() == 8);
+ match(Set c (FmaVD c (Binary a b)));
+ format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
+ ins_cost(150);
+ ins_encode %{
+ int vector_len = 2;
+ __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+// a * b + c
+instruct vfma8D_mem(vecZ a, memory b, vecZ c) %{
+ predicate(UseFMA && n->as_Vector()->length() == 8);
+ match(Set c (FmaVD c (Binary a (LoadVector b))));
+ format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
+ ins_cost(150);
+ ins_encode %{
+ int vector_len = 2;
+ __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+// a * b + c
+instruct vfma4F_reg(vecX a, vecX b, vecX c) %{
+ predicate(UseFMA && n->as_Vector()->length() == 4);
+ match(Set c (FmaVF c (Binary a b)));
+ format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
+ ins_cost(150);
+ ins_encode %{
+ int vector_len = 0;
+ __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+// a * b + c
+instruct vfma4F_mem(vecX a, memory b, vecX c) %{
+ predicate(UseFMA && n->as_Vector()->length() == 4);
+ match(Set c (FmaVF c (Binary a (LoadVector b))));
+ format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
+ ins_cost(150);
+ ins_encode %{
+ int vector_len = 0;
+ __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+// a * b + c
+instruct vfma8F_reg(vecY a, vecY b, vecY c) %{
+ predicate(UseFMA && n->as_Vector()->length() == 8);
+ match(Set c (FmaVF c (Binary a b)));
+ format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
+ ins_cost(150);
+ ins_encode %{
+ int vector_len = 1;
+ __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+// a * b + c
+instruct vfma8F_mem(vecY a, memory b, vecY c) %{
+ predicate(UseFMA && n->as_Vector()->length() == 8);
+ match(Set c (FmaVF c (Binary a (LoadVector b))));
+ format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
+ ins_cost(150);
+ ins_encode %{
+ int vector_len = 1;
+ __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+// a * b + c
+instruct vfma16F_reg(vecZ a, vecZ b, vecZ c) %{
+ predicate(UseFMA && n->as_Vector()->length() == 16);
+ match(Set c (FmaVF c (Binary a b)));
+ format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
+ ins_cost(150);
+ ins_encode %{
+ int vector_len = 2;
+ __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+// a * b + c
+instruct vfma16F_mem(vecZ a, memory b, vecZ c) %{
+ predicate(UseFMA && n->as_Vector()->length() == 16);
+ match(Set c (FmaVF c (Binary a (LoadVector b))));
+ format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
+ ins_cost(150);
+ ins_encode %{
+ int vector_len = 2;
+ __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
--- a/hotspot/src/share/vm/adlc/formssel.cpp Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/share/vm/adlc/formssel.cpp Wed Jun 07 13:09:46 2017 -0700
@@ -4179,6 +4179,7 @@
"URShiftVB","URShiftVS","URShiftVI","URShiftVL",
"ReplicateB","ReplicateS","ReplicateI","ReplicateL","ReplicateF","ReplicateD",
"LoadVector","StoreVector",
+ "FmaVD", "FmaVF",
// Next are not supported currently.
"PackB","PackS","PackI","PackL","PackF","PackD","Pack2L","Pack2D",
"ExtractB","ExtractUB","ExtractC","ExtractS","ExtractI","ExtractL","ExtractF","ExtractD"
--- a/hotspot/src/share/vm/opto/classes.hpp Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/share/vm/opto/classes.hpp Wed Jun 07 13:09:46 2017 -0700
@@ -310,6 +310,8 @@
macro(MulReductionVF)
macro(MulVD)
macro(MulReductionVD)
+macro(FmaVD)
+macro(FmaVF)
macro(DivVF)
macro(DivVD)
macro(AbsVF)
--- a/hotspot/src/share/vm/opto/matcher.cpp Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/share/vm/opto/matcher.cpp Wed Jun 07 13:09:46 2017 -0700
@@ -977,7 +977,6 @@
// Use one stack to keep both: child's node/state and parent's node/index
MStack mstack(max_stack * 2 * 2); // usually: C->live_nodes() * 2 * 2
mstack.push(n, Visit, NULL, -1); // set NULL as parent to indicate root
-
while (mstack.is_nonempty()) {
C->check_node_count(NodeLimitFudgeFactor, "too many nodes matching instructions");
if (C->failing()) return NULL;
@@ -2122,6 +2121,8 @@
case Op_EncodeISOArray:
case Op_FmaD:
case Op_FmaF:
+ case Op_FmaVD:
+ case Op_FmaVF:
set_shared(n); // Force result into register (it will be anyways)
break;
case Op_ConP: { // Convert pointers above the centerline to NUL
@@ -2311,7 +2312,9 @@
break;
}
case Op_FmaD:
- case Op_FmaF: {
+ case Op_FmaF:
+ case Op_FmaVD:
+ case Op_FmaVF: {
// Restructure into a binary tree for Matching.
Node* pair = new BinaryNode(n->in(1), n->in(2));
n->set_req(2, pair);
--- a/hotspot/src/share/vm/opto/superword.cpp Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/share/vm/opto/superword.cpp Wed Jun 07 13:09:46 2017 -0700
@@ -2324,6 +2324,13 @@
const TypeVect* vt = TypeVect::make(bt, vlen);
vn = new CMoveVDNode(cc, src1, src2, vt);
NOT_PRODUCT(if(is_trace_cmov()) {tty->print("SWPointer::output: created new CMove node %d: ", vn->_idx); vn->dump();})
+ } else if (opc == Op_FmaD || opc == Op_FmaF) {
+ // Promote operands to vector
+ Node* in1 = vector_opd(p, 1);
+ Node* in2 = vector_opd(p, 2);
+ Node* in3 = vector_opd(p, 3);
+ vn = VectorNode::make(opc, in1, in2, in3, vlen, velt_basic_type(n));
+ vlen_in_bytes = vn->as_Vector()->length_in_bytes();
} else {
if (do_reserve_copy()) {
NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: ShouldNotReachHere, exiting SuperWord");})
--- a/hotspot/src/share/vm/opto/vectornode.cpp Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/share/vm/opto/vectornode.cpp Wed Jun 07 13:09:46 2017 -0700
@@ -86,6 +86,12 @@
case Op_MulD:
assert(bt == T_DOUBLE, "must be");
return Op_MulVD;
+ case Op_FmaD:
+ assert(bt == T_DOUBLE, "must be");
+ return Op_FmaVD;
+ case Op_FmaF:
+ assert(bt == T_FLOAT, "must be");
+ return Op_FmaVF;
case Op_CMoveD:
assert(bt == T_DOUBLE, "must be");
return Op_CMoveVD;
@@ -259,6 +265,11 @@
*start = 2;
*end = n->req();
break;
+ case Op_FmaD:
+ case Op_FmaF:
+ *start = 1;
+ *end = 4; // 3 vector operands
+ break;
default:
*start = 1;
*end = n->req(); // default is all operands
@@ -328,6 +339,19 @@
}
+VectorNode* VectorNode::make(int opc, Node* n1, Node* n2, Node* n3, uint vlen, BasicType bt) {
+ const TypeVect* vt = TypeVect::make(bt, vlen);
+ int vopc = VectorNode::opcode(opc, bt);
+ // This method should not be called for unimplemented vectors.
+ guarantee(vopc > 0, "Vector for '%s' is not implemented", NodeClassNames[opc]);
+ switch (vopc) {
+ case Op_FmaVD: return new FmaVDNode(n1, n2, n3, vt);
+ case Op_FmaVF: return new FmaVFNode(n1, n2, n3, vt);
+ }
+ fatal("Missed vector creation for '%s'", NodeClassNames[vopc]);
+ return NULL;
+}
+
// Scalar promotion
VectorNode* VectorNode::scalar2vector(Node* s, uint vlen, const Type* opd_t) {
BasicType bt = opd_t->array_element_basic_type();
--- a/hotspot/src/share/vm/opto/vectornode.hpp Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/share/vm/opto/vectornode.hpp Wed Jun 07 13:09:46 2017 -0700
@@ -62,6 +62,7 @@
static VectorNode* scalar2vector(Node* s, uint vlen, const Type* opd_t);
static VectorNode* shift_count(Node* shift, Node* cnt, uint vlen, BasicType bt);
static VectorNode* make(int opc, Node* n1, Node* n2, uint vlen, BasicType bt);
+ static VectorNode* make(int opc, Node* n1, Node* n2, Node* n3, uint vlen, BasicType bt);
static int opcode(int opc, BasicType bt);
static bool implemented(int opc, uint vlen, BasicType bt);
@@ -260,6 +261,22 @@
virtual int Opcode() const;
};
+//------------------------------FmaVDNode--------------------------------------
+// Vector multiply double
+class FmaVDNode : public VectorNode {
+public:
+ FmaVDNode(Node* in1, Node* in2, Node* in3, const TypeVect* vt) : VectorNode(in1, in2, in3, vt) {}
+ virtual int Opcode() const;
+};
+
+//------------------------------FmaVFNode--------------------------------------
+// Vector multiply float
+class FmaVFNode : public VectorNode {
+public:
+ FmaVFNode(Node* in1, Node* in2, Node* in3, const TypeVect* vt) : VectorNode(in1, in2, in3, vt) {}
+ virtual int Opcode() const;
+};
+
//------------------------------CMoveVDNode--------------------------------------
// Vector multiply double
class CMoveVDNode : public VectorNode {
--- a/hotspot/src/share/vm/runtime/vmStructs.cpp Wed Jun 07 08:56:35 2017 -0700
+++ b/hotspot/src/share/vm/runtime/vmStructs.cpp Wed Jun 07 13:09:46 2017 -0700
@@ -2057,6 +2057,8 @@
declare_c2_type(MulVFNode, VectorNode) \
declare_c2_type(MulReductionVFNode, ReductionNode) \
declare_c2_type(MulVDNode, VectorNode) \
+ declare_c2_type(FmaVDNode, VectorNode) \
+ declare_c2_type(FmaVFNode, VectorNode) \
declare_c2_type(CMoveVDNode, VectorNode) \
declare_c2_type(MulReductionVDNode, ReductionNode) \
declare_c2_type(DivVFNode, VectorNode) \