# HG changeset patch # User vdeshpande # Date 1544654914 28800 # Node ID 4bb6e0871bf72b21fbd8e961c03cfc884b0eb2ad # Parent 9e28eff3d40fba960506b703b8b41a0ffb8d73e6 8214751: X86: Support for VNNI Instructions Reviewed-by: kvn Contributed-by: razvan.a.lupusoru@intel.com, vivek.r.deshpande@intel.com diff -r 9e28eff3d40f -r 4bb6e0871bf7 src/hotspot/cpu/x86/assembler_x86.cpp --- a/src/hotspot/cpu/x86/assembler_x86.cpp Wed Dec 12 15:35:20 2018 -0500 +++ b/src/hotspot/cpu/x86/assembler_x86.cpp Wed Dec 12 14:48:34 2018 -0800 @@ -3966,6 +3966,34 @@ emit_int8((unsigned char)(0xC0 | encode)); } +void Assembler::pmaddwd(XMMRegister dst, XMMRegister src) { + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8((unsigned char)0xF5); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::vpmaddwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(vector_len == AVX_128bit ? VM_Version::supports_avx() : + (vector_len == AVX_256bit ? VM_Version::supports_avx2() : + (vector_len == AVX_512bit ? VM_Version::supports_evex() : 0)), ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, nds, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8((unsigned char)0xF5); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::evpdpwssd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(VM_Version::supports_evex(), ""); + assert(VM_Version::supports_vnni(), "must support vnni"); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8(0x52); + emit_int8((unsigned char)(0xC0 | encode)); +} + // generic void Assembler::pop(Register dst) { int encode = prefix_and_encode(dst->encoding()); diff -r 9e28eff3d40f -r 4bb6e0871bf7 src/hotspot/cpu/x86/assembler_x86.hpp --- a/src/hotspot/cpu/x86/assembler_x86.hpp Wed Dec 12 15:35:20 2018 -0500 +++ b/src/hotspot/cpu/x86/assembler_x86.hpp Wed Dec 12 14:48:34 2018 -0800 @@ -1668,6 +1668,12 @@ void evpmovdb(Address dst, XMMRegister src, int vector_len); + // Multiply add + void pmaddwd(XMMRegister dst, XMMRegister src); + void vpmaddwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + // Multiply add accumulate + void evpdpwssd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + #ifndef _LP64 // no 32bit push/pop on amd64 void popl(Address dst); #endif diff -r 9e28eff3d40f -r 4bb6e0871bf7 src/hotspot/cpu/x86/vm_version_x86.cpp --- a/src/hotspot/cpu/x86/vm_version_x86.cpp Wed Dec 12 15:35:20 2018 -0500 +++ b/src/hotspot/cpu/x86/vm_version_x86.cpp Wed Dec 12 14:48:34 2018 -0800 @@ -1289,7 +1289,7 @@ if (FLAG_IS_DEFAULT(UseXMMForArrayCopy)) { UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus } - if (supports_sse4_2() && supports_ht()) { // Newest Intel cpus + if ((supports_sse4_2() && supports_ht()) || supports_avx()) { // Newest Intel cpus if (FLAG_IS_DEFAULT(UseUnalignedLoadStores)) { UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus } diff -r 9e28eff3d40f -r 4bb6e0871bf7 src/hotspot/cpu/x86/vm_version_x86.hpp --- a/src/hotspot/cpu/x86/vm_version_x86.hpp Wed Dec 12 15:35:20 2018 -0500 +++ b/src/hotspot/cpu/x86/vm_version_x86.hpp Wed Dec 12 14:48:34 2018 -0800 @@ -336,6 +336,7 @@ #define CPU_AVX512_VPOPCNTDQ ((uint64_t)UCONST64(0x2000000000)) // Vector popcount #define CPU_VPCLMULQDQ ((uint64_t)UCONST64(0x4000000000)) //Vector carryless multiplication #define CPU_VAES ((uint64_t)UCONST64(0x8000000000)) // Vector AES instructions +#define CPU_VNNI ((uint64_t)UCONST64(0x16000000000)) // Vector Neural Network Instructions enum Extended_Family { // AMD @@ -548,6 +549,8 @@ result |= CPU_VPCLMULQDQ; if (_cpuid_info.sef_cpuid7_ecx.bits.vaes != 0) result |= CPU_VAES; + if (_cpuid_info.sef_cpuid7_ecx.bits.avx512_vnni != 0) + result |= CPU_VNNI; } } if(_cpuid_info.sef_cpuid7_ebx.bits.bmi1 != 0) @@ -828,6 +831,7 @@ static bool supports_vpopcntdq() { return (_features & CPU_AVX512_VPOPCNTDQ) != 0; } static bool supports_vpclmulqdq() { return (_features & CPU_VPCLMULQDQ) != 0; } static bool supports_vaes() { return (_features & CPU_VAES) != 0; } + static bool supports_vnni() { return (_features & CPU_VNNI) != 0; } // Intel features static bool is_intel_family_core() { return is_intel() && diff -r 9e28eff3d40f -r 4bb6e0871bf7 src/hotspot/cpu/x86/x86.ad --- a/src/hotspot/cpu/x86/x86.ad Wed Dec 12 15:35:20 2018 -0500 +++ b/src/hotspot/cpu/x86/x86.ad Wed Dec 12 14:48:34 2018 -0800 @@ -1446,6 +1446,10 @@ if (VM_Version::supports_on_spin_wait() == false) ret_value = false; break; + case Op_MulAddVS2VI: + if (UseSSE < 2) + ret_value = false; + break; } return ret_value; // Per default match rules are supported. @@ -9855,6 +9859,118 @@ ins_pipe( pipe_slow ); %} +// --------------------------------- Vector Multiply Add -------------------------------------- + +instruct smuladd4S2I_reg(vecD dst, vecD src1) %{ + predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 2); + match(Set dst (MulAddVS2VI dst src1)); + format %{ "pmaddwd $dst,$dst,$src1\t! muladd packed4Sto2I" %} + ins_encode %{ + __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmuladd4S2I_reg(vecD dst, vecD src1, vecD src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + match(Set dst (MulAddVS2VI src1 src2)); + format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed4Sto2I" %} + ins_encode %{ + int vector_len = 0; + __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct smuladd8S4I_reg(vecX dst, vecX src1) %{ + predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 4); + match(Set dst (MulAddVS2VI dst src1)); + format %{ "pmaddwd $dst,$dst,$src1\t! muladd packed8Sto4I" %} + ins_encode %{ + __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmuladd8S4I_reg(vecX dst, vecX src1, vecX src2) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (MulAddVS2VI src1 src2)); + format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed8Sto4I" %} + ins_encode %{ + int vector_len = 0; + __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmuladd16S8I_reg(vecY dst, vecY src1, vecY src2) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 8); + match(Set dst (MulAddVS2VI src1 src2)); + format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed16Sto8I" %} + ins_encode %{ + int vector_len = 1; + __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmuladd32S16I_reg(vecZ dst, vecZ src1, vecZ src2) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 16); + match(Set dst (MulAddVS2VI src1 src2)); + format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed32Sto16I" %} + ins_encode %{ + int vector_len = 2; + __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +// --------------------------------- Vector Multiply Add Add ---------------------------------- + +instruct vmuladdadd4S2I_reg(vecD dst, vecD src1, vecD src2) %{ + predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 2); + match(Set dst (AddVI (MulAddVS2VI src1 src2) dst)); + format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed4Sto2I" %} + ins_encode %{ + int vector_len = 0; + __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmuladdadd8S4I_reg(vecX dst, vecX src1, vecX src2) %{ + predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 4); + match(Set dst (AddVI (MulAddVS2VI src1 src2) dst)); + format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed8Sto4I" %} + ins_encode %{ + int vector_len = 0; + __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmuladdadd16S8I_reg(vecY dst, vecY src1, vecY src2) %{ + predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 8); + match(Set dst (AddVI (MulAddVS2VI src1 src2) dst)); + format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed16Sto8I" %} + ins_encode %{ + int vector_len = 1; + __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmuladdadd32S16I_reg(vecZ dst, vecZ src1, vecZ src2) %{ + predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 16); + match(Set dst (AddVI (MulAddVS2VI src1 src2) dst)); + format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed32Sto16I" %} + ins_encode %{ + int vector_len = 2; + __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + // --------------------------------- PopCount -------------------------------------- instruct vpopcount2I(vecD dst, vecD src) %{ diff -r 9e28eff3d40f -r 4bb6e0871bf7 src/hotspot/cpu/x86/x86_32.ad --- a/src/hotspot/cpu/x86/x86_32.ad Wed Dec 12 15:35:20 2018 -0500 +++ b/src/hotspot/cpu/x86/x86_32.ad Wed Dec 12 14:48:34 2018 -0800 @@ -7755,6 +7755,16 @@ ins_pipe( ialu_reg_mem_alu0 ); %} +instruct mulAddS2I_rReg(rRegI dst, rRegI src1, rRegI src2, rRegI src3, eFlagsReg cr) +%{ + match(Set dst (MulAddS2I (Binary dst src1) (Binary src2 src3))); + effect(KILL cr, KILL src2); + + expand %{ mulI_rReg(dst, src1, cr); + mulI_rReg(src2, src3, cr); + addI_rReg(dst, src2, cr); %} +%} + // Multiply Register Int to Long instruct mulI2L(eADXRegL dst, eAXRegI src, nadxRegI src1, eFlagsReg flags) %{ // Basic Idea: long = (long)int * (long)int diff -r 9e28eff3d40f -r 4bb6e0871bf7 src/hotspot/cpu/x86/x86_64.ad --- a/src/hotspot/cpu/x86/x86_64.ad Wed Dec 12 15:35:20 2018 -0500 +++ b/src/hotspot/cpu/x86/x86_64.ad Wed Dec 12 14:48:34 2018 -0800 @@ -8175,6 +8175,16 @@ ins_pipe(ialu_reg_mem_alu0); %} +instruct mulAddS2I_rReg(rRegI dst, rRegI src1, rRegI src2, rRegI src3, rFlagsReg cr) +%{ + match(Set dst (MulAddS2I (Binary dst src1) (Binary src2 src3))); + effect(KILL cr, KILL src2); + + expand %{ mulI_rReg(dst, src1, cr); + mulI_rReg(src2, src3, cr); + addI_rReg(dst, src2, cr); %} +%} + instruct mulL_rReg(rRegL dst, rRegL src, rFlagsReg cr) %{ match(Set dst (MulL dst src)); diff -r 9e28eff3d40f -r 4bb6e0871bf7 src/hotspot/share/adlc/formssel.cpp --- a/src/hotspot/share/adlc/formssel.cpp Wed Dec 12 15:35:20 2018 -0500 +++ b/src/hotspot/share/adlc/formssel.cpp Wed Dec 12 14:48:34 2018 -0800 @@ -4181,6 +4181,7 @@ "AddReductionVF", "AddReductionVD", "MulReductionVI", "MulReductionVL", "MulReductionVF", "MulReductionVD", + "MulAddVS2VI", "LShiftCntV","RShiftCntV", "LShiftVB","LShiftVS","LShiftVI","LShiftVL", "RShiftVB","RShiftVS","RShiftVI","RShiftVL", diff -r 9e28eff3d40f -r 4bb6e0871bf7 src/hotspot/share/opto/classes.hpp --- a/src/hotspot/share/opto/classes.hpp Wed Dec 12 15:35:20 2018 -0500 +++ b/src/hotspot/share/opto/classes.hpp Wed Dec 12 14:48:34 2018 -0800 @@ -201,6 +201,7 @@ macro(LoopLimit) macro(Mach) macro(MachProj) +macro(MulAddS2I) macro(MaxI) macro(MemBarAcquire) macro(LoadFence) @@ -341,6 +342,7 @@ macro(MulReductionVF) macro(MulVD) macro(MulReductionVD) +macro(MulAddVS2VI) macro(FmaVD) macro(FmaVF) macro(DivVF) diff -r 9e28eff3d40f -r 4bb6e0871bf7 src/hotspot/share/opto/loopnode.hpp --- a/src/hotspot/share/opto/loopnode.hpp Wed Dec 12 15:35:20 2018 -0500 +++ b/src/hotspot/share/opto/loopnode.hpp Wed Dec 12 14:48:34 2018 -0800 @@ -1249,6 +1249,9 @@ // important (common) to do address expressions. Node *remix_address_expressions( Node *n ); + // Convert add to muladd to generate MuladdS2I under certain criteria + Node * convert_add_to_muladd(Node * n); + // Attempt to use a conditional move instead of a phi/branch Node *conditional_move( Node *n ); diff -r 9e28eff3d40f -r 4bb6e0871bf7 src/hotspot/share/opto/loopopts.cpp --- a/src/hotspot/share/opto/loopopts.cpp Wed Dec 12 15:35:20 2018 -0500 +++ b/src/hotspot/share/opto/loopopts.cpp Wed Dec 12 14:48:34 2018 -0800 @@ -493,6 +493,54 @@ return NULL; } +// Optimize ((in1[2*i] * in2[2*i]) + (in1[2*i+1] * in2[2*i+1])) +Node *PhaseIdealLoop::convert_add_to_muladd(Node* n) { + assert(n->Opcode() == Op_AddI, "sanity"); + Node * nn = NULL; + Node * in1 = n->in(1); + Node * in2 = n->in(2); + if (in1->Opcode() == Op_MulI && in2->Opcode() == Op_MulI) { + IdealLoopTree* loop_n = get_loop(get_ctrl(n)); + if (loop_n->_head->as_Loop()->is_valid_counted_loop() && + Matcher::match_rule_supported(Op_MulAddS2I) && + Matcher::match_rule_supported(Op_MulAddVS2VI)) { + Node* mul_in1 = in1->in(1); + Node* mul_in2 = in1->in(2); + Node* mul_in3 = in2->in(1); + Node* mul_in4 = in2->in(2); + if (mul_in1->Opcode() == Op_LoadS && + mul_in2->Opcode() == Op_LoadS && + mul_in3->Opcode() == Op_LoadS && + mul_in4->Opcode() == Op_LoadS) { + IdealLoopTree* loop1 = get_loop(get_ctrl(mul_in1)); + IdealLoopTree* loop2 = get_loop(get_ctrl(mul_in2)); + IdealLoopTree* loop3 = get_loop(get_ctrl(mul_in3)); + IdealLoopTree* loop4 = get_loop(get_ctrl(mul_in4)); + IdealLoopTree* loop5 = get_loop(get_ctrl(in1)); + IdealLoopTree* loop6 = get_loop(get_ctrl(in2)); + // All nodes should be in the same counted loop. + if (loop_n == loop1 && loop_n == loop2 && loop_n == loop3 && + loop_n == loop4 && loop_n == loop5 && loop_n == loop6) { + Node* adr1 = mul_in1->in(MemNode::Address); + Node* adr2 = mul_in2->in(MemNode::Address); + Node* adr3 = mul_in3->in(MemNode::Address); + Node* adr4 = mul_in4->in(MemNode::Address); + if (adr1->is_AddP() && adr2->is_AddP() && adr3->is_AddP() && adr4->is_AddP()) { + if ((adr1->in(AddPNode::Base) == adr3->in(AddPNode::Base)) && + (adr2->in(AddPNode::Base) == adr4->in(AddPNode::Base))) { + nn = new MulAddS2INode(mul_in1, mul_in2, mul_in3, mul_in4); + register_new_node(nn, get_ctrl(n)); + _igvn.replace_node(n, nn); + return nn; + } + } + } + } + } + } + return nn; +} + //------------------------------conditional_move------------------------------- // Attempt to replace a Phi with a conditional move. We have some pretty // strict profitability requirements. All Phis at the merge point must @@ -927,6 +975,11 @@ Node *m = remix_address_expressions( n ); if( m ) return m; + if (n_op == Op_AddI) { + Node *nn = convert_add_to_muladd( n ); + if ( nn ) return nn; + } + if (n->is_ConstraintCast()) { Node* dom_cast = n->as_ConstraintCast()->dominating_cast(&_igvn, this); // ConstraintCastNode::dominating_cast() uses node control input to determine domination. diff -r 9e28eff3d40f -r 4bb6e0871bf7 src/hotspot/share/opto/matcher.cpp --- a/src/hotspot/share/opto/matcher.cpp Wed Dec 12 15:35:20 2018 -0500 +++ b/src/hotspot/share/opto/matcher.cpp Wed Dec 12 14:48:34 2018 -0800 @@ -2352,6 +2352,15 @@ n->del_req(3); break; } + case Op_MulAddS2I: { + Node* pair1 = new BinaryNode(n->in(1), n->in(2)); + Node* pair2 = new BinaryNode(n->in(3), n->in(4)); + n->set_req(1, pair1); + n->set_req(2, pair2); + n->del_req(4); + n->del_req(3); + break; + } default: break; } diff -r 9e28eff3d40f -r 4bb6e0871bf7 src/hotspot/share/opto/mulnode.hpp --- a/src/hotspot/share/opto/mulnode.hpp Wed Dec 12 15:35:20 2018 -0500 +++ b/src/hotspot/share/opto/mulnode.hpp Wed Dec 12 14:48:34 2018 -0800 @@ -285,4 +285,15 @@ virtual const Type* Value(PhaseGVN* phase) const; }; +//------------------------------MulAddS2INode---------------------------------- +// Multiply shorts into integers and add them. +// Semantics: I_OUT = S1 * S2 + S3 * S4 +class MulAddS2INode : public Node { +public: + MulAddS2INode(Node* in1, Node *in2, Node *in3, Node* in4) : Node(0, in1, in2, in3, in4) {} + virtual int Opcode() const; + const Type *bottom_type() const { return TypeInt::INT; } + virtual uint ideal_reg() const { return Op_RegI; } +}; + #endif // SHARE_VM_OPTO_MULNODE_HPP diff -r 9e28eff3d40f -r 4bb6e0871bf7 src/hotspot/share/opto/superword.cpp --- a/src/hotspot/share/opto/superword.cpp Wed Dec 12 15:35:20 2018 -0500 +++ b/src/hotspot/share/opto/superword.cpp Wed Dec 12 14:48:34 2018 -0800 @@ -645,6 +645,10 @@ // with a different alignment were created before. for (uint i = 0; i < align_to_refs.size(); i++) { MemNode* mr = align_to_refs.at(i)->as_Mem(); + if (mr == mem_ref) { + // Skip when we are looking at same memory operation. + continue; + } if (same_velt_type(mr, mem_ref) && memory_alignment(mr, iv_adjustment) != 0) create_pack = false; @@ -846,6 +850,27 @@ return NULL; } +//------------------span_works_for_memory_size----------------------------- +static bool span_works_for_memory_size(MemNode* mem, int span, int mem_size, int offset) { + bool span_matches_memory = false; + if ((mem_size == type2aelembytes(T_BYTE) || mem_size == type2aelembytes(T_SHORT)) + && ABS(span) == type2aelembytes(T_INT)) { + // There is a mismatch on span size compared to memory. + for (DUIterator_Fast jmax, j = mem->fast_outs(jmax); j < jmax; j++) { + Node* use = mem->fast_out(j); + if (!VectorNode::is_type_transition_to_int(use)) { + return false; + } + } + // If all uses transition to integer, it means that we can successfully align even on mismatch. + return true; + } + else { + span_matches_memory = ABS(span) == mem_size; + } + return span_matches_memory && (ABS(offset) % mem_size) == 0; +} + //------------------------------ref_is_alignable--------------------------- // Can the preloop align the reference to position zero in the vector? bool SuperWord::ref_is_alignable(SWPointer& p) { @@ -862,7 +887,7 @@ int offset = p.offset_in_bytes(); // Stride one accesses are alignable if offset is aligned to memory operation size. // Offset can be unaligned when UseUnalignedAccesses is used. - if (ABS(span) == mem_size && (ABS(offset) % mem_size) == 0) { + if (span_works_for_memory_size(p.mem(), span, mem_size, offset)) { return true; } // If the initial offset from start of the object is computable, @@ -915,6 +940,28 @@ } return false; } +//---------------------------get_vw_bytes_special------------------------ +int SuperWord::get_vw_bytes_special(MemNode* s) { + // Get the vector width in bytes. + int vw = vector_width_in_bytes(s); + + // Check for special case where there is an MulAddS2I usage where short vectors are going to need combined. + BasicType btype = velt_basic_type(s); + if (type2aelembytes(btype) == 2) { + bool should_combine_adjacent = true; + for (DUIterator_Fast imax, i = s->fast_outs(imax); i < imax; i++) { + Node* user = s->fast_out(i); + if (!VectorNode::is_muladds2i(user)) { + should_combine_adjacent = false; + } + } + if (should_combine_adjacent) { + vw = MIN2(Matcher::max_vector_size(btype)*type2aelembytes(btype), vw * 2); + } + } + + return vw; +} //---------------------------get_iv_adjustment--------------------------- // Calculate loop's iv adjustment for this memory ops. @@ -923,7 +970,7 @@ int offset = align_to_ref_p.offset_in_bytes(); int scale = align_to_ref_p.scale_in_bytes(); int elt_size = align_to_ref_p.memory_size(); - int vw = vector_width_in_bytes(mem_ref); + int vw = get_vw_bytes_special(mem_ref); assert(vw > 1, "sanity"); int iv_adjustment; if (scale != 0) { @@ -2303,6 +2350,12 @@ const TypePtr* atyp = n->adr_type(); vn = StoreVectorNode::make(opc, ctl, mem, adr, atyp, val, vlen); vlen_in_bytes = vn->as_StoreVector()->memory_size(); + } else if (VectorNode::is_muladds2i(n)) { + assert(n->req() == 5u, "MulAddS2I should have 4 operands."); + Node* in1 = vector_opd(p, 1); + Node* in2 = vector_opd(p, 2); + vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n)); + vlen_in_bytes = vn->as_Vector()->length_in_bytes(); } else if (n->req() == 3 && !is_cmov_pack(p)) { // Promote operands to vector Node* in1 = NULL; @@ -2615,6 +2668,16 @@ } assert(opd_bt == in->bottom_type()->basic_type(), "all same type"); pk->add_opd(in); + if (VectorNode::is_muladds2i(pi)) { + Node* in2 = pi->in(opd_idx + 2); + assert(my_pack(in2) == NULL, "Should already have been unpacked"); + if (my_pack(in2) != NULL) { + NOT_PRODUCT(if (is_trace_loop_reverse() || TraceLoopOpts) { tty->print_cr("Should already have been unpacked"); }) + return NULL; + } + assert(opd_bt == in2->bottom_type()->basic_type(), "all same type"); + pk->add_opd(in2); + } } _igvn.register_new_node_with_optimizer(pk); _phase->set_ctrl(pk, _phase->get_ctrl(opd)); @@ -2692,6 +2755,21 @@ } return true; } + if (VectorNode::is_muladds2i(use)) { + // MulAddS2I takes shorts and produces ints - hence the special checks + // on alignment and size. + if (u_pk->size() * 2 != d_pk->size()) { + return false; + } + for (uint i = 0; i < MIN2(d_pk->size(), u_pk->size()); i++) { + Node* ui = u_pk->at(i); + Node* di = d_pk->at(i); + if (alignment(ui) != alignment(di) * 2) { + return false; + } + } + return true; + } if (u_pk->size() != d_pk->size()) return false; for (uint i = 0; i < u_pk->size(); i++) { @@ -3017,7 +3095,7 @@ NOT_PRODUCT(if(is_trace_alignment()) tty->print("SWPointer::memory_alignment: SWPointer p invalid, return bottom_align");) return bottom_align; } - int vw = vector_width_in_bytes(s); + int vw = get_vw_bytes_special(s); if (vw < 2) { NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SWPointer::memory_alignment: vector_width_in_bytes < 2, return bottom_align");) return bottom_align; // No vectors for this type diff -r 9e28eff3d40f -r 4bb6e0871bf7 src/hotspot/share/opto/superword.hpp --- a/src/hotspot/share/opto/superword.hpp Wed Dec 12 15:35:20 2018 -0500 +++ b/src/hotspot/share/opto/superword.hpp Wed Dec 12 14:48:34 2018 -0800 @@ -347,6 +347,7 @@ BasicType bt = velt_basic_type(n); return vector_width(n)*type2aelembytes(bt); } + int get_vw_bytes_special(MemNode* s); MemNode* align_to_ref() { return _align_to_ref; } void set_align_to_ref(MemNode* m) { _align_to_ref = m; } diff -r 9e28eff3d40f -r 4bb6e0871bf7 src/hotspot/share/opto/vectornode.cpp --- a/src/hotspot/share/opto/vectornode.cpp Wed Dec 12 15:35:20 2018 -0500 +++ b/src/hotspot/share/opto/vectornode.cpp Wed Dec 12 14:48:34 2018 -0800 @@ -196,6 +196,8 @@ case Op_StoreF: case Op_StoreD: return Op_StoreVector; + case Op_MulAddS2I: + return Op_MulAddVS2VI; default: return 0; // Unimplemented @@ -214,6 +216,25 @@ return false; } +bool VectorNode::is_type_transition_short_to_int(Node* n) { + switch (n->Opcode()) { + case Op_MulAddS2I: + return true; + } + return false; +} + +bool VectorNode::is_type_transition_to_int(Node* n) { + return is_type_transition_short_to_int(n); +} + +bool VectorNode::is_muladds2i(Node* n) { + if (n->Opcode() == Op_MulAddS2I) { + return true; + } + return false; +} + bool VectorNode::is_shift(Node* n) { switch (n->Opcode()) { case Op_LShiftI: @@ -277,6 +298,7 @@ case Op_AndI: case Op_AndL: case Op_OrI: case Op_OrL: case Op_XorI: case Op_XorL: + case Op_MulAddS2I: *start = 1; *end = 3; // 2 vector operands break; @@ -354,6 +376,8 @@ case Op_AndV: return new AndVNode(n1, n2, vt); case Op_OrV: return new OrVNode (n1, n2, vt); case Op_XorV: return new XorVNode(n1, n2, vt); + + case Op_MulAddVS2VI: return new MulAddVS2VINode(n1, n2, vt); default: fatal("Missed vector creation for '%s'", NodeClassNames[vopc]); return NULL; diff -r 9e28eff3d40f -r 4bb6e0871bf7 src/hotspot/share/opto/vectornode.hpp --- a/src/hotspot/share/opto/vectornode.hpp Wed Dec 12 15:35:20 2018 -0500 +++ b/src/hotspot/share/opto/vectornode.hpp Wed Dec 12 14:48:34 2018 -0800 @@ -67,6 +67,9 @@ static int opcode(int opc, BasicType bt); static bool implemented(int opc, uint vlen, BasicType bt); static bool is_shift(Node* n); + static bool is_type_transition_short_to_int(Node* n); + static bool is_type_transition_to_int(Node* n); + static bool is_muladds2i(Node* n); static bool is_invariant_vector(Node* n); // [Start, end) half-open range defining which operands are vectors static void vector_operands(Node* n, uint* start, uint* end); @@ -261,6 +264,14 @@ virtual int Opcode() const; }; +//------------------------------MulAddVS2VINode-------------------------------- +// Vector multiply shorts to int and add adjacent ints. +class MulAddVS2VINode : public VectorNode { + public: + MulAddVS2VINode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {} + virtual int Opcode() const; +}; + //------------------------------FmaVDNode-------------------------------------- // Vector multiply double class FmaVDNode : public VectorNode { diff -r 9e28eff3d40f -r 4bb6e0871bf7 test/hotspot/jtreg/compiler/loopopts/superword/Vec_MulAddS2I.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/hotspot/jtreg/compiler/loopopts/superword/Vec_MulAddS2I.java Wed Dec 12 14:48:34 2018 -0800 @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/** + * @test + * @bug 8214751 + * @summary Add C2 x86 Superword support for VNNI VPDPWSSD Instruction + * @requires os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64" + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250 + * -XX:CompileThresholdScaling=0.1 + * -XX:+SuperWord + * -XX:LoopMaxUnroll=2 + * compiler.loopopts.superword.Vec_MulAddS2I + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250 + * -XX:CompileThresholdScaling=0.1 + * -XX:-SuperWord + * -XX:LoopMaxUnroll=2 + * compiler.loopopts.superword.Vec_MulAddS2I + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250 + * -XX:CompileThresholdScaling=0.1 + * -XX:+SuperWord + * -XX:LoopMaxUnroll=4 + * compiler.loopopts.superword.Vec_MulAddS2I + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250 + * -XX:CompileThresholdScaling=0.1 + * -XX:-SuperWord + * -XX:LoopMaxUnroll=4 + * compiler.loopopts.superword.Vec_MulAddS2I + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250 + * -XX:CompileThresholdScaling=0.1 + * -XX:+SuperWord + * -XX:LoopMaxUnroll=8 + * compiler.loopopts.superword.Vec_MulAddS2I + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250 + * -XX:CompileThresholdScaling=0.1 + * -XX:-SuperWord + * -XX:LoopMaxUnroll=8 + * compiler.loopopts.superword.Vec_MulAddS2I + * + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250 + * -XX:CompileThresholdScaling=0.1 + * -XX:+SuperWord + * -XX:LoopMaxUnroll=16 + * compiler.loopopts.superword.Vec_MulAddS2I + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250 + * -XX:CompileThresholdScaling=0.1 + * -XX:-SuperWord + * -XX:LoopMaxUnroll=16 + * compiler.loopopts.superword.Vec_MulAddS2I + */ + +package compiler.loopopts.superword; +import java.util.Random; + +public class Vec_MulAddS2I { + static final int NUM = 1024; + static int[] out = new int[NUM]; + static short[] in1 = new short[2*NUM]; + static short[] in2 = new short[2*NUM]; + public static void main(String[] args) throws Exception { + Vec_MulAddS2IInit(in1, in2); + int result = 0; + int valid = 204800000; + for (int j = 0; j < 10000*512; j++) { + result = Vec_MulAddS2IImplement(in1, in2, out); + } + if (result == valid) { + System.out.println("Success"); + } else { + System.out.println("Invalid calculation of element variables in the out array: " + result); + System.out.println("Expected value for each element of out array = " + valid); + throw new Exception("Failed"); + } + } + + public static void Vec_MulAddS2IInit( + short[] in1, + short[] in2) { + for (int i=0; i<2*NUM; i++) { + in1[i] = (short)4; + in2[i] = (short)5; + } + } + + public static int Vec_MulAddS2IImplement( + short[] in1, + short[] in2, + int[] out) { + for (int i = 0; i < NUM; i++) { + out[i] += ((in1[2*i] * in2[2*i]) + (in1[2*i+1] * in2[2*i+1])); + } + Random rand = new Random(); + int n = rand.nextInt(NUM-1); + return out[n]; + } + +}