# HG changeset patch # User sviswanathan # Date 1557261207 25200 # Node ID 1851a532ddfebb914d14a1c82be6c83cf5029fde # Parent 98558b7544c729ffab356531fa0f594ce4e19f4c 8222074: Enhance auto vectorization for x86 Reviewed-by: kvn, vlivanov diff -r 98558b7544c7 -r 1851a532ddfe src/hotspot/cpu/x86/assembler_x86.cpp --- a/src/hotspot/cpu/x86/assembler_x86.cpp Tue May 07 21:53:46 2019 +0200 +++ b/src/hotspot/cpu/x86/assembler_x86.cpp Tue May 07 13:33:27 2019 -0700 @@ -1894,6 +1894,69 @@ emit_int8((unsigned char)(0xC0 | encode)); } +void Assembler::pabsb(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_ssse3(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8(0x1C); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::pabsw(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_ssse3(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8(0x1D); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::pabsd(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_ssse3(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8(0x1E); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::vpabsb(XMMRegister dst, XMMRegister src, int vector_len) { + assert(vector_len == AVX_128bit? VM_Version::supports_avx() : + vector_len == AVX_256bit? VM_Version::supports_avx2() : + vector_len == AVX_512bit? VM_Version::supports_avx512bw() : 0, ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0x1C); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::vpabsw(XMMRegister dst, XMMRegister src, int vector_len) { + assert(vector_len == AVX_128bit? VM_Version::supports_avx() : + vector_len == AVX_256bit? VM_Version::supports_avx2() : + vector_len == AVX_512bit? VM_Version::supports_avx512bw() : 0, ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0x1D); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::vpabsd(XMMRegister dst, XMMRegister src, int vector_len) { + assert(vector_len == AVX_128bit? VM_Version::supports_avx() : + vector_len == AVX_256bit? VM_Version::supports_avx2() : + vector_len == AVX_512bit? VM_Version::supports_evex() : 0, ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0x1E); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::evpabsq(XMMRegister dst, XMMRegister src, int vector_len) { + assert(UseAVX > 2, ""); + InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0x1F); + emit_int8((unsigned char)(0xC0 | encode)); +} + void Assembler::decl(Address dst) { // Don't use it directly. Use MacroAssembler::decrement() instead. InstructionMark im(this); @@ -3416,10 +3479,19 @@ InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); emit_int8(0x00); - emit_int8(0xC0 | encode); + emit_int8((unsigned char)(0xC0 | encode)); emit_int8(imm8); } +void Assembler::vpermq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(UseAVX > 2, "requires AVX512F"); + InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8((unsigned char)0x36); + emit_int8((unsigned char)(0xC0 | encode)); +} + void Assembler::vperm2i128(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8) { assert(VM_Version::supports_avx2(), ""); InstructionAttr attributes(AVX_256bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); @@ -3884,6 +3956,14 @@ emit_int8((unsigned char)(0xC0 | encode)); } +void Assembler::pmovsxbw(XMMRegister dst, XMMRegister src) { + assert(VM_Version::supports_sse4_1(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8(0x20); + emit_int8((unsigned char)(0xC0 | encode)); +} + void Assembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) { assert(VM_Version::supports_avx(), ""); InstructionMark im(this); @@ -3905,6 +3985,15 @@ emit_int8((unsigned char) (0xC0 | encode)); } +void Assembler::vpmovsxbw(XMMRegister dst, XMMRegister src, int vector_len) { + assert(vector_len == AVX_128bit? VM_Version::supports_avx() : + vector_len == AVX_256bit? VM_Version::supports_avx2() : + vector_len == AVX_512bit? VM_Version::supports_avx512bw() : 0, ""); + InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); + int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8(0x20); + emit_int8((unsigned char)(0xC0 | encode)); +} void Assembler::evpmovzxbw(XMMRegister dst, KRegister mask, Address src, int vector_len) { assert(VM_Version::supports_avx512vlbw(), ""); @@ -6277,6 +6366,26 @@ emit_int8((unsigned char)(0xC0 | encode)); } +void Assembler::evpsraq(XMMRegister dst, XMMRegister src, int shift, int vector_len) { + assert(UseAVX > 2, "requires AVX512"); + assert ((VM_Version::supports_avx512vl() || vector_len == 2), "requires AVX512vl"); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(xmm4->encoding(), dst->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8((unsigned char)0x72); + emit_int8((unsigned char)(0xC0 | encode)); + emit_int8(shift & 0xFF); +} + +void Assembler::evpsraq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { + assert(UseAVX > 2, "requires AVX512"); + assert ((VM_Version::supports_avx512vl() || vector_len == 2), "requires AVX512vl"); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8((unsigned char)0xE2); + emit_int8((unsigned char)(0xC0 | encode)); +} // logical operations packed integers void Assembler::pand(XMMRegister dst, XMMRegister src) { diff -r 98558b7544c7 -r 1851a532ddfe src/hotspot/cpu/x86/assembler_x86.hpp --- a/src/hotspot/cpu/x86/assembler_x86.hpp Tue May 07 21:53:46 2019 +0200 +++ b/src/hotspot/cpu/x86/assembler_x86.hpp Tue May 07 13:33:27 2019 -0700 @@ -1102,6 +1102,15 @@ void cvttpd2dq(XMMRegister dst, XMMRegister src); + //Abs of packed Integer values + void pabsb(XMMRegister dst, XMMRegister src); + void pabsw(XMMRegister dst, XMMRegister src); + void pabsd(XMMRegister dst, XMMRegister src); + void vpabsb(XMMRegister dst, XMMRegister src, int vector_len); + void vpabsw(XMMRegister dst, XMMRegister src, int vector_len); + void vpabsd(XMMRegister dst, XMMRegister src, int vector_len); + void evpabsq(XMMRegister dst, XMMRegister src, int vector_len); + // Divide Scalar Double-Precision Floating-Point Values void divsd(XMMRegister dst, Address src); void divsd(XMMRegister dst, XMMRegister src); @@ -1589,6 +1598,7 @@ // Pemutation of 64bit words void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len); void vpermq(XMMRegister dst, XMMRegister src, int imm8); + void vpermq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vperm2i128(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8); void vperm2f128(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8); void evpermi2q(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); @@ -1668,6 +1678,10 @@ void evpmovdb(Address dst, XMMRegister src, int vector_len); + // Sign extend moves + void pmovsxbw(XMMRegister dst, XMMRegister src); + void vpmovsxbw(XMMRegister dst, XMMRegister src, int vector_len); + // Multiply add void pmaddwd(XMMRegister dst, XMMRegister src); void vpmaddwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); @@ -2094,6 +2108,8 @@ void vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_len); void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); + void evpsraq(XMMRegister dst, XMMRegister src, int shift, int vector_len); + void evpsraq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); // And packed integers void pand(XMMRegister dst, XMMRegister src); diff -r 98558b7544c7 -r 1851a532ddfe src/hotspot/cpu/x86/macroAssembler_x86.cpp --- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp Tue May 07 21:53:46 2019 +0200 +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp Tue May 07 13:33:27 2019 -0700 @@ -1003,25 +1003,25 @@ } } -void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src) { +void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) { // Used in sign-masking with aligned address. assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); if (reachable(src)) { Assembler::andpd(dst, as_Address(src)); } else { - lea(rscratch1, src); - Assembler::andpd(dst, Address(rscratch1, 0)); - } -} - -void MacroAssembler::andps(XMMRegister dst, AddressLiteral src) { + lea(scratch_reg, src); + Assembler::andpd(dst, Address(scratch_reg, 0)); + } +} + +void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register scratch_reg) { // Used in sign-masking with aligned address. assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); if (reachable(src)) { Assembler::andps(dst, as_Address(src)); } else { - lea(rscratch1, src); - Assembler::andps(dst, Address(rscratch1, 0)); + lea(scratch_reg, src); + Assembler::andps(dst, Address(scratch_reg, 0)); } } @@ -3340,13 +3340,13 @@ Assembler::vmovdqu(dst, src); } -void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src) { +void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg) { if (reachable(src)) { vmovdqu(dst, as_Address(src)); } else { - lea(rscratch1, src); - vmovdqu(dst, Address(rscratch1, 0)); + lea(scratch_reg, src); + vmovdqu(dst, Address(scratch_reg, 0)); } } @@ -3698,14 +3698,14 @@ } } -void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) { +void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) { // Used in sign-bit flipping with aligned address. assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); if (reachable(src)) { Assembler::xorpd(dst, as_Address(src)); } else { - lea(rscratch1, src); - Assembler::xorpd(dst, Address(rscratch1, 0)); + lea(scratch_reg, src); + Assembler::xorpd(dst, Address(scratch_reg, 0)); } } @@ -3726,14 +3726,14 @@ } } -void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) { +void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register scratch_reg) { // Used in sign-bit flipping with aligned address. assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); if (reachable(src)) { Assembler::xorps(dst, as_Address(src)); } else { - lea(rscratch1, src); - Assembler::xorps(dst, Address(rscratch1, 0)); + lea(scratch_reg, src); + Assembler::xorps(dst, Address(scratch_reg, 0)); } } @@ -3799,12 +3799,12 @@ Assembler::vpaddw(dst, nds, src, vector_len); } -void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) { +void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { if (reachable(src)) { Assembler::vpand(dst, nds, as_Address(src), vector_len); } else { - lea(rscratch1, src); - Assembler::vpand(dst, nds, Address(rscratch1, 0), vector_len); + lea(scratch_reg, src); + Assembler::vpand(dst, nds, Address(scratch_reg, 0), vector_len); } } @@ -3873,6 +3873,22 @@ Assembler::vpsraw(dst, nds, shift, vector_len); } +void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { + assert(UseAVX > 2,""); + if (!VM_Version::supports_avx512vl() && vector_len < 2) { + vector_len = 2; + } + Assembler::evpsraq(dst, nds, shift, vector_len); +} + +void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) { + assert(UseAVX > 2,""); + if (!VM_Version::supports_avx512vl() && vector_len < 2) { + vector_len = 2; + } + Assembler::evpsraq(dst, nds, shift, vector_len); +} + void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); Assembler::vpsrlw(dst, nds, shift, vector_len); @@ -3913,21 +3929,21 @@ Assembler::pshuflw(dst, src, mode); } -void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) { +void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { if (reachable(src)) { vandpd(dst, nds, as_Address(src), vector_len); } else { - lea(rscratch1, src); - vandpd(dst, nds, Address(rscratch1, 0), vector_len); - } -} - -void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) { + lea(scratch_reg, src); + vandpd(dst, nds, Address(scratch_reg, 0), vector_len); + } +} + +void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { if (reachable(src)) { vandps(dst, nds, as_Address(src), vector_len); } else { - lea(rscratch1, src); - vandps(dst, nds, Address(rscratch1, 0), vector_len); + lea(scratch_reg, src); + vandps(dst, nds, Address(scratch_reg, 0), vector_len); } } @@ -3995,23 +4011,161 @@ vxorpd(dst, nds, src, Assembler::AVX_128bit); } -void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) { +void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { if (reachable(src)) { vxorpd(dst, nds, as_Address(src), vector_len); } else { - lea(rscratch1, src); - vxorpd(dst, nds, Address(rscratch1, 0), vector_len); - } -} - -void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) { + lea(scratch_reg, src); + vxorpd(dst, nds, Address(scratch_reg, 0), vector_len); + } +} + +void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { if (reachable(src)) { vxorps(dst, nds, as_Address(src), vector_len); } else { - lea(rscratch1, src); - vxorps(dst, nds, Address(rscratch1, 0), vector_len); - } -} + lea(scratch_reg, src); + vxorps(dst, nds, Address(scratch_reg, 0), vector_len); + } +} + +void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { + if (UseAVX > 1 || (vector_len < 1)) { + if (reachable(src)) { + Assembler::vpxor(dst, nds, as_Address(src), vector_len); + } else { + lea(scratch_reg, src); + Assembler::vpxor(dst, nds, Address(scratch_reg, 0), vector_len); + } + } + else { + MacroAssembler::vxorpd(dst, nds, src, vector_len, scratch_reg); + } +} + +//------------------------------------------------------------------------------------------- +#ifdef COMPILER2 +// Generic instructions support for use in .ad files C2 code generation + +void MacroAssembler::vabsnegd(int opcode, XMMRegister dst, Register scr) { + if (opcode == Op_AbsVD) { + andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr); + } else { + assert((opcode == Op_NegVD),"opcode should be Op_NegD"); + xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr); + } +} + +void MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { + if (opcode == Op_AbsVD) { + vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr); + } else { + assert((opcode == Op_NegVD),"opcode should be Op_NegD"); + vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr); + } +} + +void MacroAssembler::vabsnegf(int opcode, XMMRegister dst, Register scr) { + if (opcode == Op_AbsVF) { + andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr); + } else { + assert((opcode == Op_NegVF),"opcode should be Op_NegF"); + xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr); + } +} + +void MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { + if (opcode == Op_AbsVF) { + vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr); + } else { + assert((opcode == Op_NegVF),"opcode should be Op_NegF"); + vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr); + } +} + +void MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { + if (sign) { + pmovsxbw(dst, src); + } else { + pmovzxbw(dst, src); + } +} + +void MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { + if (sign) { + vpmovsxbw(dst, src, vector_len); + } else { + vpmovzxbw(dst, src, vector_len); + } +} + +void MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src) { + if (opcode == Op_RShiftVI) { + psrad(dst, src); + } else if (opcode == Op_LShiftVI) { + pslld(dst, src); + } else { + assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); + psrld(dst, src); + } +} + +void MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + if (opcode == Op_RShiftVI) { + vpsrad(dst, nds, src, vector_len); + } else if (opcode == Op_LShiftVI) { + vpslld(dst, nds, src, vector_len); + } else { + assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); + vpsrld(dst, nds, src, vector_len); + } +} + +void MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src) { + if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) { + psraw(dst, src); + } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) { + psllw(dst, src); + } else { + assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB"); + psrlw(dst, src); + } +} + +void MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) { + vpsraw(dst, nds, src, vector_len); + } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) { + vpsllw(dst, nds, src, vector_len); + } else { + assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB"); + vpsrlw(dst, nds, src, vector_len); + } +} + +void MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src) { + if (opcode == Op_RShiftVL) { + psrlq(dst, src); // using srl to implement sra on pre-avs512 systems + } else if (opcode == Op_LShiftVL) { + psllq(dst, src); + } else { + assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); + psrlq(dst, src); + } +} + +void MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + if (opcode == Op_RShiftVL) { + evpsraq(dst, nds, src, vector_len); + } else if (opcode == Op_LShiftVL) { + vpsllq(dst, nds, src, vector_len); + } else { + assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); + vpsrlq(dst, nds, src, vector_len); + } +} +#endif +//------------------------------------------------------------------------------------------- void MacroAssembler::clear_jweak_tag(Register possibly_jweak) { const int32_t inverted_jweak_mask = ~static_cast(JNIHandles::weak_tag_mask); diff -r 98558b7544c7 -r 1851a532ddfe src/hotspot/cpu/x86/macroAssembler_x86.hpp --- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp Tue May 07 21:53:46 2019 +0200 +++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp Tue May 07 13:33:27 2019 -0700 @@ -877,12 +877,12 @@ // Floating void andpd(XMMRegister dst, Address src) { Assembler::andpd(dst, src); } - void andpd(XMMRegister dst, AddressLiteral src); + void andpd(XMMRegister dst, AddressLiteral src, Register scratch_reg = rscratch1); void andpd(XMMRegister dst, XMMRegister src) { Assembler::andpd(dst, src); } void andps(XMMRegister dst, XMMRegister src) { Assembler::andps(dst, src); } void andps(XMMRegister dst, Address src) { Assembler::andps(dst, src); } - void andps(XMMRegister dst, AddressLiteral src); + void andps(XMMRegister dst, AddressLiteral src, Register scratch_reg = rscratch1); void comiss(XMMRegister dst, XMMRegister src) { Assembler::comiss(dst, src); } void comiss(XMMRegister dst, Address src) { Assembler::comiss(dst, src); } @@ -1066,8 +1066,8 @@ // these are private because users should be doing movflt/movdbl + void movss(XMMRegister dst, XMMRegister src) { Assembler::movss(dst, src); } void movss(Address dst, XMMRegister src) { Assembler::movss(dst, src); } - void movss(XMMRegister dst, XMMRegister src) { Assembler::movss(dst, src); } void movss(XMMRegister dst, Address src) { Assembler::movss(dst, src); } void movss(XMMRegister dst, AddressLiteral src); @@ -1105,7 +1105,7 @@ void vmovdqu(Address dst, XMMRegister src); void vmovdqu(XMMRegister dst, Address src); void vmovdqu(XMMRegister dst, XMMRegister src); - void vmovdqu(XMMRegister dst, AddressLiteral src); + void vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg = rscratch1); void evmovdquq(XMMRegister dst, Address src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); } void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); } void evmovdquq(Address dst, XMMRegister src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); } @@ -1183,12 +1183,12 @@ // Bitwise Logical XOR of Packed Double-Precision Floating-Point Values void xorpd(XMMRegister dst, XMMRegister src); void xorpd(XMMRegister dst, Address src) { Assembler::xorpd(dst, src); } - void xorpd(XMMRegister dst, AddressLiteral src); + void xorpd(XMMRegister dst, AddressLiteral src, Register scratch_reg = rscratch1); // Bitwise Logical XOR of Packed Single-Precision Floating-Point Values void xorps(XMMRegister dst, XMMRegister src); void xorps(XMMRegister dst, Address src) { Assembler::xorps(dst, src); } - void xorps(XMMRegister dst, AddressLiteral src); + void xorps(XMMRegister dst, AddressLiteral src, Register scratch_reg = rscratch1); // Shuffle Bytes void pshufb(XMMRegister dst, XMMRegister src) { Assembler::pshufb(dst, src); } @@ -1215,7 +1215,7 @@ void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); } void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); } - void vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len); + void vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1); void vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len); void vpbroadcastw(XMMRegister dst, Address src, int vector_len) { Assembler::vpbroadcastw(dst, src, vector_len); } @@ -1241,6 +1241,9 @@ void vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len); void vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len); + void evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len); + void evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len); + void vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len); void vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len); @@ -1260,11 +1263,11 @@ void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); } void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); } - void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len); + void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1); void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); } void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); } - void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len); + void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1); void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vdivsd(dst, nds, src); } void vdivsd(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vdivsd(dst, nds, src); } @@ -1297,11 +1300,11 @@ void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); } void vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); } - void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len); + void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1); void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); } void vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); } - void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len); + void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1); void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2 @@ -1315,6 +1318,7 @@ else Assembler::vxorpd(dst, nds, src, vector_len); } + void vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1); // Simple version for AVX2 256bit vectors void vpxor(XMMRegister dst, XMMRegister src) { Assembler::vpxor(dst, dst, src, true); } @@ -1601,6 +1605,22 @@ void movl2ptr(Register dst, Address src) { LP64_ONLY(movslq(dst, src)) NOT_LP64(movl(dst, src)); } void movl2ptr(Register dst, Register src) { LP64_ONLY(movslq(dst, src)) NOT_LP64(if (dst != src) movl(dst, src)); } +#ifdef COMPILER2 + // Generic instructions support for use in .ad files C2 code generation + void vabsnegd(int opcode, XMMRegister dst, Register scr); + void vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr); + void vabsnegf(int opcode, XMMRegister dst, Register scr); + void vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr); + void vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len); + void vextendbw(bool sign, XMMRegister dst, XMMRegister src); + void vshiftd(int opcode, XMMRegister dst, XMMRegister src); + void vshiftd(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vshiftw(int opcode, XMMRegister dst, XMMRegister src); + void vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vshiftq(int opcode, XMMRegister dst, XMMRegister src); + void vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); +#endif + // C2 compiled method's prolog code. void verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub); diff -r 98558b7544c7 -r 1851a532ddfe src/hotspot/cpu/x86/stubGenerator_x86_32.cpp --- a/src/hotspot/cpu/x86/stubGenerator_x86_32.cpp Tue May 07 21:53:46 2019 +0200 +++ b/src/hotspot/cpu/x86/stubGenerator_x86_32.cpp Tue May 07 13:33:27 2019 -0700 @@ -602,7 +602,59 @@ return start; } - + //--------------------------------------------------------------------------------------------------- + + address generate_vector_mask(const char *stub_name, int32_t mask) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", stub_name); + address start = __ pc(); + + for (int i = 0; i < 16; i++) { + __ emit_data(mask, relocInfo::none, 0); + } + + return start; + } + + address generate_vector_mask_long_double(const char *stub_name, int32_t maskhi, int32_t masklo) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", stub_name); + address start = __ pc(); + + for (int i = 0; i < 8; i++) { + __ emit_data(masklo, relocInfo::none, 0); + __ emit_data(maskhi, relocInfo::none, 0); + } + + return start; + } + + //---------------------------------------------------------------------------------------------------- + + address generate_vector_byte_perm_mask(const char *stub_name) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", stub_name); + address start = __ pc(); + + __ emit_data(0x00000001, relocInfo::none, 0); + __ emit_data(0x00000000, relocInfo::none, 0); + __ emit_data(0x00000003, relocInfo::none, 0); + __ emit_data(0x00000000, relocInfo::none, 0); + __ emit_data(0x00000005, relocInfo::none, 0); + __ emit_data(0x00000000, relocInfo::none, 0); + __ emit_data(0x00000007, relocInfo::none, 0); + __ emit_data(0x00000000, relocInfo::none, 0); + __ emit_data(0x00000000, relocInfo::none, 0); + __ emit_data(0x00000000, relocInfo::none, 0); + __ emit_data(0x00000002, relocInfo::none, 0); + __ emit_data(0x00000000, relocInfo::none, 0); + __ emit_data(0x00000004, relocInfo::none, 0); + __ emit_data(0x00000000, relocInfo::none, 0); + __ emit_data(0x00000006, relocInfo::none, 0); + __ emit_data(0x00000000, relocInfo::none, 0); + + return start; + } //---------------------------------------------------------------------------------------------------- // Non-destructive plausibility checks for oops @@ -3823,6 +3875,14 @@ //------------------------------------------------------------------------------------------------------------------------ // entry points that are platform specific + StubRoutines::x86::_vector_float_sign_mask = generate_vector_mask("vector_float_sign_mask", 0x7FFFFFFF); + StubRoutines::x86::_vector_float_sign_flip = generate_vector_mask("vector_float_sign_flip", 0x80000000); + StubRoutines::x86::_vector_double_sign_mask = generate_vector_mask_long_double("vector_double_sign_mask", 0x7FFFFFFF, 0xFFFFFFFF); + StubRoutines::x86::_vector_double_sign_flip = generate_vector_mask_long_double("vector_double_sign_flip", 0x80000000, 0x00000000); + StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_mask("vector_short_to_byte_mask", 0x00ff00ff); + StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask"); + StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask_long_double("vector_long_sign_mask", 0x80000000, 0x00000000); + // support for verify_oop (must happen after universe_init) StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); diff -r 98558b7544c7 -r 1851a532ddfe src/hotspot/cpu/x86/stubGenerator_x86_64.cpp --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp Tue May 07 21:53:46 2019 +0200 +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp Tue May 07 13:33:27 2019 -0700 @@ -979,6 +979,40 @@ return start; } + address generate_vector_mask(const char *stub_name, int64_t mask) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", stub_name); + address start = __ pc(); + + __ emit_data64(mask, relocInfo::none); + __ emit_data64(mask, relocInfo::none); + __ emit_data64(mask, relocInfo::none); + __ emit_data64(mask, relocInfo::none); + __ emit_data64(mask, relocInfo::none); + __ emit_data64(mask, relocInfo::none); + __ emit_data64(mask, relocInfo::none); + __ emit_data64(mask, relocInfo::none); + + return start; + } + + address generate_vector_byte_perm_mask(const char *stub_name) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", stub_name); + address start = __ pc(); + + __ emit_data64(0x0000000000000001, relocInfo::none); + __ emit_data64(0x0000000000000003, relocInfo::none); + __ emit_data64(0x0000000000000005, relocInfo::none); + __ emit_data64(0x0000000000000007, relocInfo::none); + __ emit_data64(0x0000000000000000, relocInfo::none); + __ emit_data64(0x0000000000000002, relocInfo::none); + __ emit_data64(0x0000000000000004, relocInfo::none); + __ emit_data64(0x0000000000000006, relocInfo::none); + + return start; + } + // Non-destructive plausibility checks for oops // // Arguments: @@ -5871,6 +5905,13 @@ StubRoutines::x86::_float_sign_flip = generate_fp_mask("float_sign_flip", 0x8000000080000000); StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF); StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000); + StubRoutines::x86::_vector_float_sign_mask = generate_vector_mask("vector_float_sign_mask", 0x7FFFFFFF7FFFFFFF); + StubRoutines::x86::_vector_float_sign_flip = generate_vector_mask("vector_float_sign_flip", 0x8000000080000000); + StubRoutines::x86::_vector_double_sign_mask = generate_vector_mask("vector_double_sign_mask", 0x7FFFFFFFFFFFFFFF); + StubRoutines::x86::_vector_double_sign_flip = generate_vector_mask("vector_double_sign_flip", 0x8000000000000000); + StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_mask("vector_short_to_byte_mask", 0x00ff00ff00ff00ff); + StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask"); + StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask("vector_long_sign_mask", 0x8000000000000000); // support for verify_oop (must happen after universe_init) StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); diff -r 98558b7544c7 -r 1851a532ddfe src/hotspot/cpu/x86/stubRoutines_x86.cpp --- a/src/hotspot/cpu/x86/stubRoutines_x86.cpp Tue May 07 21:53:46 2019 +0200 +++ b/src/hotspot/cpu/x86/stubRoutines_x86.cpp Tue May 07 13:33:27 2019 -0700 @@ -43,6 +43,13 @@ address StubRoutines::x86::_upper_word_mask_addr = NULL; address StubRoutines::x86::_shuffle_byte_flip_mask_addr = NULL; address StubRoutines::x86::_k256_adr = NULL; +address StubRoutines::x86::_vector_short_to_byte_mask = NULL; +address StubRoutines::x86::_vector_float_sign_mask = NULL; +address StubRoutines::x86::_vector_float_sign_flip = NULL; +address StubRoutines::x86::_vector_double_sign_mask = NULL; +address StubRoutines::x86::_vector_double_sign_flip = NULL; +address StubRoutines::x86::_vector_byte_perm_mask = NULL; +address StubRoutines::x86::_vector_long_sign_mask = NULL; #ifdef _LP64 address StubRoutines::x86::_k256_W_adr = NULL; address StubRoutines::x86::_k512_W_addr = NULL; diff -r 98558b7544c7 -r 1851a532ddfe src/hotspot/cpu/x86/stubRoutines_x86.hpp --- a/src/hotspot/cpu/x86/stubRoutines_x86.hpp Tue May 07 21:53:46 2019 +0200 +++ b/src/hotspot/cpu/x86/stubRoutines_x86.hpp Tue May 07 13:33:27 2019 -0700 @@ -102,6 +102,7 @@ static address double_sign_flip() { return _double_sign_flip; } + #else // !LP64 private: @@ -139,6 +140,13 @@ //k256 table for sha256 static juint _k256[]; static address _k256_adr; + static address _vector_short_to_byte_mask; + static address _vector_float_sign_mask; + static address _vector_float_sign_flip; + static address _vector_double_sign_mask; + static address _vector_double_sign_flip; + static address _vector_byte_perm_mask; + static address _vector_long_sign_mask; #ifdef _LP64 static juint _k256_W[]; static address _k256_W_adr; @@ -212,6 +220,33 @@ static address upper_word_mask_addr() { return _upper_word_mask_addr; } static address shuffle_byte_flip_mask_addr() { return _shuffle_byte_flip_mask_addr; } static address k256_addr() { return _k256_adr; } + + static address vector_short_to_byte_mask() { + return _vector_short_to_byte_mask; + } + static address vector_float_sign_mask() { + return _vector_float_sign_mask; + } + + static address vector_float_sign_flip() { + return _vector_float_sign_flip; + } + + static address vector_double_sign_mask() { + return _vector_double_sign_mask; + } + + static address vector_double_sign_flip() { + return _vector_double_sign_flip; + } + + static address vector_byte_perm_mask() { + return _vector_byte_perm_mask; + } + + static address vector_long_sign_mask() { + return _vector_long_sign_mask; + } #ifdef _LP64 static address k256_W_addr() { return _k256_W_adr; } static address k512_W_addr() { return _k512_W_addr; } diff -r 98558b7544c7 -r 1851a532ddfe src/hotspot/cpu/x86/x86.ad --- a/src/hotspot/cpu/x86/x86.ad Tue May 07 21:53:46 2019 +0200 +++ b/src/hotspot/cpu/x86/x86.ad Tue May 07 13:33:27 2019 -0700 @@ -1372,14 +1372,20 @@ static address double_signmask() { return (address)double_signmask_pool; } static address double_signflip() { return (address)double_signflip_pool; } #endif - - + static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); } + static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); } + static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); } + +//============================================================================= const bool Matcher::match_rule_supported(int opcode) { if (!has_match_rule(opcode)) return false; bool ret_value = true; switch (opcode) { + case Op_AbsVL: + if (UseAVX < 3) + ret_value = false; case Op_PopCountI: case Op_PopCountL: if (!UsePopCountInstruction) @@ -1402,6 +1408,9 @@ if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here ret_value = false; break; + case Op_AbsVB: + case Op_AbsVS: + case Op_AbsVI: case Op_AddReductionVI: if (UseSSE < 3) // requires at least SSE3 ret_value = false; @@ -1447,9 +1456,19 @@ ret_value = false; break; case Op_MulAddVS2VI: + case Op_RShiftVL: + case Op_AbsVD: + case Op_NegVD: if (UseSSE < 2) ret_value = false; break; + case Op_MulVB: + case Op_LShiftVB: + case Op_RShiftVB: + case Op_URShiftVB: + if (UseSSE < 4) + ret_value = false; + break; #ifdef _LP64 case Op_MaxD: case Op_MaxF: @@ -1470,24 +1489,42 @@ bool ret_value = match_rule_supported(opcode); if (ret_value) { switch (opcode) { + case Op_AbsVB: case Op_AddVB: case Op_SubVB: if ((vlen == 64) && (VM_Version::supports_avx512bw() == false)) ret_value = false; break; - case Op_URShiftVS: - case Op_RShiftVS: - case Op_LShiftVS: - case Op_MulVS: + case Op_AbsVS: case Op_AddVS: case Op_SubVS: + case Op_MulVS: + case Op_LShiftVS: + case Op_RShiftVS: + case Op_URShiftVS: if ((vlen == 32) && (VM_Version::supports_avx512bw() == false)) ret_value = false; break; + case Op_MulVB: + case Op_LShiftVB: + case Op_RShiftVB: + case Op_URShiftVB: + if ((vlen == 32 && UseAVX < 2) || + ((vlen == 64) && (VM_Version::supports_avx512bw() == false))) + ret_value = false; + break; + case Op_NegVF: + if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) + ret_value = false; + break; case Op_CMoveVF: if (vlen != 8) ret_value = false; break; + case Op_NegVD: + if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) + ret_value = false; + break; case Op_CMoveVD: if (vlen != 4) ret_value = false; @@ -7302,6 +7339,186 @@ // --------------------------------- MUL -------------------------------------- +// Byte vector mul +instruct mul4B_reg(vecS dst, vecS src1, vecS src2, vecS tmp, rRegI scratch) %{ + predicate(UseSSE > 3 && n->as_Vector()->length() == 4); + match(Set dst (MulVB src1 src2)); + effect(TEMP dst, TEMP tmp, TEMP scratch); + format %{"pmovsxbw $tmp,$src1\n\t" + "pmovsxbw $dst,$src2\n\t" + "pmullw $tmp,$dst\n\t" + "movdqu $dst,[0x00ff00ff0x00ff00ff]\n\t" + "pand $dst,$tmp\n\t" + "packuswb $dst,$dst\t! mul packed4B" %} + ins_encode %{ + __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister); + __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister); + __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister); + __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); + __ pand($dst$$XMMRegister, $tmp$$XMMRegister); + __ packuswb($dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct mul8B_reg(vecD dst, vecD src1, vecD src2, vecD tmp, rRegI scratch) %{ + predicate(UseSSE > 3 && n->as_Vector()->length() == 8); + match(Set dst (MulVB src1 src2)); + effect(TEMP dst, TEMP tmp, TEMP scratch); + format %{"pmovsxbw $tmp,$src1\n\t" + "pmovsxbw $dst,$src2\n\t" + "pmullw $tmp,$dst\n\t" + "movdqu $dst,[0x00ff00ff0x00ff00ff]\n\t" + "pand $dst,$tmp\n\t" + "packuswb $dst,$dst\t! mul packed8B" %} + ins_encode %{ + __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister); + __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister); + __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister); + __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); + __ pand($dst$$XMMRegister, $tmp$$XMMRegister); + __ packuswb($dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct mul16B_reg(vecX dst, vecX src1, vecX src2, vecX tmp1, vecX tmp2, rRegI scratch) %{ + predicate(UseSSE > 3 && n->as_Vector()->length() == 16); + match(Set dst (MulVB src1 src2)); + effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch); + format %{"pmovsxbw $tmp1,$src1\n\t" + "pmovsxbw $tmp2,$src2\n\t" + "pmullw $tmp1,$tmp2\n\t" + "pshufd $tmp2,$src1,0xEE\n\t" + "pshufd $dst,$src2,0xEE\n\t" + "pmovsxbw $tmp2,$tmp2\n\t" + "pmovsxbw $dst,$dst\n\t" + "pmullw $tmp2,$dst\n\t" + "movdqu $dst,[0x00ff00ff0x00ff00ff]\n\t" + "pand $tmp2,$dst\n\t" + "pand $dst,$tmp1\n\t" + "packuswb $dst,$tmp2\t! mul packed16B" %} + ins_encode %{ + __ pmovsxbw($tmp1$$XMMRegister, $src1$$XMMRegister); + __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister); + __ pmullw($tmp1$$XMMRegister, $tmp2$$XMMRegister); + __ pshufd($tmp2$$XMMRegister, $src1$$XMMRegister, 0xEE); + __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xEE); + __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister); + __ pmovsxbw($dst$$XMMRegister, $dst$$XMMRegister); + __ pmullw($tmp2$$XMMRegister, $dst$$XMMRegister); + __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); + __ pand($tmp2$$XMMRegister, $dst$$XMMRegister); + __ pand($dst$$XMMRegister, $tmp1$$XMMRegister); + __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul16B_reg_avx(vecX dst, vecX src1, vecX src2, vecX tmp, rRegI scratch) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 16); + match(Set dst (MulVB src1 src2)); + effect(TEMP dst, TEMP tmp, TEMP scratch); + format %{"vpmovsxbw $tmp,$src1\n\t" + "vpmovsxbw $dst,$src2\n\t" + "vpmullw $tmp,$tmp,$dst\n\t" + "vmovdqu $dst,[0x00ff00ff0x00ff00ff]\n\t" + "vpand $dst,$dst,$tmp\n\t" + "vextracti128_high $tmp,$dst\n\t" + "vpackuswb $dst,$dst,$dst\n\t! mul packed16B" %} + ins_encode %{ + int vector_len = 1; + __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vector_len); + __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len); + __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); + __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len); + __ vextracti128_high($tmp$$XMMRegister, $dst$$XMMRegister); + __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, 0); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul32B_reg_avx(vecY dst, vecY src1, vecY src2, vecY tmp1, vecY tmp2, rRegI scratch) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 32); + match(Set dst (MulVB src1 src2)); + effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch); + format %{"vextracti128_high $tmp1,$src1\n\t" + "vextracti128_high $dst,$src2\n\t" + "vpmovsxbw $tmp1,$tmp1\n\t" + "vpmovsxbw $dst,$dst\n\t" + "vpmullw $tmp1,$tmp1,$dst\n\t" + "vpmovsxbw $tmp2,$src1\n\t" + "vpmovsxbw $dst,$src2\n\t" + "vpmullw $tmp2,$tmp2,$dst\n\t" + "vmovdqu $dst, [0x00ff00ff0x00ff00ff]\n\t" + "vpbroadcastd $dst, $dst\n\t" + "vpand $tmp1,$tmp1,$dst\n\t" + "vpand $dst,$dst,$tmp2\n\t" + "vpackuswb $dst,$dst,$tmp1\n\t" + "vpermq $dst, $dst, 0xD8\t! mul packed32B" %} + ins_encode %{ + int vector_len = 1; + __ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister); + __ vextracti128_high($dst$$XMMRegister, $src2$$XMMRegister); + __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len); + __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vector_len); + __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len); + __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); + __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vector_len); + __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp1$$XMMRegister, vector_len); + __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmul64B_reg_avx(vecZ dst, vecZ src1, vecZ src2, vecZ tmp1, vecZ tmp2, rRegI scratch) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 64); + match(Set dst (MulVB src1 src2)); + effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch); + format %{"vextracti64x4_high $tmp1,$src1\n\t" + "vextracti64x4_high $dst,$src2\n\t" + "vpmovsxbw $tmp1,$tmp1\n\t" + "vpmovsxbw $dst,$dst\n\t" + "vpmullw $tmp1,$tmp1,$dst\n\t" + "vpmovsxbw $tmp2,$src1\n\t" + "vpmovsxbw $dst,$src2\n\t" + "vpmullw $tmp2,$tmp2,$dst\n\t" + "vmovdqu $dst, [0x00ff00ff0x00ff00ff]\n\t" + "vpbroadcastd $dst, $dst\n\t" + "vpand $tmp1,$tmp1,$dst\n\t" + "vpand $tmp2,$tmp2,$dst\n\t" + "vpackuswb $dst,$tmp1,$tmp2\n\t" + "evmovdquq $tmp2,[0x0604020007050301]\n\t" + "vpermq $dst,$tmp2,$dst,0x01\t! mul packed64B" %} + + ins_encode %{ + int vector_len = 2; + __ vextracti64x4_high($tmp1$$XMMRegister, $src1$$XMMRegister); + __ vextracti64x4_high($dst$$XMMRegister, $src2$$XMMRegister); + __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len); + __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vector_len); + __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len); + __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); + __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vector_len); + __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vector_len, $scratch$$Register); + __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len); + + %} + ins_pipe( pipe_slow ); +%} + // Shorts/Chars vector mul instruct vmul2S(vecS dst, vecS src) %{ predicate(UseAVX == 0 && n->as_Vector()->length() == 2); @@ -8024,20 +8241,6 @@ ins_pipe( pipe_slow ); %} -// ------------------------------ Shift --------------------------------------- - -// Left and right shift count vectors are the same on x86 -// (only lowest bits of xmm reg are used for count). -instruct vshiftcnt(vecS dst, rRegI cnt) %{ - match(Set dst (LShiftCntV cnt)); - match(Set dst (RShiftCntV cnt)); - format %{ "movd $dst,$cnt\t! load shift count" %} - ins_encode %{ - __ movdl($dst$$XMMRegister, $cnt$$Register); - %} - ins_pipe( pipe_slow ); -%} - // --------------------------------- Sqrt -------------------------------------- // Floating point vector sqrt @@ -8195,1092 +8398,478 @@ ins_pipe( pipe_slow ); %} -// ------------------------------ LeftShift ----------------------------------- - -// Shorts/Chars vector left shift -instruct vsll2S(vecS dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (LShiftVS dst shift)); - format %{ "psllw $dst,$shift\t! left shift packed2S" %} - ins_encode %{ - __ psllw($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll2S_imm(vecS dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (LShiftVS dst shift)); - format %{ "psllw $dst,$shift\t! left shift packed2S" %} - ins_encode %{ - __ psllw($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll2S_reg(vecS dst, vecS src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (LShiftVS src shift)); - format %{ "vpsllw $dst,$src,$shift\t! left shift packed2S" %} - ins_encode %{ - int vector_len = 0; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll2S_reg_imm(vecS dst, vecS src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (LShiftVS src shift)); - format %{ "vpsllw $dst,$src,$shift\t! left shift packed2S" %} - ins_encode %{ - int vector_len = 0; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll4S(vecD dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 4); - match(Set dst (LShiftVS dst shift)); - format %{ "psllw $dst,$shift\t! left shift packed4S" %} - ins_encode %{ - __ psllw($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll4S_imm(vecD dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 4); - match(Set dst (LShiftVS dst shift)); - format %{ "psllw $dst,$shift\t! left shift packed4S" %} - ins_encode %{ - __ psllw($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll4S_reg(vecD dst, vecD src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 4); - match(Set dst (LShiftVS src shift)); - format %{ "vpsllw $dst,$src,$shift\t! left shift packed4S" %} - ins_encode %{ - int vector_len = 0; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll4S_reg_imm(vecD dst, vecD src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 4); - match(Set dst (LShiftVS src shift)); - format %{ "vpsllw $dst,$src,$shift\t! left shift packed4S" %} - ins_encode %{ - int vector_len = 0; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll8S(vecX dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 8); - match(Set dst (LShiftVS dst shift)); - format %{ "psllw $dst,$shift\t! left shift packed8S" %} - ins_encode %{ - __ psllw($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll8S_imm(vecX dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 8); - match(Set dst (LShiftVS dst shift)); - format %{ "psllw $dst,$shift\t! left shift packed8S" %} - ins_encode %{ - __ psllw($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll8S_reg(vecX dst, vecX src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 8); - match(Set dst (LShiftVS src shift)); - format %{ "vpsllw $dst,$src,$shift\t! left shift packed8S" %} - ins_encode %{ - int vector_len = 0; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll8S_reg_imm(vecX dst, vecX src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 8); - match(Set dst (LShiftVS src shift)); - format %{ "vpsllw $dst,$src,$shift\t! left shift packed8S" %} - ins_encode %{ - int vector_len = 0; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll16S_reg(vecY dst, vecY src, vecS shift) %{ +// ------------------------------ Shift --------------------------------------- + +// Left and right shift count vectors are the same on x86 +// (only lowest bits of xmm reg are used for count). +instruct vshiftcnt(vecS dst, rRegI cnt) %{ + match(Set dst (LShiftCntV cnt)); + match(Set dst (RShiftCntV cnt)); + format %{ "movdl $dst,$cnt\t! load shift count" %} + ins_encode %{ + __ movdl($dst$$XMMRegister, $cnt$$Register); + %} + ins_pipe( pipe_slow ); +%} + +instruct vshiftcntimm(vecS dst, immI8 cnt, rRegI tmp) %{ + match(Set dst cnt); + effect(TEMP tmp); + format %{ "movl $tmp,$cnt\t" + "movdl $dst,$tmp\t! load shift count" %} + ins_encode %{ + __ movl($tmp$$Register, $cnt$$constant); + __ movdl($dst$$XMMRegister, $tmp$$Register); + %} + ins_pipe( pipe_slow ); +%} + +// Byte vector shift +instruct vshift4B(vecS dst, vecS src, vecS shift, vecS tmp, rRegI scratch) %{ + predicate(UseSSE > 3 && n->as_Vector()->length() == 4); + match(Set dst (LShiftVB src shift)); + match(Set dst (RShiftVB src shift)); + match(Set dst (URShiftVB src shift)); + effect(TEMP dst, TEMP tmp, TEMP scratch); + format %{"vextendbw $tmp,$src\n\t" + "vshiftw $tmp,$shift\n\t" + "movdqu $dst,[0x00ff00ff0x00ff00ff]\n\t" + "pand $dst,$tmp\n\t" + "packuswb $dst,$dst\n\t ! packed4B shift" %} + ins_encode %{ + int opcode = this->as_Mach()->ideal_Opcode(); + + __ vextendbw(opcode, $tmp$$XMMRegister, $src$$XMMRegister); + __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister); + __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); + __ pand($dst$$XMMRegister, $tmp$$XMMRegister); + __ packuswb($dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vshift8B(vecD dst, vecD src, vecS shift, vecD tmp, rRegI scratch) %{ + predicate(UseSSE > 3 && n->as_Vector()->length() == 8); + match(Set dst (LShiftVB src shift)); + match(Set dst (RShiftVB src shift)); + match(Set dst (URShiftVB src shift)); + effect(TEMP dst, TEMP tmp, TEMP scratch); + format %{"vextendbw $tmp,$src\n\t" + "vshiftw $tmp,$shift\n\t" + "movdqu $dst,[0x00ff00ff0x00ff00ff]\n\t" + "pand $dst,$tmp\n\t" + "packuswb $dst,$dst\n\t ! packed8B shift" %} + ins_encode %{ + int opcode = this->as_Mach()->ideal_Opcode(); + + __ vextendbw(opcode, $tmp$$XMMRegister, $src$$XMMRegister); + __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister); + __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); + __ pand($dst$$XMMRegister, $tmp$$XMMRegister); + __ packuswb($dst$$XMMRegister, $dst$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vshift16B(vecX dst, vecX src, vecS shift, vecX tmp1, vecX tmp2, rRegI scratch) %{ + predicate(UseSSE > 3 && UseAVX <= 1 && n->as_Vector()->length() == 16); + match(Set dst (LShiftVB src shift)); + match(Set dst (RShiftVB src shift)); + match(Set dst (URShiftVB src shift)); + effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch); + format %{"vextendbw $tmp1,$src\n\t" + "vshiftw $tmp1,$shift\n\t" + "pshufd $tmp2,$src\n\t" + "vextendbw $tmp2,$tmp2\n\t" + "vshiftw $tmp2,$shift\n\t" + "movdqu $dst,[0x00ff00ff0x00ff00ff]\n\t" + "pand $tmp2,$dst\n\t" + "pand $dst,$tmp1\n\t" + "packuswb $dst,$tmp2\n\t! packed16B shift" %} + ins_encode %{ + int opcode = this->as_Mach()->ideal_Opcode(); + + __ vextendbw(opcode, $tmp1$$XMMRegister, $src$$XMMRegister); + __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister); + __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE); + __ vextendbw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister); + __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister); + __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); + __ pand($tmp2$$XMMRegister, $dst$$XMMRegister); + __ pand($dst$$XMMRegister, $tmp1$$XMMRegister); + __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vshift16B_avx(vecX dst, vecX src, vecS shift, vecX tmp, rRegI scratch) %{ predicate(UseAVX > 1 && n->as_Vector()->length() == 16); - match(Set dst (LShiftVS src shift)); - format %{ "vpsllw $dst,$src,$shift\t! left shift packed16S" %} - ins_encode %{ - int vector_len = 1; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll16S_reg_imm(vecY dst, vecY src, immI8 shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 16); - match(Set dst (LShiftVS src shift)); - format %{ "vpsllw $dst,$src,$shift\t! left shift packed16S" %} - ins_encode %{ + match(Set dst (LShiftVB src shift)); + match(Set dst (RShiftVB src shift)); + match(Set dst (URShiftVB src shift)); + effect(TEMP dst, TEMP tmp, TEMP scratch); + format %{"vextendbw $tmp,$src\n\t" + "vshiftw $tmp,$tmp,$shift\n\t" + "vpand $tmp,$tmp,[0x00ff00ff0x00ff00ff]\n\t" + "vextracti128_high $dst,$tmp\n\t" + "vpackuswb $dst,$tmp,$dst\n\t! packed16B shift" %} + ins_encode %{ + int opcode = this->as_Mach()->ideal_Opcode(); + int vector_len = 1; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll32S_reg(vecZ dst, vecZ src, vecS shift) %{ - predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32); - match(Set dst (LShiftVS src shift)); - format %{ "vpsllw $dst,$src,$shift\t! left shift packed32S" %} - ins_encode %{ - int vector_len = 2; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{ - predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32); - match(Set dst (LShiftVS src shift)); - format %{ "vpsllw $dst,$src,$shift\t! left shift packed32S" %} - ins_encode %{ + __ vextendbw(opcode, $tmp$$XMMRegister, $src$$XMMRegister, vector_len); + __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len); + __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register); + __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister); + __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0); + %} + ins_pipe( pipe_slow ); +%} + +instruct vshift32B_avx(vecY dst, vecY src, vecS shift, vecY tmp, rRegI scratch) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 32); + match(Set dst (LShiftVB src shift)); + match(Set dst (RShiftVB src shift)); + match(Set dst (URShiftVB src shift)); + effect(TEMP dst, TEMP tmp, TEMP scratch); + format %{"vextracti128_high $tmp,$src\n\t" + "vextendbw $tmp,$tmp\n\t" + "vextendbw $dst,$src\n\t" + "vshiftw $tmp,$tmp,$shift\n\t" + "vshiftw $dst,$dst,$shift\n\t" + "vpand $tmp,$tmp,[0x00ff00ff0x00ff00ff]\n\t" + "vpand $dst,$dst,[0x00ff00ff0x00ff00ff]\n\t" + "vpackuswb $dst,$dst,$tmp\n\t" + "vpermq $dst,$dst,0xD8\n\t! packed32B shift" %} + ins_encode %{ + int opcode = this->as_Mach()->ideal_Opcode(); + + int vector_len = 1; + __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister); + __ vextendbw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len); + __ vextendbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, vector_len); + __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len); + __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vector_len); + __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register); + __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register); + __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len); + __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vshift64B_avx(vecZ dst, vecZ src, vecS shift, vecZ tmp1, vecZ tmp2, rRegI scratch) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 64); + match(Set dst (LShiftVB src shift)); + match(Set dst (RShiftVB src shift)); + match(Set dst (URShiftVB src shift)); + effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch); + format %{"vextracti64x4 $tmp1,$src\n\t" + "vextendbw $tmp1,$tmp1\n\t" + "vextendbw $tmp2,$src\n\t" + "vshiftw $tmp1,$tmp1,$shift\n\t" + "vshiftw $tmp2,$tmp2,$shift\n\t" + "vmovdqu $dst,[0x00ff00ff0x00ff00ff]\n\t" + "vpbroadcastd $dst,$dst\n\t" + "vpand $tmp1,$tmp1,$dst\n\t" + "vpand $tmp2,$tmp2,$dst\n\t" + "vpackuswb $dst,$tmp1,$tmp2\n\t" + "evmovdquq $tmp2, [0x0604020007050301]\n\t" + "vpermq $dst,$tmp2,$dst\n\t! packed64B shift" %} + ins_encode %{ + int opcode = this->as_Mach()->ideal_Opcode(); + int vector_len = 2; - __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -// Integers vector left shift -instruct vsll2I(vecD dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (LShiftVI dst shift)); - format %{ "pslld $dst,$shift\t! left shift packed2I" %} - ins_encode %{ - __ pslld($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll2I_imm(vecD dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (LShiftVI dst shift)); - format %{ "pslld $dst,$shift\t! left shift packed2I" %} - ins_encode %{ - __ pslld($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll2I_reg(vecD dst, vecD src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (LShiftVI src shift)); - format %{ "vpslld $dst,$src,$shift\t! left shift packed2I" %} - ins_encode %{ - int vector_len = 0; - __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll2I_reg_imm(vecD dst, vecD src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (LShiftVI src shift)); - format %{ "vpslld $dst,$src,$shift\t! left shift packed2I" %} - ins_encode %{ - int vector_len = 0; - __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll4I(vecX dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 4); - match(Set dst (LShiftVI dst shift)); - format %{ "pslld $dst,$shift\t! left shift packed4I" %} - ins_encode %{ - __ pslld($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll4I_imm(vecX dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 4); - match(Set dst (LShiftVI dst shift)); - format %{ "pslld $dst,$shift\t! left shift packed4I" %} - ins_encode %{ - __ pslld($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll4I_reg(vecX dst, vecX src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 4); - match(Set dst (LShiftVI src shift)); - format %{ "vpslld $dst,$src,$shift\t! left shift packed4I" %} - ins_encode %{ - int vector_len = 0; - __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll4I_reg_imm(vecX dst, vecX src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 4); - match(Set dst (LShiftVI src shift)); - format %{ "vpslld $dst,$src,$shift\t! left shift packed4I" %} - ins_encode %{ - int vector_len = 0; - __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll8I_reg(vecY dst, vecY src, vecS shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 8); - match(Set dst (LShiftVI src shift)); - format %{ "vpslld $dst,$src,$shift\t! left shift packed8I" %} - ins_encode %{ - int vector_len = 1; - __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll8I_reg_imm(vecY dst, vecY src, immI8 shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 8); - match(Set dst (LShiftVI src shift)); - format %{ "vpslld $dst,$src,$shift\t! left shift packed8I" %} - ins_encode %{ - int vector_len = 1; - __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll16I_reg(vecZ dst, vecZ src, vecS shift) %{ - predicate(UseAVX > 2 && n->as_Vector()->length() == 16); - match(Set dst (LShiftVI src shift)); - format %{ "vpslld $dst,$src,$shift\t! left shift packed16I" %} - ins_encode %{ - int vector_len = 2; - __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{ - predicate(UseAVX > 2 && n->as_Vector()->length() == 16); - match(Set dst (LShiftVI src shift)); - format %{ "vpslld $dst,$src,$shift\t! left shift packed16I" %} - ins_encode %{ - int vector_len = 2; - __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -// Longs vector left shift -instruct vsll2L(vecX dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (LShiftVL dst shift)); - format %{ "psllq $dst,$shift\t! left shift packed2L" %} - ins_encode %{ - __ psllq($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll2L_imm(vecX dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (LShiftVL dst shift)); - format %{ "psllq $dst,$shift\t! left shift packed2L" %} - ins_encode %{ - __ psllq($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll2L_reg(vecX dst, vecX src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (LShiftVL src shift)); - format %{ "vpsllq $dst,$src,$shift\t! left shift packed2L" %} - ins_encode %{ - int vector_len = 0; - __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll2L_reg_imm(vecX dst, vecX src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (LShiftVL src shift)); - format %{ "vpsllq $dst,$src,$shift\t! left shift packed2L" %} - ins_encode %{ - int vector_len = 0; - __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll4L_reg(vecY dst, vecY src, vecS shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 4); - match(Set dst (LShiftVL src shift)); - format %{ "vpsllq $dst,$src,$shift\t! left shift packed4L" %} - ins_encode %{ - int vector_len = 1; - __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 4); - match(Set dst (LShiftVL src shift)); - format %{ "vpsllq $dst,$src,$shift\t! left shift packed4L" %} - ins_encode %{ - int vector_len = 1; - __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll8L_reg(vecZ dst, vecZ src, vecS shift) %{ - predicate(UseAVX > 2 && n->as_Vector()->length() == 8); - match(Set dst (LShiftVL src shift)); - format %{ "vpsllq $dst,$src,$shift\t! left shift packed8L" %} - ins_encode %{ - int vector_len = 2; - __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsll8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{ - predicate(UseAVX > 2 && n->as_Vector()->length() == 8); - match(Set dst (LShiftVL src shift)); - format %{ "vpsllq $dst,$src,$shift\t! left shift packed8L" %} - ins_encode %{ - int vector_len = 2; - __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -// ----------------------- LogicalRightShift ----------------------------------- + __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1); + __ vextendbw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len); + __ vextendbw(opcode, $tmp2$$XMMRegister, $src$$XMMRegister, vector_len); + __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vector_len); + __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vector_len); + __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); + __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len); + __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vector_len); + __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vector_len, $scratch$$Register); + __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} // Shorts vector logical right shift produces incorrect Java result // for negative data because java code convert short value into int with // sign extension before a shift. But char vectors are fine since chars are // unsigned values. - -instruct vsrl2S(vecS dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (URShiftVS dst shift)); - format %{ "psrlw $dst,$shift\t! logical right shift packed2S" %} - ins_encode %{ - __ psrlw($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl2S_imm(vecS dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (URShiftVS dst shift)); - format %{ "psrlw $dst,$shift\t! logical right shift packed2S" %} - ins_encode %{ - __ psrlw($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl2S_reg(vecS dst, vecS src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); +// Shorts/Chars vector left shift +instruct vshist2S(vecS dst, vecS src, vecS shift) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (LShiftVS src shift)); + match(Set dst (RShiftVS src shift)); match(Set dst (URShiftVS src shift)); - format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed2S" %} - ins_encode %{ - int vector_len = 0; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl2S_reg_imm(vecS dst, vecS src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (URShiftVS src shift)); - format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed2S" %} - ins_encode %{ - int vector_len = 0; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl4S(vecD dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 4); - match(Set dst (URShiftVS dst shift)); - format %{ "psrlw $dst,$shift\t! logical right shift packed4S" %} - ins_encode %{ - __ psrlw($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl4S_imm(vecD dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 4); - match(Set dst (URShiftVS dst shift)); - format %{ "psrlw $dst,$shift\t! logical right shift packed4S" %} - ins_encode %{ - __ psrlw($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl4S_reg(vecD dst, vecD src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + format %{ "vshiftw $dst,$src,$shift\t! shift packed2S" %} + ins_encode %{ + int opcode = this->as_Mach()->ideal_Opcode(); + if (UseAVX == 0) { + if ($dst$$XMMRegister != $src$$XMMRegister) + __ movflt($dst$$XMMRegister, $src$$XMMRegister); + __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister); + } else { + int vector_len = 0; + __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + } + %} + ins_pipe( pipe_slow ); +%} + +instruct vshift4S(vecD dst, vecD src, vecS shift) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (LShiftVS src shift)); + match(Set dst (RShiftVS src shift)); match(Set dst (URShiftVS src shift)); - format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed4S" %} - ins_encode %{ - int vector_len = 0; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl4S_reg_imm(vecD dst, vecD src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + format %{ "vshiftw $dst,$src,$shift\t! shift packed4S" %} + ins_encode %{ + int opcode = this->as_Mach()->ideal_Opcode(); + if (UseAVX == 0) { + if ($dst$$XMMRegister != $src$$XMMRegister) + __ movdbl($dst$$XMMRegister, $src$$XMMRegister); + __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister); + + } else { + int vector_len = 0; + __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + } + %} + ins_pipe( pipe_slow ); +%} + +instruct vshift8S(vecX dst, vecX src, vecS shift) %{ + predicate(n->as_Vector()->length() == 8); + match(Set dst (LShiftVS src shift)); + match(Set dst (RShiftVS src shift)); match(Set dst (URShiftVS src shift)); - format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed4S" %} - ins_encode %{ - int vector_len = 0; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl8S(vecX dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 8); - match(Set dst (URShiftVS dst shift)); - format %{ "psrlw $dst,$shift\t! logical right shift packed8S" %} - ins_encode %{ - __ psrlw($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl8S_imm(vecX dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 8); - match(Set dst (URShiftVS dst shift)); - format %{ "psrlw $dst,$shift\t! logical right shift packed8S" %} - ins_encode %{ - __ psrlw($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl8S_reg(vecX dst, vecX src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + format %{ "vshiftw $dst,$src,$shift\t! shift packed8S" %} + ins_encode %{ + int opcode = this->as_Mach()->ideal_Opcode(); + if (UseAVX == 0) { + if ($dst$$XMMRegister != $src$$XMMRegister) + __ movdqu($dst$$XMMRegister, $src$$XMMRegister); + __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister); + } else { + int vector_len = 0; + __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + } + %} + ins_pipe( pipe_slow ); +%} + +instruct vshift16S(vecY dst, vecY src, vecS shift) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 16); + match(Set dst (LShiftVS src shift)); + match(Set dst (RShiftVS src shift)); match(Set dst (URShiftVS src shift)); - format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed8S" %} - ins_encode %{ - int vector_len = 0; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl8S_reg_imm(vecX dst, vecX src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 8); - match(Set dst (URShiftVS src shift)); - format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed8S" %} - ins_encode %{ - int vector_len = 0; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl16S_reg(vecY dst, vecY src, vecS shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 16); - match(Set dst (URShiftVS src shift)); - format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed16S" %} + format %{ "vshiftw $dst,$src,$shift\t! shift packed16S" %} ins_encode %{ int vector_len = 1; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl16S_reg_imm(vecY dst, vecY src, immI8 shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 16); + int opcode = this->as_Mach()->ideal_Opcode(); + __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vshift32S(vecZ dst, vecZ src, vecS shift) %{ + predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32); + match(Set dst (LShiftVS src shift)); + match(Set dst (RShiftVS src shift)); match(Set dst (URShiftVS src shift)); - format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed16S" %} - ins_encode %{ - int vector_len = 1; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl32S_reg(vecZ dst, vecZ src, vecS shift) %{ - predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32); - match(Set dst (URShiftVS src shift)); - format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed32S" %} - ins_encode %{ - int vector_len = 2; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{ - predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32); - match(Set dst (URShiftVS src shift)); - format %{ "vpsrlw $dst,$src,$shift\t! logical right shift packed32S" %} + format %{ "vshiftw $dst,$src,$shift\t! shift packed32S" %} ins_encode %{ int vector_len = 2; - __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -// Integers vector logical right shift -instruct vsrl2I(vecD dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (URShiftVI dst shift)); - format %{ "psrld $dst,$shift\t! logical right shift packed2I" %} - ins_encode %{ - __ psrld($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl2I_imm(vecD dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (URShiftVI dst shift)); - format %{ "psrld $dst,$shift\t! logical right shift packed2I" %} - ins_encode %{ - __ psrld($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl2I_reg(vecD dst, vecD src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (URShiftVI src shift)); - format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed2I" %} - ins_encode %{ - int vector_len = 0; - __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl2I_reg_imm(vecD dst, vecD src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); + int opcode = this->as_Mach()->ideal_Opcode(); + __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +// Integers vector left shift +instruct vshift2I(vecD dst, vecD src, vecS shift) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (LShiftVI src shift)); + match(Set dst (RShiftVI src shift)); match(Set dst (URShiftVI src shift)); - format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed2I" %} - ins_encode %{ - int vector_len = 0; - __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl4I(vecX dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 4); - match(Set dst (URShiftVI dst shift)); - format %{ "psrld $dst,$shift\t! logical right shift packed4I" %} - ins_encode %{ - __ psrld($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl4I_imm(vecX dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 4); - match(Set dst (URShiftVI dst shift)); - format %{ "psrld $dst,$shift\t! logical right shift packed4I" %} - ins_encode %{ - __ psrld($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl4I_reg(vecX dst, vecX src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + format %{ "vshiftd $dst,$src,$shift\t! shift packed2I" %} + ins_encode %{ + int opcode = this->as_Mach()->ideal_Opcode(); + if (UseAVX == 0) { + if ($dst$$XMMRegister != $src$$XMMRegister) + __ movdbl($dst$$XMMRegister, $src$$XMMRegister); + __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister); + } else { + int vector_len = 0; + __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + } + %} + ins_pipe( pipe_slow ); +%} + +instruct vshift4I(vecX dst, vecX src, vecS shift) %{ + predicate(n->as_Vector()->length() == 4); + match(Set dst (LShiftVI src shift)); + match(Set dst (RShiftVI src shift)); match(Set dst (URShiftVI src shift)); - format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed4I" %} - ins_encode %{ - int vector_len = 0; - __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl4I_reg_imm(vecX dst, vecX src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + format %{ "vshiftd $dst,$src,$shift\t! shift packed4I" %} + ins_encode %{ + int opcode = this->as_Mach()->ideal_Opcode(); + if (UseAVX == 0) { + if ($dst$$XMMRegister != $src$$XMMRegister) + __ movdqu($dst$$XMMRegister, $src$$XMMRegister); + __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister); + } else { + int vector_len = 0; + __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + } + %} + ins_pipe( pipe_slow ); +%} + +instruct vshift8I(vecY dst, vecY src, vecS shift) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 8); + match(Set dst (LShiftVI src shift)); + match(Set dst (RShiftVI src shift)); match(Set dst (URShiftVI src shift)); - format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed4I" %} - ins_encode %{ - int vector_len = 0; - __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl8I_reg(vecY dst, vecY src, vecS shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 8); - match(Set dst (URShiftVI src shift)); - format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed8I" %} + format %{ "vshiftd $dst,$src,$shift\t! shift packed8I" %} ins_encode %{ int vector_len = 1; - __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl8I_reg_imm(vecY dst, vecY src, immI8 shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 8); + int opcode = this->as_Mach()->ideal_Opcode(); + __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vshift16I(vecZ dst, vecZ src, vecS shift) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 16); + match(Set dst (LShiftVI src shift)); + match(Set dst (RShiftVI src shift)); match(Set dst (URShiftVI src shift)); - format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed8I" %} - ins_encode %{ - int vector_len = 1; - __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl16I_reg(vecZ dst, vecZ src, vecS shift) %{ - predicate(UseAVX > 2 && n->as_Vector()->length() == 16); - match(Set dst (URShiftVI src shift)); - format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed16I" %} - ins_encode %{ - int vector_len = 2; - __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{ - predicate(UseAVX > 2 && n->as_Vector()->length() == 16); - match(Set dst (URShiftVI src shift)); - format %{ "vpsrld $dst,$src,$shift\t! logical right shift packed16I" %} + format %{ "vshiftd $dst,$src,$shift\t! shift packed16I" %} ins_encode %{ int vector_len = 2; - __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -// Longs vector logical right shift -instruct vsrl2L(vecX dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (URShiftVL dst shift)); - format %{ "psrlq $dst,$shift\t! logical right shift packed2L" %} - ins_encode %{ + int opcode = this->as_Mach()->ideal_Opcode(); + __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +// Longs vector shift +instruct vshift2L(vecX dst, vecX src, vecS shift) %{ + predicate(n->as_Vector()->length() == 2); + match(Set dst (LShiftVL src shift)); + match(Set dst (URShiftVL src shift)); + format %{ "vshiftq $dst,$src,$shift\t! shift packed2L" %} + ins_encode %{ + int opcode = this->as_Mach()->ideal_Opcode(); + if (UseAVX == 0) { + if ($dst$$XMMRegister != $src$$XMMRegister) + __ movdqu($dst$$XMMRegister, $src$$XMMRegister); + __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister); + } else { + int vector_len = 0; + __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + } + %} + ins_pipe( pipe_slow ); +%} + +instruct vshift4L(vecY dst, vecY src, vecS shift) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 4); + match(Set dst (LShiftVL src shift)); + match(Set dst (URShiftVL src shift)); + format %{ "vshiftq $dst,$src,$shift\t! left shift packed4L" %} + ins_encode %{ + int vector_len = 1; + int opcode = this->as_Mach()->ideal_Opcode(); + __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vshift8L(vecZ dst, vecZ src, vecS shift) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 8); + match(Set dst (LShiftVL src shift)); + match(Set dst (RShiftVL src shift)); + match(Set dst (URShiftVL src shift)); + format %{ "vshiftq $dst,$src,$shift\t! shift packed8L" %} + ins_encode %{ + int vector_len = 2; + int opcode = this->as_Mach()->ideal_Opcode(); + __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +// -------------------ArithmeticRightShift ----------------------------------- +// Long vector arithmetic right shift +instruct vsra2L_reg(vecX dst, vecX src, vecS shift, vecX tmp, rRegI scratch) %{ + predicate(UseSSE >= 2 && n->as_Vector()->length() == 2); + match(Set dst (RShiftVL src shift)); + effect(TEMP dst, TEMP tmp, TEMP scratch); + format %{ "movdqu $dst,$src\n\t" + "psrlq $dst,$shift\n\t" + "movdqu $tmp,[0x8000000000000000]\n\t" + "psrlq $tmp,$shift\n\t" + "pxor $dst,$tmp\n\t" + "psubq $dst,$tmp\t! arithmetic right shift packed2L" %} + ins_encode %{ + __ movdqu($dst$$XMMRegister, $src$$XMMRegister); __ psrlq($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl2L_imm(vecX dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (URShiftVL dst shift)); - format %{ "psrlq $dst,$shift\t! logical right shift packed2L" %} - ins_encode %{ - __ psrlq($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl2L_reg(vecX dst, vecX src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (URShiftVL src shift)); - format %{ "vpsrlq $dst,$src,$shift\t! logical right shift packed2L" %} - ins_encode %{ - int vector_len = 0; - __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl2L_reg_imm(vecX dst, vecX src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (URShiftVL src shift)); - format %{ "vpsrlq $dst,$src,$shift\t! logical right shift packed2L" %} - ins_encode %{ - int vector_len = 0; - __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl4L_reg(vecY dst, vecY src, vecS shift) %{ + __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register); + __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister); + __ pxor($dst$$XMMRegister, $tmp$$XMMRegister); + __ psubq($dst$$XMMRegister, $tmp$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra2L_reg_evex(vecX dst, vecX src, vecS shift) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 2); + match(Set dst (RShiftVL src shift)); + format %{ "evpsraq $dst,$src,$shift\t! arithmetic right shift packed2L" %} + ins_encode %{ + int vector_len = 0; + __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra4L_reg(vecY dst, vecY src, vecS shift, vecY tmp, rRegI scratch) %{ predicate(UseAVX > 1 && n->as_Vector()->length() == 4); - match(Set dst (URShiftVL src shift)); - format %{ "vpsrlq $dst,$src,$shift\t! logical right shift packed4L" %} + match(Set dst (RShiftVL src shift)); + effect(TEMP dst, TEMP tmp, TEMP scratch); + format %{ "vpsrlq $dst,$src,$shift\n\t" + "vmovdqu $tmp,[0x8000000000000000]\n\t" + "vpsrlq $tmp,$tmp,$shift\n\t" + "vpxor $dst,$dst,$tmp\n\t" + "vpsubq $dst,$dst,$tmp\t! arithmetic right shift packed4L" %} ins_encode %{ int vector_len = 1; __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl4L_reg_imm(vecY dst, vecY src, immI8 shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 4); - match(Set dst (URShiftVL src shift)); - format %{ "vpsrlq $dst,$src,$shift\t! logical right shift packed4L" %} + __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register); + __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len); + __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len); + __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vsra4L_reg_evex(vecY dst, vecY src, vecS shift) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 4); + match(Set dst (RShiftVL src shift)); + format %{ "evpsraq $dst,$src,$shift\t! arithmetic right shift packed4L" %} ins_encode %{ int vector_len = 1; - __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl8L_reg(vecZ dst, vecZ src, vecS shift) %{ - predicate(UseAVX > 2 && n->as_Vector()->length() == 8); - match(Set dst (URShiftVL src shift)); - format %{ "vpsrlq $dst,$src,$shift\t! logical right shift packed8L" %} - ins_encode %{ - int vector_len = 2; - __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsrl8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{ - predicate(UseAVX > 2 && n->as_Vector()->length() == 8); - match(Set dst (URShiftVL src shift)); - format %{ "vpsrlq $dst,$src,$shift\t! logical right shift packed8L" %} - ins_encode %{ - int vector_len = 2; - __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -// ------------------- ArithmeticRightShift ----------------------------------- - -// Shorts/Chars vector arithmetic right shift -instruct vsra2S(vecS dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (RShiftVS dst shift)); - format %{ "psraw $dst,$shift\t! arithmetic right shift packed2S" %} - ins_encode %{ - __ psraw($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra2S_imm(vecS dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (RShiftVS dst shift)); - format %{ "psraw $dst,$shift\t! arithmetic right shift packed2S" %} - ins_encode %{ - __ psraw($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra2S_reg(vecS dst, vecS src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (RShiftVS src shift)); - format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed2S" %} - ins_encode %{ - int vector_len = 0; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra2S_reg_imm(vecS dst, vecS src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (RShiftVS src shift)); - format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed2S" %} - ins_encode %{ - int vector_len = 0; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra4S(vecD dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 4); - match(Set dst (RShiftVS dst shift)); - format %{ "psraw $dst,$shift\t! arithmetic right shift packed4S" %} - ins_encode %{ - __ psraw($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra4S_imm(vecD dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 4); - match(Set dst (RShiftVS dst shift)); - format %{ "psraw $dst,$shift\t! arithmetic right shift packed4S" %} - ins_encode %{ - __ psraw($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra4S_reg(vecD dst, vecD src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 4); - match(Set dst (RShiftVS src shift)); - format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed4S" %} - ins_encode %{ - int vector_len = 0; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra4S_reg_imm(vecD dst, vecD src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 4); - match(Set dst (RShiftVS src shift)); - format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed4S" %} - ins_encode %{ - int vector_len = 0; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra8S(vecX dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 8); - match(Set dst (RShiftVS dst shift)); - format %{ "psraw $dst,$shift\t! arithmetic right shift packed8S" %} - ins_encode %{ - __ psraw($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra8S_imm(vecX dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 8); - match(Set dst (RShiftVS dst shift)); - format %{ "psraw $dst,$shift\t! arithmetic right shift packed8S" %} - ins_encode %{ - __ psraw($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra8S_reg(vecX dst, vecX src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 8); - match(Set dst (RShiftVS src shift)); - format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed8S" %} - ins_encode %{ - int vector_len = 0; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra8S_reg_imm(vecX dst, vecX src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 8); - match(Set dst (RShiftVS src shift)); - format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed8S" %} - ins_encode %{ - int vector_len = 0; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra16S_reg(vecY dst, vecY src, vecS shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 16); - match(Set dst (RShiftVS src shift)); - format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed16S" %} - ins_encode %{ - int vector_len = 1; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra16S_reg_imm(vecY dst, vecY src, immI8 shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 16); - match(Set dst (RShiftVS src shift)); - format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed16S" %} - ins_encode %{ - int vector_len = 1; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra32S_reg(vecZ dst, vecZ src, vecS shift) %{ - predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32); - match(Set dst (RShiftVS src shift)); - format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed32S" %} - ins_encode %{ - int vector_len = 2; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{ - predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32); - match(Set dst (RShiftVS src shift)); - format %{ "vpsraw $dst,$src,$shift\t! arithmetic right shift packed32S" %} - ins_encode %{ - int vector_len = 2; - __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -// Integers vector arithmetic right shift -instruct vsra2I(vecD dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (RShiftVI dst shift)); - format %{ "psrad $dst,$shift\t! arithmetic right shift packed2I" %} - ins_encode %{ - __ psrad($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra2I_imm(vecD dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 2); - match(Set dst (RShiftVI dst shift)); - format %{ "psrad $dst,$shift\t! arithmetic right shift packed2I" %} - ins_encode %{ - __ psrad($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra2I_reg(vecD dst, vecD src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (RShiftVI src shift)); - format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed2I" %} - ins_encode %{ - int vector_len = 0; - __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra2I_reg_imm(vecD dst, vecD src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 2); - match(Set dst (RShiftVI src shift)); - format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed2I" %} - ins_encode %{ - int vector_len = 0; - __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra4I(vecX dst, vecS shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 4); - match(Set dst (RShiftVI dst shift)); - format %{ "psrad $dst,$shift\t! arithmetic right shift packed4I" %} - ins_encode %{ - __ psrad($dst$$XMMRegister, $shift$$XMMRegister); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra4I_imm(vecX dst, immI8 shift) %{ - predicate(UseAVX == 0 && n->as_Vector()->length() == 4); - match(Set dst (RShiftVI dst shift)); - format %{ "psrad $dst,$shift\t! arithmetic right shift packed4I" %} - ins_encode %{ - __ psrad($dst$$XMMRegister, (int)$shift$$constant); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra4I_reg(vecX dst, vecX src, vecS shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 4); - match(Set dst (RShiftVI src shift)); - format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed4I" %} - ins_encode %{ - int vector_len = 0; - __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra4I_reg_imm(vecX dst, vecX src, immI8 shift) %{ - predicate(UseAVX > 0 && n->as_Vector()->length() == 4); - match(Set dst (RShiftVI src shift)); - format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed4I" %} - ins_encode %{ - int vector_len = 0; - __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra8I_reg(vecY dst, vecY src, vecS shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 8); - match(Set dst (RShiftVI src shift)); - format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed8I" %} - ins_encode %{ - int vector_len = 1; - __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra8I_reg_imm(vecY dst, vecY src, immI8 shift) %{ - predicate(UseAVX > 1 && n->as_Vector()->length() == 8); - match(Set dst (RShiftVI src shift)); - format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed8I" %} - ins_encode %{ - int vector_len = 1; - __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra16I_reg(vecZ dst, vecZ src, vecS shift) %{ - predicate(UseAVX > 2 && n->as_Vector()->length() == 16); - match(Set dst (RShiftVI src shift)); - format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed16I" %} - ins_encode %{ - int vector_len = 2; - __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -instruct vsra16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{ - predicate(UseAVX > 2 && n->as_Vector()->length() == 16); - match(Set dst (RShiftVI src shift)); - format %{ "vpsrad $dst,$src,$shift\t! arithmetic right shift packed16I" %} - ins_encode %{ - int vector_len = 2; - __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len); - %} - ins_pipe( pipe_slow ); -%} - -// There are no longs vector arithmetic right shift instructions. - + __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} // --------------------------------- AND -------------------------------------- @@ -9708,6 +9297,291 @@ ins_pipe( pipe_slow ); %} +// --------------------------------- ABS -------------------------------------- +// a = |a| +instruct vabs4B_reg(vecS dst, vecS src) %{ + predicate(UseSSE > 2 && n->as_Vector()->length() == 4); + match(Set dst (AbsVB src)); + format %{ "pabsb $dst,$src\t# $dst = |$src| abs packed4B" %} + ins_encode %{ + __ pabsb($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs8B_reg(vecD dst, vecD src) %{ + predicate(UseSSE > 2 && n->as_Vector()->length() == 8); + match(Set dst (AbsVB src)); + format %{ "pabsb $dst,$src\t# $dst = |$src| abs packed8B" %} + ins_encode %{ + __ pabsb($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs16B_reg(vecX dst, vecX src) %{ + predicate(UseSSE > 2 && n->as_Vector()->length() == 16); + match(Set dst (AbsVB src)); + format %{ "pabsb $dst,$src\t# $dst = |$src| abs packed16B" %} + ins_encode %{ + __ pabsb($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs32B_reg(vecY dst, vecY src) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 32); + match(Set dst (AbsVB src)); + format %{ "vpabsb $dst,$src\t# $dst = |$src| abs packed32B" %} + ins_encode %{ + int vector_len = 1; + __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs64B_reg(vecZ dst, vecZ src) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 64); + match(Set dst (AbsVB src)); + format %{ "vpabsb $dst,$src\t# $dst = |$src| abs packed64B" %} + ins_encode %{ + int vector_len = 2; + __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs2S_reg(vecD dst, vecD src) %{ + predicate(UseSSE > 2 && n->as_Vector()->length() == 2); + match(Set dst (AbsVS src)); + format %{ "pabsw $dst,$src\t# $dst = |$src| abs packed2S" %} + ins_encode %{ + __ pabsw($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs4S_reg(vecD dst, vecD src) %{ + predicate(UseSSE > 2 && n->as_Vector()->length() == 4); + match(Set dst (AbsVS src)); + format %{ "pabsw $dst,$src\t# $dst = |$src| abs packed4S" %} + ins_encode %{ + __ pabsw($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs8S_reg(vecX dst, vecX src) %{ + predicate(UseSSE > 2 && n->as_Vector()->length() == 8); + match(Set dst (AbsVS src)); + format %{ "pabsw $dst,$src\t# $dst = |$src| abs packed8S" %} + ins_encode %{ + __ pabsw($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs16S_reg(vecY dst, vecY src) %{ + predicate(UseAVX > 1 && n->as_Vector()->length() == 16); + match(Set dst (AbsVS src)); + format %{ "vpabsw $dst,$src\t# $dst = |$src| abs packed16S" %} + ins_encode %{ + int vector_len = 1; + __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs32S_reg(vecZ dst, vecZ src) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 32); + match(Set dst (AbsVS src)); + format %{ "vpabsw $dst,$src\t# $dst = |$src| abs packed32S" %} + ins_encode %{ + int vector_len = 2; + __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs2I_reg(vecD dst, vecD src) %{ + predicate(UseSSE > 2 && n->as_Vector()->length() == 2); + match(Set dst (AbsVI src)); + format %{ "pabsd $dst,$src\t# $dst = |$src| abs packed2I" %} + ins_encode %{ + __ pabsd($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs4I_reg(vecX dst, vecX src) %{ + predicate(UseSSE > 2 && n->as_Vector()->length() == 4); + match(Set dst (AbsVI src)); + format %{ "pabsd $dst,$src\t# $dst = |$src| abs packed4I" %} + ins_encode %{ + __ pabsd($dst$$XMMRegister, $src$$XMMRegister); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs8I_reg(vecY dst, vecY src) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (AbsVI src)); + format %{ "vpabsd $dst,$src\t# $dst = |$src| abs packed8I" %} + ins_encode %{ + int vector_len = 1; + __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs16I_reg(vecZ dst, vecZ src) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 16); + match(Set dst (AbsVI src)); + format %{ "vpabsd $dst,$src\t# $dst = |$src| abs packed16I" %} + ins_encode %{ + int vector_len = 2; + __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs2L_reg(vecX dst, vecX src) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 2); + match(Set dst (AbsVL src)); + format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packed2L" %} + ins_encode %{ + int vector_len = 0; + __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs4L_reg(vecY dst, vecY src) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 4); + match(Set dst (AbsVL src)); + format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packed4L" %} + ins_encode %{ + int vector_len = 1; + __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabs8L_reg(vecZ dst, vecZ src) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 8); + match(Set dst (AbsVL src)); + format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packed8L" %} + ins_encode %{ + int vector_len = 2; + __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +// --------------------------------- ABSNEG -------------------------------------- + +instruct vabsneg2D(vecX dst, vecX src, rRegI scratch) %{ + predicate(UseSSE >= 2 && n->as_Vector()->length() == 2); + match(Set dst (AbsVD src)); + match(Set dst (NegVD src)); + effect(TEMP scratch); + format %{ "vabsnegd $dst,$src,[mask]\t# absneg packed2D" %} + ins_encode %{ + int opcode = this->as_Mach()->ideal_Opcode(); + if ($dst$$XMMRegister != $src$$XMMRegister) + __ movdqu($dst$$XMMRegister, $src$$XMMRegister); + __ vabsnegd(opcode, $dst$$XMMRegister, $scratch$$Register); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabsneg4D(vecY dst, vecY src, rRegI scratch) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 4); + match(Set dst (AbsVD src)); + match(Set dst (NegVD src)); + effect(TEMP scratch); + format %{ "vabsnegd $dst,$src,[mask]\t# absneg packed4D" %} + ins_encode %{ + int opcode = this->as_Mach()->ideal_Opcode(); + int vector_len = 1; + __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vector_len, $scratch$$Register); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabsneg8D(vecZ dst, vecZ src, rRegI scratch) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 8); + match(Set dst (AbsVD src)); + match(Set dst (NegVD src)); + effect(TEMP scratch); + format %{ "vabsnegd $dst,$src,[mask]\t# absneg packed8D" %} + ins_encode %{ + int opcode = this->as_Mach()->ideal_Opcode(); + int vector_len = 2; + __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vector_len, $scratch$$Register); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabsneg2F(vecD dst, vecD src, rRegI scratch) %{ + predicate(UseSSE > 0 && n->as_Vector()->length() == 2); + match(Set dst (AbsVF src)); + match(Set dst (NegVF src)); + effect(TEMP scratch); + format %{ "vabsnegf $dst,$src,[mask]\t# absneg packed2F" %} + ins_encode %{ + int opcode = this->as_Mach()->ideal_Opcode(); + if ($dst$$XMMRegister != $src$$XMMRegister) + __ movdqu($dst$$XMMRegister, $src$$XMMRegister); + __ vabsnegf(opcode, $dst$$XMMRegister, $scratch$$Register); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabsneg4F(vecX dst, rRegI scratch) %{ + predicate(UseSSE > 0 && n->as_Vector()->length() == 4); + match(Set dst (AbsVF dst)); + match(Set dst (NegVF dst)); + effect(TEMP scratch); + format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %} + ins_cost(150); + ins_encode %{ + int opcode = this->as_Mach()->ideal_Opcode(); + __ vabsnegf(opcode, $dst$$XMMRegister, $scratch$$Register); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabsneg8F(vecY dst, vecY src, rRegI scratch) %{ + predicate(UseAVX > 0 && n->as_Vector()->length() == 8); + match(Set dst (AbsVF src)); + match(Set dst (NegVF src)); + effect(TEMP scratch); + format %{ "vabsnegf $dst,$src,[mask]\t# absneg packed8F" %} + ins_cost(150); + ins_encode %{ + int opcode = this->as_Mach()->ideal_Opcode(); + int vector_len = 1; + __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vector_len, $scratch$$Register); + %} + ins_pipe( pipe_slow ); +%} + +instruct vabsneg16F(vecZ dst, vecZ src, rRegI scratch) %{ + predicate(UseAVX > 2 && n->as_Vector()->length() == 16); + match(Set dst (AbsVF src)); + match(Set dst (NegVF src)); + effect(TEMP scratch); + format %{ "vabsnegf $dst,$src,[mask]\t# absneg packed16F" %} + ins_cost(150); + ins_encode %{ + int opcode = this->as_Mach()->ideal_Opcode(); + int vector_len = 2; + __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vector_len, $scratch$$Register); + %} + ins_pipe( pipe_slow ); +%} + // --------------------------------- FMA -------------------------------------- // a * b + c diff -r 98558b7544c7 -r 1851a532ddfe src/hotspot/cpu/x86/x86_32.ad --- a/src/hotspot/cpu/x86/x86_32.ad Tue May 07 21:53:46 2019 +0200 +++ b/src/hotspot/cpu/x86/x86_32.ad Tue May 07 13:33:27 2019 -0700 @@ -8949,6 +8949,28 @@ ins_pipe(ialu_reg_reg_alu0); %} +// Integer Absolute Instructions +instruct absI_rReg(rRegI dst, rRegI src, rRegI tmp, eFlagsReg cr) +%{ + match(Set dst (AbsI src)); + effect(TEMP dst, TEMP tmp, KILL cr); + format %{ "movl $tmp, $src\n\t" + "sarl $tmp, 31\n\t" + "movl $dst, $src\n\t" + "xorl $dst, $tmp\n\t" + "subl $dst, $tmp\n" + %} + ins_encode %{ + __ movl($tmp$$Register, $src$$Register); + __ sarl($tmp$$Register, 31); + __ movl($dst$$Register, $src$$Register); + __ xorl($dst$$Register, $tmp$$Register); + __ subl($dst$$Register, $tmp$$Register); + %} + + ins_pipe(ialu_reg_reg); +%} + //----------Long Instructions------------------------------------------------ // Add Long Register with Register instruct addL_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{ diff -r 98558b7544c7 -r 1851a532ddfe src/hotspot/cpu/x86/x86_64.ad --- a/src/hotspot/cpu/x86/x86_64.ad Tue May 07 21:53:46 2019 +0200 +++ b/src/hotspot/cpu/x86/x86_64.ad Tue May 07 13:33:27 2019 -0700 @@ -8181,6 +8181,52 @@ ins_pipe( pipe_cmpxchg ); %} +//----------Abs Instructions------------------------------------------- + +// Integer Absolute Instructions +instruct absI_rReg(rRegI dst, rRegI src, rRegI tmp, rFlagsReg cr) +%{ + match(Set dst (AbsI src)); + effect(TEMP dst, TEMP tmp, KILL cr); + format %{ "movl $tmp, $src\n\t" + "sarl $tmp, 31\n\t" + "movl $dst, $src\n\t" + "xorl $dst, $tmp\n\t" + "subl $dst, $tmp\n" + %} + ins_encode %{ + __ movl($tmp$$Register, $src$$Register); + __ sarl($tmp$$Register, 31); + __ movl($dst$$Register, $src$$Register); + __ xorl($dst$$Register, $tmp$$Register); + __ subl($dst$$Register, $tmp$$Register); + %} + + ins_pipe(ialu_reg_reg); +%} + +// Long Absolute Instructions +instruct absL_rReg(rRegL dst, rRegL src, rRegL tmp, rFlagsReg cr) +%{ + match(Set dst (AbsL src)); + effect(TEMP dst, TEMP tmp, KILL cr); + format %{ "movq $tmp, $src\n\t" + "sarq $tmp, 63\n\t" + "movq $dst, $src\n\t" + "xorq $dst, $tmp\n\t" + "subq $dst, $tmp\n" + %} + ins_encode %{ + __ movq($tmp$$Register, $src$$Register); + __ sarq($tmp$$Register, 63); + __ movq($dst$$Register, $src$$Register); + __ xorq($dst$$Register, $tmp$$Register); + __ subq($dst$$Register, $tmp$$Register); + %} + + ins_pipe(ialu_reg_reg); +%} + //----------Subtraction Instructions------------------------------------------- // Integer Subtraction Instructions diff -r 98558b7544c7 -r 1851a532ddfe src/hotspot/share/adlc/formssel.cpp --- a/src/hotspot/share/adlc/formssel.cpp Tue May 07 21:53:46 2019 +0200 +++ b/src/hotspot/share/adlc/formssel.cpp Tue May 07 13:33:27 2019 -0700 @@ -3808,7 +3808,7 @@ "MaxI","MinI","MaxF","MinF","MaxD","MinD", "MaxV", "MinV", "MulI","MulL","MulF","MulD", - "MulVS","MulVI","MulVL","MulVF","MulVD", + "MulVB","MulVS","MulVI","MulVL","MulVF","MulVD", "OrI","OrL", "OrV", "XorI","XorL", @@ -4175,10 +4175,10 @@ static const char *vector_list[] = { "AddVB","AddVS","AddVI","AddVL","AddVF","AddVD", "SubVB","SubVS","SubVI","SubVL","SubVF","SubVD", - "MulVS","MulVI","MulVL","MulVF","MulVD", + "MulVB","MulVS","MulVI","MulVL","MulVF","MulVD", "CMoveVD", "CMoveVF", "DivVF","DivVD", - "AbsVF","AbsVD", + "AbsVB","AbsVS","AbsVI","AbsVL","AbsVF","AbsVD", "NegVF","NegVD", "SqrtVD","SqrtVF", "AndV" ,"XorV" ,"OrV", diff -r 98558b7544c7 -r 1851a532ddfe src/hotspot/share/classfile/vmSymbols.cpp --- a/src/hotspot/share/classfile/vmSymbols.cpp Tue May 07 21:53:46 2019 +0200 +++ b/src/hotspot/share/classfile/vmSymbols.cpp Tue May 07 13:33:27 2019 -0700 @@ -363,6 +363,9 @@ case vmIntrinsics::_isInstance: case vmIntrinsics::_currentThread: case vmIntrinsics::_dabs: + case vmIntrinsics::_fabs: + case vmIntrinsics::_iabs: + case vmIntrinsics::_labs: case vmIntrinsics::_dsqrt: case vmIntrinsics::_dsin: case vmIntrinsics::_dcos: @@ -404,6 +407,9 @@ case vmIntrinsics::_longBitsToDouble: case vmIntrinsics::_currentThread: case vmIntrinsics::_dabs: + case vmIntrinsics::_fabs: + case vmIntrinsics::_iabs: + case vmIntrinsics::_labs: case vmIntrinsics::_dsqrt: case vmIntrinsics::_dsin: case vmIntrinsics::_dcos: @@ -567,6 +573,9 @@ case vmIntrinsics::_doubleToRawLongBits: case vmIntrinsics::_longBitsToDouble: case vmIntrinsics::_dabs: + case vmIntrinsics::_fabs: + case vmIntrinsics::_iabs: + case vmIntrinsics::_labs: case vmIntrinsics::_dsqrt: case vmIntrinsics::_dsin: case vmIntrinsics::_dcos: diff -r 98558b7544c7 -r 1851a532ddfe src/hotspot/share/classfile/vmSymbols.hpp --- a/src/hotspot/share/classfile/vmSymbols.hpp Tue May 07 21:53:46 2019 +0200 +++ b/src/hotspot/share/classfile/vmSymbols.hpp Tue May 07 13:33:27 2019 -0700 @@ -472,6 +472,7 @@ template(float_int_signature, "(F)I") \ template(double_long_signature, "(D)J") \ template(double_double_signature, "(D)D") \ + template(float_float_signature, "(F)F") \ template(int_float_signature, "(I)F") \ template(long_int_signature, "(J)I") \ template(long_long_signature, "(J)J") \ @@ -771,6 +772,9 @@ do_name(fma_name, "fma") \ \ do_intrinsic(_dabs, java_lang_Math, abs_name, double_double_signature, F_S) \ + do_intrinsic(_fabs, java_lang_Math, abs_name, float_float_signature, F_S) \ + do_intrinsic(_iabs, java_lang_Math, abs_name, int_int_signature, F_S) \ + do_intrinsic(_labs, java_lang_Math, abs_name, long_long_signature, F_S) \ do_intrinsic(_dsin, java_lang_Math, sin_name, double_double_signature, F_S) \ do_intrinsic(_dcos, java_lang_Math, cos_name, double_double_signature, F_S) \ do_intrinsic(_dtan, java_lang_Math, tan_name, double_double_signature, F_S) \ diff -r 98558b7544c7 -r 1851a532ddfe src/hotspot/share/opto/c2compiler.cpp --- a/src/hotspot/share/opto/c2compiler.cpp Tue May 07 21:53:46 2019 +0200 +++ b/src/hotspot/share/opto/c2compiler.cpp Tue May 07 13:33:27 2019 -0700 @@ -460,6 +460,9 @@ case vmIntrinsics::_dcos: case vmIntrinsics::_dtan: case vmIntrinsics::_dabs: + case vmIntrinsics::_fabs: + case vmIntrinsics::_iabs: + case vmIntrinsics::_labs: case vmIntrinsics::_datan2: case vmIntrinsics::_dsqrt: case vmIntrinsics::_dexp: diff -r 98558b7544c7 -r 1851a532ddfe src/hotspot/share/opto/classes.hpp --- a/src/hotspot/share/opto/classes.hpp Tue May 07 21:53:46 2019 +0200 +++ b/src/hotspot/share/opto/classes.hpp Tue May 07 13:33:27 2019 -0700 @@ -30,6 +30,7 @@ macro(AbsD) macro(AbsF) macro(AbsI) +macro(AbsL) macro(AddD) macro(AddF) macro(AddI) @@ -335,6 +336,7 @@ macro(SubVL) macro(SubVF) macro(SubVD) +macro(MulVB) macro(MulVS) macro(MulVI) macro(MulReductionVI) @@ -349,6 +351,10 @@ macro(FmaVF) macro(DivVF) macro(DivVD) +macro(AbsVB) +macro(AbsVS) +macro(AbsVI) +macro(AbsVL) macro(AbsVF) macro(AbsVD) macro(NegVF) diff -r 98558b7544c7 -r 1851a532ddfe src/hotspot/share/opto/library_call.cpp --- a/src/hotspot/share/opto/library_call.cpp Tue May 07 21:53:46 2019 +0200 +++ b/src/hotspot/share/opto/library_call.cpp Tue May 07 13:33:27 2019 -0700 @@ -227,6 +227,7 @@ bool runtime_math(const TypeFunc* call_type, address funcAddr, const char* funcName); bool inline_math_native(vmIntrinsics::ID id); bool inline_math(vmIntrinsics::ID id); + bool inline_double_math(vmIntrinsics::ID id); template bool inline_math_overflow(Node* arg1, Node* arg2); void inline_math_mathExact(Node* math, Node* test); @@ -533,6 +534,9 @@ case vmIntrinsics::_dcos: case vmIntrinsics::_dtan: case vmIntrinsics::_dabs: + case vmIntrinsics::_fabs: + case vmIntrinsics::_iabs: + case vmIntrinsics::_labs: case vmIntrinsics::_datan2: case vmIntrinsics::_dsqrt: case vmIntrinsics::_dexp: @@ -1793,7 +1797,7 @@ // public static double Math.sqrt(double) // public static double Math.log(double) // public static double Math.log10(double) -bool LibraryCallKit::inline_math(vmIntrinsics::ID id) { +bool LibraryCallKit::inline_double_math(vmIntrinsics::ID id) { Node* arg = round_double_node(argument(0)); Node* n = NULL; switch (id) { @@ -1805,6 +1809,23 @@ return true; } +//------------------------------inline_math----------------------------------- +// public static float Math.abs(float) +// public static int Math.abs(int) +// public static long Math.abs(long) +bool LibraryCallKit::inline_math(vmIntrinsics::ID id) { + Node* arg = argument(0); + Node* n = NULL; + switch (id) { + case vmIntrinsics::_fabs: n = new AbsFNode( arg); break; + case vmIntrinsics::_iabs: n = new AbsINode( arg); break; + case vmIntrinsics::_labs: n = new AbsLNode( arg); break; + default: fatal_unexpected_iid(id); break; + } + set_result(_gvn.transform(n)); + return true; +} + //------------------------------runtime_math----------------------------- bool LibraryCallKit::runtime_math(const TypeFunc* call_type, address funcAddr, const char* funcName) { assert(call_type == OptoRuntime::Math_DD_D_Type() || call_type == OptoRuntime::Math_D_D_Type(), @@ -1855,8 +1876,11 @@ runtime_math(OptoRuntime::Math_D_D_Type(), FN_PTR(SharedRuntime::dlog10), "LOG10"); // These intrinsics are supported on all hardware - case vmIntrinsics::_dsqrt: return Matcher::match_rule_supported(Op_SqrtD) ? inline_math(id) : false; - case vmIntrinsics::_dabs: return Matcher::has_match_rule(Op_AbsD) ? inline_math(id) : false; + case vmIntrinsics::_dsqrt: return Matcher::match_rule_supported(Op_SqrtD) ? inline_double_math(id) : false; + case vmIntrinsics::_dabs: return Matcher::has_match_rule(Op_AbsD) ? inline_double_math(id) : false; + case vmIntrinsics::_fabs: return Matcher::match_rule_supported(Op_AbsF) ? inline_math(id) : false; + case vmIntrinsics::_iabs: return Matcher::match_rule_supported(Op_AbsI) ? inline_math(id) : false; + case vmIntrinsics::_labs: return Matcher::match_rule_supported(Op_AbsL) ? inline_math(id) : false; case vmIntrinsics::_dexp: return StubRoutines::dexp() != NULL ? diff -r 98558b7544c7 -r 1851a532ddfe src/hotspot/share/opto/subnode.hpp --- a/src/hotspot/share/opto/subnode.hpp Tue May 07 21:53:46 2019 +0200 +++ b/src/hotspot/share/opto/subnode.hpp Tue May 07 13:33:27 2019 -0700 @@ -350,6 +350,17 @@ virtual uint ideal_reg() const { return Op_RegI; } }; +//------------------------------AbsLNode--------------------------------------- +// Absolute value a long. Since a naive graph involves control flow, we +// "match" it in the ideal world (so the control flow can be removed). +class AbsLNode : public AbsNode { +public: + AbsLNode( Node *in1 ) : AbsNode(in1) {} + virtual int Opcode() const; + const Type *bottom_type() const { return TypeLong::LONG; } + virtual uint ideal_reg() const { return Op_RegL; } +}; + //------------------------------AbsFNode--------------------------------------- // Absolute value a float, a common float-point idiom with a cheap hardware // implemention on most chips. Since a naive graph involves control flow, we diff -r 98558b7544c7 -r 1851a532ddfe src/hotspot/share/opto/superword.cpp --- a/src/hotspot/share/opto/superword.cpp Tue May 07 21:53:46 2019 +0200 +++ b/src/hotspot/share/opto/superword.cpp Tue May 07 13:33:27 2019 -0700 @@ -2453,6 +2453,7 @@ } } else if (opc == Op_SqrtF || opc == Op_SqrtD || opc == Op_AbsF || opc == Op_AbsD || + opc == Op_AbsI || opc == Op_AbsL || opc == Op_NegF || opc == Op_NegD || opc == Op_PopCountI) { assert(n->req() == 2, "only one input expected"); diff -r 98558b7544c7 -r 1851a532ddfe src/hotspot/share/opto/vectornode.cpp --- a/src/hotspot/share/opto/vectornode.cpp Tue May 07 21:53:46 2019 +0200 +++ b/src/hotspot/share/opto/vectornode.cpp Tue May 07 13:33:27 2019 -0700 @@ -70,8 +70,8 @@ return Op_SubVD; case Op_MulI: switch (bt) { - case T_BOOLEAN: - case T_BYTE: return 0; // Unimplemented + case T_BOOLEAN:return 0; + case T_BYTE: return Op_MulVB; case T_CHAR: case T_SHORT: return Op_MulVS; case T_INT: return Op_MulVI; @@ -104,6 +104,18 @@ case Op_DivD: assert(bt == T_DOUBLE, "must be"); return Op_DivVD; + case Op_AbsI: + switch (bt) { + case T_BOOLEAN: + case T_CHAR: return 0; // abs does not make sense for unsigned + case T_BYTE: return Op_AbsVB; + case T_SHORT: return Op_AbsVS; + case T_INT: return Op_AbsVI; + default: ShouldNotReachHere(); return 0; + } + case Op_AbsL: + assert(bt == T_LONG, "must be"); + return Op_AbsVL; case Op_AbsF: assert(bt == T_FLOAT, "must be"); return Op_AbsVF; @@ -350,6 +362,7 @@ case Op_SubVF: return new SubVFNode(n1, n2, vt); case Op_SubVD: return new SubVDNode(n1, n2, vt); + case Op_MulVB: return new MulVBNode(n1, n2, vt); case Op_MulVS: return new MulVSNode(n1, n2, vt); case Op_MulVI: return new MulVINode(n1, n2, vt); case Op_MulVL: return new MulVLNode(n1, n2, vt); @@ -359,6 +372,10 @@ case Op_DivVF: return new DivVFNode(n1, n2, vt); case Op_DivVD: return new DivVDNode(n1, n2, vt); + case Op_AbsVB: return new AbsVBNode(n1, vt); + case Op_AbsVS: return new AbsVSNode(n1, vt); + case Op_AbsVI: return new AbsVINode(n1, vt); + case Op_AbsVL: return new AbsVLNode(n1, vt); case Op_AbsVF: return new AbsVFNode(n1, vt); case Op_AbsVD: return new AbsVDNode(n1, vt); diff -r 98558b7544c7 -r 1851a532ddfe src/hotspot/share/opto/vectornode.hpp --- a/src/hotspot/share/opto/vectornode.hpp Tue May 07 21:53:46 2019 +0200 +++ b/src/hotspot/share/opto/vectornode.hpp Tue May 07 13:33:27 2019 -0700 @@ -224,6 +224,14 @@ virtual int Opcode() const; }; +//------------------------------MulVBNode-------------------------------------- +// Vector multiply byte +class MulVBNode : public VectorNode { + public: + MulVBNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {} + virtual int Opcode() const; +}; + //------------------------------MulVSNode-------------------------------------- // Vector multiply short class MulVSNode : public VectorNode { @@ -360,6 +368,38 @@ virtual int Opcode() const; }; +//------------------------------AbsVBNode-------------------------------------- +// Vector Abs byte +class AbsVBNode : public VectorNode { +public: + AbsVBNode(Node* in, const TypeVect* vt) : VectorNode(in, vt) {} + virtual int Opcode() const; +}; + +//------------------------------AbsVSNode-------------------------------------- +// Vector Abs short +class AbsVSNode : public VectorNode { +public: + AbsVSNode(Node* in, const TypeVect* vt) : VectorNode(in, vt) {} + virtual int Opcode() const; +}; + +//------------------------------AbsVINode-------------------------------------- +// Vector Abs int +class AbsVINode : public VectorNode { +public: + AbsVINode(Node* in, const TypeVect* vt) : VectorNode(in, vt) {} + virtual int Opcode() const; +}; + +//------------------------------AbsVLNode-------------------------------------- +// Vector Abs long +class AbsVLNode : public VectorNode { +public: + AbsVLNode(Node* in, const TypeVect* vt) : VectorNode(in, vt) {} + virtual int Opcode() const; +}; + //------------------------------AbsVFNode-------------------------------------- // Vector Abs float class AbsVFNode : public VectorNode { diff -r 98558b7544c7 -r 1851a532ddfe src/hotspot/share/runtime/vmStructs.cpp --- a/src/hotspot/share/runtime/vmStructs.cpp Tue May 07 21:53:46 2019 +0200 +++ b/src/hotspot/share/runtime/vmStructs.cpp Tue May 07 13:33:27 2019 -0700 @@ -1758,6 +1758,10 @@ declare_c2_type(ReverseBytesLNode, Node) \ declare_c2_type(ReductionNode, Node) \ declare_c2_type(VectorNode, Node) \ + declare_c2_type(AbsVBNode, VectorNode) \ + declare_c2_type(AbsVSNode, VectorNode) \ + declare_c2_type(AbsVINode, VectorNode) \ + declare_c2_type(AbsVLNode, VectorNode) \ declare_c2_type(AddVBNode, VectorNode) \ declare_c2_type(AddVSNode, VectorNode) \ declare_c2_type(AddVINode, VectorNode) \ @@ -1774,6 +1778,7 @@ declare_c2_type(SubVLNode, VectorNode) \ declare_c2_type(SubVFNode, VectorNode) \ declare_c2_type(SubVDNode, VectorNode) \ + declare_c2_type(MulVBNode, VectorNode) \ declare_c2_type(MulVSNode, VectorNode) \ declare_c2_type(MulVLNode, VectorNode) \ declare_c2_type(MulReductionVLNode, ReductionNode) \ @@ -1782,6 +1787,8 @@ declare_c2_type(MulVFNode, VectorNode) \ declare_c2_type(MulReductionVFNode, ReductionNode) \ declare_c2_type(MulVDNode, VectorNode) \ + declare_c2_type(NegVFNode, VectorNode) \ + declare_c2_type(NegVDNode, VectorNode) \ declare_c2_type(FmaVDNode, VectorNode) \ declare_c2_type(FmaVFNode, VectorNode) \ declare_c2_type(CMoveVFNode, VectorNode) \ diff -r 98558b7544c7 -r 1851a532ddfe src/java.base/share/classes/java/lang/Math.java --- a/src/java.base/share/classes/java/lang/Math.java Tue May 07 21:53:46 2019 +0200 +++ b/src/java.base/share/classes/java/lang/Math.java Tue May 07 13:33:27 2019 -0700 @@ -1353,6 +1353,7 @@ * @param a the argument whose absolute value is to be determined * @return the absolute value of the argument. */ + @HotSpotIntrinsicCandidate public static int abs(int a) { return (a < 0) ? -a : a; } @@ -1370,6 +1371,7 @@ * @param a the argument whose absolute value is to be determined * @return the absolute value of the argument. */ + @HotSpotIntrinsicCandidate public static long abs(long a) { return (a < 0) ? -a : a; } @@ -1394,6 +1396,7 @@ * @param a the argument whose absolute value is to be determined * @return the absolute value of the argument. */ + @HotSpotIntrinsicCandidate public static float abs(float a) { return (a <= 0.0F) ? 0.0F - a : a; } diff -r 98558b7544c7 -r 1851a532ddfe src/jdk.internal.vm.compiler/share/classes/org.graalvm.compiler.hotspot.test/src/org/graalvm/compiler/hotspot/test/CheckGraalIntrinsics.java --- a/src/jdk.internal.vm.compiler/share/classes/org.graalvm.compiler.hotspot.test/src/org/graalvm/compiler/hotspot/test/CheckGraalIntrinsics.java Tue May 07 21:53:46 2019 +0200 +++ b/src/jdk.internal.vm.compiler/share/classes/org.graalvm.compiler.hotspot.test/src/org/graalvm/compiler/hotspot/test/CheckGraalIntrinsics.java Tue May 07 13:33:27 2019 -0700 @@ -398,6 +398,9 @@ if (isJDK13OrHigher()) { add(toBeInvestigated, + "java/lang/Math.abs(F)F", + "java/lang/Math.abs(I)I", + "java/lang/Math.abs(J)J", "java/lang/Math.max(DD)D", "java/lang/Math.max(FF)F", "java/lang/Math.min(DD)D", diff -r 98558b7544c7 -r 1851a532ddfe test/hotspot/jtreg/compiler/c2/cr6340864/TestDoubleVect.java --- a/test/hotspot/jtreg/compiler/c2/cr6340864/TestDoubleVect.java Tue May 07 21:53:46 2019 +0200 +++ b/test/hotspot/jtreg/compiler/c2/cr6340864/TestDoubleVect.java Tue May 07 13:33:27 2019 -0700 @@ -86,6 +86,7 @@ test_divc_n(a0, a1); test_divv(a0, a1, -VALUE); test_diva(a0, a1, a3); + test_negc(a0, a1); } // Test and verify results System.out.println("Verification"); @@ -339,6 +340,16 @@ for (int i=12; i 0) @@ -469,6 +481,13 @@ end = System.currentTimeMillis(); System.out.println("test_diva_n: " + (end - start)); + start = System.currentTimeMillis(); + for (int i=0; i