# HG changeset patch # User srukmannagar # Date 1522113372 25200 # Node ID 3b1570be85578fa33e1d050dbdea6758fa2d2d36 # Parent 537ef53e26af061bbc8621d631a6fe81902ef6ec 8200067: Add support for vpclmulqdq for crc32 Reviewed-by: kvn diff -r 537ef53e26af -r 3b1570be8557 src/hotspot/cpu/x86/assembler_x86.cpp --- a/src/hotspot/cpu/x86/assembler_x86.cpp Mon Mar 26 17:40:54 2018 -0700 +++ b/src/hotspot/cpu/x86/assembler_x86.cpp Mon Mar 26 18:16:12 2018 -0700 @@ -4080,6 +4080,16 @@ emit_operand(dst, src); emit_int8(mode & 0xFF); } +void Assembler::evshufi64x2(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) { + assert(VM_Version::supports_evex(), "requires EVEX support"); + assert(vector_len == Assembler::AVX_256bit || vector_len == Assembler::AVX_512bit, ""); + InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int8(0x43); + emit_int8((unsigned char)(0xC0 | encode)); + emit_int8(imm8 & 0xFF); +} void Assembler::psrldq(XMMRegister dst, int shift) { // Shift left 128 bit value in dst XMMRegister by shift number of bytes. @@ -6201,6 +6211,27 @@ emit_operand(dst, src); } +void Assembler::evpxorq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + assert(VM_Version::supports_evex(), "requires EVEX support"); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8((unsigned char)0xEF); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::evpxorq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + assert(VM_Version::supports_evex(), "requires EVEX support"); + assert(dst != xnoreg, "sanity"); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit); + vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8((unsigned char)0xEF); + emit_operand(dst, src); +} + // vinserti forms @@ -6786,6 +6817,16 @@ emit_int8((unsigned char)mask); } +void Assembler::evpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask, int vector_len) { + assert(VM_Version::supports_vpclmulqdq(), "Requires vector carryless multiplication support"); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int8(0x44); + emit_int8((unsigned char)(0xC0 | encode)); + emit_int8((unsigned char)mask); +} + void Assembler::vzeroupper() { if (VM_Version::supports_vzeroupper()) { InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); diff -r 537ef53e26af -r 3b1570be8557 src/hotspot/cpu/x86/assembler_x86.hpp --- a/src/hotspot/cpu/x86/assembler_x86.hpp Mon Mar 26 17:40:54 2018 -0700 +++ b/src/hotspot/cpu/x86/assembler_x86.hpp Mon Mar 26 18:16:12 2018 -0700 @@ -1663,6 +1663,9 @@ void pshuflw(XMMRegister dst, XMMRegister src, int mode); void pshuflw(XMMRegister dst, Address src, int mode); + // Shuffle packed values at 128 bit granularity + void evshufi64x2(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len); + // Shift Right by bytes Logical DoubleQuadword Immediate void psrldq(XMMRegister dst, int shift); // Shift Left by bytes Logical DoubleQuadword Immediate @@ -2046,6 +2049,9 @@ void pxor(XMMRegister dst, XMMRegister src); void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len); + void evpxorq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void evpxorq(XMMRegister dst, XMMRegister nds, Address src, int vector_len); + // vinserti forms void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8); @@ -2108,7 +2114,7 @@ // Carry-Less Multiplication Quadword void pclmulqdq(XMMRegister dst, XMMRegister src, int mask); void vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask); - + void evpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask, int vector_len); // AVX instruction which is used to clear upper 128 bits of YMM registers and // to avoid transaction penalty between AVX and SSE states. There is no // penalty if legacy SSE instructions are encoded using VEX prefix because diff -r 537ef53e26af -r 3b1570be8557 src/hotspot/cpu/x86/macroAssembler_x86.cpp --- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp Mon Mar 26 17:40:54 2018 -0700 +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp Mon Mar 26 18:16:12 2018 -0700 @@ -10120,6 +10120,16 @@ } /** +* Fold four 128-bit data chunks +*/ +void MacroAssembler::fold_128bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) { + evpclmulhdq(xtmp, xK, xcrc, Assembler::AVX_512bit); // [123:64] + evpclmulldq(xcrc, xK, xcrc, Assembler::AVX_512bit); // [63:0] + evpxorq(xcrc, xcrc, Address(buf, offset), Assembler::AVX_512bit /* vector_len */); + evpxorq(xcrc, xcrc, xtmp, Assembler::AVX_512bit /* vector_len */); +} + +/** * Fold 128-bit data chunk */ void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) { @@ -10224,6 +10234,34 @@ shrl(len, 4); jcc(Assembler::zero, L_tail_restore); + // Fold total 512 bits of polynomial on each iteration + if (VM_Version::supports_vpclmulqdq()) { + Label Parallel_loop, L_No_Parallel; + + cmpl(len, 8); + jccb(Assembler::less, L_No_Parallel); + + movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32)); + evmovdquq(xmm1, Address(buf, 0), Assembler::AVX_512bit); + movdl(xmm5, crc); + evpxorq(xmm1, xmm1, xmm5, Assembler::AVX_512bit); + addptr(buf, 64); + subl(len, 7); + evshufi64x2(xmm0, xmm0, xmm0, 0x00, Assembler::AVX_512bit); //propagate the mask from 128 bits to 512 bits + + BIND(Parallel_loop); + fold_128bit_crc32_avx512(xmm1, xmm0, xmm5, buf, 0); + addptr(buf, 64); + subl(len, 4); + jcc(Assembler::greater, Parallel_loop); + + vextracti64x2(xmm2, xmm1, 0x01); + vextracti64x2(xmm3, xmm1, 0x02); + vextracti64x2(xmm4, xmm1, 0x03); + jmp(L_fold_512b); + + BIND(L_No_Parallel); + } // Fold crc into first bytes of vector movdqa(xmm1, Address(buf, 0)); movdl(rax, xmm1); diff -r 537ef53e26af -r 3b1570be8557 src/hotspot/cpu/x86/macroAssembler_x86.hpp --- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp Mon Mar 26 17:40:54 2018 -0700 +++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp Mon Mar 26 18:16:12 2018 -0700 @@ -1498,6 +1498,14 @@ // 0x11 - multiply upper 64 bits [64:127] Assembler::vpclmulqdq(dst, nds, src, 0x11); } + void evpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + // 0x00 - multiply lower 64 bits [0:63] + Assembler::evpclmulqdq(dst, nds, src, 0x00, vector_len); + } + void evpclmulhdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + // 0x11 - multiply upper 64 bits [64:127] + Assembler::evpclmulqdq(dst, nds, src, 0x11, vector_len); + } // Data @@ -1723,6 +1731,7 @@ // Fold 8-bit data void fold_8bit_crc32(Register crc, Register table, Register tmp); void fold_8bit_crc32(XMMRegister crc, Register table, XMMRegister xtmp, Register tmp); + void fold_128bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset); // Compress char[] array to byte[]. void char_array_compress(Register src, Register dst, Register len, diff -r 537ef53e26af -r 3b1570be8557 src/hotspot/cpu/x86/vm_version_x86.cpp --- a/src/hotspot/cpu/x86/vm_version_x86.cpp Mon Mar 26 17:40:54 2018 -0700 +++ b/src/hotspot/cpu/x86/vm_version_x86.cpp Mon Mar 26 18:16:12 2018 -0700 @@ -665,6 +665,7 @@ _features &= ~CPU_AVX512BW; _features &= ~CPU_AVX512VL; _features &= ~CPU_AVX512_VPOPCNTDQ; + _features &= ~CPU_VPCLMULQDQ; } if (UseAVX < 2) diff -r 537ef53e26af -r 3b1570be8557 src/hotspot/cpu/x86/vm_version_x86.hpp --- a/src/hotspot/cpu/x86/vm_version_x86.hpp Mon Mar 26 17:40:54 2018 -0700 +++ b/src/hotspot/cpu/x86/vm_version_x86.hpp Mon Mar 26 18:16:12 2018 -0700 @@ -334,6 +334,7 @@ #define CPU_FMA ((uint64_t)UCONST64(0x800000000)) // FMA instructions #define CPU_VZEROUPPER ((uint64_t)UCONST64(0x1000000000)) // Vzeroupper instruction #define CPU_AVX512_VPOPCNTDQ ((uint64_t)UCONST64(0x2000000000)) // Vector popcount +#define CPU_VPCLMULQDQ ((uint64_t)UCONST64(0x4000000000)) //Vector carryless multiplication enum Extended_Family { // AMD @@ -542,6 +543,8 @@ result |= CPU_AVX512VL; if (_cpuid_info.sef_cpuid7_ecx.bits.avx512_vpopcntdq != 0) result |= CPU_AVX512_VPOPCNTDQ; + if (_cpuid_info.sef_cpuid7_ecx.bits.vpclmulqdq != 0) + result |= CPU_VPCLMULQDQ; } } if(_cpuid_info.sef_cpuid7_ebx.bits.bmi1 != 0) @@ -819,6 +822,7 @@ static bool supports_fma() { return (_features & CPU_FMA) != 0 && supports_avx(); } static bool supports_vzeroupper() { return (_features & CPU_VZEROUPPER) != 0; } static bool supports_vpopcntdq() { return (_features & CPU_AVX512_VPOPCNTDQ) != 0; } + static bool supports_vpclmulqdq() { return (_features & CPU_VPCLMULQDQ) != 0; } // Intel features static bool is_intel_family_core() { return is_intel() &&