--- a/src/hotspot/cpu/x86/assembler_x86.cpp Mon Mar 26 17:40:54 2018 -0700
+++ b/src/hotspot/cpu/x86/assembler_x86.cpp Mon Mar 26 18:16:12 2018 -0700
@@ -4080,6 +4080,16 @@
emit_operand(dst, src);
emit_int8(mode & 0xFF);
}
+void Assembler::evshufi64x2(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) {
+ assert(VM_Version::supports_evex(), "requires EVEX support");
+ assert(vector_len == Assembler::AVX_256bit || vector_len == Assembler::AVX_512bit, "");
+ InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+ attributes.set_is_evex_instruction();
+ int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+ emit_int8(0x43);
+ emit_int8((unsigned char)(0xC0 | encode));
+ emit_int8(imm8 & 0xFF);
+}
void Assembler::psrldq(XMMRegister dst, int shift) {
// Shift left 128 bit value in dst XMMRegister by shift number of bytes.
@@ -6201,6 +6211,27 @@
emit_operand(dst, src);
}
+void Assembler::evpxorq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+ assert(VM_Version::supports_evex(), "requires EVEX support");
+ InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+ attributes.set_is_evex_instruction();
+ int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+ emit_int8((unsigned char)0xEF);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evpxorq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+ assert(VM_Version::supports_evex(), "requires EVEX support");
+ assert(dst != xnoreg, "sanity");
+ InstructionMark im(this);
+ InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+ attributes.set_is_evex_instruction();
+ attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
+ vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+ emit_int8((unsigned char)0xEF);
+ emit_operand(dst, src);
+}
+
// vinserti forms
@@ -6786,6 +6817,16 @@
emit_int8((unsigned char)mask);
}
+void Assembler::evpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask, int vector_len) {
+ assert(VM_Version::supports_vpclmulqdq(), "Requires vector carryless multiplication support");
+ InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+ attributes.set_is_evex_instruction();
+ int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+ emit_int8(0x44);
+ emit_int8((unsigned char)(0xC0 | encode));
+ emit_int8((unsigned char)mask);
+}
+
void Assembler::vzeroupper() {
if (VM_Version::supports_vzeroupper()) {
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
--- a/src/hotspot/cpu/x86/assembler_x86.hpp Mon Mar 26 17:40:54 2018 -0700
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp Mon Mar 26 18:16:12 2018 -0700
@@ -1663,6 +1663,9 @@
void pshuflw(XMMRegister dst, XMMRegister src, int mode);
void pshuflw(XMMRegister dst, Address src, int mode);
+ // Shuffle packed values at 128 bit granularity
+ void evshufi64x2(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len);
+
// Shift Right by bytes Logical DoubleQuadword Immediate
void psrldq(XMMRegister dst, int shift);
// Shift Left by bytes Logical DoubleQuadword Immediate
@@ -2046,6 +2049,9 @@
void pxor(XMMRegister dst, XMMRegister src);
void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+ void evpxorq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void evpxorq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+
// vinserti forms
void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
@@ -2108,7 +2114,7 @@
// Carry-Less Multiplication Quadword
void pclmulqdq(XMMRegister dst, XMMRegister src, int mask);
void vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask);
-
+ void evpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask, int vector_len);
// AVX instruction which is used to clear upper 128 bits of YMM registers and
// to avoid transaction penalty between AVX and SSE states. There is no
// penalty if legacy SSE instructions are encoded using VEX prefix because
--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp Mon Mar 26 17:40:54 2018 -0700
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp Mon Mar 26 18:16:12 2018 -0700
@@ -10120,6 +10120,16 @@
}
/**
+* Fold four 128-bit data chunks
+*/
+void MacroAssembler::fold_128bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
+ evpclmulhdq(xtmp, xK, xcrc, Assembler::AVX_512bit); // [123:64]
+ evpclmulldq(xcrc, xK, xcrc, Assembler::AVX_512bit); // [63:0]
+ evpxorq(xcrc, xcrc, Address(buf, offset), Assembler::AVX_512bit /* vector_len */);
+ evpxorq(xcrc, xcrc, xtmp, Assembler::AVX_512bit /* vector_len */);
+}
+
+/**
* Fold 128-bit data chunk
*/
void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
@@ -10224,6 +10234,34 @@
shrl(len, 4);
jcc(Assembler::zero, L_tail_restore);
+ // Fold total 512 bits of polynomial on each iteration
+ if (VM_Version::supports_vpclmulqdq()) {
+ Label Parallel_loop, L_No_Parallel;
+
+ cmpl(len, 8);
+ jccb(Assembler::less, L_No_Parallel);
+
+ movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
+ evmovdquq(xmm1, Address(buf, 0), Assembler::AVX_512bit);
+ movdl(xmm5, crc);
+ evpxorq(xmm1, xmm1, xmm5, Assembler::AVX_512bit);
+ addptr(buf, 64);
+ subl(len, 7);
+ evshufi64x2(xmm0, xmm0, xmm0, 0x00, Assembler::AVX_512bit); //propagate the mask from 128 bits to 512 bits
+
+ BIND(Parallel_loop);
+ fold_128bit_crc32_avx512(xmm1, xmm0, xmm5, buf, 0);
+ addptr(buf, 64);
+ subl(len, 4);
+ jcc(Assembler::greater, Parallel_loop);
+
+ vextracti64x2(xmm2, xmm1, 0x01);
+ vextracti64x2(xmm3, xmm1, 0x02);
+ vextracti64x2(xmm4, xmm1, 0x03);
+ jmp(L_fold_512b);
+
+ BIND(L_No_Parallel);
+ }
// Fold crc into first bytes of vector
movdqa(xmm1, Address(buf, 0));
movdl(rax, xmm1);
--- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp Mon Mar 26 17:40:54 2018 -0700
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp Mon Mar 26 18:16:12 2018 -0700
@@ -1498,6 +1498,14 @@
// 0x11 - multiply upper 64 bits [64:127]
Assembler::vpclmulqdq(dst, nds, src, 0x11);
}
+ void evpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+ // 0x00 - multiply lower 64 bits [0:63]
+ Assembler::evpclmulqdq(dst, nds, src, 0x00, vector_len);
+ }
+ void evpclmulhdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+ // 0x11 - multiply upper 64 bits [64:127]
+ Assembler::evpclmulqdq(dst, nds, src, 0x11, vector_len);
+ }
// Data
@@ -1723,6 +1731,7 @@
// Fold 8-bit data
void fold_8bit_crc32(Register crc, Register table, Register tmp);
void fold_8bit_crc32(XMMRegister crc, Register table, XMMRegister xtmp, Register tmp);
+ void fold_128bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset);
// Compress char[] array to byte[].
void char_array_compress(Register src, Register dst, Register len,
--- a/src/hotspot/cpu/x86/vm_version_x86.cpp Mon Mar 26 17:40:54 2018 -0700
+++ b/src/hotspot/cpu/x86/vm_version_x86.cpp Mon Mar 26 18:16:12 2018 -0700
@@ -665,6 +665,7 @@
_features &= ~CPU_AVX512BW;
_features &= ~CPU_AVX512VL;
_features &= ~CPU_AVX512_VPOPCNTDQ;
+ _features &= ~CPU_VPCLMULQDQ;
}
if (UseAVX < 2)
--- a/src/hotspot/cpu/x86/vm_version_x86.hpp Mon Mar 26 17:40:54 2018 -0700
+++ b/src/hotspot/cpu/x86/vm_version_x86.hpp Mon Mar 26 18:16:12 2018 -0700
@@ -334,6 +334,7 @@
#define CPU_FMA ((uint64_t)UCONST64(0x800000000)) // FMA instructions
#define CPU_VZEROUPPER ((uint64_t)UCONST64(0x1000000000)) // Vzeroupper instruction
#define CPU_AVX512_VPOPCNTDQ ((uint64_t)UCONST64(0x2000000000)) // Vector popcount
+#define CPU_VPCLMULQDQ ((uint64_t)UCONST64(0x4000000000)) //Vector carryless multiplication
enum Extended_Family {
// AMD
@@ -542,6 +543,8 @@
result |= CPU_AVX512VL;
if (_cpuid_info.sef_cpuid7_ecx.bits.avx512_vpopcntdq != 0)
result |= CPU_AVX512_VPOPCNTDQ;
+ if (_cpuid_info.sef_cpuid7_ecx.bits.vpclmulqdq != 0)
+ result |= CPU_VPCLMULQDQ;
}
}
if(_cpuid_info.sef_cpuid7_ebx.bits.bmi1 != 0)
@@ -819,6 +822,7 @@
static bool supports_fma() { return (_features & CPU_FMA) != 0 && supports_avx(); }
static bool supports_vzeroupper() { return (_features & CPU_VZEROUPPER) != 0; }
static bool supports_vpopcntdq() { return (_features & CPU_AVX512_VPOPCNTDQ) != 0; }
+ static bool supports_vpclmulqdq() { return (_features & CPU_VPCLMULQDQ) != 0; }
// Intel features
static bool is_intel_family_core() { return is_intel() &&