# HG changeset patch
# User srukmannagar
# Date 1522113372 25200
# Node ID 3b1570be85578fa33e1d050dbdea6758fa2d2d36
# Parent  537ef53e26af061bbc8621d631a6fe81902ef6ec
8200067: Add support for vpclmulqdq for crc32
Reviewed-by: kvn

diff -r 537ef53e26af -r 3b1570be8557 src/hotspot/cpu/x86/assembler_x86.cpp
--- a/src/hotspot/cpu/x86/assembler_x86.cpp	Mon Mar 26 17:40:54 2018 -0700
+++ b/src/hotspot/cpu/x86/assembler_x86.cpp	Mon Mar 26 18:16:12 2018 -0700
@@ -4080,6 +4080,16 @@
   emit_operand(dst, src);
   emit_int8(mode & 0xFF);
 }
+void Assembler::evshufi64x2(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) {
+  assert(VM_Version::supports_evex(), "requires EVEX support");
+  assert(vector_len == Assembler::AVX_256bit || vector_len == Assembler::AVX_512bit, "");
+  InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x43);
+  emit_int8((unsigned char)(0xC0 | encode));
+  emit_int8(imm8 & 0xFF);
+}
 
 void Assembler::psrldq(XMMRegister dst, int shift) {
   // Shift left 128 bit value in dst XMMRegister by shift number of bytes.
@@ -6201,6 +6211,27 @@
   emit_operand(dst, src);
 }
 
+void Assembler::evpxorq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_evex(), "requires EVEX support");
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8((unsigned char)0xEF);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evpxorq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  assert(VM_Version::supports_evex(), "requires EVEX support");
+  assert(dst != xnoreg, "sanity");
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
+  vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8((unsigned char)0xEF);
+  emit_operand(dst, src);
+}
+
 
 // vinserti forms
 
@@ -6786,6 +6817,16 @@
   emit_int8((unsigned char)mask);
 }
 
+void Assembler::evpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask, int vector_len) {
+  assert(VM_Version::supports_vpclmulqdq(), "Requires vector carryless multiplication support");
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x44);
+  emit_int8((unsigned char)(0xC0 | encode));
+  emit_int8((unsigned char)mask);
+}
+
 void Assembler::vzeroupper() {
   if (VM_Version::supports_vzeroupper()) {
     InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
diff -r 537ef53e26af -r 3b1570be8557 src/hotspot/cpu/x86/assembler_x86.hpp
--- a/src/hotspot/cpu/x86/assembler_x86.hpp	Mon Mar 26 17:40:54 2018 -0700
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp	Mon Mar 26 18:16:12 2018 -0700
@@ -1663,6 +1663,9 @@
   void pshuflw(XMMRegister dst, XMMRegister src, int mode);
   void pshuflw(XMMRegister dst, Address src,     int mode);
 
+  // Shuffle packed values at 128 bit granularity
+  void evshufi64x2(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len);
+
   // Shift Right by bytes Logical DoubleQuadword Immediate
   void psrldq(XMMRegister dst, int shift);
   // Shift Left by bytes Logical DoubleQuadword Immediate
@@ -2046,6 +2049,9 @@
   void pxor(XMMRegister dst, XMMRegister src);
   void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
   void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+  void evpxorq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void evpxorq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+
 
   // vinserti forms
   void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
@@ -2108,7 +2114,7 @@
   // Carry-Less Multiplication Quadword
   void pclmulqdq(XMMRegister dst, XMMRegister src, int mask);
   void vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask);
-
+  void evpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask, int vector_len);
   // AVX instruction which is used to clear upper 128 bits of YMM registers and
   // to avoid transaction penalty between AVX and SSE states. There is no
   // penalty if legacy SSE instructions are encoded using VEX prefix because
diff -r 537ef53e26af -r 3b1570be8557 src/hotspot/cpu/x86/macroAssembler_x86.cpp
--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp	Mon Mar 26 17:40:54 2018 -0700
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp	Mon Mar 26 18:16:12 2018 -0700
@@ -10120,6 +10120,16 @@
 }
 
 /**
+* Fold four 128-bit data chunks
+*/
+void MacroAssembler::fold_128bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
+  evpclmulhdq(xtmp, xK, xcrc, Assembler::AVX_512bit); // [123:64]
+  evpclmulldq(xcrc, xK, xcrc, Assembler::AVX_512bit); // [63:0]
+  evpxorq(xcrc, xcrc, Address(buf, offset), Assembler::AVX_512bit /* vector_len */);
+  evpxorq(xcrc, xcrc, xtmp, Assembler::AVX_512bit /* vector_len */);
+}
+
+/**
  * Fold 128-bit data chunk
  */
 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
@@ -10224,6 +10234,34 @@
   shrl(len, 4);
   jcc(Assembler::zero, L_tail_restore);
 
+  // Fold total 512 bits of polynomial on each iteration
+  if (VM_Version::supports_vpclmulqdq()) {
+    Label Parallel_loop, L_No_Parallel;
+
+    cmpl(len, 8);
+    jccb(Assembler::less, L_No_Parallel);
+
+    movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
+    evmovdquq(xmm1, Address(buf, 0), Assembler::AVX_512bit);
+    movdl(xmm5, crc);
+    evpxorq(xmm1, xmm1, xmm5, Assembler::AVX_512bit);
+    addptr(buf, 64);
+    subl(len, 7);
+    evshufi64x2(xmm0, xmm0, xmm0, 0x00, Assembler::AVX_512bit); //propagate the mask from 128 bits to 512 bits
+
+    BIND(Parallel_loop);
+    fold_128bit_crc32_avx512(xmm1, xmm0, xmm5, buf, 0);
+    addptr(buf, 64);
+    subl(len, 4);
+    jcc(Assembler::greater, Parallel_loop);
+
+    vextracti64x2(xmm2, xmm1, 0x01);
+    vextracti64x2(xmm3, xmm1, 0x02);
+    vextracti64x2(xmm4, xmm1, 0x03);
+    jmp(L_fold_512b);
+
+    BIND(L_No_Parallel);
+  }
   // Fold crc into first bytes of vector
   movdqa(xmm1, Address(buf, 0));
   movdl(rax, xmm1);
diff -r 537ef53e26af -r 3b1570be8557 src/hotspot/cpu/x86/macroAssembler_x86.hpp
--- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp	Mon Mar 26 17:40:54 2018 -0700
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp	Mon Mar 26 18:16:12 2018 -0700
@@ -1498,6 +1498,14 @@
     // 0x11 - multiply upper 64 bits [64:127]
     Assembler::vpclmulqdq(dst, nds, src, 0x11);
   }
+  void evpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+    // 0x00 - multiply lower 64 bits [0:63]
+    Assembler::evpclmulqdq(dst, nds, src, 0x00, vector_len);
+  }
+  void evpclmulhdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+    // 0x11 - multiply upper 64 bits [64:127]
+    Assembler::evpclmulqdq(dst, nds, src, 0x11, vector_len);
+  }
 
   // Data
 
@@ -1723,6 +1731,7 @@
   // Fold 8-bit data
   void fold_8bit_crc32(Register crc, Register table, Register tmp);
   void fold_8bit_crc32(XMMRegister crc, Register table, XMMRegister xtmp, Register tmp);
+  void fold_128bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset);
 
   // Compress char[] array to byte[].
   void char_array_compress(Register src, Register dst, Register len,
diff -r 537ef53e26af -r 3b1570be8557 src/hotspot/cpu/x86/vm_version_x86.cpp
--- a/src/hotspot/cpu/x86/vm_version_x86.cpp	Mon Mar 26 17:40:54 2018 -0700
+++ b/src/hotspot/cpu/x86/vm_version_x86.cpp	Mon Mar 26 18:16:12 2018 -0700
@@ -665,6 +665,7 @@
     _features &= ~CPU_AVX512BW;
     _features &= ~CPU_AVX512VL;
     _features &= ~CPU_AVX512_VPOPCNTDQ;
+    _features &= ~CPU_VPCLMULQDQ;
   }
 
   if (UseAVX < 2)
diff -r 537ef53e26af -r 3b1570be8557 src/hotspot/cpu/x86/vm_version_x86.hpp
--- a/src/hotspot/cpu/x86/vm_version_x86.hpp	Mon Mar 26 17:40:54 2018 -0700
+++ b/src/hotspot/cpu/x86/vm_version_x86.hpp	Mon Mar 26 18:16:12 2018 -0700
@@ -334,6 +334,7 @@
 #define CPU_FMA ((uint64_t)UCONST64(0x800000000))      // FMA instructions
 #define CPU_VZEROUPPER ((uint64_t)UCONST64(0x1000000000))       // Vzeroupper instruction
 #define CPU_AVX512_VPOPCNTDQ ((uint64_t)UCONST64(0x2000000000)) // Vector popcount
+#define CPU_VPCLMULQDQ ((uint64_t)UCONST64(0x4000000000)) //Vector carryless multiplication
 
   enum Extended_Family {
     // AMD
@@ -542,6 +543,8 @@
           result |= CPU_AVX512VL;
         if (_cpuid_info.sef_cpuid7_ecx.bits.avx512_vpopcntdq != 0)
           result |= CPU_AVX512_VPOPCNTDQ;
+        if (_cpuid_info.sef_cpuid7_ecx.bits.vpclmulqdq != 0)
+          result |= CPU_VPCLMULQDQ;
       }
     }
     if(_cpuid_info.sef_cpuid7_ebx.bits.bmi1 != 0)
@@ -819,6 +822,7 @@
   static bool supports_fma()        { return (_features & CPU_FMA) != 0 && supports_avx(); }
   static bool supports_vzeroupper() { return (_features & CPU_VZEROUPPER) != 0; }
   static bool supports_vpopcntdq()  { return (_features & CPU_AVX512_VPOPCNTDQ) != 0; }
+  static bool supports_vpclmulqdq() { return (_features & CPU_VPCLMULQDQ) != 0; }
 
   // Intel features
   static bool is_intel_family_core() { return is_intel() &&