8154975: Update for vectorizedMismatch with AVX512
authorvdeshpande
Wed, 27 Apr 2016 13:37:07 -0700
changeset 38138 8514e24123c8
parent 38137 55c5f1d1ba6c
child 38139 cf6f5c1b7205
8154975: Update for vectorizedMismatch with AVX512 Reviewed-by: kvn
hotspot/src/cpu/x86/vm/assembler_x86.cpp
hotspot/src/cpu/x86/vm/assembler_x86.hpp
hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Mon Apr 25 15:14:02 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Wed Apr 27 13:37:07 2016 -0700
@@ -2323,6 +2323,15 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
+// This instruction produces ZF or CF flags
+void Assembler::ktestql(KRegister src1, KRegister src2) {
+  assert(VM_Version::supports_avx512bw(), "");
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(src1->encoding(), 0, src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  emit_int8((unsigned char)0x99);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 void Assembler::movb(Address dst, int imm8) {
   InstructionMark im(this);
    prefix(dst);
@@ -2491,6 +2500,19 @@
   emit_operand(src, dst);
 }
 
+void Assembler::evmovdqub(KRegister mask, XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_avx512vlbw(), "");
+  assert(is_vector_masking(), "");    // For stub code use only
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
+  attributes.set_embedded_opmask_register_specifier(mask);
+  attributes.set_is_evex_instruction();
+  vex_prefix(src, 0, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x6F);
+  emit_operand(dst, src);
+}
+
 void Assembler::evmovdquw(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
@@ -3285,6 +3307,19 @@
   emit_operand(as_Register(dst_enc), src);
 }
 
+void Assembler::evpcmpeqb(KRegister mask, KRegister kdst, XMMRegister nds, Address src, int vector_len) {
+  assert(VM_Version::supports_avx512vlbw(), "");
+  assert(is_vector_masking(), "");    // For stub code use only
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_reg_mask */ false, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
+  attributes.set_embedded_opmask_register_specifier(mask);
+  attributes.set_is_evex_instruction();
+  vex_prefix(src, nds->encoding(), kdst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x74);
+  emit_operand(as_Register(kdst->encoding()), src);
+}
+
 // In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst
 void Assembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_sse2(), "");
@@ -6938,7 +6973,7 @@
   emit_int8(byte3);
 
   // P2: byte 4 as zL'Lbv'aaa
-  int byte4 = (_attributes->is_no_reg_mask()) ? 0 : 1; // kregs are implemented in the low 3 bits as aaa (hard code k1, it will be initialized for now)
+  int byte4 = (_attributes->is_no_reg_mask()) ? 0 : _attributes->get_embedded_opmask_register_specifier(); // kregs are implemented in the low 3 bits as aaa (hard code k1, it will be initialized for now)
   // EVEX.v` for extending EVEX.vvvv or VIDX
   byte4 |= (evex_v ? 0: EVEX_V);
   // third EXEC.b for broadcast actions
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Mon Apr 25 15:14:02 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Wed Apr 27 13:37:07 2016 -0700
@@ -606,6 +606,7 @@
   bool _legacy_mode_vl;
   bool _legacy_mode_vlbw;
   bool _is_managed;
+  bool _vector_masking;    // For stub code use only
 
   class InstructionAttr *_attributes;
 
@@ -813,6 +814,7 @@
     _legacy_mode_vl = (VM_Version::supports_avx512vl() == false);
     _legacy_mode_vlbw = (VM_Version::supports_avx512vlbw() == false);
     _is_managed = false;
+    _vector_masking = false;
     _attributes = NULL;
   }
 
@@ -823,6 +825,12 @@
   void clear_managed(void) { _is_managed = false; }
   bool is_managed(void) { return _is_managed; }
 
+  // Following functions are for stub code use only
+  void set_vector_masking(void) { _vector_masking = true; }
+  void clear_vector_masking(void) { _vector_masking = false; }
+  bool is_vector_masking(void) { return _vector_masking; }
+
+
   void lea(Register dst, Address src);
 
   void mov(Register dst, Register src);
@@ -1354,6 +1362,8 @@
   void kortestdl(KRegister dst, KRegister src);
   void kortestql(KRegister dst, KRegister src);
 
+  void ktestql(KRegister dst, KRegister src);
+
   void movdl(XMMRegister dst, Register src);
   void movdl(Register dst, XMMRegister src);
   void movdl(XMMRegister dst, Address src);
@@ -1381,6 +1391,7 @@
   void evmovdqub(Address dst, XMMRegister src, int vector_len);
   void evmovdqub(XMMRegister dst, Address src, int vector_len);
   void evmovdqub(XMMRegister dst, XMMRegister src, int vector_len);
+  void evmovdqub(KRegister mask, XMMRegister dst, Address src, int vector_len);
   void evmovdquw(Address dst, XMMRegister src, int vector_len);
   void evmovdquw(XMMRegister dst, Address src, int vector_len);
   void evmovdquw(XMMRegister dst, XMMRegister src, int vector_len);
@@ -1534,6 +1545,7 @@
   void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
   void evpcmpeqb(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
   void evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vector_len);
+  void evpcmpeqb(KRegister mask, KRegister kdst, XMMRegister nds, Address src, int vector_len);
 
   void pcmpeqw(XMMRegister dst, XMMRegister src);
   void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
@@ -2098,7 +2110,8 @@
       _evex_encoding(0),
       _is_clear_context(false),
       _is_extended_context(false),
-      _current_assembler(NULL) {
+      _current_assembler(NULL),
+      _embedded_opmask_register_specifier(1) { // hard code k1, it will be initialized for now
     if (UseAVX < 3) _legacy_mode = true;
   }
 
@@ -2122,6 +2135,7 @@
   int  _evex_encoding;
   bool _is_clear_context;
   bool _is_extended_context;
+  int _embedded_opmask_register_specifier;
 
   Assembler *_current_assembler;
 
@@ -2139,6 +2153,7 @@
   int  get_evex_encoding(void) const { return _evex_encoding; }
   bool is_clear_context(void) const { return _is_clear_context; }
   bool is_extended_context(void) const { return _is_extended_context; }
+  int get_embedded_opmask_register_specifier(void) const { return _embedded_opmask_register_specifier; }
 
   // Set the vector len manually
   void set_vector_len(int vector_len) { _avx_vector_len = vector_len; }
@@ -2172,6 +2187,11 @@
     }
   }
 
+  // Set embedded opmask register specifier.
+  void set_embedded_opmask_register_specifier(KRegister mask) {
+    _embedded_opmask_register_specifier = (*mask).encoding() & 0x7;
+  }
+
 };
 
 #endif // CPU_X86_VM_ASSEMBLER_X86_HPP
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Mon Apr 25 15:14:02 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Wed Apr 27 13:37:07 2016 -0700
@@ -9425,6 +9425,7 @@
 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
   Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
   assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
+  Label VECTOR64_LOOP, VECTOR64_TAIL, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
   Label VECTOR32_LOOP, VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
   Label VECTOR16_TAIL, VECTOR8_TAIL, VECTOR4_TAIL;
   Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
@@ -9437,11 +9438,62 @@
   shlq(length);
   xorq(result, result);
 
+  if ((UseAVX > 2) &&
+      VM_Version::supports_avx512vlbw()) {
+    set_vector_masking();  // opening of the stub context for programming mask registers
+    cmpq(length, 64);
+    jcc(Assembler::less, VECTOR32_TAIL);
+    movq(tmp1, length);
+    andq(tmp1, 0x3F);      // tail count
+    andq(length, ~(0x3F)); //vector count
+
+    bind(VECTOR64_LOOP);
+    // AVX512 code to compare 64 byte vectors.
+    evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
+    evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
+    kortestql(k7, k7);
+    jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL);     // mismatch
+    addq(result, 64);
+    subq(length, 64);
+    jccb(Assembler::notZero, VECTOR64_LOOP);
+
+    //bind(VECTOR64_TAIL);
+    testq(tmp1, tmp1);
+    jcc(Assembler::zero, SAME_TILL_END);
+
+    bind(VECTOR64_TAIL);
+    // AVX512 code to compare upto 63 byte vectors.
+    // Save k1
+    kmovql(k3, k1);
+    mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
+    shlxq(tmp2, tmp2, tmp1);
+    notq(tmp2);
+    kmovql(k1, tmp2);
+
+    evmovdqub(k1, rymm0, Address(obja, result), Assembler::AVX_512bit);
+    evpcmpeqb(k1, k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
+
+    ktestql(k7, k1);
+    // Restore k1
+    kmovql(k1, k3);
+    jcc(Assembler::below, SAME_TILL_END);     // not mismatch
+
+    bind(VECTOR64_NOT_EQUAL);
+    kmovql(tmp1, k7);
+    notq(tmp1);
+    tzcntq(tmp1, tmp1);
+    addq(result, tmp1);
+    shrq(result);
+    jmp(DONE);
+    bind(VECTOR32_TAIL);
+    clear_vector_masking();   // closing of the stub context for programming mask registers
+  }
+
   cmpq(length, 8);
   jcc(Assembler::equal, VECTOR8_LOOP);
   jcc(Assembler::less, VECTOR4_TAIL);
 
-  if (UseAVX >= 2){
+  if (UseAVX >= 2) {
 
     cmpq(length, 16);
     jcc(Assembler::equal, VECTOR16_LOOP);
@@ -9549,7 +9601,7 @@
   jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
   jmpb(SAME_TILL_END);
 
-  if (UseAVX >= 2){
+  if (UseAVX >= 2) {
     bind(VECTOR32_NOT_EQUAL);
     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
@@ -9562,7 +9614,7 @@
   }
 
   bind(VECTOR16_NOT_EQUAL);
-  if (UseAVX >= 2){
+  if (UseAVX >= 2) {
     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
     pxor(rymm0, rymm2);
@@ -9593,7 +9645,6 @@
   bind(DONE);
 }
 
-
 //Helper functions for square_to_len()
 
 /**