8154974: AVX-512 equipped inflate, has_negatives & compress intrinsics
authorkvn
Thu, 05 May 2016 17:16:08 -0700
changeset 38239 4d8b8ba74fea
parent 38238 1bbcc430c78d
child 38240 28e0cafd5222
8154974: AVX-512 equipped inflate, has_negatives & compress intrinsics Reviewed-by: kvn Contributed-by: tomasz.wojtowicz@intel.com
hotspot/src/cpu/x86/vm/assembler_x86.cpp
hotspot/src/cpu/x86/vm/assembler_x86.hpp
hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Thu May 05 10:03:26 2016 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Thu May 05 17:16:08 2016 -0700
@@ -2332,6 +2332,22 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
+void Assembler::ktestq(KRegister src1, KRegister src2) {
+  assert(VM_Version::supports_avx512bw(), "");
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(src1->encoding(), 0, src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  emit_int8((unsigned char)0x99);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::ktestd(KRegister src1, KRegister src2) {
+  assert(VM_Version::supports_avx512bw(), "");
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(src1->encoding(), 0, src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8((unsigned char)0x99);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 void Assembler::movb(Address dst, int imm8) {
   InstructionMark im(this);
    prefix(dst);
@@ -2500,7 +2516,7 @@
   emit_operand(src, dst);
 }
 
-void Assembler::evmovdqub(KRegister mask, XMMRegister dst, Address src, int vector_len) {
+void Assembler::evmovdqub(XMMRegister dst, KRegister mask, Address src, int vector_len) {
   assert(VM_Version::supports_avx512vlbw(), "");
   assert(is_vector_masking(), "");    // For stub code use only
   InstructionMark im(this);
@@ -2513,16 +2529,6 @@
   emit_operand(dst, src);
 }
 
-void Assembler::evmovdquw(XMMRegister dst, XMMRegister src, int vector_len) {
-  assert(VM_Version::supports_evex(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
-  attributes.set_is_evex_instruction();
-  int prefix = (_legacy_mode_bw) ? VEX_SIMD_F2 : VEX_SIMD_F3;
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes);
-  emit_int8(0x6F);
-  emit_int8((unsigned char)(0xC0 | encode));
-}
-
 void Assembler::evmovdquw(XMMRegister dst, Address src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionMark im(this);
@@ -2535,6 +2541,19 @@
   emit_operand(dst, src);
 }
 
+void Assembler::evmovdquw(XMMRegister dst, KRegister mask, Address src, int vector_len) {
+  assert(is_vector_masking(), "");
+  assert(VM_Version::supports_avx512vlbw(), "");
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
+  attributes.set_embedded_opmask_register_specifier(mask);
+  attributes.set_is_evex_instruction();
+  vex_prefix(src, 0, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x6F);
+  emit_operand(dst, src);
+}
+
 void Assembler::evmovdquw(Address dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   assert(src != xnoreg, "sanity");
@@ -2548,6 +2567,19 @@
   emit_operand(src, dst);
 }
 
+void Assembler::evmovdquw(Address dst, KRegister mask, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_avx512vlbw(), "");
+  assert(src != xnoreg, "sanity");
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
+  attributes.set_embedded_opmask_register_specifier(mask);
+  attributes.set_is_evex_instruction();
+  vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x7F);
+  emit_operand(src, dst);
+}
+
 void Assembler::evmovdqul(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@@ -3295,10 +3327,71 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
+void Assembler::evpcmpgtb(KRegister kdst, XMMRegister nds, Address src, int vector_len) {
+  assert(VM_Version::supports_avx512vlbw(), "");
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
+  attributes.set_is_evex_instruction();
+  int dst_enc = kdst->encoding();
+  vex_prefix(src, nds->encoding(), dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x64);
+  emit_operand(as_Register(dst_enc), src);
+}
+
+void Assembler::evpcmpgtb(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len) {
+  assert(is_vector_masking(), "");
+  assert(VM_Version::supports_avx512vlbw(), "");
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
+  attributes.set_embedded_opmask_register_specifier(mask);
+  attributes.set_is_evex_instruction();
+  int dst_enc = kdst->encoding();
+  vex_prefix(src, nds->encoding(), dst_enc, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x64);
+  emit_operand(as_Register(dst_enc), src);
+}
+
+void Assembler::evpcmpuw(KRegister kdst, XMMRegister nds, XMMRegister src, ComparisonPredicate vcc, int vector_len) {
+  assert(VM_Version::supports_avx512vlbw(), "");
+  InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  attributes.set_is_evex_instruction();
+  int encode = vex_prefix_and_encode(kdst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x3E);
+  emit_int8((unsigned char)(0xC0 | encode));
+  emit_int8(vcc);
+}
+
+void Assembler::evpcmpuw(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, ComparisonPredicate vcc, int vector_len) {
+  assert(is_vector_masking(), "");
+  assert(VM_Version::supports_avx512vlbw(), "");
+  InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_embedded_opmask_register_specifier(mask);
+  attributes.set_is_evex_instruction();
+  int encode = vex_prefix_and_encode(kdst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x3E);
+  emit_int8((unsigned char)(0xC0 | encode));
+  emit_int8(vcc);
+}
+
+void Assembler::evpcmpuw(KRegister kdst, XMMRegister nds, Address src, ComparisonPredicate vcc, int vector_len) {
+  assert(VM_Version::supports_avx512vlbw(), "");
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
+  attributes.set_is_evex_instruction();
+  int dst_enc = kdst->encoding();
+  vex_prefix(src, nds->encoding(), kdst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x3E);
+  emit_operand(as_Register(dst_enc), src);
+  emit_int8(vcc);
+}
+
 void Assembler::evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx512bw(), "");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
   attributes.set_is_evex_instruction();
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   int dst_enc = kdst->encoding();
@@ -3307,7 +3400,7 @@
   emit_operand(as_Register(dst_enc), src);
 }
 
-void Assembler::evpcmpeqb(KRegister mask, KRegister kdst, XMMRegister nds, Address src, int vector_len) {
+void Assembler::evpcmpeqb(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx512vlbw(), "");
   assert(is_vector_masking(), "");    // For stub code use only
   InstructionMark im(this);
@@ -3620,6 +3713,46 @@
   emit_operand(dst, src);
 }
 
+void Assembler::evpmovzxbw(XMMRegister dst, KRegister mask, Address src, int vector_len) {
+  assert(is_vector_masking(), "");
+  assert(VM_Version::supports_avx512vlbw(), "");
+  assert(dst != xnoreg, "sanity");
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit);
+  attributes.set_embedded_opmask_register_specifier(mask);
+  attributes.set_is_evex_instruction();
+  vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int8(0x30);
+  emit_operand(dst, src);
+}
+
+void Assembler::evpmovwb(Address dst, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_avx512vlbw(), "");
+  assert(src != xnoreg, "sanity");
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit);
+  attributes.set_is_evex_instruction();
+  vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes);
+  emit_int8(0x30);
+  emit_operand(src, dst);
+}
+
+void Assembler::evpmovwb(Address dst, KRegister mask, XMMRegister src, int vector_len) {
+  assert(is_vector_masking(), "");
+  assert(VM_Version::supports_avx512vlbw(), "");
+  assert(src != xnoreg, "sanity");
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit);
+  attributes.set_embedded_opmask_register_specifier(mask);
+  attributes.set_is_evex_instruction();
+  vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F_38, &attributes);
+  emit_int8(0x30);
+  emit_operand(src, dst);
+}
+
 // generic
 void Assembler::pop(Register dst) {
   int encode = prefix_and_encode(dst->encoding());
@@ -6406,7 +6539,6 @@
   emit_int8(0x77);
 }
 
-
 #ifndef _LP64
 // 32bit only pieces of the assembler
 
@@ -6973,7 +7105,10 @@
   emit_int8(byte3);
 
   // P2: byte 4 as zL'Lbv'aaa
-  int byte4 = (_attributes->is_no_reg_mask()) ? 0 : _attributes->get_embedded_opmask_register_specifier(); // kregs are implemented in the low 3 bits as aaa (hard code k1, it will be initialized for now)
+  // kregs are implemented in the low 3 bits as aaa (hard code k1, it will be initialized for now)
+  int byte4 = (_attributes->is_no_reg_mask()) ?
+              0 :
+              _attributes->get_embedded_opmask_register_specifier();
   // EVEX.v` for extending EVEX.vvvv or VIDX
   byte4 |= (evex_v ? 0: EVEX_V);
   // third EXEC.b for broadcast actions
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Thu May 05 10:03:26 2016 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Thu May 05 17:16:08 2016 -0700
@@ -587,6 +587,16 @@
 #endif
   };
 
+  enum ComparisonPredicate {
+    eq = 0,
+    lt = 1,
+    le = 2,
+    _false = 3,
+    neq = 4,
+    nlt = 5,
+    nle = 6,
+    _true = 7
+  };
 
 
   // NOTE: The general philopsophy of the declarations here is that 64bit versions
@@ -830,7 +840,6 @@
   void clear_vector_masking(void) { _vector_masking = false; }
   bool is_vector_masking(void) { return _vector_masking; }
 
-
   void lea(Register dst, Address src);
 
   void mov(Register dst, Register src);
@@ -1362,6 +1371,9 @@
   void kortestdl(KRegister dst, KRegister src);
   void kortestql(KRegister dst, KRegister src);
 
+  void ktestq(KRegister src1, KRegister src2);
+  void ktestd(KRegister src1, KRegister src2);
+
   void ktestql(KRegister dst, KRegister src);
 
   void movdl(XMMRegister dst, Register src);
@@ -1391,10 +1403,11 @@
   void evmovdqub(Address dst, XMMRegister src, int vector_len);
   void evmovdqub(XMMRegister dst, Address src, int vector_len);
   void evmovdqub(XMMRegister dst, XMMRegister src, int vector_len);
-  void evmovdqub(KRegister mask, XMMRegister dst, Address src, int vector_len);
+  void evmovdqub(XMMRegister dst, KRegister mask, Address src, int vector_len);
   void evmovdquw(Address dst, XMMRegister src, int vector_len);
+  void evmovdquw(Address dst, KRegister mask, XMMRegister src, int vector_len);
   void evmovdquw(XMMRegister dst, Address src, int vector_len);
-  void evmovdquw(XMMRegister dst, XMMRegister src, int vector_len);
+  void evmovdquw(XMMRegister dst, KRegister mask, Address src, int vector_len);
   void evmovdqul(Address dst, XMMRegister src, int vector_len);
   void evmovdqul(XMMRegister dst, Address src, int vector_len);
   void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len);
@@ -1545,7 +1558,14 @@
   void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
   void evpcmpeqb(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
   void evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vector_len);
-  void evpcmpeqb(KRegister mask, KRegister kdst, XMMRegister nds, Address src, int vector_len);
+  void evpcmpeqb(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len);
+
+  void evpcmpgtb(KRegister kdst, XMMRegister nds, Address src, int vector_len);
+  void evpcmpgtb(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len);
+
+  void evpcmpuw(KRegister kdst, XMMRegister nds, XMMRegister src, ComparisonPredicate vcc, int vector_len);
+  void evpcmpuw(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, ComparisonPredicate of, int vector_len);
+  void evpcmpuw(KRegister kdst, XMMRegister nds, Address src, ComparisonPredicate vcc, int vector_len);
 
   void pcmpeqw(XMMRegister dst, XMMRegister src);
   void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
@@ -1589,7 +1609,11 @@
   void pmovzxbw(XMMRegister dst, XMMRegister src);
   void pmovzxbw(XMMRegister dst, Address src);
 
-  void vpmovzxbw(XMMRegister dst, Address src, int vector_len);
+  void vpmovzxbw( XMMRegister dst, Address src, int vector_len);
+  void evpmovzxbw(XMMRegister dst, KRegister mask, Address src, int vector_len);
+
+  void evpmovwb(Address dst, XMMRegister src, int vector_len);
+  void evpmovwb(Address dst, KRegister mask, XMMRegister src, int vector_len);
 
 #ifndef _LP64 // no 32bit push/pop on amd64
   void popl(Address dst);
@@ -1839,6 +1863,8 @@
   void vsubss(XMMRegister dst, XMMRegister nds, Address src);
   void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src);
 
+  void shlxl(Register dst, Register src1, Register src2);
+  void shlxq(Register dst, Register src1, Register src2);
 
   //====================VECTOR ARITHMETIC=====================================
 
@@ -2073,9 +2099,6 @@
   void cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len);
   void vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len);
 
-  void shlxl(Register dst, Register src1, Register src2);
-  void shlxq(Register dst, Register src1, Register src2);
-
  protected:
   // Next instructions require address alignment 16 bytes SSE mode.
   // They should be called only from corresponding MacroAssembler instructions.
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Thu May 05 10:03:26 2016 -0700
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Thu May 05 17:16:08 2016 -0700
@@ -8251,10 +8251,19 @@
 
 // Search for Non-ASCII character (Negative byte value) in a byte array,
 // return true if it has any and false otherwise.
+//   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
+//   @HotSpotIntrinsicCandidate
+//   private static boolean hasNegatives(byte[] ba, int off, int len) {
+//     for (int i = off; i < off + len; i++) {
+//       if (ba[i] < 0) {
+//         return true;
+//       }
+//     }
+//     return false;
+//   }
 void MacroAssembler::has_negatives(Register ary1, Register len,
-                                   Register result, Register tmp1,
-                                   XMMRegister vec1, XMMRegister vec2) {
-
+  Register result, Register tmp1,
+  XMMRegister vec1, XMMRegister vec2) {
   // rsi: byte array
   // rcx: len
   // rax: result
@@ -8267,79 +8276,161 @@
   testl(len, len);
   jcc(Assembler::zero, FALSE_LABEL);
 
-  movl(result, len); // copy
-
-  if (UseAVX >= 2 && UseSSE >= 2) {
-    // With AVX2, use 32-byte vector compare
-    Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
-
-    // Compare 32-byte vectors
-    andl(result, 0x0000001f);  //   tail count (in bytes)
-    andl(len, 0xffffffe0);   // vector count (in bytes)
-    jcc(Assembler::zero, COMPARE_TAIL);
+  if ((UseAVX > 2) && // AVX512
+    VM_Version::supports_avx512vlbw() &&
+    VM_Version::supports_bmi2()) {
+
+    set_vector_masking();  // opening of the stub context for programming mask registers
+
+    Label test_64_loop, test_tail;
+    Register tmp3_aliased = len;
+
+    movl(tmp1, len);
+    vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
+
+    andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
+    andl(len, ~(64 - 1));    // vector count (in chars)
+    jccb(Assembler::zero, test_tail);
 
     lea(ary1, Address(ary1, len, Address::times_1));
     negptr(len);
 
-    movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
-    movdl(vec2, tmp1);
-    vpbroadcastd(vec2, vec2);
-
-    bind(COMPARE_WIDE_VECTORS);
-    vmovdqu(vec1, Address(ary1, len, Address::times_1));
-    vptest(vec1, vec2);
+    bind(test_64_loop);
+    // Check whether our 64 elements of size byte contain negatives
+    evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
+    kortestql(k2, k2);
     jcc(Assembler::notZero, TRUE_LABEL);
-    addptr(len, 32);
-    jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
-
-    testl(result, result);
+
+    addptr(len, 64);
+    jccb(Assembler::notZero, test_64_loop);
+
+
+    bind(test_tail);
+    // bail out when there is nothing to be done
+    testl(tmp1, -1);
     jcc(Assembler::zero, FALSE_LABEL);
 
-    vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
-    vptest(vec1, vec2);
+    // Save k1
+    kmovql(k3, k1);
+
+    // ~(~0 << len) applied up to two times (for 32-bit scenario)
+#ifdef _LP64
+    mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
+    shlxq(tmp3_aliased, tmp3_aliased, tmp1);
+    notq(tmp3_aliased);
+    kmovql(k1, tmp3_aliased);
+#else
+    Label k_init;
+    jmp(k_init);
+
+    // We could not read 64-bits from a general purpose register thus we move
+    // data required to compose 64 1's to the instruction stream
+    // We emit 64 byte wide series of elements from 0..63 which later on would
+    // be used as a compare targets with tail count contained in tmp1 register.
+    // Result would be a k1 register having tmp1 consecutive number or 1
+    // counting from least significant bit.
+    address tmp = pc();
+    emit_int64(0x0706050403020100);
+    emit_int64(0x0F0E0D0C0B0A0908);
+    emit_int64(0x1716151413121110);
+    emit_int64(0x1F1E1D1C1B1A1918);
+    emit_int64(0x2726252423222120);
+    emit_int64(0x2F2E2D2C2B2A2928);
+    emit_int64(0x3736353433323130);
+    emit_int64(0x3F3E3D3C3B3A3938);
+
+    bind(k_init);
+    lea(len, InternalAddress(tmp));
+    // create mask to test for negative byte inside a vector
+    evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
+    evpcmpgtb(k1, vec1, Address(len, 0), Assembler::AVX_512bit);
+
+#endif
+    evpcmpgtb(k2, k1, vec2, Address(ary1, 0), Assembler::AVX_512bit);
+    ktestq(k2, k1);
+    // Restore k1
+    kmovql(k1, k3);
     jcc(Assembler::notZero, TRUE_LABEL);
+
     jmp(FALSE_LABEL);
 
-    bind(COMPARE_TAIL); // len is zero
-    movl(len, result);
-    // Fallthru to tail compare
-  } else if (UseSSE42Intrinsics) {
-    assert(UseSSE >= 4, "SSE4 must be  for SSE4.2 intrinsics to be available");
-    // With SSE4.2, use double quad vector compare
-    Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
-
-    // Compare 16-byte vectors
-    andl(result, 0x0000000f);  //   tail count (in bytes)
-    andl(len, 0xfffffff0);   // vector count (in bytes)
-    jccb(Assembler::zero, COMPARE_TAIL);
-
-    lea(ary1, Address(ary1, len, Address::times_1));
-    negptr(len);
-
-    movl(tmp1, 0x80808080);
-    movdl(vec2, tmp1);
-    pshufd(vec2, vec2, 0);
-
-    bind(COMPARE_WIDE_VECTORS);
-    movdqu(vec1, Address(ary1, len, Address::times_1));
-    ptest(vec1, vec2);
-    jcc(Assembler::notZero, TRUE_LABEL);
-    addptr(len, 16);
-    jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
-
-    testl(result, result);
-    jcc(Assembler::zero, FALSE_LABEL);
-
-    movdqu(vec1, Address(ary1, result, Address::times_1, -16));
-    ptest(vec1, vec2);
-    jccb(Assembler::notZero, TRUE_LABEL);
-    jmpb(FALSE_LABEL);
-
-    bind(COMPARE_TAIL); // len is zero
-    movl(len, result);
-    // Fallthru to tail compare
-  }
-
+    clear_vector_masking();   // closing of the stub context for programming mask registers
+  }
+  else {
+    movl(result, len); // copy
+
+    if (UseAVX == 2 && UseSSE >= 2) {
+      // With AVX2, use 32-byte vector compare
+      Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
+
+      // Compare 32-byte vectors
+      andl(result, 0x0000001f);  //   tail count (in bytes)
+      andl(len, 0xffffffe0);   // vector count (in bytes)
+      jccb(Assembler::zero, COMPARE_TAIL);
+
+      lea(ary1, Address(ary1, len, Address::times_1));
+      negptr(len);
+
+      movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
+      movdl(vec2, tmp1);
+      vpbroadcastd(vec2, vec2);
+
+      bind(COMPARE_WIDE_VECTORS);
+      vmovdqu(vec1, Address(ary1, len, Address::times_1));
+      vptest(vec1, vec2);
+      jccb(Assembler::notZero, TRUE_LABEL);
+      addptr(len, 32);
+      jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
+
+      testl(result, result);
+      jccb(Assembler::zero, FALSE_LABEL);
+
+      vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
+      vptest(vec1, vec2);
+      jccb(Assembler::notZero, TRUE_LABEL);
+      jmpb(FALSE_LABEL);
+
+      bind(COMPARE_TAIL); // len is zero
+      movl(len, result);
+      // Fallthru to tail compare
+    }
+    else if (UseSSE42Intrinsics) {
+      assert(UseSSE >= 4, "SSE4 must be  for SSE4.2 intrinsics to be available");
+      // With SSE4.2, use double quad vector compare
+      Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
+
+      // Compare 16-byte vectors
+      andl(result, 0x0000000f);  //   tail count (in bytes)
+      andl(len, 0xfffffff0);   // vector count (in bytes)
+      jccb(Assembler::zero, COMPARE_TAIL);
+
+      lea(ary1, Address(ary1, len, Address::times_1));
+      negptr(len);
+
+      movl(tmp1, 0x80808080);
+      movdl(vec2, tmp1);
+      pshufd(vec2, vec2, 0);
+
+      bind(COMPARE_WIDE_VECTORS);
+      movdqu(vec1, Address(ary1, len, Address::times_1));
+      ptest(vec1, vec2);
+      jccb(Assembler::notZero, TRUE_LABEL);
+      addptr(len, 16);
+      jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
+
+      testl(result, result);
+      jccb(Assembler::zero, FALSE_LABEL);
+
+      movdqu(vec1, Address(ary1, result, Address::times_1, -16));
+      ptest(vec1, vec2);
+      jccb(Assembler::notZero, TRUE_LABEL);
+      jmpb(FALSE_LABEL);
+
+      bind(COMPARE_TAIL); // len is zero
+      movl(len, result);
+      // Fallthru to tail compare
+    }
+  }
   // Compare 4-byte vectors
   andl(len, 0xfffffffc); // vector count (in bytes)
   jccb(Assembler::zero, COMPARE_CHAR);
@@ -8387,7 +8478,6 @@
     vpxor(vec2, vec2);
   }
 }
-
 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
 void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
                                    Register limit, Register result, Register chr,
@@ -8833,10 +8923,23 @@
 }
 
 // encode char[] to byte[] in ISO_8859_1
+   //@HotSpotIntrinsicCandidate
+   //private static int implEncodeISOArray(byte[] sa, int sp,
+   //byte[] da, int dp, int len) {
+   //  int i = 0;
+   //  for (; i < len; i++) {
+   //    char c = StringUTF16.getChar(sa, sp++);
+   //    if (c > '\u00FF')
+   //      break;
+   //    da[dp++] = (byte)c;
+   //  }
+   //  return i;
+   //}
 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
-                                      XMMRegister tmp1Reg, XMMRegister tmp2Reg,
-                                      XMMRegister tmp3Reg, XMMRegister tmp4Reg,
-                                      Register tmp5, Register result) {
+  XMMRegister tmp1Reg, XMMRegister tmp2Reg,
+  XMMRegister tmp3Reg, XMMRegister tmp4Reg,
+  Register tmp5, Register result) {
+
   // rsi: src
   // rdi: dst
   // rdx: len
@@ -8851,6 +8954,7 @@
   // check for zero length
   testl(len, len);
   jcc(Assembler::zero, L_done);
+
   movl(result, len);
 
   // Setup pointers
@@ -8959,6 +9063,7 @@
 
   bind(L_copy_1_char_exit);
   addptr(result, len); // len is negative count of not processed elements
+
   bind(L_done);
 }
 
@@ -9470,8 +9575,8 @@
     notq(tmp2);
     kmovql(k1, tmp2);
 
-    evmovdqub(k1, rymm0, Address(obja, result), Assembler::AVX_512bit);
-    evpcmpeqb(k1, k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
+    evmovdqub(rymm0, k1, Address(obja, result), Assembler::AVX_512bit);
+    evpcmpeqb(k7, k1, rymm0, Address(objb, result), Assembler::AVX_512bit);
 
     ktestql(k7, k1);
     // Restore k1
@@ -10830,13 +10935,24 @@
 #undef BIND
 #undef BLOCK_COMMENT
 
-
 // Compress char[] array to byte[].
+//   ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
+//   @HotSpotIntrinsicCandidate
+//   private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
+//     for (int i = 0; i < len; i++) {
+//       int c = src[srcOff++];
+//       if (c >>> 8 != 0) {
+//         return 0;
+//       }
+//       dst[dstOff++] = (byte)c;
+//     }
+//     return len;
+//   }
 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
-                                         XMMRegister tmp1Reg, XMMRegister tmp2Reg,
-                                         XMMRegister tmp3Reg, XMMRegister tmp4Reg,
-                                         Register tmp5, Register result) {
-  Label copy_chars_loop, return_length, return_zero, done;
+  XMMRegister tmp1Reg, XMMRegister tmp2Reg,
+  XMMRegister tmp3Reg, XMMRegister tmp4Reg,
+  Register tmp5, Register result) {
+  Label copy_chars_loop, return_length, return_zero, done, below_threshold;
 
   // rsi: src
   // rdi: dst
@@ -10853,11 +10969,141 @@
   // save length for return
   push(len);
 
+  if ((UseAVX > 2) && // AVX512
+    VM_Version::supports_avx512vlbw() &&
+    VM_Version::supports_bmi2()) {
+
+    set_vector_masking();  // opening of the stub context for programming mask registers
+
+    Label copy_32_loop, copy_loop_tail, copy_just_portion_of_candidates;
+
+    // alignement
+    Label post_alignement;
+
+    // if length of the string is less than 16, handle it in an old fashioned
+    // way
+    testl(len, -32);
+    jcc(Assembler::zero, below_threshold);
+
+    // First check whether a character is compressable ( <= 0xFF).
+    // Create mask to test for Unicode chars inside zmm vector
+    movl(result, 0x00FF);
+    evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
+
+    testl(len, -64);
+    jcc(Assembler::zero, post_alignement);
+
+    // Save k1
+    kmovql(k3, k1);
+
+    movl(tmp5, dst);
+    andl(tmp5, (64 - 1));
+    negl(tmp5);
+    andl(tmp5, (64 - 1));
+
+    // bail out when there is nothing to be done
+    testl(tmp5, 0xFFFFFFFF);
+    jcc(Assembler::zero, post_alignement);
+
+    // ~(~0 << len), where len is the # of remaining elements to process
+    movl(result, 0xFFFFFFFF);
+    shlxl(result, result, tmp5);
+    notl(result);
+
+    kmovdl(k1, result);
+
+    evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit);
+    evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
+    ktestd(k2, k1);
+    jcc(Assembler::carryClear, copy_just_portion_of_candidates);
+
+    evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
+
+    addptr(src, tmp5);
+    addptr(src, tmp5);
+    addptr(dst, tmp5);
+    subl(len, tmp5);
+
+    bind(post_alignement);
+    // end of alignement
+
+    movl(tmp5, len);
+    andl(tmp5, (32 - 1));   // tail count (in chars)
+    andl(len, ~(32 - 1));    // vector count (in chars)
+    jcc(Assembler::zero, copy_loop_tail);
+
+    lea(src, Address(src, len, Address::times_2));
+    lea(dst, Address(dst, len, Address::times_1));
+    negptr(len);
+
+    bind(copy_32_loop);
+    evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
+    evpcmpuw(k2, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
+    kortestdl(k2, k2);
+    jcc(Assembler::carryClear, copy_just_portion_of_candidates);
+
+    // All elements in current processed chunk are valid candidates for
+    // compression. Write a truncated byte elements to the memory.
+    evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
+    addptr(len, 32);
+    jcc(Assembler::notZero, copy_32_loop);
+
+    bind(copy_loop_tail);
+    // bail out when there is nothing to be done
+    testl(tmp5, 0xFFFFFFFF);
+    jcc(Assembler::zero, return_length);
+
+    // Save k1
+    kmovql(k3, k1);
+
+    movl(len, tmp5);
+
+    // ~(~0 << len), where len is the # of remaining elements to process
+    movl(result, 0xFFFFFFFF);
+    shlxl(result, result, len);
+    notl(result);
+
+    kmovdl(k1, result);
+
+    evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit);
+    evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
+    ktestd(k2, k1);
+    jcc(Assembler::carryClear, copy_just_portion_of_candidates);
+
+    evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
+    // Restore k1
+    kmovql(k1, k3);
+
+    jmp(return_length);
+
+    bind(copy_just_portion_of_candidates);
+    kmovdl(tmp5, k2);
+    tzcntl(tmp5, tmp5);
+
+    // ~(~0 << tmp5), where tmp5 is a number of elements in an array from the
+    // result to the first element larger than 0xFF
+    movl(result, 0xFFFFFFFF);
+    shlxl(result, result, tmp5);
+    notl(result);
+
+    kmovdl(k1, result);
+
+    evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
+    // Restore k1
+    kmovql(k1, k3);
+
+    jmp(return_zero);
+
+    clear_vector_masking();   // closing of the stub context for programming mask registers
+  }
   if (UseSSE42Intrinsics) {
     assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
     Label copy_32_loop, copy_16, copy_tail;
 
+    bind(below_threshold);
+
     movl(result, len);
+
     movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
 
     // vectored compression
@@ -10939,10 +11185,16 @@
 }
 
 // Inflate byte[] array to char[].
+//   ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
+//   @HotSpotIntrinsicCandidate
+//   private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
+//     for (int i = 0; i < len; i++) {
+//       dst[dstOff++] = (char)(src[srcOff++] & 0xff);
+//     }
+//   }
 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
-                                        XMMRegister tmp1, Register tmp2) {
-  Label copy_chars_loop, done;
-
+  XMMRegister tmp1, Register tmp2) {
+  Label copy_chars_loop, done, below_threshold;
   // rsi: src
   // rdi: dst
   // rdx: len
@@ -10953,20 +11205,109 @@
   // rdx holds length
   assert_different_registers(src, dst, len, tmp2);
 
+  if ((UseAVX > 2) && // AVX512
+    VM_Version::supports_avx512vlbw() &&
+    VM_Version::supports_bmi2()) {
+
+    set_vector_masking();  // opening of the stub context for programming mask registers
+
+    Label copy_32_loop, copy_tail;
+    Register tmp3_aliased = len;
+
+    // if length of the string is less than 16, handle it in an old fashioned
+    // way
+    testl(len, -16);
+    jcc(Assembler::zero, below_threshold);
+
+    // In order to use only one arithmetic operation for the main loop we use
+    // this pre-calculation
+    movl(tmp2, len);
+    andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
+    andl(len, -32);     // vector count
+    jccb(Assembler::zero, copy_tail);
+
+    lea(src, Address(src, len, Address::times_1));
+    lea(dst, Address(dst, len, Address::times_2));
+    negptr(len);
+
+
+    // inflate 32 chars per iter
+    bind(copy_32_loop);
+    vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
+    evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
+    addptr(len, 32);
+    jcc(Assembler::notZero, copy_32_loop);
+
+    bind(copy_tail);
+    // bail out when there is nothing to be done
+    testl(tmp2, -1); // we don't destroy the contents of tmp2 here
+    jcc(Assembler::zero, done);
+
+    // Save k1
+    kmovql(k2, k1);
+
+    // ~(~0 << length), where length is the # of remaining elements to process
+    movl(tmp3_aliased, -1);
+    shlxl(tmp3_aliased, tmp3_aliased, tmp2);
+    notl(tmp3_aliased);
+    kmovdl(k1, tmp3_aliased);
+    evpmovzxbw(tmp1, k1, Address(src, 0), Assembler::AVX_512bit);
+    evmovdquw(Address(dst, 0), k1, tmp1, Assembler::AVX_512bit);
+
+    // Restore k1
+    kmovql(k1, k2);
+    jmp(done);
+
+    clear_vector_masking();   // closing of the stub context for programming mask registers
+  }
   if (UseSSE42Intrinsics) {
     assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
-    Label copy_8_loop, copy_bytes, copy_tail;
+    Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
 
     movl(tmp2, len);
-    andl(tmp2, 0x00000007);   // tail count (in chars)
-    andl(len, 0xfffffff8);    // vector count (in chars)
-    jccb(Assembler::zero, copy_tail);
+
+    if (UseAVX > 1) {
+      andl(tmp2, (16 - 1));
+      andl(len, -16);
+      jccb(Assembler::zero, copy_new_tail);
+    } else {
+      andl(tmp2, 0x00000007);   // tail count (in chars)
+      andl(len, 0xfffffff8);    // vector count (in chars)
+      jccb(Assembler::zero, copy_tail);
+    }
 
     // vectored inflation
     lea(src, Address(src, len, Address::times_1));
     lea(dst, Address(dst, len, Address::times_2));
     negptr(len);
 
+    if (UseAVX > 1) {
+      bind(copy_16_loop);
+      vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
+      vmovdqu(Address(dst, len, Address::times_2), tmp1);
+      addptr(len, 16);
+      jcc(Assembler::notZero, copy_16_loop);
+
+      bind(below_threshold);
+      bind(copy_new_tail);
+      if (UseAVX > 2) {
+        movl(tmp2, len);
+      }
+      else {
+        movl(len, tmp2);
+      }
+      andl(tmp2, 0x00000007);
+      andl(len, 0xFFFFFFF8);
+      jccb(Assembler::zero, copy_tail);
+
+      pmovzxbw(tmp1, Address(src, 0));
+      movdqu(Address(dst, 0), tmp1);
+      addptr(src, 8);
+      addptr(dst, 2 * 8);
+
+      jmp(copy_tail, true);
+    }
+
     // inflate 8 chars per iter
     bind(copy_8_loop);
     pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
@@ -11005,7 +11346,6 @@
   bind(done);
 }
 
-
 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
   switch (cond) {
     // Note some conditions are synonyms for others