8142980: SKX SpecJvm2008 - Derby
authormcberg
Thu, 19 Nov 2015 16:07:22 -0800
changeset 34203 6817dadf6c7e
parent 34202 5d19ca9c25a8
child 34204 5ad1ba3afecc
8142980: SKX SpecJvm2008 - Derby Summary: Fix EVEX and AVX512 problems found by testing on 64-bit SKX and KNL EVEX enabled platforms. Reviewed-by: iveresov, kvn, vlivanov
hotspot/src/cpu/x86/vm/assembler_x86.cpp
hotspot/src/cpu/x86/vm/assembler_x86.hpp
hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp
hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Mon Nov 16 14:19:10 2015 +0100
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Thu Nov 19 16:07:22 2015 -0800
@@ -2152,6 +2152,23 @@
   emit_int8(0xC0 | encode);
 }
 
+void Assembler::kmovwl(KRegister dst, Register src) {
+  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = kreg_prefix_and_encode(dst, knoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  emit_int8((unsigned char)0x92);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::kmovdl(KRegister dst, Register src) {
+  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  VexSimdPrefix pre = !_legacy_mode_bw ? VEX_SIMD_F2 : VEX_SIMD_NONE;
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, VEX_OPCODE_0F, &attributes);
+  emit_int8((unsigned char)0x92);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 void Assembler::kmovql(KRegister dst, KRegister src) {
   NOT_LP64(assert(VM_Version::supports_evex(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
@@ -2187,20 +2204,39 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-void Assembler::kmovdl(KRegister dst, Register src) {
-  NOT_LP64(assert(VM_Version::supports_evex(), ""));
-  VexSimdPrefix pre = !_legacy_mode_bw ? VEX_SIMD_F2 : VEX_SIMD_NONE;
+// This instruction produces ZF or CF flags
+void Assembler::kortestbl(KRegister src1, KRegister src2) {
+  NOT_LP64(assert(VM_Version::supports_avx512dq(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, VEX_OPCODE_0F, &attributes);
-  emit_int8((unsigned char)0x92);
-  emit_int8((unsigned char)(0xC0 | encode));
-}
-
-void Assembler::kmovwl(KRegister dst, Register src) {
+  int encode = kreg_prefix_and_encode(src1, knoreg, src2, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8((unsigned char)0x98);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// This instruction produces ZF or CF flags
+void Assembler::kortestwl(KRegister src1, KRegister src2) {
   NOT_LP64(assert(VM_Version::supports_evex(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = kreg_prefix_and_encode(dst, knoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
-  emit_int8((unsigned char)0x92);
+  int encode = kreg_prefix_and_encode(src1, knoreg, src2, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  emit_int8((unsigned char)0x98);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// This instruction produces ZF or CF flags
+void Assembler::kortestdl(KRegister src1, KRegister src2) {
+  NOT_LP64(assert(VM_Version::supports_avx512bw(), ""));
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = kreg_prefix_and_encode(src1, knoreg, src2, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8((unsigned char)0x98);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// This instruction produces ZF or CF flags
+void Assembler::kortestql(KRegister src1, KRegister src2) {
+  NOT_LP64(assert(VM_Version::supports_avx512bw(), ""));
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = kreg_prefix_and_encode(src1, knoreg, src2, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+  emit_int8((unsigned char)0x98);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
@@ -2337,6 +2373,63 @@
 }
 
 // Move Unaligned EVEX enabled Vector (programmable : 8,16,32,64)
+void Assembler::evmovdqub(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x6F);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evmovdqub(XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
+  vex_prefix(src, 0, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x6F);
+  emit_operand(dst, src);
+}
+
+void Assembler::evmovdqub(Address dst, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  assert(src != xnoreg, "sanity");
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
+  vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x7F);
+  emit_operand(src, dst);
+}
+
+void Assembler::evmovdquw(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x6F);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evmovdquw(XMMRegister dst, Address src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
+  vex_prefix(src, 0, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x6F);
+  emit_operand(dst, src);
+}
+
+void Assembler::evmovdquw(Address dst, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  assert(src != xnoreg, "sanity");
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
+  vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x7F);
+  emit_operand(src, dst);
+}
 void Assembler::evmovdqul(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
@@ -3033,6 +3126,36 @@
   emit_int8(imm8);
 }
 
+// In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst
+void Assembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x74);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst
+void Assembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x74);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// In this context, kdst is written the mask used to process the equal components
+void Assembler::evpcmpeqb(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_avx512bw(), "");
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+  int encode = vex_prefix_and_encode(kdst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x74);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst
 void Assembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
@@ -3041,9 +3164,9 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
+// In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst
 void Assembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  assert(!VM_Version::supports_evex(), "");
   InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
@@ -3051,6 +3174,87 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
+// In this context, kdst is written the mask used to process the equal components
+void Assembler::evpcmpeqw(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_avx512bw(), "");
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+  int encode = vex_prefix_and_encode(kdst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x75);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst
+void Assembler::pcmpeqd(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x76);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst
+void Assembler::vpcmpeqd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x76);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// In this context, kdst is written the mask used to process the equal components
+void Assembler::evpcmpeqd(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+  int encode = vex_prefix_and_encode(kdst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8(0x76);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst
+void Assembler::pcmpeqq(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse4_1(), ""));
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int8(0x29);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst
+void Assembler::vpcmpeqq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_avx(), "");
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int8(0x29);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// In this context, kdst is written the mask used to process the equal components
+void Assembler::evpcmpeqq(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+  int encode = vex_prefix_and_encode(kdst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int8(0x29);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// In this context, kdst is written the mask used to process the equal components
+void Assembler::evpcmpeqq(KRegister kdst, XMMRegister nds, Address src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  InstructionMark im(this);
+  InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+  int dst_enc = kdst->encoding();
+  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int8(0x29);
+  emit_operand(as_Register(dst_enc), src);
+}
+
 void Assembler::pmovmskb(Register dst, XMMRegister src) {
   assert(VM_Version::supports_sse2(), "");
   InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
@@ -3139,11 +3343,11 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-void Assembler::vpmovzxbw(XMMRegister dst, Address src) {
+void Assembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
   InstructionMark im(this);
   assert(dst != xnoreg, "sanity");
-  InstructionAttr attributes(AVX_256bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false);
+  InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_HVM, /* input_size_in_bits */ EVEX_NObit);
   vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x30);
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Mon Nov 16 14:19:10 2015 +0100
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Thu Nov 19 16:07:22 2015 -0800
@@ -1331,13 +1331,18 @@
 
   void movddup(XMMRegister dst, XMMRegister src);
 
+  void kmovwl(KRegister dst, Register src);
+  void kmovdl(KRegister dst, Register src);
   void kmovql(KRegister dst, KRegister src);
   void kmovql(KRegister dst, Register src);
-  void kmovdl(KRegister dst, Register src);
-  void kmovwl(KRegister dst, Register src);
   void kmovql(Address dst, KRegister src);
   void kmovql(KRegister dst, Address src);
 
+  void kortestbl(KRegister dst, KRegister src);
+  void kortestwl(KRegister dst, KRegister src);
+  void kortestdl(KRegister dst, KRegister src);
+  void kortestql(KRegister dst, KRegister src);
+
   void movdl(XMMRegister dst, Register src);
   void movdl(Register dst, XMMRegister src);
   void movdl(XMMRegister dst, Address src);
@@ -1362,6 +1367,12 @@
   void vmovdqu(XMMRegister dst, XMMRegister src);
 
    // Move Unaligned 512bit Vector
+  void evmovdqub(Address dst, XMMRegister src, int vector_len);
+  void evmovdqub(XMMRegister dst, Address src, int vector_len);
+  void evmovdqub(XMMRegister dst, XMMRegister src, int vector_len);
+  void evmovdquw(Address dst, XMMRegister src, int vector_len);
+  void evmovdquw(XMMRegister dst, Address src, int vector_len);
+  void evmovdquw(XMMRegister dst, XMMRegister src, int vector_len);
   void evmovdqul(Address dst, XMMRegister src, int vector_len);
   void evmovdqul(XMMRegister dst, Address src, int vector_len);
   void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len);
@@ -1507,8 +1518,22 @@
   void pcmpestri(XMMRegister xmm1, XMMRegister xmm2, int imm8);
   void pcmpestri(XMMRegister xmm1, Address src, int imm8);
 
+  void pcmpeqb(XMMRegister dst, XMMRegister src);
+  void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void evpcmpeqb(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
+
   void pcmpeqw(XMMRegister dst, XMMRegister src);
   void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void evpcmpeqw(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
+
+  void pcmpeqd(XMMRegister dst, XMMRegister src);
+  void vpcmpeqd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void evpcmpeqd(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
+
+  void pcmpeqq(XMMRegister dst, XMMRegister src);
+  void vpcmpeqq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void evpcmpeqq(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
+  void evpcmpeqq(KRegister kdst, XMMRegister nds, Address src, int vector_len);
 
   void pmovmskb(Register dst, XMMRegister src);
   void vpmovmskb(Register dst, XMMRegister src);
@@ -1529,7 +1554,7 @@
   void pmovzxbw(XMMRegister dst, XMMRegister src);
   void pmovzxbw(XMMRegister dst, Address src);
 
-  void vpmovzxbw(XMMRegister dst, Address src);
+  void vpmovzxbw(XMMRegister dst, Address src, int vector_len);
 
 #ifndef _LP64 // no 32bit push/pop on amd64
   void popl(Address dst);
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Mon Nov 16 14:19:10 2015 +0100
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Thu Nov 19 16:07:22 2015 -0800
@@ -3949,6 +3949,236 @@
   testl(dst, as_Address(src));
 }
 
+void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
+  int dst_enc = dst->encoding();
+  int src_enc = src->encoding();
+  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
+    Assembler::pcmpeqb(dst, src);
+  } else if ((dst_enc < 16) && (src_enc < 16)) {
+    Assembler::pcmpeqb(dst, src);
+  } else if (src_enc < 16) {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+    Assembler::pcmpeqb(xmm0, src);
+    movdqu(dst, xmm0);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  } else if (dst_enc < 16) {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, src, Assembler::AVX_512bit);
+    Assembler::pcmpeqb(dst, xmm0);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  } else {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
+    movdqu(xmm0, src);
+    movdqu(xmm1, dst);
+    Assembler::pcmpeqb(xmm1, xmm0);
+    movdqu(dst, xmm1);
+    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  }
+}
+
+void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
+  int dst_enc = dst->encoding();
+  int src_enc = src->encoding();
+  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
+    Assembler::pcmpeqw(dst, src);
+  } else if ((dst_enc < 16) && (src_enc < 16)) {
+    Assembler::pcmpeqw(dst, src);
+  } else if (src_enc < 16) {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+    Assembler::pcmpeqw(xmm0, src);
+    movdqu(dst, xmm0);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  } else if (dst_enc < 16) {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, src, Assembler::AVX_512bit);
+    Assembler::pcmpeqw(dst, xmm0);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  } else {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
+    movdqu(xmm0, src);
+    movdqu(xmm1, dst);
+    Assembler::pcmpeqw(xmm1, xmm0);
+    movdqu(dst, xmm1);
+    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  }
+}
+
+void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
+  int dst_enc = dst->encoding();
+  if (dst_enc < 16) {
+    Assembler::pcmpestri(dst, src, imm8);
+  } else {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+    Assembler::pcmpestri(xmm0, src, imm8);
+    movdqu(dst, xmm0);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  }
+}
+
+void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
+  int dst_enc = dst->encoding();
+  int src_enc = src->encoding();
+  if ((dst_enc < 16) && (src_enc < 16)) {
+    Assembler::pcmpestri(dst, src, imm8);
+  } else if (src_enc < 16) {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+    Assembler::pcmpestri(xmm0, src, imm8);
+    movdqu(dst, xmm0);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  } else if (dst_enc < 16) {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, src, Assembler::AVX_512bit);
+    Assembler::pcmpestri(dst, xmm0, imm8);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  } else {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
+    movdqu(xmm0, src);
+    movdqu(xmm1, dst);
+    Assembler::pcmpestri(xmm1, xmm0, imm8);
+    movdqu(dst, xmm1);
+    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  }
+}
+
+void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
+  int dst_enc = dst->encoding();
+  int src_enc = src->encoding();
+  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
+    Assembler::pmovzxbw(dst, src);
+  } else if ((dst_enc < 16) && (src_enc < 16)) {
+    Assembler::pmovzxbw(dst, src);
+  } else if (src_enc < 16) {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+    Assembler::pmovzxbw(xmm0, src);
+    movdqu(dst, xmm0);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  } else if (dst_enc < 16) {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, src, Assembler::AVX_512bit);
+    Assembler::pmovzxbw(dst, xmm0);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  } else {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
+    movdqu(xmm0, src);
+    movdqu(xmm1, dst);
+    Assembler::pmovzxbw(xmm1, xmm0);
+    movdqu(dst, xmm1);
+    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  }
+}
+
+void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
+  int dst_enc = dst->encoding();
+  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
+    Assembler::pmovzxbw(dst, src);
+  } else if (dst_enc < 16) {
+    Assembler::pmovzxbw(dst, src);
+  } else {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+    Assembler::pmovzxbw(xmm0, src);
+    movdqu(dst, xmm0);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  }
+}
+
+void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
+  int src_enc = src->encoding();
+  if (src_enc < 16) {
+    Assembler::pmovmskb(dst, src);
+  } else {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, src, Assembler::AVX_512bit);
+    Assembler::pmovmskb(dst, xmm0);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  }
+}
+
+void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
+  int dst_enc = dst->encoding();
+  int src_enc = src->encoding();
+  if ((dst_enc < 16) && (src_enc < 16)) {
+    Assembler::ptest(dst, src);
+  } else if (src_enc < 16) {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+    Assembler::ptest(xmm0, src);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  } else if (dst_enc < 16) {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, src, Assembler::AVX_512bit);
+    Assembler::ptest(dst, xmm0);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  } else {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
+    movdqu(xmm0, src);
+    movdqu(xmm1, dst);
+    Assembler::ptest(xmm1, xmm0);
+    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  }
+}
+
 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
   if (reachable(src)) {
     Assembler::sqrtsd(dst, as_Address(src));
@@ -4256,6 +4486,214 @@
   }
 }
 
+void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src) {
+  int dst_enc = dst->encoding();
+  int src_enc = src->encoding();
+  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
+    Assembler::vpbroadcastw(dst, src);
+  } else if ((dst_enc < 16) && (src_enc < 16)) {
+    Assembler::vpbroadcastw(dst, src);
+  } else if (src_enc < 16) {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+    Assembler::vpbroadcastw(xmm0, src);
+    movdqu(dst, xmm0);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  } else if (dst_enc < 16) {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, src, Assembler::AVX_512bit);
+    Assembler::vpbroadcastw(dst, xmm0);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  } else {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
+    movdqu(xmm0, src);
+    movdqu(xmm1, dst);
+    Assembler::vpbroadcastw(xmm1, xmm0);
+    movdqu(dst, xmm1);
+    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  }
+}
+
+void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  int dst_enc = dst->encoding();
+  int nds_enc = nds->encoding();
+  int src_enc = src->encoding();
+  assert(dst_enc == nds_enc, "");
+  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
+    Assembler::vpcmpeqb(dst, nds, src, vector_len);
+  } else if ((dst_enc < 16) && (src_enc < 16)) {
+    Assembler::vpcmpeqb(dst, nds, src, vector_len);
+  } else if (src_enc < 16) {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+    Assembler::vpcmpeqb(xmm0, xmm0, src, vector_len);
+    movdqu(dst, xmm0);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  } else if (dst_enc < 16) {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, src, Assembler::AVX_512bit);
+    Assembler::vpcmpeqb(dst, dst, xmm0, vector_len);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  } else {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
+    movdqu(xmm0, src);
+    movdqu(xmm1, dst);
+    Assembler::vpcmpeqb(xmm1, xmm1, xmm0, vector_len);
+    movdqu(dst, xmm1);
+    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  }
+}
+
+void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  int dst_enc = dst->encoding();
+  int nds_enc = nds->encoding();
+  int src_enc = src->encoding();
+  assert(dst_enc == nds_enc, "");
+  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
+    Assembler::vpcmpeqw(dst, nds, src, vector_len);
+  } else if ((dst_enc < 16) && (src_enc < 16)) {
+    Assembler::vpcmpeqw(dst, nds, src, vector_len);
+  } else if (src_enc < 16) {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+    Assembler::vpcmpeqw(xmm0, xmm0, src, vector_len);
+    movdqu(dst, xmm0);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  } else if (dst_enc < 16) {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, src, Assembler::AVX_512bit);
+    Assembler::vpcmpeqw(dst, dst, xmm0, vector_len);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  } else {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
+    movdqu(xmm0, src);
+    movdqu(xmm1, dst);
+    Assembler::vpcmpeqw(xmm1, xmm1, xmm0, vector_len);
+    movdqu(dst, xmm1);
+    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  }
+}
+
+void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
+  int dst_enc = dst->encoding();
+  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
+    Assembler::vpmovzxbw(dst, src, vector_len);
+  } else if (dst_enc < 16) {
+    Assembler::vpmovzxbw(dst, src, vector_len);
+  } else {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+    Assembler::vpmovzxbw(xmm0, src, vector_len);
+    movdqu(dst, xmm0);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  }
+}
+
+void MacroAssembler::vpmovmskb(Register dst, XMMRegister src) {
+  int src_enc = src->encoding();
+  if (src_enc < 16) {
+    Assembler::vpmovmskb(dst, src);
+  } else {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, src, Assembler::AVX_512bit);
+    Assembler::vpmovmskb(dst, xmm0);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  }
+}
+
+void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  int dst_enc = dst->encoding();
+  int nds_enc = nds->encoding();
+  int src_enc = src->encoding();
+  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
+    Assembler::vpmullw(dst, nds, src, vector_len);
+  } else if ((dst_enc < 16) && (src_enc < 16)) {
+    Assembler::vpmullw(dst, dst, src, vector_len);
+  } else if ((dst_enc < 16) && (nds_enc < 16)) {
+    // use nds as scratch for src
+    evmovdqul(nds, src, Assembler::AVX_512bit);
+    Assembler::vpmullw(dst, dst, nds, vector_len);
+  } else if ((src_enc < 16) && (nds_enc < 16)) {
+    // use nds as scratch for dst
+    evmovdqul(nds, dst, Assembler::AVX_512bit);
+    Assembler::vpmullw(nds, nds, src, vector_len);
+    evmovdqul(dst, nds, Assembler::AVX_512bit);
+  } else if (dst_enc < 16) {
+    // use nds as scatch for xmm0 to hold src
+    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, src, Assembler::AVX_512bit);
+    Assembler::vpmullw(dst, dst, xmm0, vector_len);
+    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+  } else {
+    // worse case scenario, all regs are in the upper bank
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
+    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm1, src, Assembler::AVX_512bit);
+    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+    Assembler::vpmullw(xmm0, xmm0, xmm1, vector_len);
+    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  }
+}
+
+void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  int dst_enc = dst->encoding();
+  int nds_enc = nds->encoding();
+  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
+    Assembler::vpmullw(dst, nds, src, vector_len);
+  } else if (dst_enc < 16) {
+    Assembler::vpmullw(dst, dst, src, vector_len);
+  } else if (nds_enc < 16) {
+    // implies dst_enc in upper bank with src as scratch
+    evmovdqul(nds, dst, Assembler::AVX_512bit);
+    Assembler::vpmullw(nds, nds, src, vector_len);
+    evmovdqul(dst, nds, Assembler::AVX_512bit);
+  } else {
+    // worse case scenario, all regs in upper bank
+    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+    Assembler::vpmullw(xmm0, xmm0, src, vector_len);
+    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
+  }
+}
+
 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   int dst_enc = dst->encoding();
   int nds_enc = nds->encoding();
@@ -4374,66 +4812,6 @@
   }
 }
 
-
-void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  int src_enc = src->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpmullw(dst, nds, src, vector_len);
-  } else if ((dst_enc < 16) && (src_enc < 16)) {
-    Assembler::vpmullw(dst, dst, src, vector_len);
-  } else if ((dst_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch for src
-    evmovdqul(nds, src, Assembler::AVX_512bit);
-    Assembler::vpmullw(dst, dst, nds, vector_len);
-  } else if ((src_enc < 16) && (nds_enc < 16)) {
-    // use nds as scratch for dst
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpmullw(nds, nds, src, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else if (dst_enc < 16) {
-    // use nds as scatch for xmm0 to hold src
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
-    Assembler::vpmullw(dst, dst, xmm0, vector_len);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs are in the upper bank
-    subptr(rsp, 64);
-    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm1, src, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpmullw(xmm0, xmm0, xmm1, vector_len);
-    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
-    addptr(rsp, 64);
-  }
-}
-
-void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
-  int dst_enc = dst->encoding();
-  int nds_enc = nds->encoding();
-  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
-    Assembler::vpmullw(dst, nds, src, vector_len);
-  } else if (dst_enc < 16) {
-    Assembler::vpmullw(dst, dst, src, vector_len);
-  } else if (nds_enc < 16) {
-    // implies dst_enc in upper bank with src as scratch
-    evmovdqul(nds, dst, Assembler::AVX_512bit);
-    Assembler::vpmullw(nds, nds, src, vector_len);
-    evmovdqul(dst, nds, Assembler::AVX_512bit);
-  } else {
-    // worse case scenario, all regs in upper bank
-    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpmullw(xmm0, xmm0, src, vector_len);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-  }
-}
-
 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
   int dst_enc = dst->encoding();
   int nds_enc = nds->encoding();
@@ -4638,6 +5016,40 @@
   }
 }
 
+void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
+  int dst_enc = dst->encoding();
+  int src_enc = src->encoding();
+  if ((dst_enc < 16) && (src_enc < 16)) {
+    Assembler::vptest(dst, src);
+  } else if (src_enc < 16) {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
+    Assembler::vptest(xmm0, src);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  } else if (dst_enc < 16) {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    evmovdqul(xmm0, src, Assembler::AVX_512bit);
+    Assembler::vptest(dst, xmm0);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  } else {
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
+    subptr(rsp, 64);
+    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
+    movdqu(xmm0, src);
+    movdqu(xmm1, dst);
+    Assembler::vptest(xmm1, xmm0);
+    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
+    addptr(rsp, 64);
+  }
+}
+
 // This instruction exists within macros, ergo we cannot control its input
 // when emitted through those patterns.
 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
@@ -7722,7 +8134,7 @@
       vmovdqu(vec1, Address(str1, result, scale));
       vpxor(vec1, Address(str2, result, scale));
     } else {
-      vpmovzxbw(vec1, Address(str1, result, scale1));
+      vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
       vpxor(vec1, Address(str2, result, scale2));
     }
     vptest(vec1, vec1);
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp	Mon Nov 16 14:19:10 2015 +0100
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp	Thu Nov 19 16:07:22 2015 -0800
@@ -1004,6 +1004,19 @@
     Assembler::pclmulqdq(dst, src, 0x11);
   }
 
+  void pcmpeqb(XMMRegister dst, XMMRegister src);
+  void pcmpeqw(XMMRegister dst, XMMRegister src);
+
+  void pcmpestri(XMMRegister dst, Address src, int imm8);
+  void pcmpestri(XMMRegister dst, XMMRegister src, int imm8);
+
+  void pmovzxbw(XMMRegister dst, XMMRegister src);
+  void pmovzxbw(XMMRegister dst, Address src);
+
+  void pmovmskb(Register dst, XMMRegister src);
+
+  void ptest(XMMRegister dst, XMMRegister src);
+
   void sqrtsd(XMMRegister dst, XMMRegister src)    { Assembler::sqrtsd(dst, src); }
   void sqrtsd(XMMRegister dst, Address src)        { Assembler::sqrtsd(dst, src); }
   void sqrtsd(XMMRegister dst, AddressLiteral src);
@@ -1061,15 +1074,23 @@
   void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
   void vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
+  void vpbroadcastw(XMMRegister dst, XMMRegister src);
+
+  void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+
+  void vpmovzxbw(XMMRegister dst, Address src, int vector_len);
+  void vpmovmskb(Register dst, XMMRegister src);
+
+  void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+
   void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
   void vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
   void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
   void vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
-  void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
-  void vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
-
   void vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
   void vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
 
@@ -1079,6 +1100,8 @@
   void vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
   void vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
 
+  void vptest(XMMRegister dst, XMMRegister src);
+
   void punpcklbw(XMMRegister dst, XMMRegister src);
   void punpcklbw(XMMRegister dst, Address src) { Assembler::punpcklbw(dst, src); }
 
--- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Mon Nov 16 14:19:10 2015 +0100
+++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Thu Nov 19 16:07:22 2015 -0800
@@ -198,7 +198,7 @@
     }
   }
 
-  if (vect_words > 0) {
+  if (save_vectors) {
     assert(vect_words*wordSize == 128, "");
     __ subptr(rsp, 128); // Save upper half of YMM registes
     for (int n = 0; n < num_xmm_regs; n++) {
@@ -266,21 +266,7 @@
 #else
   assert(!restore_vectors, "vectors are generated only by C2");
 #endif
-  int off = xmm0_off;
-  int delta = xmm1_off - off;
-
-  if (UseSSE == 1) {
-    assert(additional_frame_bytes == 0, "");
-    for (int n = 0; n < num_xmm_regs; n++) {
-      __ movflt(as_XMMRegister(n), Address(rsp, off*wordSize));
-      off += delta;
-    }
-  } else if (UseSSE >= 2) {
-    for (int n = 0; n < num_xmm_regs; n++) {
-      __ movdqu(as_XMMRegister(n), Address(rsp, off*wordSize+additional_frame_bytes));
-      off += delta;
-    }
-  }
+
   if (restore_vectors) {
     assert(additional_frame_bytes == 128, "");
     if (UseAVX > 2) {
@@ -296,6 +282,23 @@
     }
     __ addptr(rsp, additional_frame_bytes); // Save upper half of YMM registes
   }
+
+  int off = xmm0_off;
+  int delta = xmm1_off - off;
+
+  if (UseSSE == 1) {
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ movflt(as_XMMRegister(n), Address(rsp, off*wordSize));
+      off += delta;
+    }
+  } else if (UseSSE >= 2) {
+    // additional_frame_bytes only populated for the restore_vector case, else it is 0
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ movdqu(as_XMMRegister(n), Address(rsp, off*wordSize+additional_frame_bytes));
+      off += delta;
+    }
+  }
+
   __ pop_FPU_state();
   __ addptr(rsp, FPU_regs_live*wordSize); // Pop FPU registers
 
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Mon Nov 16 14:19:10 2015 +0100
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Thu Nov 19 16:07:22 2015 -0800
@@ -1439,8 +1439,8 @@
       // Copy 64-bytes per iteration
       __ BIND(L_loop);
       if (UseAVX > 2) {
-        __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 32), Assembler::AVX_512bit);
-        __ evmovdqul(Address(dest, qword_count, Address::times_8, 32), xmm0, Assembler::AVX_512bit);
+        __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit);
+        __ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit);
       } else if (UseAVX == 2) {
         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
         __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);