8076276: Add support for AVX512
authorkvn
Fri, 08 May 2015 11:49:20 -0700
changeset 30624 2e1803c8a26d
parent 30596 0322b394e7fd
child 30627 d79f16a39a8c
8076276: Add support for AVX512 Reviewed-by: kvn, roland Contributed-by: michael.c.berg@intel.com
hotspot/src/cpu/x86/vm/assembler_x86.cpp
hotspot/src/cpu/x86/vm/assembler_x86.hpp
hotspot/src/cpu/x86/vm/c1_FrameMap_x86.cpp
hotspot/src/cpu/x86/vm/c1_FrameMap_x86.hpp
hotspot/src/cpu/x86/vm/c1_LinearScan_x86.hpp
hotspot/src/cpu/x86/vm/c1_Runtime1_x86.cpp
hotspot/src/cpu/x86/vm/c2_init_x86.cpp
hotspot/src/cpu/x86/vm/frame_x86.hpp
hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
hotspot/src/cpu/x86/vm/register_definitions_x86.cpp
hotspot/src/cpu/x86/vm/register_x86.cpp
hotspot/src/cpu/x86/vm/register_x86.hpp
hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp
hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp
hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
hotspot/src/cpu/x86/vm/vm_version_x86.cpp
hotspot/src/cpu/x86/vm/vm_version_x86.hpp
hotspot/src/cpu/x86/vm/vmreg_x86.cpp
hotspot/src/cpu/x86/vm/vmreg_x86.hpp
hotspot/src/cpu/x86/vm/vmreg_x86.inline.hpp
hotspot/src/cpu/x86/vm/x86.ad
hotspot/src/cpu/x86/vm/x86_32.ad
hotspot/src/cpu/x86/vm/x86_64.ad
hotspot/src/share/vm/adlc/archDesc.cpp
hotspot/src/share/vm/adlc/formssel.cpp
hotspot/src/share/vm/c1/c1_LinearScan.cpp
hotspot/src/share/vm/opto/c2_globals.hpp
hotspot/src/share/vm/opto/chaitin.cpp
hotspot/src/share/vm/opto/chaitin.hpp
hotspot/src/share/vm/opto/classes.hpp
hotspot/src/share/vm/opto/compile.cpp
hotspot/src/share/vm/opto/matcher.cpp
hotspot/src/share/vm/opto/opcodes.cpp
hotspot/src/share/vm/opto/opcodes.hpp
hotspot/src/share/vm/opto/optoreg.hpp
hotspot/src/share/vm/opto/output.cpp
hotspot/src/share/vm/opto/regmask.cpp
hotspot/src/share/vm/opto/regmask.hpp
hotspot/src/share/vm/opto/type.cpp
hotspot/src/share/vm/opto/type.hpp
hotspot/src/share/vm/opto/vectornode.cpp
hotspot/src/share/vm/opto/vectornode.hpp
hotspot/src/share/vm/runtime/vmStructs.cpp
hotspot/test/compiler/loopopts/superword/SumRed_Long.java
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Fri May 08 11:49:20 2015 -0700
@@ -54,6 +54,36 @@
 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
 // Implementation of AddressLiteral
 
+// A 2-D table for managing compressed displacement(disp8) on EVEX enabled platforms.
+unsigned char tuple_table[Assembler::EVEX_ETUP + 1][Assembler::AVX_512bit + 1] = {
+  // -----------------Table 4.5 -------------------- //
+  16, 32, 64,  // EVEX_FV(0)
+  4,  4,  4,   // EVEX_FV(1) - with Evex.b
+  16, 32, 64,  // EVEX_FV(2) - with Evex.w
+  8,  8,  8,   // EVEX_FV(3) - with Evex.w and Evex.b
+  8,  16, 32,  // EVEX_HV(0)
+  4,  4,  4,   // EVEX_HV(1) - with Evex.b
+  // -----------------Table 4.6 -------------------- //
+  16, 32, 64,  // EVEX_FVM(0)
+  1,  1,  1,   // EVEX_T1S(0)
+  2,  2,  2,   // EVEX_T1S(1)
+  4,  4,  4,   // EVEX_T1S(2)
+  8,  8,  8,   // EVEX_T1S(3)
+  4,  4,  4,   // EVEX_T1F(0)
+  8,  8,  8,   // EVEX_T1F(1)
+  8,  8,  8,   // EVEX_T2(0)
+  0,  16, 16,  // EVEX_T2(1)
+  0,  16, 16,  // EVEX_T4(0)
+  0,  0,  32,  // EVEX_T4(1)
+  0,  0,  32,  // EVEX_T8(0)
+  8,  16, 32,  // EVEX_HVM(0)
+  4,  8,  16,  // EVEX_QVM(0)
+  2,  4,  8,   // EVEX_OVM(0)
+  16, 16, 16,  // EVEX_M128(0)
+  8,  32, 64,  // EVEX_DUP(0)
+  0,  0,  0    // EVEX_NTUP
+};
+
 AddressLiteral::AddressLiteral(address target, relocInfo::relocType rtype) {
   _is_lval = false;
   _target = target;
@@ -183,8 +213,9 @@
 // make this go away someday
 void Assembler::emit_data(jint data, relocInfo::relocType rtype, int format) {
   if (rtype == relocInfo::none)
-        emit_int32(data);
-  else  emit_data(data, Relocation::spec_simple(rtype), format);
+    emit_int32(data);
+  else
+    emit_data(data, Relocation::spec_simple(rtype), format);
 }
 
 void Assembler::emit_data(jint data, RelocationHolder const& rspec, int format) {
@@ -273,6 +304,177 @@
 }
 
 
+bool Assembler::query_compressed_disp_byte(int disp, bool is_evex_inst, int vector_len,
+                                           int cur_tuple_type, int in_size_in_bits, int cur_encoding) {
+  int mod_idx = 0;
+  // We will test if the displacement fits the compressed format and if so
+  // apply the compression to the displacment iff the result is8bit.
+  if (VM_Version::supports_evex() && is_evex_inst) {
+    switch (cur_tuple_type) {
+    case EVEX_FV:
+      if ((cur_encoding & VEX_W) == VEX_W) {
+        mod_idx += 2 + ((cur_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0;
+      } else {
+        mod_idx = ((cur_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0;
+      }
+      break;
+
+    case EVEX_HV:
+      mod_idx = ((cur_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0;
+      break;
+
+    case EVEX_FVM:
+      break;
+
+    case EVEX_T1S:
+      switch (in_size_in_bits) {
+      case EVEX_8bit:
+        break;
+
+      case EVEX_16bit:
+        mod_idx = 1;
+        break;
+
+      case EVEX_32bit:
+        mod_idx = 2;
+        break;
+
+      case EVEX_64bit:
+        mod_idx = 3;
+        break;
+      }
+      break;
+
+    case EVEX_T1F:
+    case EVEX_T2:
+    case EVEX_T4:
+      mod_idx = (in_size_in_bits == EVEX_64bit) ? 1 : 0;
+      break;
+
+    case EVEX_T8:
+      break;
+
+    case EVEX_HVM:
+      break;
+
+    case EVEX_QVM:
+      break;
+
+    case EVEX_OVM:
+      break;
+
+    case EVEX_M128:
+      break;
+
+    case EVEX_DUP:
+      break;
+
+    default:
+      assert(0, "no valid evex tuple_table entry");
+      break;
+    }
+
+    if (vector_len >= AVX_128bit && vector_len <= AVX_512bit) {
+      int disp_factor = tuple_table[cur_tuple_type + mod_idx][vector_len];
+      if ((disp % disp_factor) == 0) {
+        int new_disp = disp / disp_factor;
+        if ((-0x80 <= new_disp && new_disp < 0x80)) {
+          disp = new_disp;
+        }
+      } else {
+        return false;
+      }
+    }
+  }
+  return (-0x80 <= disp && disp < 0x80);
+}
+
+
+bool Assembler::emit_compressed_disp_byte(int &disp) {
+  int mod_idx = 0;
+  // We will test if the displacement fits the compressed format and if so
+  // apply the compression to the displacment iff the result is8bit.
+  if (VM_Version::supports_evex() && is_evex_instruction) {
+    switch (tuple_type) {
+    case EVEX_FV:
+      if ((evex_encoding & VEX_W) == VEX_W) {
+        mod_idx += 2 + ((evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0;
+      } else {
+        mod_idx = ((evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0;
+      }
+      break;
+
+    case EVEX_HV:
+      mod_idx = ((evex_encoding & EVEX_Rb) == EVEX_Rb) ? 1 : 0;
+      break;
+
+    case EVEX_FVM:
+      break;
+
+    case EVEX_T1S:
+      switch (input_size_in_bits) {
+      case EVEX_8bit:
+        break;
+
+      case EVEX_16bit:
+        mod_idx = 1;
+        break;
+
+      case EVEX_32bit:
+        mod_idx = 2;
+        break;
+
+      case EVEX_64bit:
+        mod_idx = 3;
+        break;
+      }
+      break;
+
+    case EVEX_T1F:
+    case EVEX_T2:
+    case EVEX_T4:
+      mod_idx = (input_size_in_bits == EVEX_64bit) ? 1 : 0;
+      break;
+
+    case EVEX_T8:
+      break;
+
+    case EVEX_HVM:
+      break;
+
+    case EVEX_QVM:
+      break;
+
+    case EVEX_OVM:
+      break;
+
+    case EVEX_M128:
+      break;
+
+    case EVEX_DUP:
+      break;
+
+    default:
+      assert(0, "no valid evex tuple_table entry");
+      break;
+    }
+
+    if (avx_vector_len >= AVX_128bit && avx_vector_len <= AVX_512bit) {
+      int disp_factor = tuple_table[tuple_type + mod_idx][avx_vector_len];
+      if ((disp % disp_factor) == 0) {
+        int new_disp = disp / disp_factor;
+        if (is8bit(new_disp)) {
+          disp = new_disp;
+        }
+      } else {
+        return false;
+      }
+    }
+  }
+  return is8bit(disp);
+}
+
+
 void Assembler::emit_operand(Register reg, Register base, Register index,
                              Address::ScaleFactor scale, int disp,
                              RelocationHolder const& rspec,
@@ -296,7 +498,7 @@
         assert(index != rsp, "illegal addressing mode");
         emit_int8(0x04 | regenc);
         emit_int8(scale << 6 | indexenc | baseenc);
-      } else if (is8bit(disp) && rtype == relocInfo::none) {
+      } else if (emit_compressed_disp_byte(disp) && rtype == relocInfo::none) {
         // [base + index*scale + imm8]
         // [01 reg 100][ss index base] imm8
         assert(index != rsp, "illegal addressing mode");
@@ -318,7 +520,7 @@
         // [00 reg 100][00 100 100]
         emit_int8(0x04 | regenc);
         emit_int8(0x24);
-      } else if (is8bit(disp) && rtype == relocInfo::none) {
+      } else if (emit_compressed_disp_byte(disp) && rtype == relocInfo::none) {
         // [rsp + imm8]
         // [01 reg 100][00 100 100] disp8
         emit_int8(0x44 | regenc);
@@ -339,7 +541,7 @@
         // [base]
         // [00 reg base]
         emit_int8(0x00 | regenc | baseenc);
-      } else if (is8bit(disp) && rtype == relocInfo::none) {
+      } else if (emit_compressed_disp_byte(disp) && rtype == relocInfo::none) {
         // [base + disp8]
         // [01 reg base] disp8
         emit_int8(0x40 | regenc | baseenc);
@@ -389,11 +591,20 @@
       emit_data(disp, rspec, disp32_operand);
     }
   }
+  is_evex_instruction = false;
 }
 
 void Assembler::emit_operand(XMMRegister reg, Register base, Register index,
                              Address::ScaleFactor scale, int disp,
                              RelocationHolder const& rspec) {
+  if (UseAVX > 2) {
+    int xreg_enc = reg->encoding();
+    if (xreg_enc > 15) {
+      XMMRegister new_reg = as_XMMRegister(xreg_enc & 0xf);
+      emit_operand((Register)new_reg, base, index, scale, disp, rspec);
+      return;
+    }
+  }
   emit_operand((Register)reg, base, index, scale, disp, rspec);
 }
 
@@ -686,6 +897,29 @@
     debug_only(has_disp32 = true); // has both kinds of operands!
     break;
 
+  case 0x62: // EVEX_4bytes
+    assert((UseAVX > 0), "shouldn't have EVEX prefix");
+    assert(ip == inst+1, "no prefixes allowed");
+    // no EVEX collisions, all instructions that have 0x62 opcodes
+    // have EVEX versions and are subopcodes of 0x66
+    ip++; // skip P0 and exmaine W in P1
+    is_64bit = ((VEX_W & *ip) == VEX_W);
+    ip++; // move to P2
+    ip++; // skip P2, move to opcode
+    // To find the end of instruction (which == end_pc_operand).
+    switch (0xFF & *ip) {
+    case 0x61: // pcmpestri r, r/a, #8
+    case 0x70: // pshufd r, r/a, #8
+    case 0x73: // psrldq r, #8
+      tail_size = 1;  // the imm8
+      break;
+    default:
+      break;
+    }
+    ip++; // skip opcode
+    debug_only(has_disp32 = true); // has both kinds of operands!
+    break;
+
   case 0xD1: // sal a, 1; sar a, 1; shl a, 1; shr a, 1
   case 0xD3: // sal a, %cl; sar a, %cl; shl a, %cl; shr a, %cl
   case 0xD9: // fld_s a; fst_s a; fstp_s a; fldcw a
@@ -985,12 +1219,22 @@
 
 void Assembler::addsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x58, dst, src, VEX_SIMD_F2);
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0x58, dst, src, VEX_SIMD_F2);
+  } else {
+    emit_simd_arith(0x58, dst, src, VEX_SIMD_F2);
+  }
 }
 
 void Assembler::addsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x58, dst, src, VEX_SIMD_F2);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_64bit;
+    emit_simd_arith_q(0x58, dst, src, VEX_SIMD_F2);
+  } else {
+    emit_simd_arith(0x58, dst, src, VEX_SIMD_F2);
+  }
 }
 
 void Assembler::addss(XMMRegister dst, XMMRegister src) {
@@ -1000,20 +1244,26 @@
 
 void Assembler::addss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_32bit;
+  }
   emit_simd_arith(0x58, dst, src, VEX_SIMD_F3);
 }
 
 void Assembler::aesdec(XMMRegister dst, Address src) {
   assert(VM_Version::supports_aes(), "");
   InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, false,
+              VEX_OPCODE_0F_38, false, AVX_128bit, true);
   emit_int8((unsigned char)0xDE);
   emit_operand(dst, src);
 }
 
 void Assembler::aesdec(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_aes(), "");
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
+                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
   emit_int8((unsigned char)0xDE);
   emit_int8(0xC0 | encode);
 }
@@ -1021,14 +1271,16 @@
 void Assembler::aesdeclast(XMMRegister dst, Address src) {
   assert(VM_Version::supports_aes(), "");
   InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, false,
+              VEX_OPCODE_0F_38, false, AVX_128bit, true);
   emit_int8((unsigned char)0xDF);
   emit_operand(dst, src);
 }
 
 void Assembler::aesdeclast(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_aes(), "");
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
+                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
   emit_int8((unsigned char)0xDF);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1036,14 +1288,16 @@
 void Assembler::aesenc(XMMRegister dst, Address src) {
   assert(VM_Version::supports_aes(), "");
   InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, false,
+              VEX_OPCODE_0F_38, false, AVX_128bit, true);
   emit_int8((unsigned char)0xDC);
   emit_operand(dst, src);
 }
 
 void Assembler::aesenc(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_aes(), "");
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
+                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
   emit_int8((unsigned char)0xDC);
   emit_int8(0xC0 | encode);
 }
@@ -1051,14 +1305,16 @@
 void Assembler::aesenclast(XMMRegister dst, Address src) {
   assert(VM_Version::supports_aes(), "");
   InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, false,
+              VEX_OPCODE_0F_38, false, AVX_128bit, true);
   emit_int8((unsigned char)0xDD);
   emit_operand(dst, src);
 }
 
 void Assembler::aesenclast(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_aes(), "");
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
+                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
   emit_int8((unsigned char)0xDD);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1091,7 +1347,7 @@
 
 void Assembler::andnl(Register dst, Register src1, Register src2) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode(dst, src1, src2);
+  int encode = vex_prefix_0F38_and_encode(dst, src1, src2, false);
   emit_int8((unsigned char)0xF2);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1099,7 +1355,7 @@
 void Assembler::andnl(Register dst, Register src1, Address src2) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38(dst, src1, src2);
+  vex_prefix_0F38(dst, src1, src2, false);
   emit_int8((unsigned char)0xF2);
   emit_operand(dst, src2);
 }
@@ -1126,7 +1382,7 @@
 
 void Assembler::blsil(Register dst, Register src) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode(rbx, dst, src);
+  int encode = vex_prefix_0F38_and_encode(rbx, dst, src, false);
   emit_int8((unsigned char)0xF3);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1134,14 +1390,14 @@
 void Assembler::blsil(Register dst, Address src) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38(rbx, dst, src);
+  vex_prefix_0F38(rbx, dst, src, false);
   emit_int8((unsigned char)0xF3);
   emit_operand(rbx, src);
 }
 
 void Assembler::blsmskl(Register dst, Register src) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode(rdx, dst, src);
+  int encode = vex_prefix_0F38_and_encode(rdx, dst, src, false);
   emit_int8((unsigned char)0xF3);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1149,14 +1405,14 @@
 void Assembler::blsmskl(Register dst, Address src) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38(rdx, dst, src);
+  vex_prefix_0F38(rdx, dst, src, false);
   emit_int8((unsigned char)0xF3);
   emit_operand(rdx, src);
 }
 
 void Assembler::blsrl(Register dst, Register src) {
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_0F38_and_encode(rcx, dst, src);
+  int encode = vex_prefix_0F38_and_encode(rcx, dst, src, false);
   emit_int8((unsigned char)0xF3);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1164,7 +1420,7 @@
 void Assembler::blsrl(Register dst, Address src) {
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
-  vex_prefix_0F38(rcx, dst, src);
+  vex_prefix_0F38(rcx, dst, src, false);
   emit_int8((unsigned char)0xF3);
   emit_operand(rcx, src);
 }
@@ -1312,22 +1568,36 @@
   // NOTE: dbx seems to decode this as comiss even though the
   // 0x66 is there. Strangly ucomisd comes out correct
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_64bit;
+    emit_simd_arith_nonds_q(0x2F, dst, src, VEX_SIMD_66, true);
+  } else {
+    emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66);
+  }
 }
 
 void Assembler::comisd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66);
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_nonds_q(0x2F, dst, src, VEX_SIMD_66, true);
+  } else {
+    emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66);
+  }
 }
 
 void Assembler::comiss(XMMRegister dst, Address src) {
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_32bit;
+  }
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE);
+  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE, true);
 }
 
 void Assembler::comiss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE);
+  emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE, true);
 }
 
 void Assembler::cpuid() {
@@ -1347,36 +1617,61 @@
 
 void Assembler::cvtsd2ss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x5A, dst, src, VEX_SIMD_F2);
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0x5A, dst, src, VEX_SIMD_F2);
+  } else {
+    emit_simd_arith(0x5A, dst, src, VEX_SIMD_F2);
+  }
 }
 
 void Assembler::cvtsd2ss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x5A, dst, src, VEX_SIMD_F2);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1F;
+    input_size_in_bits = EVEX_64bit;
+    emit_simd_arith_q(0x5A, dst, src, VEX_SIMD_F2);
+  } else {
+    emit_simd_arith(0x5A, dst, src, VEX_SIMD_F2);
+  }
 }
 
 void Assembler::cvtsi2sdl(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
+  int encode = 0;
+  if (VM_Version::supports_evex()) {
+    encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F2, true);
+  } else {
+    encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, false);
+  }
   emit_int8(0x2A);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::cvtsi2sdl(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x2A, dst, src, VEX_SIMD_F2);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_32bit;
+    emit_simd_arith_q(0x2A, dst, src, VEX_SIMD_F2, true);
+  } else {
+    emit_simd_arith(0x2A, dst, src, VEX_SIMD_F2);
+  }
 }
 
 void Assembler::cvtsi2ssl(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3, true);
   emit_int8(0x2A);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::cvtsi2ssl(XMMRegister dst, Address src) {
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_32bit;
+  }
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith(0x2A, dst, src, VEX_SIMD_F3);
+  emit_simd_arith(0x2A, dst, src, VEX_SIMD_F3, true);
 }
 
 void Assembler::cvtss2sd(XMMRegister dst, XMMRegister src) {
@@ -1385,6 +1680,10 @@
 }
 
 void Assembler::cvtss2sd(XMMRegister dst, Address src) {
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_32bit;
+  }
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith(0x5A, dst, src, VEX_SIMD_F3);
 }
@@ -1392,14 +1691,14 @@
 
 void Assembler::cvttsd2sil(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, true);
   emit_int8(0x2C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::cvttss2sil(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, true);
   emit_int8(0x2C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1414,15 +1713,29 @@
 
 void Assembler::divsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x5E, dst, src, VEX_SIMD_F2);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_64bit;
+    emit_simd_arith_q(0x5E, dst, src, VEX_SIMD_F2);
+  } else {
+    emit_simd_arith(0x5E, dst, src, VEX_SIMD_F2);
+  }
 }
 
 void Assembler::divsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x5E, dst, src, VEX_SIMD_F2);
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0x5E, dst, src, VEX_SIMD_F2);
+  } else {
+    emit_simd_arith(0x5E, dst, src, VEX_SIMD_F2);
+  }
 }
 
 void Assembler::divss(XMMRegister dst, Address src) {
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_32bit;
+  }
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   emit_simd_arith(0x5E, dst, src, VEX_SIMD_F3);
 }
@@ -1675,7 +1988,11 @@
 
 void Assembler::movapd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_66);
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_nonds_q(0x28, dst, src, VEX_SIMD_66, true);
+  } else {
+    emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_66);
+  }
 }
 
 void Assembler::movaps(XMMRegister dst, XMMRegister src) {
@@ -1685,7 +2002,8 @@
 
 void Assembler::movlhps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode(dst, src, src, VEX_SIMD_NONE);
+  int encode = simd_prefix_and_encode(dst, src, src, VEX_SIMD_NONE, true, VEX_OPCODE_0F,
+                                      false, AVX_128bit);
   emit_int8(0x16);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1698,6 +2016,51 @@
   emit_operand(dst, src);
 }
 
+void Assembler::kmovq(KRegister dst, KRegister src) {
+  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  int encode = kreg_prefix_and_encode(dst, knoreg, src, VEX_SIMD_NONE,
+                                      true, VEX_OPCODE_0F, true);
+  emit_int8((unsigned char)0x90);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::kmovq(KRegister dst, Address src) {
+  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  int dst_enc = dst->encoding();
+  int nds_enc = 0;
+  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_NONE,
+             VEX_OPCODE_0F, true, AVX_128bit, true, true);
+  emit_int8((unsigned char)0x90);
+  emit_operand((Register)dst, src);
+}
+
+void Assembler::kmovq(Address dst, KRegister src) {
+  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  int src_enc = src->encoding();
+  int nds_enc = 0;
+  vex_prefix(dst, nds_enc, src_enc, VEX_SIMD_NONE,
+             VEX_OPCODE_0F, true, AVX_128bit, true, true);
+  emit_int8((unsigned char)0x90);
+  emit_operand((Register)src, dst);
+}
+
+void Assembler::kmovql(KRegister dst, Register src) {
+  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  bool supports_bw = VM_Version::supports_avx512bw();
+  VexSimdPrefix pre = supports_bw ? VEX_SIMD_F2 : VEX_SIMD_NONE;
+  int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, true,
+                                      VEX_OPCODE_0F, supports_bw);
+  emit_int8((unsigned char)0x92);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::kmovdl(KRegister dst, Register src) {
+  NOT_LP64(assert(VM_Version::supports_evex(), ""));
+  VexSimdPrefix pre = VM_Version::supports_avx512bw() ? VEX_SIMD_F2 : VEX_SIMD_NONE;
+  int encode = kreg_prefix_and_encode(dst, knoreg, src, pre, true, VEX_OPCODE_0F, false);
+  emit_int8((unsigned char)0x92);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
 
 void Assembler::movb(Address dst, int imm8) {
   InstructionMark im(this);
@@ -1718,7 +2081,7 @@
 
 void Assembler::movdl(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
+  int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, true);
   emit_int8(0x6E);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -1726,23 +2089,31 @@
 void Assembler::movdl(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // swap src/dst to get correct prefix
-  int encode = simd_prefix_and_encode(src, dst, VEX_SIMD_66);
+  int encode = simd_prefix_and_encode(src, dst, VEX_SIMD_66, true);
   emit_int8(0x7E);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::movdl(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_66);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_66, true, VEX_OPCODE_0F);
   emit_int8(0x6E);
   emit_operand(dst, src);
 }
 
 void Assembler::movdl(Address dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_66);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_66, true);
   emit_int8(0x7E);
   emit_operand(src, dst);
 }
@@ -1754,11 +2125,17 @@
 
 void Assembler::movdqa(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FVM;
+  }
   emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_66);
 }
 
 void Assembler::movdqu(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FVM;
+  }
   emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_F3);
 }
 
@@ -1769,8 +2146,11 @@
 
 void Assembler::movdqu(Address dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_F3);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FVM;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_F3, false);
   emit_int8(0x7F);
   emit_operand(src, dst);
 }
@@ -1778,28 +2158,77 @@
 // Move Unaligned 256bit Vector
 void Assembler::vmovdqu(XMMRegister dst, XMMRegister src) {
   assert(UseAVX > 0, "");
-  bool vector256 = true;
-  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F3, vector256);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FVM;
+  }
+  int vector_len = AVX_256bit;
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F3, vector_len);
   emit_int8(0x6F);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::vmovdqu(XMMRegister dst, Address src) {
   assert(UseAVX > 0, "");
-  InstructionMark im(this);
-  bool vector256 = true;
-  vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector256);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FVM;
+  }
+  InstructionMark im(this);
+  int vector_len = AVX_256bit;
+  vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector_len, false);
   emit_int8(0x6F);
   emit_operand(dst, src);
 }
 
 void Assembler::vmovdqu(Address dst, XMMRegister src) {
   assert(UseAVX > 0, "");
-  InstructionMark im(this);
-  bool vector256 = true;
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FVM;
+  }
+  InstructionMark im(this);
+  int vector_len = AVX_256bit;
   // swap src<->dst for encoding
   assert(src != xnoreg, "sanity");
-  vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector256);
+  vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector_len, false);
+  emit_int8(0x7F);
+  emit_operand(src, dst);
+}
+
+// Move Unaligned EVEX enabled Vector (programmable : 8,16,32,64)
+void Assembler::evmovdqu(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(UseAVX > 0, "");
+  int src_enc = src->encoding();
+  int dst_enc = dst->encoding();
+  int encode = vex_prefix_and_encode(dst_enc, 0, src_enc, VEX_SIMD_F3, VEX_OPCODE_0F,
+                                     true, vector_len, false, false);
+  emit_int8(0x6F);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evmovdqu(XMMRegister dst, Address src, int vector_len) {
+  assert(UseAVX > 0, "");
+  InstructionMark im(this);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FVM;
+    vex_prefix_q(dst, xnoreg, src, VEX_SIMD_F3, vector_len, false);
+  } else {
+    vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector_len, false);
+  }
+  emit_int8(0x6F);
+  emit_operand(dst, src);
+}
+
+void Assembler::evmovdqu(Address dst, XMMRegister src, int vector_len) {
+  assert(UseAVX > 0, "");
+  InstructionMark im(this);
+  assert(src != xnoreg, "sanity");
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FVM;
+    // swap src<->dst for encoding
+    vex_prefix_q(src, xnoreg, dst, VEX_SIMD_F3, vector_len, false);
+  } else {
+    // swap src<->dst for encoding
+    vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector_len, false);
+  }
   emit_int8(0x7F);
   emit_operand(src, dst);
 }
@@ -1845,7 +2274,11 @@
 // The selection is done in MacroAssembler::movdbl() and movflt().
 void Assembler::movlpd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x12, dst, src, VEX_SIMD_66);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_32bit;
+  }
+  emit_simd_arith(0x12, dst, src, VEX_SIMD_66, true);
 }
 
 void Assembler::movq( MMXRegister dst, Address src ) {
@@ -1871,7 +2304,13 @@
 void Assembler::movq(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_F3);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_64bit;
+    simd_prefix_q(dst, xnoreg, src, VEX_SIMD_F3, true);
+  } else {
+    simd_prefix(dst, src, VEX_SIMD_F3, true, VEX_OPCODE_0F);
+  }
   emit_int8(0x7E);
   emit_operand(dst, src);
 }
@@ -1879,7 +2318,14 @@
 void Assembler::movq(Address dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_66);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_64bit;
+    simd_prefix(src, xnoreg, dst, VEX_SIMD_66, true,
+                VEX_OPCODE_0F, true, AVX_128bit);
+  } else {
+    simd_prefix(dst, src, VEX_SIMD_66, true);
+  }
   emit_int8((unsigned char)0xD6);
   emit_operand(src, dst);
 }
@@ -1902,36 +2348,60 @@
 
 void Assembler::movsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x10, dst, src, VEX_SIMD_F2);
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0x10, dst, src, VEX_SIMD_F2, true);
+  } else {
+    emit_simd_arith(0x10, dst, src, VEX_SIMD_F2);
+  }
 }
 
 void Assembler::movsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F2);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_64bit;
+    emit_simd_arith_nonds_q(0x10, dst, src, VEX_SIMD_F2, true);
+  } else {
+    emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F2);
+  }
 }
 
 void Assembler::movsd(Address dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_F2);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_64bit;
+    simd_prefix_q(src, xnoreg, dst, VEX_SIMD_F2);
+  } else {
+    simd_prefix(src, xnoreg, dst, VEX_SIMD_F2, false);
+  }
   emit_int8(0x11);
   emit_operand(src, dst);
 }
 
 void Assembler::movss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith(0x10, dst, src, VEX_SIMD_F3);
+  emit_simd_arith(0x10, dst, src, VEX_SIMD_F3, true);
 }
 
 void Assembler::movss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F3);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_32bit;
+  }
+  emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F3, true);
 }
 
 void Assembler::movss(Address dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_F3);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_F3, false);
   emit_int8(0x11);
   emit_operand(src, dst);
 }
@@ -2023,16 +2493,30 @@
 
 void Assembler::mulsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x59, dst, src, VEX_SIMD_F2);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_64bit;
+    emit_simd_arith_q(0x59, dst, src, VEX_SIMD_F2);
+  } else {
+    emit_simd_arith(0x59, dst, src, VEX_SIMD_F2);
+  }
 }
 
 void Assembler::mulsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x59, dst, src, VEX_SIMD_F2);
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0x59, dst, src, VEX_SIMD_F2);
+  } else {
+    emit_simd_arith(0x59, dst, src, VEX_SIMD_F2);
+  }
 }
 
 void Assembler::mulss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_32bit;
+  }
   emit_simd_arith(0x59, dst, src, VEX_SIMD_F3);
 }
 
@@ -2332,22 +2816,30 @@
 void Assembler::packuswb(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
-  emit_simd_arith(0x67, dst, src, VEX_SIMD_66);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_32bit;
+  }
+  emit_simd_arith(0x67, dst, src, VEX_SIMD_66,
+                  false, (VM_Version::supports_avx512dq() == false));
 }
 
 void Assembler::packuswb(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x67, dst, src, VEX_SIMD_66);
-}
-
-void Assembler::vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0x67, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vpermq(XMMRegister dst, XMMRegister src, int imm8, bool vector256) {
+  emit_simd_arith(0x67, dst, src, VEX_SIMD_66,
+                  false, (VM_Version::supports_avx512dq() == false));
+}
+
+void Assembler::vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(UseAVX > 0, "some form of AVX must be enabled");
+  emit_vex_arith(0x67, dst, nds, src, VEX_SIMD_66, vector_len,
+                 false, (VM_Version::supports_avx512dq() == false));
+}
+
+void Assembler::vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len) {
   assert(VM_Version::supports_avx2(), "");
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, true, vector256);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, false,
+                                      VEX_OPCODE_0F_3A, true, vector_len);
   emit_int8(0x00);
   emit_int8(0xC0 | encode);
   emit_int8(imm8);
@@ -2361,7 +2853,8 @@
 void Assembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
   assert(VM_Version::supports_sse4_2(), "");
   InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
+  simd_prefix(dst, xnoreg, src, VEX_SIMD_66, false, VEX_OPCODE_0F_3A,
+              false, AVX_128bit, true);
   emit_int8(0x61);
   emit_operand(dst, src);
   emit_int8(imm8);
@@ -2369,7 +2862,8 @@
 
 void Assembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_2(), "");
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, false,
+                                      VEX_OPCODE_0F_3A, false, AVX_128bit, true);
   emit_int8(0x61);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
@@ -2377,7 +2871,8 @@
 
 void Assembler::pextrd(Register dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, false);
+  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, true, VEX_OPCODE_0F_3A,
+                                      false, AVX_128bit, (VM_Version::supports_avx512dq() == false));
   emit_int8(0x16);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
@@ -2385,7 +2880,8 @@
 
 void Assembler::pextrq(Register dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, true);
+  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, true, VEX_OPCODE_0F_3A,
+                                      false, AVX_128bit, (VM_Version::supports_avx512dq() == false));
   emit_int8(0x16);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
@@ -2393,7 +2889,8 @@
 
 void Assembler::pinsrd(XMMRegister dst, Register src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, false);
+  int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, true, VEX_OPCODE_0F_3A,
+                                      false, AVX_128bit, (VM_Version::supports_avx512dq() == false));
   emit_int8(0x22);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
@@ -2401,7 +2898,8 @@
 
 void Assembler::pinsrq(XMMRegister dst, Register src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, true);
+  int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, true, VEX_OPCODE_0F_3A,
+                                      false, AVX_128bit, (VM_Version::supports_avx512dq() == false));
   emit_int8(0x22);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
@@ -2409,15 +2907,18 @@
 
 void Assembler::pmovzxbw(XMMRegister dst, Address src) {
   assert(VM_Version::supports_sse4_1(), "");
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_HVM;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38);
   emit_int8(0x30);
   emit_operand(dst, src);
 }
 
 void Assembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38);
   emit_int8(0x30);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -2520,15 +3021,20 @@
 
 void Assembler::pshufb(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_ssse3(), "");
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38,
+                                      false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
   emit_int8(0x00);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::pshufb(XMMRegister dst, Address src) {
   assert(VM_Version::supports_ssse3(), "");
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FVM;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38,
+              false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
   emit_int8(0x00);
   emit_operand(dst, src);
 }
@@ -2545,8 +3051,12 @@
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_66);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_66, false);
   emit_int8(0x70);
   emit_operand(dst, src);
   emit_int8(mode & 0xFF);
@@ -2555,7 +3065,8 @@
 void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_F2);
+  emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_F2, false,
+                        (VM_Version::supports_avx512bw() == false));
   emit_int8(mode & 0xFF);
 }
 
@@ -2563,8 +3074,12 @@
   assert(isByte(mode), "invalid value");
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
-  InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_F2);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FVM;
+  }
+  InstructionMark im(this);
+  simd_prefix(dst, xnoreg, src, VEX_SIMD_F2, false, VEX_OPCODE_0F,
+              false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
   emit_int8(0x70);
   emit_operand(dst, src);
   emit_int8(mode & 0xFF);
@@ -2573,7 +3088,8 @@
 void Assembler::psrldq(XMMRegister dst, int shift) {
   // Shift 128 bit value in xmm register by number of bytes.
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66);
+  int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F,
+                                      false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
   emit_int8(0x73);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift);
@@ -2583,14 +3099,15 @@
   assert(VM_Version::supports_sse4_1(), "");
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
   InstructionMark im(this);
-  simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  simd_prefix(dst, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38);
   emit_int8(0x17);
   emit_operand(dst, src);
 }
 
 void Assembler::ptest(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
+                                      false, VEX_OPCODE_0F_38);
   emit_int8(0x17);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -2598,19 +3115,20 @@
 void Assembler::vptest(XMMRegister dst, Address src) {
   assert(VM_Version::supports_avx(), "");
   InstructionMark im(this);
-  bool vector256 = true;
+  int vector_len = AVX_256bit;
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector256);
+  vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
   emit_int8(0x17);
   emit_operand(dst, src);
 }
 
 void Assembler::vptest(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  bool vector256 = true;
-  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38);
+  int vector_len = AVX_256bit;
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
+                                     vector_len, VEX_OPCODE_0F_38);
   emit_int8(0x17);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -2618,6 +3136,9 @@
 void Assembler::punpcklbw(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FVM;
+  }
   emit_simd_arith(0x60, dst, src, VEX_SIMD_66);
 }
 
@@ -2629,6 +3150,10 @@
 void Assembler::punpckldq(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_32bit;
+  }
   emit_simd_arith(0x62, dst, src, VEX_SIMD_66);
 }
 
@@ -2838,12 +3363,22 @@
 
 void Assembler::sqrtsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x51, dst, src, VEX_SIMD_F2);
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0x51, dst, src, VEX_SIMD_F2);
+  } else {
+    emit_simd_arith(0x51, dst, src, VEX_SIMD_F2);
+  }
 }
 
 void Assembler::sqrtsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x51, dst, src, VEX_SIMD_F2);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_64bit;
+    emit_simd_arith_q(0x51, dst, src, VEX_SIMD_F2);
+  } else {
+    emit_simd_arith(0x51, dst, src, VEX_SIMD_F2);
+  }
 }
 
 void Assembler::sqrtss(XMMRegister dst, XMMRegister src) {
@@ -2857,6 +3392,10 @@
 
 void Assembler::sqrtss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_32bit;
+  }
   emit_simd_arith(0x51, dst, src, VEX_SIMD_F3);
 }
 
@@ -2907,12 +3446,20 @@
 
 void Assembler::subsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x5C, dst, src, VEX_SIMD_F2);
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0x5C, dst, src, VEX_SIMD_F2);
+  } else {
+    emit_simd_arith(0x5C, dst, src, VEX_SIMD_F2);
+  }
 }
 
 void Assembler::subsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x5C, dst, src, VEX_SIMD_F2);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_64bit;
+  }
+  emit_simd_arith_q(0x5C, dst, src, VEX_SIMD_F2);
 }
 
 void Assembler::subss(XMMRegister dst, XMMRegister src) {
@@ -2922,6 +3469,10 @@
 
 void Assembler::subss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_32bit;
+  }
   emit_simd_arith(0x5C, dst, src, VEX_SIMD_F3);
 }
 
@@ -2978,22 +3529,36 @@
 
 void Assembler::ucomisd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_64bit;
+    emit_simd_arith_nonds_q(0x2E, dst, src, VEX_SIMD_66, true);
+  } else {
+    emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66);
+  }
 }
 
 void Assembler::ucomisd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66);
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_nonds_q(0x2E, dst, src, VEX_SIMD_66, true);
+  } else {
+    emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66);
+  }
 }
 
 void Assembler::ucomiss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_32bit;
+  }
+  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE, true);
 }
 
 void Assembler::ucomiss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE);
+  emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE, true);
 }
 
 void Assembler::xabort(int8_t imm8) {
@@ -3075,82 +3640,138 @@
 
 void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_64bit;
+    emit_vex_arith_q(0x58, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
+  } else {
+    emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
+  }
 }
 
 void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
+  if (VM_Version::supports_evex()) {
+    emit_vex_arith_q(0x58, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
+  } else {
+    emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
+  }
 }
 
 void Assembler::vaddss(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_32bit;
+  }
+  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F3, AVX_128bit);
 }
 
 void Assembler::vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
+  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F3, AVX_128bit);
 }
 
 void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_64bit;
+    emit_vex_arith_q(0x5E, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
+  } else {
+    emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
+  }
 }
 
 void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
+  if (VM_Version::supports_evex()) {
+    emit_vex_arith_q(0x5E, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
+  } else {
+    emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
+  }
 }
 
 void Assembler::vdivss(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_32bit;
+  }
+  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F3, AVX_128bit);
 }
 
 void Assembler::vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
+  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F3, AVX_128bit);
 }
 
 void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_64bit;
+    emit_vex_arith_q(0x59, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
+  } else {
+    emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
+  }
 }
 
 void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
+  if (VM_Version::supports_evex()) {
+    emit_vex_arith_q(0x59, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
+  } else {
+    emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
+  }
 }
 
 void Assembler::vmulss(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_32bit;
+  }
+  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F3, AVX_128bit);
 }
 
 void Assembler::vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
+  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F3, AVX_128bit);
 }
 
 void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_64bit;
+    emit_vex_arith_q(0x5C, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
+  } else {
+    emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
+  }
 }
 
 void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
+  if (VM_Version::supports_evex()) {
+    emit_vex_arith_q(0x5C, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
+  } else {
+    emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F2, AVX_128bit);
+  }
 }
 
 void Assembler::vsubss(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_32bit;
+  }
+  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F3, AVX_128bit);
 }
 
 void Assembler::vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
+  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F3, AVX_128bit);
 }
 
 //====================VECTOR ARITHMETIC=====================================
@@ -3159,7 +3780,11 @@
 
 void Assembler::addpd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x58, dst, src, VEX_SIMD_66);
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0x58, dst, src, VEX_SIMD_66);
+  } else {
+    emit_simd_arith(0x58, dst, src, VEX_SIMD_66);
+  }
 }
 
 void Assembler::addps(XMMRegister dst, XMMRegister src) {
@@ -3167,29 +3792,47 @@
   emit_simd_arith(0x58, dst, src, VEX_SIMD_NONE);
 }
 
-void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  if (VM_Version::supports_evex()) {
+    emit_vex_arith_q(0x58, dst, nds, src, VEX_SIMD_66, vector_len);
+  } else {
+    emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_66, vector_len);
+  }
+}
+
+void Assembler::vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector_len);
+}
+
+void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector256);
-}
-
-void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_64bit;
+    emit_vex_arith_q(0x58, dst, nds, src, VEX_SIMD_66, vector_len);
+  } else {
+    emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_66, vector_len);
+  }
+}
+
+void Assembler::vaddps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vaddps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
-  assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector256);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_32bit;
+  }
+  emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
 void Assembler::subpd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x5C, dst, src, VEX_SIMD_66);
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0x5C, dst, src, VEX_SIMD_66);
+  } else {
+    emit_simd_arith(0x5C, dst, src, VEX_SIMD_66);
+  }
 }
 
 void Assembler::subps(XMMRegister dst, XMMRegister src) {
@@ -3197,29 +3840,47 @@
   emit_simd_arith(0x5C, dst, src, VEX_SIMD_NONE);
 }
 
-void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  if (VM_Version::supports_evex()) {
+    emit_vex_arith_q(0x5C, dst, nds, src, VEX_SIMD_66, vector_len);
+  } else {
+    emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_66, vector_len);
+  }
+}
+
+void Assembler::vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector_len);
+}
+
+void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector256);
-}
-
-void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_64bit;
+    emit_vex_arith_q(0x5C, dst, nds, src, VEX_SIMD_66, vector_len);
+  } else {
+    emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_66, vector_len);
+  }
+}
+
+void Assembler::vsubps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vsubps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
-  assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector256);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_32bit;
+  }
+  emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
 void Assembler::mulpd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x59, dst, src, VEX_SIMD_66);
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0x59, dst, src, VEX_SIMD_66);
+  } else {
+    emit_simd_arith(0x59, dst, src, VEX_SIMD_66);
+  }
 }
 
 void Assembler::mulps(XMMRegister dst, XMMRegister src) {
@@ -3227,29 +3888,47 @@
   emit_simd_arith(0x59, dst, src, VEX_SIMD_NONE);
 }
 
-void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  if (VM_Version::supports_evex()) {
+    emit_vex_arith_q(0x59, dst, nds, src, VEX_SIMD_66, vector_len);
+  } else {
+    emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_66, vector_len);
+  }
+}
+
+void Assembler::vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector_len);
+}
+
+void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector256);
-}
-
-void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_64bit;
+    emit_vex_arith_q(0x59, dst, nds, src, VEX_SIMD_66, vector_len);
+  } else {
+    emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_66, vector_len);
+  }
+}
+
+void Assembler::vmulps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vmulps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
-  assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector256);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_32bit;
+  }
+  emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
 void Assembler::divpd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x5E, dst, src, VEX_SIMD_66);
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0x5E, dst, src, VEX_SIMD_66);
+  } else {
+    emit_simd_arith(0x5E, dst, src, VEX_SIMD_66);
+  }
 }
 
 void Assembler::divps(XMMRegister dst, XMMRegister src) {
@@ -3257,118 +3936,199 @@
   emit_simd_arith(0x5E, dst, src, VEX_SIMD_NONE);
 }
 
-void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  if (VM_Version::supports_evex()) {
+    emit_vex_arith_q(0x5E, dst, nds, src, VEX_SIMD_66, vector_len);
+  } else {
+    emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_66, vector_len);
+  }
+}
+
+void Assembler::vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_avx(), "");
+  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector_len);
+}
+
+void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector256);
-}
-
-void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_64bit;
+    emit_vex_arith_q(0x5E, dst, nds, src, VEX_SIMD_66, vector_len);
+  } else {
+    emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_66, vector_len);
+  }
+}
+
+void Assembler::vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vdivps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
-  assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector256);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_32bit;
+  }
+  emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector_len);
 }
 
 void Assembler::andpd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x54, dst, src, VEX_SIMD_66);
+  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
+    emit_simd_arith_q(0x54, dst, src, VEX_SIMD_66);
+  } else {
+    emit_simd_arith(0x54, dst, src, VEX_SIMD_66, false, true);
+  }
 }
 
 void Assembler::andps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE);
+  emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE, false,
+                  (VM_Version::supports_avx512dq() == false));
 }
 
 void Assembler::andps(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_32bit;
+  }
+  emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE,
+                  false, (VM_Version::supports_avx512dq() == false));
 }
 
 void Assembler::andpd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x54, dst, src, VEX_SIMD_66);
-}
-
-void Assembler::vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_64bit;
+    emit_simd_arith_q(0x54, dst, src, VEX_SIMD_66);
+  } else {
+    emit_simd_arith(0x54, dst, src, VEX_SIMD_66, false, true);
+  }
+}
+
+void Assembler::vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
+    emit_vex_arith_q(0x54, dst, nds, src, VEX_SIMD_66, vector_len);
+  } else {
+    emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector_len, true);
+  }
+}
+
+void Assembler::vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector256);
-}
-
-void Assembler::vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  bool legacy_mode = (VM_Version::supports_avx512dq() == false);
+  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector_len, legacy_mode);
+}
+
+void Assembler::vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_64bit;
+    emit_vex_arith_q(0x54, dst, nds, src, VEX_SIMD_66, vector_len);
+  } else {
+    emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector_len, true);
+  }
+}
+
+void Assembler::vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector256);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_32bit;
+  }
+  emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector_len,
+                 (VM_Version::supports_avx512dq() == false));
 }
 
 void Assembler::xorpd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x57, dst, src, VEX_SIMD_66);
+  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
+    emit_simd_arith_q(0x57, dst, src, VEX_SIMD_66);
+  } else {
+    emit_simd_arith(0x57, dst, src, VEX_SIMD_66, false, true);
+  }
 }
 
 void Assembler::xorps(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE);
+  emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE,
+                  false, (VM_Version::supports_avx512dq() == false));
 }
 
 void Assembler::xorpd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0x57, dst, src, VEX_SIMD_66);
+  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_64bit;
+    emit_simd_arith_q(0x57, dst, src, VEX_SIMD_66);
+  } else {
+    emit_simd_arith(0x57, dst, src, VEX_SIMD_66, false, true);
+  }
 }
 
 void Assembler::xorps(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE);
-}
-
-void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_32bit;
+  }
+  emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE, false,
+                  (VM_Version::supports_avx512dq() == false));
+}
+
+void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
+    emit_vex_arith_q(0x57, dst, nds, src, VEX_SIMD_66, vector_len);
+  } else {
+    emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector_len, true);
+  }
+}
+
+void Assembler::vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector256);
-}
-
-void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector_len,
+                 (VM_Version::supports_avx512dq() == false));
+}
+
+void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
+  if (VM_Version::supports_evex() && VM_Version::supports_avx512dq()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_64bit;
+    emit_vex_arith_q(0x57, dst, nds, src, VEX_SIMD_66, vector_len);
+  } else {
+    emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector_len, true);
+  }
+}
+
+void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
   assert(VM_Version::supports_avx(), "");
-  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector256);
-}
-
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_32bit;
+  }
+  emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector_len,
+                 (VM_Version::supports_avx512dq() == false));
+}
 
 // Integer vector arithmetic
-void Assembler::vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38);
+void Assembler::vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_avx() && (vector_len == 0) ||
+         VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len,
+                                     VEX_OPCODE_0F_38, true, false);
   emit_int8(0x01);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-void Assembler::vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38);
+void Assembler::vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_avx() && (vector_len == 0) ||
+         VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len,
+                                     VEX_OPCODE_0F_38, true, false);
   emit_int8(0x02);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3390,61 +4150,89 @@
 
 void Assembler::paddq(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xD4, dst, src, VEX_SIMD_66);
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0xD4, dst, src, VEX_SIMD_66);
+  } else {
+    emit_simd_arith(0xD4, dst, src, VEX_SIMD_66);
+  }
 }
 
 void Assembler::phaddw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse3(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
+                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
   emit_int8(0x01);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::phaddd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse3(), ""));
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
+                                      VEX_OPCODE_0F_38, false, AVX_128bit, true);
   emit_int8(0x02);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xD4, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xD4, dst, nds, src, VEX_SIMD_66, vector256);
+void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector_len,
+                 (VM_Version::supports_avx512bw() == false));
+}
+
+void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector_len,
+                 (VM_Version::supports_avx512bw() == false));
+}
+
+void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector_len);
+}
+
+void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  if (VM_Version::supports_evex()) {
+    emit_vex_arith_q(0xD4, dst, nds, src, VEX_SIMD_66, vector_len);
+  } else {
+    emit_vex_arith(0xD4, dst, nds, src, VEX_SIMD_66, vector_len);
+  }
+}
+
+void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FVM;
+  }
+  emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector_len);
+}
+
+void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FVM;
+  }
+  emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector_len);
+}
+
+void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_32bit;
+  }
+  emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector_len);
+}
+
+void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_64bit;
+    emit_vex_arith_q(0xD4, dst, nds, src, VEX_SIMD_66, vector_len);
+  } else {
+    emit_vex_arith(0xD4, dst, nds, src, VEX_SIMD_66, vector_len);
+  }
 }
 
 void Assembler::psubb(XMMRegister dst, XMMRegister src) {
@@ -3464,84 +4252,149 @@
 
 void Assembler::psubq(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xFB, dst, src, VEX_SIMD_66);
-}
-
-void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xFB, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xFB, dst, nds, src, VEX_SIMD_66, vector256);
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0xFB, dst, src, VEX_SIMD_66);
+  } else {
+    emit_simd_arith(0xFB, dst, src, VEX_SIMD_66);
+  }
+}
+
+void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector_len,
+                 (VM_Version::supports_avx512bw() == false));
+}
+
+void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector_len,
+                 (VM_Version::supports_avx512bw() == false));
+}
+
+void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector_len);
+}
+
+void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  if (VM_Version::supports_evex()) {
+    emit_vex_arith_q(0xFB, dst, nds, src, VEX_SIMD_66, vector_len);
+  } else {
+    emit_vex_arith(0xFB, dst, nds, src, VEX_SIMD_66, vector_len);
+  }
+}
+
+void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FVM;
+  }
+  emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector_len,
+                 (VM_Version::supports_avx512bw() == false));
+}
+
+void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FVM;
+  }
+  emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector_len,
+                 (VM_Version::supports_avx512bw() == false));
+}
+
+void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_32bit;
+  }
+  emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector_len);
+}
+
+void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_64bit;
+    emit_vex_arith_q(0xFB, dst, nds, src, VEX_SIMD_66, vector_len);
+  } else {
+    emit_vex_arith(0xFB, dst, nds, src, VEX_SIMD_66, vector_len);
+  }
 }
 
 void Assembler::pmullw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xD5, dst, src, VEX_SIMD_66);
+  emit_simd_arith(0xD5, dst, src, VEX_SIMD_66,
+                  (VM_Version::supports_avx512bw() == false));
 }
 
 void Assembler::pmulld(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66,
+                                      false, VEX_OPCODE_0F_38);
+  emit_int8(0x40);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector_len,
+                 (VM_Version::supports_avx512bw() == false));
+}
+
+void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66,
+                                     vector_len, VEX_OPCODE_0F_38);
   emit_int8(0x40);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38);
+void Assembler::vpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(UseAVX > 2, "requires some form of AVX");
+  int src_enc = src->encoding();
+  int dst_enc = dst->encoding();
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+  int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66,
+                                     VEX_OPCODE_0F_38, true, vector_len, false, false);
   emit_int8(0x40);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
-void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FVM;
+  }
+  emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector_len);
+}
+
+void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_32bit;
+  }
   InstructionMark im(this);
   int dst_enc = dst->encoding();
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector256);
+  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66,
+             VEX_OPCODE_0F_38, false, vector_len);
+  emit_int8(0x40);
+  emit_operand(dst, src);
+}
+
+void Assembler::vpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_64bit;
+  }
+  InstructionMark im(this);
+  int dst_enc = dst->encoding();
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+  vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, true, vector_len);
   emit_int8(0x40);
   emit_operand(dst, src);
 }
@@ -3550,7 +4403,8 @@
 void Assembler::psllw(XMMRegister dst, int shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM6 is for /6 encoding: 66 0F 71 /6 ib
-  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66);
+  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F,
+                                      false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
   emit_int8(0x71);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
@@ -3559,7 +4413,7 @@
 void Assembler::pslld(XMMRegister dst, int shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM6 is for /6 encoding: 66 0F 72 /6 ib
-  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66);
+  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, false);
   emit_int8(0x72);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
@@ -3568,7 +4422,7 @@
 void Assembler::psllq(XMMRegister dst, int shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM6 is for /6 encoding: 66 0F 73 /6 ib
-  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66);
+  int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F, true);
   emit_int8(0x73);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
@@ -3576,7 +4430,8 @@
 
 void Assembler::psllw(XMMRegister dst, XMMRegister shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xF1, dst, shift, VEX_SIMD_66);
+  emit_simd_arith(0xF1, dst, shift, VEX_SIMD_66, false,
+                  (VM_Version::supports_avx512bw() == false));
 }
 
 void Assembler::pslld(XMMRegister dst, XMMRegister shift) {
@@ -3586,50 +4441,65 @@
 
 void Assembler::psllq(XMMRegister dst, XMMRegister shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xF3, dst, shift, VEX_SIMD_66);
-}
-
-void Assembler::vpsllw(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0xF3, dst, shift, VEX_SIMD_66);
+  } else {
+    emit_simd_arith(0xF3, dst, shift, VEX_SIMD_66);
+  }
+}
+
+void Assembler::vpsllw(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
   // XMM6 is for /6 encoding: 66 0F 71 /6 ib
-  emit_vex_arith(0x71, xmm6, dst, src, VEX_SIMD_66, vector256);
+  emit_vex_arith(0x71, xmm6, dst, src, VEX_SIMD_66, vector_len,
+                 (VM_Version::supports_avx512bw() == false));
   emit_int8(shift & 0xFF);
 }
 
-void Assembler::vpslld(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+void Assembler::vpslld(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
   // XMM6 is for /6 encoding: 66 0F 72 /6 ib
-  emit_vex_arith(0x72, xmm6, dst, src, VEX_SIMD_66, vector256);
+  emit_vex_arith(0x72, xmm6, dst, src, VEX_SIMD_66, vector_len);
   emit_int8(shift & 0xFF);
 }
 
-void Assembler::vpsllq(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+void Assembler::vpsllq(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
   // XMM6 is for /6 encoding: 66 0F 73 /6 ib
-  emit_vex_arith(0x73, xmm6, dst, src, VEX_SIMD_66, vector256);
+  if (VM_Version::supports_evex()) {
+    emit_vex_arith_q(0x73, xmm6, dst, src, VEX_SIMD_66, vector_len);
+  } else {
+    emit_vex_arith(0x73, xmm6, dst, src, VEX_SIMD_66, vector_len);
+  }
   emit_int8(shift & 0xFF);
 }
 
-void Assembler::vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xF1, dst, src, shift, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xF2, dst, src, shift, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xF3, dst, src, shift, VEX_SIMD_66, vector256);
+void Assembler::vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  emit_vex_arith(0xF1, dst, src, shift, VEX_SIMD_66, vector_len,
+                 (VM_Version::supports_avx512bw() == false));
+}
+
+void Assembler::vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  emit_vex_arith(0xF2, dst, src, shift, VEX_SIMD_66, vector_len);
+}
+
+void Assembler::vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  if (VM_Version::supports_evex()) {
+    emit_vex_arith_q(0xF3, dst, src, shift, VEX_SIMD_66, vector_len);
+  } else {
+    emit_vex_arith(0xF3, dst, src, shift, VEX_SIMD_66, vector_len);
+  }
 }
 
 // Shift packed integers logically right by specified number of bits.
 void Assembler::psrlw(XMMRegister dst, int shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM2 is for /2 encoding: 66 0F 71 /2 ib
-  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66);
+  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F,
+                                      (VM_Version::supports_avx512bw() == false));
   emit_int8(0x71);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
@@ -3638,7 +4508,7 @@
 void Assembler::psrld(XMMRegister dst, int shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM2 is for /2 encoding: 66 0F 72 /2 ib
-  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66);
+  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, false);
   emit_int8(0x72);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
@@ -3649,7 +4519,12 @@
   // shifts 128 bit value in xmm register by number of bytes.
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM2 is for /2 encoding: 66 0F 73 /2 ib
-  int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66);
+  int encode = 0;
+  if (VM_Version::supports_evex() && VM_Version::supports_avx512bw()) {
+    encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false);
+  } else {
+    encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F, true);
+  }
   emit_int8(0x73);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
@@ -3657,7 +4532,8 @@
 
 void Assembler::psrlw(XMMRegister dst, XMMRegister shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xD1, dst, shift, VEX_SIMD_66);
+  emit_simd_arith(0xD1, dst, shift, VEX_SIMD_66, false,
+                  (VM_Version::supports_avx512bw() == false));
 }
 
 void Assembler::psrld(XMMRegister dst, XMMRegister shift) {
@@ -3667,50 +4543,65 @@
 
 void Assembler::psrlq(XMMRegister dst, XMMRegister shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xD3, dst, shift, VEX_SIMD_66);
-}
-
-void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  if (VM_Version::supports_evex()) {
+    emit_simd_arith_q(0xD3, dst, shift, VEX_SIMD_66);
+  } else {
+    emit_simd_arith(0xD3, dst, shift, VEX_SIMD_66);
+  }
+}
+
+void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
   // XMM2 is for /2 encoding: 66 0F 73 /2 ib
-  emit_vex_arith(0x71, xmm2, dst, src, VEX_SIMD_66, vector256);
+  emit_vex_arith(0x71, xmm2, dst, src, VEX_SIMD_66, vector_len,
+                 (VM_Version::supports_avx512bw() == false));
   emit_int8(shift & 0xFF);
 }
 
-void Assembler::vpsrld(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+void Assembler::vpsrld(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
   // XMM2 is for /2 encoding: 66 0F 73 /2 ib
-  emit_vex_arith(0x72, xmm2, dst, src, VEX_SIMD_66, vector256);
+  emit_vex_arith(0x72, xmm2, dst, src, VEX_SIMD_66, vector_len);
   emit_int8(shift & 0xFF);
 }
 
-void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
   // XMM2 is for /2 encoding: 66 0F 73 /2 ib
-  emit_vex_arith(0x73, xmm2, dst, src, VEX_SIMD_66, vector256);
+  if (VM_Version::supports_evex()) {
+    emit_vex_arith_q(0x73, xmm2, dst, src, VEX_SIMD_66, vector_len);
+  } else {
+    emit_vex_arith(0x73, xmm2, dst, src, VEX_SIMD_66, vector_len);
+  }
   emit_int8(shift & 0xFF);
 }
 
-void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xD1, dst, src, shift, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xD2, dst, src, shift, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xD3, dst, src, shift, VEX_SIMD_66, vector256);
+void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  emit_vex_arith(0xD1, dst, src, shift, VEX_SIMD_66, vector_len,
+                 (VM_Version::supports_avx512bw() == false));
+}
+
+void Assembler::vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  emit_vex_arith(0xD2, dst, src, shift, VEX_SIMD_66, vector_len);
+}
+
+void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  if (VM_Version::supports_evex()) {
+    emit_vex_arith_q(0xD3, dst, src, shift, VEX_SIMD_66, vector_len);
+  } else {
+    emit_vex_arith(0xD3, dst, src, shift, VEX_SIMD_66, vector_len);
+  }
 }
 
 // Shift packed integers arithmetically right by specified number of bits.
 void Assembler::psraw(XMMRegister dst, int shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM4 is for /4 encoding: 66 0F 71 /4 ib
-  int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66);
+  int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66, false, VEX_OPCODE_0F,
+                                      (VM_Version::supports_avx512bw() == false));
   emit_int8(0x71);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
@@ -3719,7 +4610,7 @@
 void Assembler::psrad(XMMRegister dst, int shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // XMM4 is for /4 encoding: 66 0F 72 /4 ib
-  int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66);
+  int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66, false);
   emit_int8(0x72);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(shift & 0xFF);
@@ -3727,7 +4618,8 @@
 
 void Assembler::psraw(XMMRegister dst, XMMRegister shift) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  emit_simd_arith(0xE1, dst, shift, VEX_SIMD_66);
+  emit_simd_arith(0xE1, dst, shift, VEX_SIMD_66,
+                  (VM_Version::supports_avx512bw() == false));
 }
 
 void Assembler::psrad(XMMRegister dst, XMMRegister shift) {
@@ -3735,28 +4627,30 @@
   emit_simd_arith(0xE2, dst, shift, VEX_SIMD_66);
 }
 
-void Assembler::vpsraw(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+void Assembler::vpsraw(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
   // XMM4 is for /4 encoding: 66 0F 71 /4 ib
-  emit_vex_arith(0x71, xmm4, dst, src, VEX_SIMD_66, vector256);
+  emit_vex_arith(0x71, xmm4, dst, src, VEX_SIMD_66, vector_len,
+                 (VM_Version::supports_avx512bw() == false));
   emit_int8(shift & 0xFF);
 }
 
-void Assembler::vpsrad(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+void Assembler::vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
   // XMM4 is for /4 encoding: 66 0F 71 /4 ib
-  emit_vex_arith(0x72, xmm4, dst, src, VEX_SIMD_66, vector256);
+  emit_vex_arith(0x72, xmm4, dst, src, VEX_SIMD_66, vector_len);
   emit_int8(shift & 0xFF);
 }
 
-void Assembler::vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xE1, dst, src, shift, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xE2, dst, src, shift, VEX_SIMD_66, vector256);
+void Assembler::vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  emit_vex_arith(0xE1, dst, src, shift, VEX_SIMD_66, vector_len,
+                 (VM_Version::supports_avx512bw() == false));
+}
+
+void Assembler::vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  emit_vex_arith(0xE2, dst, src, shift, VEX_SIMD_66, vector_len);
 }
 
 
@@ -3766,14 +4660,18 @@
   emit_simd_arith(0xDB, dst, src, VEX_SIMD_66);
 }
 
-void Assembler::vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vpand(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector256);
+void Assembler::vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector_len);
+}
+
+void Assembler::vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_32bit;
+  }
+  emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::por(XMMRegister dst, XMMRegister src) {
@@ -3781,14 +4679,18 @@
   emit_simd_arith(0xEB, dst, src, VEX_SIMD_66);
 }
 
-void Assembler::vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vpor(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector256);
+void Assembler::vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector_len);
+}
+
+void Assembler::vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_32bit;
+  }
+  emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 void Assembler::pxor(XMMRegister dst, XMMRegister src) {
@@ -3796,21 +4698,25 @@
   emit_simd_arith(0xEF, dst, src, VEX_SIMD_66);
 }
 
-void Assembler::vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector256);
-}
-
-void Assembler::vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
-  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
-  emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector256);
+void Assembler::vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector_len);
+}
+
+void Assembler::vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+  assert(UseAVX > 0, "requires some form of AVX");
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_FV;
+    input_size_in_bits = EVEX_32bit;
+  }
+  emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector_len);
 }
 
 
 void Assembler::vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  bool vector256 = true;
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A);
+  int vector_len = AVX_256bit;
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A);
   emit_int8(0x18);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - insert into lower 128 bits
@@ -3818,14 +4724,51 @@
   emit_int8(0x01);
 }
 
+void Assembler::vinsertf64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_evex(), "");
+  int vector_len = AVX_512bit;
+  int src_enc = src->encoding();
+  int dst_enc = dst->encoding();
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+  int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66,
+                                     VEX_OPCODE_0F_3A, true, vector_len, false, false);
+  emit_int8(0x1A);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - insert into lower 256 bits
+  // 0x01 - insert into upper 256 bits
+  emit_int8(0x01);
+}
+
+void Assembler::vinsertf64x4h(XMMRegister dst, Address src) {
+  assert(VM_Version::supports_avx(), "");
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T4;
+    input_size_in_bits = EVEX_64bit;
+  }
+  InstructionMark im(this);
+  int vector_len = AVX_512bit;
+  assert(dst != xnoreg, "sanity");
+  int dst_enc = dst->encoding();
+  // swap src<->dst for encoding
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, true, vector_len);
+  emit_int8(0x1A);
+  emit_operand(dst, src);
+  // 0x01 - insert into upper 128 bits
+  emit_int8(0x01);
+}
+
 void Assembler::vinsertf128h(XMMRegister dst, Address src) {
   assert(VM_Version::supports_avx(), "");
-  InstructionMark im(this);
-  bool vector256 = true;
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T4;
+    input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  int vector_len = AVX_256bit;
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector_len);
   emit_int8(0x18);
   emit_operand(dst, src);
   // 0x01 - insert into upper 128 bits
@@ -3834,8 +4777,8 @@
 
 void Assembler::vextractf128h(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  bool vector256 = true;
-  int encode = vex_prefix_and_encode(src, xnoreg, dst, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A);
+  int vector_len = AVX_256bit;
+  int encode = vex_prefix_and_encode(src, xnoreg, dst, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A);
   emit_int8(0x19);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - insert into lower 128 bits
@@ -3845,11 +4788,15 @@
 
 void Assembler::vextractf128h(Address dst, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  InstructionMark im(this);
-  bool vector256 = true;
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T4;
+    input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  int vector_len = AVX_256bit;
   assert(src != xnoreg, "sanity");
   int src_enc = src->encoding();
-  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
+  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector_len);
   emit_int8(0x19);
   emit_operand(src, dst);
   // 0x01 - extract from upper 128 bits
@@ -3858,8 +4805,8 @@
 
 void Assembler::vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx2(), "");
-  bool vector256 = true;
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A);
+  int vector_len = AVX_256bit;
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A);
   emit_int8(0x38);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - insert into lower 128 bits
@@ -3867,38 +4814,169 @@
   emit_int8(0x01);
 }
 
+void Assembler::vinserti64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_evex(), "");
+  int vector_len = AVX_512bit;
+  int src_enc = src->encoding();
+  int dst_enc = dst->encoding();
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+  int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
+                                     VM_Version::supports_avx512dq(), vector_len, false, false);
+  emit_int8(0x38);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - insert into lower 256 bits
+  // 0x01 - insert into upper 256 bits
+  emit_int8(0x01);
+}
+
 void Assembler::vinserti128h(XMMRegister dst, Address src) {
   assert(VM_Version::supports_avx2(), "");
-  InstructionMark im(this);
-  bool vector256 = true;
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T4;
+    input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  int vector_len = AVX_256bit;
   assert(dst != xnoreg, "sanity");
   int dst_enc = dst->encoding();
   // swap src<->dst for encoding
-  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
+  vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector_len);
   emit_int8(0x38);
   emit_operand(dst, src);
   // 0x01 - insert into upper 128 bits
   emit_int8(0x01);
 }
 
+void Assembler::vextracti128h(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  int vector_len = AVX_256bit;
+  int encode = vex_prefix_and_encode(src, xnoreg, dst, VEX_SIMD_66, vector_len, VEX_OPCODE_0F_3A);
+  emit_int8(0x39);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - insert into lower 128 bits
+  // 0x01 - insert into upper 128 bits
+  emit_int8(0x01);
+}
+
 void Assembler::vextracti128h(Address dst, XMMRegister src) {
   assert(VM_Version::supports_avx2(), "");
-  InstructionMark im(this);
-  bool vector256 = true;
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T4;
+    input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  int vector_len = AVX_256bit;
   assert(src != xnoreg, "sanity");
   int src_enc = src->encoding();
-  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector256);
+  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A, false, vector_len);
   emit_int8(0x39);
   emit_operand(src, dst);
   // 0x01 - extract from upper 128 bits
   emit_int8(0x01);
 }
 
+void Assembler::vextracti64x4h(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_evex(), "");
+  int vector_len = AVX_512bit;
+  int src_enc = src->encoding();
+  int dst_enc = dst->encoding();
+  int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
+                                     true, vector_len, false, false);
+  emit_int8(0x3B);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x01 - extract from upper 256 bits
+  emit_int8(0x01);
+}
+
+void Assembler::vextracti64x2h(XMMRegister dst, XMMRegister src, int value) {
+  assert(VM_Version::supports_evex(), "");
+  int vector_len = AVX_512bit;
+  int src_enc = src->encoding();
+  int dst_enc = dst->encoding();
+  int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
+                                     VM_Version::supports_avx512dq(), vector_len, false, false);
+  emit_int8(0x39);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x01 - extract from bits 255:128
+  // 0x02 - extract from bits 383:256
+  // 0x03 - extract from bits 511:384
+  emit_int8(value & 0x3);
+}
+
+void Assembler::vextractf64x4h(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_evex(), "");
+  int vector_len = AVX_512bit;
+  int src_enc = src->encoding();
+  int dst_enc = dst->encoding();
+  int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
+                                     VM_Version::supports_avx512dq(), vector_len, false, false);
+  emit_int8(0x1B);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x01 - extract from upper 256 bits
+  emit_int8(0x01);
+}
+
+void Assembler::vextractf64x4h(Address dst, XMMRegister src) {
+  assert(VM_Version::supports_avx2(), "");
+  tuple_type = EVEX_T4;
+  input_size_in_bits = EVEX_64bit;
+  InstructionMark im(this);
+  int vector_len = AVX_512bit;
+  assert(src != xnoreg, "sanity");
+  int src_enc = src->encoding();
+  vex_prefix(dst, 0, src_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
+             VM_Version::supports_avx512dq(), vector_len);
+  emit_int8(0x1B);
+  emit_operand(src, dst);
+  // 0x01 - extract from upper 128 bits
+  emit_int8(0x01);
+}
+
+void Assembler::vextractf32x4h(XMMRegister dst, XMMRegister src, int value) {
+  assert(VM_Version::supports_evex(), "");
+  int vector_len = AVX_512bit;
+  int src_enc = src->encoding();
+  int dst_enc = dst->encoding();
+  int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66,
+                                     VEX_OPCODE_0F_3A, false, vector_len, false, false);
+  emit_int8(0x19);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x01 - extract from bits 255:128
+  // 0x02 - extract from bits 383:256
+  // 0x03 - extract from bits 511:384
+  emit_int8(value & 0x3);
+}
+
+void Assembler::vextractf64x2h(XMMRegister dst, XMMRegister src, int value) {
+  assert(VM_Version::supports_evex(), "");
+  int vector_len = AVX_512bit;
+  int src_enc = src->encoding();
+  int dst_enc = dst->encoding();
+  int encode = vex_prefix_and_encode(src_enc, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_3A,
+                                     VM_Version::supports_avx512dq(), vector_len, false, false);
+  emit_int8(0x19);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x01 - extract from bits 255:128
+  // 0x02 - extract from bits 383:256
+  // 0x03 - extract from bits 511:384
+  emit_int8(value & 0x3);
+}
+
 // duplicate 4-bytes integer data from src into 8 locations in dest
 void Assembler::vpbroadcastd(XMMRegister dst, XMMRegister src) {
   assert(VM_Version::supports_avx2(), "");
-  bool vector256 = true;
-  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38);
+  int vector_len = AVX_256bit;
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
+                                     vector_len, VEX_OPCODE_0F_38, false);
+  emit_int8(0x58);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// duplicate 4-bytes integer data from src into 8 locations in dest
+void Assembler::evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_evex(), "");
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
+                                     vector_len, VEX_OPCODE_0F_38, false);
   emit_int8(0x58);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -3906,7 +4984,8 @@
 // Carry-Less Multiplication Quadword
 void Assembler::pclmulqdq(XMMRegister dst, XMMRegister src, int mask) {
   assert(VM_Version::supports_clmul(), "");
-  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, false,
+                                      VEX_OPCODE_0F_3A, false, AVX_128bit, true);
   emit_int8(0x44);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8((unsigned char)mask);
@@ -3915,8 +4994,9 @@
 // Carry-Less Multiplication Quadword
 void Assembler::vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask) {
   assert(VM_Version::supports_avx() && VM_Version::supports_clmul(), "");
-  bool vector256 = false;
-  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A);
+  int vector_len = AVX_128bit;
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66,
+                                     vector_len, VEX_OPCODE_0F_3A, true);
   emit_int8(0x44);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8((unsigned char)mask);
@@ -3924,8 +5004,11 @@
 
 void Assembler::vzeroupper() {
   assert(VM_Version::supports_avx(), "");
-  (void)vex_prefix_and_encode(xmm0, xmm0, xmm0, VEX_SIMD_NONE);
-  emit_int8(0x77);
+  if (UseAVX < 3)
+  {
+    (void)vex_prefix_and_encode(xmm0, xmm0, xmm0, VEX_SIMD_NONE);
+    emit_int8(0x77);
+  }
 }
 
 
@@ -4442,7 +5525,7 @@
 }
 
 
-void Assembler::vex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w, int nds_enc, VexSimdPrefix pre, VexOpcode opc, bool vector256) {
+void Assembler::vex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w, int nds_enc, VexSimdPrefix pre, VexOpcode opc, int vector_len) {
   if (vex_b || vex_x || vex_w || (opc == VEX_OPCODE_0F_38) || (opc == VEX_OPCODE_0F_3A)) {
     prefix(VEX_3bytes);
 
@@ -4452,7 +5535,7 @@
     emit_int8(byte1);
 
     int byte2 = ((~nds_enc) & 0xf) << 3;
-    byte2 |= (vex_w ? VEX_W : 0) | (vector256 ? 4 : 0) | pre;
+    byte2 |= (vex_w ? VEX_W : 0) | ((vector_len > 0) ? 4 : 0) | pre;
     emit_int8(byte2);
   } else {
     prefix(VEX_2bytes);
@@ -4460,89 +5543,237 @@
     int byte1 = vex_r ? VEX_R : 0;
     byte1 = (~byte1) & 0x80;
     byte1 |= ((~nds_enc) & 0xf) << 3;
-    byte1 |= (vector256 ? 4 : 0) | pre;
+    byte1 |= ((vector_len > 0 ) ? 4 : 0) | pre;
     emit_int8(byte1);
   }
 }
 
-void Assembler::vex_prefix(Address adr, int nds_enc, int xreg_enc, VexSimdPrefix pre, VexOpcode opc, bool vex_w, bool vector256){
+// This is a 4 byte encoding
+void Assembler::evex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w, bool evex_r, bool evex_v,
+                            int nds_enc, VexSimdPrefix pre, VexOpcode opc,
+                            bool is_extended_context, bool is_merge_context,
+                            int vector_len, bool no_mask_reg ){
+  // EVEX 0x62 prefix
+  prefix(EVEX_4bytes);
+  evex_encoding = (vex_w ? VEX_W : 0) | (evex_r ? EVEX_Rb : 0);
+
+  // P0: byte 2, initialized to RXBR`00mm
+  // instead of not'd
+  int byte2 = (vex_r ? VEX_R : 0) | (vex_x ? VEX_X : 0) | (vex_b ? VEX_B : 0) | (evex_r ? EVEX_Rb : 0);
+  byte2 = (~byte2) & 0xF0;
+  // confine opc opcode extensions in mm bits to lower two bits
+  // of form {0F, 0F_38, 0F_3A}
+  byte2 |= opc;
+  emit_int8(byte2);
+
+  // P1: byte 3 as Wvvvv1pp
+  int byte3 = ((~nds_enc) & 0xf) << 3;
+  // p[10] is always 1
+  byte3 |= EVEX_F;
+  byte3 |= (vex_w & 1) << 7;
+  // confine pre opcode extensions in pp bits to lower two bits
+  // of form {66, F3, F2}
+  byte3 |= pre;
+  emit_int8(byte3);
+
+  // P2: byte 4 as zL'Lbv'aaa
+  int byte4 = (no_mask_reg) ? 0 : 1; // kregs are implemented in the low 3 bits as aaa (hard code k1, it will be initialized for now)
+  // EVEX.v` for extending EVEX.vvvv or VIDX
+  byte4 |= (evex_v ? 0: EVEX_V);
+  // third EXEC.b for broadcast actions
+  byte4 |= (is_extended_context ? EVEX_Rb : 0);
+  // fourth EVEX.L'L for vector length : 0 is 128, 1 is 256, 2 is 512, currently we do not support 1024
+  byte4 |= ((vector_len) & 0x3) << 5;
+  // last is EVEX.z for zero/merge actions
+  byte4 |= (is_merge_context ? EVEX_Z : 0);
+  emit_int8(byte4);
+}
+
+void Assembler::vex_prefix(Address adr, int nds_enc, int xreg_enc, VexSimdPrefix pre,
+                           VexOpcode opc, bool vex_w, int vector_len, bool legacy_mode, bool no_mask_reg) {
   bool vex_r = (xreg_enc >= 8);
   bool vex_b = adr.base_needs_rex();
   bool vex_x = adr.index_needs_rex();
-  vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector256);
-}
-
-int Assembler::vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc, bool vex_w, bool vector256) {
+  avx_vector_len = vector_len;
+
+  // if vector length is turned off, revert to AVX for vectors smaller than AVX_512bit
+  if (VM_Version::supports_avx512vl() == false) {
+    switch (vector_len) {
+    case AVX_128bit:
+    case AVX_256bit:
+      legacy_mode = true;
+      break;
+    }
+  }
+
+  if ((UseAVX > 2) && (legacy_mode == false))
+  {
+    bool evex_r = (xreg_enc >= 16);
+    bool evex_v = (nds_enc >= 16);
+    is_evex_instruction = true;
+    evex_prefix(vex_r, vex_b, vex_x, vex_w, evex_r, evex_v, nds_enc, pre, opc, false, false, vector_len, no_mask_reg);
+  } else {
+    vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector_len);
+  }
+}
+
+int Assembler::vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc,
+                                     bool vex_w, int vector_len, bool legacy_mode, bool no_mask_reg ) {
   bool vex_r = (dst_enc >= 8);
   bool vex_b = (src_enc >= 8);
   bool vex_x = false;
-  vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector256);
+  avx_vector_len = vector_len;
+
+  // if vector length is turned off, revert to AVX for vectors smaller than AVX_512bit
+  if (VM_Version::supports_avx512vl() == false) {
+    switch (vector_len) {
+    case AVX_128bit:
+    case AVX_256bit:
+      legacy_mode = true;
+      break;
+    }
+  }
+
+  if ((UseAVX > 2) && (legacy_mode == false))
+  {
+    bool evex_r = (dst_enc >= 16);
+    bool evex_v = (nds_enc >= 16);
+    // can use vex_x as bank extender on rm encoding
+    vex_x = (src_enc >= 16);
+    evex_prefix(vex_r, vex_b, vex_x, vex_w, evex_r, evex_v, nds_enc, pre, opc, false, false, vector_len, no_mask_reg);
+  } else {
+    vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector_len);
+  }
+
+  // return modrm byte components for operands
   return (((dst_enc & 7) << 3) | (src_enc & 7));
 }
 
 
-void Assembler::simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr, VexSimdPrefix pre, VexOpcode opc, bool rex_w, bool vector256) {
+void Assembler::simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr, VexSimdPrefix pre,
+                            bool no_mask_reg, VexOpcode opc, bool rex_w, int vector_len, bool legacy_mode) {
   if (UseAVX > 0) {
     int xreg_enc = xreg->encoding();
     int  nds_enc = nds->is_valid() ? nds->encoding() : 0;
-    vex_prefix(adr, nds_enc, xreg_enc, pre, opc, rex_w, vector256);
+    vex_prefix(adr, nds_enc, xreg_enc, pre, opc, rex_w, vector_len, legacy_mode, no_mask_reg);
   } else {
     assert((nds == xreg) || (nds == xnoreg), "wrong sse encoding");
     rex_prefix(adr, xreg, pre, opc, rex_w);
   }
 }
 
-int Assembler::simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, VexSimdPrefix pre, VexOpcode opc, bool rex_w, bool vector256) {
+int Assembler::simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, VexSimdPrefix pre,
+                                      bool no_mask_reg, VexOpcode opc, bool rex_w, int vector_len, bool legacy_mode) {
   int dst_enc = dst->encoding();
   int src_enc = src->encoding();
   if (UseAVX > 0) {
     int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-    return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, rex_w, vector256);
+    return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, rex_w, vector_len, legacy_mode, no_mask_reg);
   } else {
     assert((nds == dst) || (nds == src) || (nds == xnoreg), "wrong sse encoding");
     return rex_prefix_and_encode(dst_enc, src_enc, pre, opc, rex_w);
   }
 }
 
-void Assembler::emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre) {
-  InstructionMark im(this);
-  simd_prefix(dst, dst, src, pre);
+int Assembler::kreg_prefix_and_encode(KRegister dst, KRegister nds, KRegister src, VexSimdPrefix pre,
+                                      bool no_mask_reg, VexOpcode opc, bool rex_w, int vector_len) {
+  int dst_enc = dst->encoding();
+  int src_enc = src->encoding();
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+  return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, rex_w, vector_len, true, no_mask_reg);
+}
+
+int Assembler::kreg_prefix_and_encode(KRegister dst, KRegister nds, Register src, VexSimdPrefix pre,
+                                      bool no_mask_reg, VexOpcode opc, bool rex_w, int vector_len) {
+  int dst_enc = dst->encoding();
+  int src_enc = src->encoding();
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+  return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, rex_w, vector_len, true, no_mask_reg);
+}
+
+void Assembler::emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg, bool legacy_mode) {
+  InstructionMark im(this);
+  simd_prefix(dst, dst, src, pre, no_mask_reg, VEX_OPCODE_0F, false, AVX_128bit, legacy_mode);
   emit_int8(opcode);
   emit_operand(dst, src);
 }
 
-void Assembler::emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre) {
-  int encode = simd_prefix_and_encode(dst, dst, src, pre);
+void Assembler::emit_simd_arith_q(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg) {
+  InstructionMark im(this);
+  simd_prefix_q(dst, dst, src, pre, no_mask_reg);
+  emit_int8(opcode);
+  emit_operand(dst, src);
+}
+
+void Assembler::emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg, bool legacy_mode) {
+  int encode = simd_prefix_and_encode(dst, dst, src, pre, no_mask_reg, VEX_OPCODE_0F, false, AVX_128bit, legacy_mode);
+  emit_int8(opcode);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::emit_simd_arith_q(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg) {
+  int encode = simd_prefix_and_encode(dst, dst, src, pre, no_mask_reg, VEX_OPCODE_0F, true, AVX_128bit);
   emit_int8(opcode);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 // Versions with no second source register (non-destructive source).
-void Assembler::emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre) {
-  InstructionMark im(this);
-  simd_prefix(dst, xnoreg, src, pre);
+void Assembler::emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool opNoRegMask) {
+  InstructionMark im(this);
+  simd_prefix(dst, xnoreg, src, pre, opNoRegMask);
   emit_int8(opcode);
   emit_operand(dst, src);
 }
 
-void Assembler::emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre) {
-  int encode = simd_prefix_and_encode(dst, xnoreg, src, pre);
+void Assembler::emit_simd_arith_nonds_q(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool opNoRegMask) {
+  InstructionMark im(this);
+  simd_prefix_q(dst, xnoreg, src, pre, opNoRegMask);
+  emit_int8(opcode);
+  emit_operand(dst, src);
+}
+
+void Assembler::emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg, bool legacy_mode) {
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg, VEX_OPCODE_0F, legacy_mode, AVX_128bit);
+  emit_int8(opcode);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::emit_simd_arith_nonds_q(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg) {
+  int encode = simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg, VEX_OPCODE_0F, true, AVX_128bit);
   emit_int8(opcode);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 // 3-operands AVX instructions
-void Assembler::emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
-                               Address src, VexSimdPrefix pre, bool vector256) {
-  InstructionMark im(this);
-  vex_prefix(dst, nds, src, pre, vector256);
+void Assembler::emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds, Address src,
+                               VexSimdPrefix pre, int vector_len, bool no_mask_reg, bool legacy_mode) {
+  InstructionMark im(this);
+  vex_prefix(dst, nds, src, pre, vector_len, no_mask_reg, legacy_mode);
+  emit_int8(opcode);
+  emit_operand(dst, src);
+}
+
+void Assembler::emit_vex_arith_q(int opcode, XMMRegister dst, XMMRegister nds,
+                                 Address src, VexSimdPrefix pre, int vector_len, bool no_mask_reg) {
+  InstructionMark im(this);
+  vex_prefix_q(dst, nds, src, pre, vector_len, no_mask_reg);
   emit_int8(opcode);
   emit_operand(dst, src);
 }
 
-void Assembler::emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
-                               XMMRegister src, VexSimdPrefix pre, bool vector256) {
-  int encode = vex_prefix_and_encode(dst, nds, src, pre, vector256);
+void Assembler::emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src,
+                               VexSimdPrefix pre, int vector_len, bool no_mask_reg, bool legacy_mode) {
+  int encode = vex_prefix_and_encode(dst, nds, src, pre, vector_len, VEX_OPCODE_0F, false, no_mask_reg);
+  emit_int8(opcode);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::emit_vex_arith_q(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src,
+                                 VexSimdPrefix pre, int vector_len, bool no_mask_reg) {
+  int src_enc = src->encoding();
+  int dst_enc = dst->encoding();
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+  int encode = vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, VEX_OPCODE_0F, true, vector_len, false, no_mask_reg);
   emit_int8(opcode);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5040,6 +6271,10 @@
 }
 
 void Assembler::andnq(Register dst, Register src1, Address src2) {
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_64bit;
+  }
   InstructionMark im(this);
   assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
   vex_prefix_0F38_q(dst, src1, src2);
@@ -5181,44 +6416,52 @@
 
 void Assembler::cvtsi2sdq(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F2);
+  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F2, true);
   emit_int8(0x2A);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::cvtsi2sdq(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionMark im(this);
-  simd_prefix_q(dst, dst, src, VEX_SIMD_F2);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  simd_prefix_q(dst, dst, src, VEX_SIMD_F2, true);
   emit_int8(0x2A);
   emit_operand(dst, src);
 }
 
 void Assembler::cvtsi2ssq(XMMRegister dst, Register src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3);
+  int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3, true);
   emit_int8(0x2A);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::cvtsi2ssq(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionMark im(this);
-  simd_prefix_q(dst, dst, src, VEX_SIMD_F3);
+  if (VM_Version::supports_evex()) {
+    tuple_type = EVEX_T1S;
+    input_size_in_bits = EVEX_32bit;
+  }
+  InstructionMark im(this);
+  simd_prefix_q(dst, dst, src, VEX_SIMD_F3, true);
   emit_int8(0x2A);
   emit_operand(dst, src);
 }
 
 void Assembler::cvttsd2siq(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F2);
+  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, true);
   emit_int8(0x2C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
 void Assembler::cvttss2siq(Register dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F3);
+  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, true);
   emit_int8(0x2C);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5387,7 +6630,7 @@
 void Assembler::movdq(XMMRegister dst, Register src) {
   // table D-1 says MMX/SSE2
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_66);
+  int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_66, true);
   emit_int8(0x6E);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5396,7 +6639,7 @@
   // table D-1 says MMX/SSE2
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   // swap src/dst to get correct prefix
-  int encode = simd_prefix_and_encode_q(src, dst, VEX_SIMD_66);
+  int encode = simd_prefix_and_encode_q(src, dst, VEX_SIMD_66, true);
   emit_int8(0x7E);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5529,7 +6772,8 @@
 
 void Assembler::mulxq(Register dst1, Register dst2, Register src) {
   assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_and_encode(dst1->encoding(), dst2->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38, true, false);
+  int encode = vex_prefix_and_encode(dst1->encoding(), dst2->encoding(), src->encoding(),
+                                     VEX_SIMD_F2, VEX_OPCODE_0F_38, true, AVX_128bit, true, false);
   emit_int8((unsigned char)0xF6);
   emit_int8((unsigned char)(0xC0 | encode));
 }
@@ -5678,7 +6922,8 @@
 
 void Assembler::rorxq(Register dst, Register src, int imm8) {
   assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported");
-  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_3A, true, false);
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2,
+                                     VEX_OPCODE_0F_3A, true, AVX_128bit, true, false);
   emit_int8((unsigned char)0xF0);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Fri May 08 11:49:20 2015 -0700
@@ -438,7 +438,7 @@
 
 };
 
-const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY( 512 / wordSize);
+const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY( 512*2 / wordSize);
 
 // The Intel x86/Amd64 Assembler: Pure assembler doing NO optimizations on the instruction
 // level (e.g. mov rax, 0 is not translated into xor rax, rax!); i.e., what you write
@@ -503,7 +503,8 @@
     REX_WRXB   = 0x4F,
 
     VEX_3bytes = 0xC4,
-    VEX_2bytes = 0xC5
+    VEX_2bytes = 0xC5,
+    EVEX_4bytes = 0x62
   };
 
   enum VexPrefix {
@@ -513,6 +514,14 @@
     VEX_W = 0x80
   };
 
+  enum ExexPrefix {
+    EVEX_F  = 0x04,
+    EVEX_V  = 0x08,
+    EVEX_Rb = 0x10,
+    EVEX_X  = 0x40,
+    EVEX_Z  = 0x80
+  };
+
   enum VexSimdPrefix {
     VEX_SIMD_NONE = 0x0,
     VEX_SIMD_66   = 0x1,
@@ -527,6 +536,37 @@
     VEX_OPCODE_0F_3A = 0x3
   };
 
+  enum AvxVectorLen {
+    AVX_128bit = 0x0,
+    AVX_256bit = 0x1,
+    AVX_512bit = 0x2,
+    AVX_NoVec  = 0x4
+  };
+
+  enum EvexTupleType {
+    EVEX_FV   = 0,
+    EVEX_HV   = 4,
+    EVEX_FVM  = 6,
+    EVEX_T1S  = 7,
+    EVEX_T1F  = 11,
+    EVEX_T2   = 13,
+    EVEX_T4   = 15,
+    EVEX_T8   = 17,
+    EVEX_HVM  = 18,
+    EVEX_QVM  = 19,
+    EVEX_OVM  = 20,
+    EVEX_M128 = 21,
+    EVEX_DUP  = 22,
+    EVEX_ETUP = 23
+  };
+
+  enum EvexInputSizeInBits {
+    EVEX_8bit  = 0,
+    EVEX_16bit = 1,
+    EVEX_32bit = 2,
+    EVEX_64bit = 3
+  };
+
   enum WhichOperand {
     // input to locate_operand, and format code for relocations
     imm_operand  = 0,            // embedded 32-bit|64-bit immediate operand
@@ -554,6 +594,11 @@
 
 private:
 
+  int evex_encoding;
+  int input_size_in_bits;
+  int avx_vector_len;
+  int tuple_type;
+  bool is_evex_instruction;
 
   // 64bit prefixes
   int prefix_and_encode(int reg_enc, bool byteinst = false);
@@ -580,108 +625,143 @@
 
   void vex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w,
                   int nds_enc, VexSimdPrefix pre, VexOpcode opc,
-                  bool vector256);
+                  int vector_len);
+
+  void evex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w, bool evex_r, bool evex_v,
+                   int nds_enc, VexSimdPrefix pre, VexOpcode opc,
+                   bool is_extended_context, bool is_merge_context,
+                   int vector_len, bool no_mask_reg );
 
   void vex_prefix(Address adr, int nds_enc, int xreg_enc,
                   VexSimdPrefix pre, VexOpcode opc,
-                  bool vex_w, bool vector256);
+                  bool vex_w, int vector_len,
+                  bool legacy_mode = false, bool no_mask_reg = false);
 
   void vex_prefix(XMMRegister dst, XMMRegister nds, Address src,
-                  VexSimdPrefix pre, bool vector256 = false) {
+                  VexSimdPrefix pre, int vector_len = AVX_128bit,
+                  bool no_mask_reg = false, bool legacy_mode = false) {
     int dst_enc = dst->encoding();
     int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-    vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, false, vector256);
+    vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, false, vector_len, legacy_mode, no_mask_reg);
   }
 
-  void vex_prefix_0F38(Register dst, Register nds, Address src) {
-    bool vex_w = false;
-    bool vector256 = false;
-    vex_prefix(src, nds->encoding(), dst->encoding(),
-               VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);
+  void vex_prefix_q(XMMRegister dst, XMMRegister nds, Address src,
+                    VexSimdPrefix pre, int vector_len = AVX_128bit,
+                    bool no_mask_reg = false) {
+    int dst_enc = dst->encoding();
+    int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+    vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, true, vector_len, false, no_mask_reg);
   }
 
-  void vex_prefix_0F38_q(Register dst, Register nds, Address src) {
+  void vex_prefix_0F38(Register dst, Register nds, Address src, bool no_mask_reg = false) {
+    bool vex_w = false;
+    int vector_len = AVX_128bit;
+    vex_prefix(src, nds->encoding(), dst->encoding(),
+               VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w,
+               vector_len, no_mask_reg);
+  }
+
+  void vex_prefix_0F38_q(Register dst, Register nds, Address src, bool no_mask_reg = false) {
     bool vex_w = true;
-    bool vector256 = false;
+    int vector_len = AVX_128bit;
     vex_prefix(src, nds->encoding(), dst->encoding(),
-               VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);
+               VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w,
+               vector_len, no_mask_reg);
   }
   int  vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc,
                              VexSimdPrefix pre, VexOpcode opc,
-                             bool vex_w, bool vector256);
+                             bool vex_w, int vector_len,
+                             bool legacy_mode, bool no_mask_reg);
 
-  int  vex_prefix_0F38_and_encode(Register dst, Register nds, Register src) {
+  int  vex_prefix_0F38_and_encode(Register dst, Register nds, Register src, bool no_mask_reg = false) {
     bool vex_w = false;
-    bool vector256 = false;
+    int vector_len = AVX_128bit;
     return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
-                                 VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);
+                                 VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
+                                 false, no_mask_reg);
   }
-  int  vex_prefix_0F38_and_encode_q(Register dst, Register nds, Register src) {
+  int  vex_prefix_0F38_and_encode_q(Register dst, Register nds, Register src, bool no_mask_reg = false) {
     bool vex_w = true;
-    bool vector256 = false;
+    int vector_len = AVX_128bit;
     return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
-                                 VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);
+                                 VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
+                                 false, no_mask_reg);
   }
   int  vex_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
-                             VexSimdPrefix pre, bool vector256 = false,
-                             VexOpcode opc = VEX_OPCODE_0F) {
+                             VexSimdPrefix pre, int vector_len = AVX_128bit,
+                             VexOpcode opc = VEX_OPCODE_0F, bool legacy_mode = false,
+                             bool no_mask_reg = false) {
     int src_enc = src->encoding();
     int dst_enc = dst->encoding();
     int nds_enc = nds->is_valid() ? nds->encoding() : 0;
-    return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, false, vector256);
+    return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, false, vector_len, legacy_mode, no_mask_reg);
   }
 
   void simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr,
-                   VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
-                   bool rex_w = false, bool vector256 = false);
+                   VexSimdPrefix pre, bool no_mask_reg, VexOpcode opc = VEX_OPCODE_0F,
+                   bool rex_w = false, int vector_len = AVX_128bit, bool legacy_mode = false);
 
-  void simd_prefix(XMMRegister dst, Address src,
-                   VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
-    simd_prefix(dst, xnoreg, src, pre, opc);
+  void simd_prefix(XMMRegister dst, Address src, VexSimdPrefix pre,
+                   bool no_mask_reg, VexOpcode opc = VEX_OPCODE_0F) {
+    simd_prefix(dst, xnoreg, src, pre, no_mask_reg, opc);
   }
 
-  void simd_prefix(Address dst, XMMRegister src, VexSimdPrefix pre) {
-    simd_prefix(src, dst, pre);
+  void simd_prefix(Address dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg) {
+    simd_prefix(src, dst, pre, no_mask_reg);
   }
   void simd_prefix_q(XMMRegister dst, XMMRegister nds, Address src,
-                     VexSimdPrefix pre) {
+                     VexSimdPrefix pre, bool no_mask_reg = false) {
     bool rex_w = true;
-    simd_prefix(dst, nds, src, pre, VEX_OPCODE_0F, rex_w);
+    simd_prefix(dst, nds, src, pre, no_mask_reg, VEX_OPCODE_0F, rex_w);
   }
 
   int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
-                             VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
-                             bool rex_w = false, bool vector256 = false);
+                             VexSimdPrefix pre, bool no_mask_reg,
+                             VexOpcode opc = VEX_OPCODE_0F,
+                             bool rex_w = false, int vector_len = AVX_128bit,
+                             bool legacy_mode = false);
+
+  int kreg_prefix_and_encode(KRegister dst, KRegister nds, KRegister src,
+                             VexSimdPrefix pre, bool no_mask_reg,
+                             VexOpcode opc = VEX_OPCODE_0F,
+                             bool rex_w = false, int vector_len = AVX_128bit);
+
+  int kreg_prefix_and_encode(KRegister dst, KRegister nds, Register src,
+                             VexSimdPrefix pre, bool no_mask_reg,
+                             VexOpcode opc = VEX_OPCODE_0F,
+                             bool rex_w = false, int vector_len = AVX_128bit);
 
   // Move/convert 32-bit integer value.
   int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, Register src,
-                             VexSimdPrefix pre) {
+                             VexSimdPrefix pre, bool no_mask_reg) {
     // It is OK to cast from Register to XMMRegister to pass argument here
     // since only encoding is used in simd_prefix_and_encode() and number of
     // Gen and Xmm registers are the same.
-    return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre);
+    return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre, no_mask_reg, VEX_OPCODE_0F);
   }
-  int simd_prefix_and_encode(XMMRegister dst, Register src, VexSimdPrefix pre) {
-    return simd_prefix_and_encode(dst, xnoreg, src, pre);
+  int simd_prefix_and_encode(XMMRegister dst, Register src, VexSimdPrefix pre, bool no_mask_reg) {
+    return simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg);
   }
   int simd_prefix_and_encode(Register dst, XMMRegister src,
-                             VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
-    return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, opc);
+                             VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
+                             bool no_mask_reg = false) {
+    return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, no_mask_reg, opc);
   }
 
   // Move/convert 64-bit integer value.
   int simd_prefix_and_encode_q(XMMRegister dst, XMMRegister nds, Register src,
-                               VexSimdPrefix pre) {
+                               VexSimdPrefix pre, bool no_mask_reg = false) {
     bool rex_w = true;
-    return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre, VEX_OPCODE_0F, rex_w);
+    return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre, no_mask_reg, VEX_OPCODE_0F, rex_w);
   }
-  int simd_prefix_and_encode_q(XMMRegister dst, Register src, VexSimdPrefix pre) {
-    return simd_prefix_and_encode_q(dst, xnoreg, src, pre);
+  int simd_prefix_and_encode_q(XMMRegister dst, Register src, VexSimdPrefix pre, bool no_mask_reg) {
+    return simd_prefix_and_encode_q(dst, xnoreg, src, pre, no_mask_reg);
   }
   int simd_prefix_and_encode_q(Register dst, XMMRegister src,
-                             VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
+                               VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
+                               bool no_mask_reg = false) {
     bool rex_w = true;
-    return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, opc, rex_w);
+    return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, no_mask_reg, opc, rex_w);
   }
 
   // Helper functions for groups of instructions
@@ -692,14 +772,28 @@
   void emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32);
   void emit_arith(int op1, int op2, Register dst, Register src);
 
-  void emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre);
-  void emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre);
-  void emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre);
-  void emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre);
+  void emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false, bool legacy_mode = false);
+  void emit_simd_arith_q(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false);
+  void emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false, bool legacy_mode = false);
+  void emit_simd_arith_q(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false);
+  void emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false);
+  void emit_simd_arith_nonds_q(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false);
+  void emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false, bool legacy_mode = false);
+  void emit_simd_arith_nonds_q(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false);
   void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
-                      Address src, VexSimdPrefix pre, bool vector256);
+                      Address src, VexSimdPrefix pre, int vector_len,
+                      bool no_mask_reg = false, bool legacy_mode = false);
+  void emit_vex_arith_q(int opcode, XMMRegister dst, XMMRegister nds,
+                        Address src, VexSimdPrefix pre, int vector_len,
+                        bool no_mask_reg = false);
   void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
-                      XMMRegister src, VexSimdPrefix pre, bool vector256);
+                      XMMRegister src, VexSimdPrefix pre, int vector_len,
+                      bool no_mask_reg = false, bool legacy_mode = false);
+  void emit_vex_arith_q(int opcode, XMMRegister dst, XMMRegister nds,
+                        XMMRegister src, VexSimdPrefix pre, int vector_len,
+                        bool no_mask_reg = false);
+
+  bool emit_compressed_disp_byte(int &disp);
 
   void emit_operand(Register reg,
                     Register base, Register index, Address::ScaleFactor scale,
@@ -825,7 +919,9 @@
   public:
 
   // Creation
-  Assembler(CodeBuffer* code) : AbstractAssembler(code) {}
+  Assembler(CodeBuffer* code) : AbstractAssembler(code) {
+    init_attributes();
+  }
 
   // Decoding
   static address locate_operand(address inst, WhichOperand which);
@@ -833,11 +929,21 @@
 
   // Utilities
   static bool is_polling_page_far() NOT_LP64({ return false;});
+  static bool query_compressed_disp_byte(int disp, bool is_evex_inst, int vector_len,
+                                         int cur_tuple_type, int in_size_in_bits, int cur_encoding);
 
   // Generic instructions
   // Does 32bit or 64bit as needed for the platform. In some sense these
   // belong in macro assembler but there is no need for both varieties to exist
 
+  void init_attributes(void) {
+    evex_encoding = 0;
+    input_size_in_bits = 0;
+    avx_vector_len = AVX_NoVec;
+    tuple_type = EVEX_ETUP;
+    is_evex_instruction = false;
+  }
+
   void lea(Register dst, Address src);
 
   void mov(Register dst, Register src);
@@ -1338,6 +1444,12 @@
   void movb(Address dst, int imm8);
   void movb(Register dst, Address src);
 
+  void kmovq(KRegister dst, KRegister src);
+  void kmovql(KRegister dst, Register src);
+  void kmovdl(KRegister dst, Register src);
+  void kmovq(Address dst, KRegister src);
+  void kmovq(KRegister dst, Address src);
+
   void movdl(XMMRegister dst, Register src);
   void movdl(Register dst, XMMRegister src);
   void movdl(XMMRegister dst, Address src);
@@ -1361,6 +1473,11 @@
   void vmovdqu(XMMRegister dst, Address src);
   void vmovdqu(XMMRegister dst, XMMRegister src);
 
+   // Move Unaligned 512bit Vector
+  void evmovdqu(Address dst, XMMRegister src, int vector_len);
+  void evmovdqu(XMMRegister dst, Address src, int vector_len);
+  void evmovdqu(XMMRegister dst, XMMRegister src, int vector_len);
+
   // Move lower 64bit to high 64bit in 128bit register
   void movlhps(XMMRegister dst, XMMRegister src);
 
@@ -1486,10 +1603,10 @@
   // Pack with unsigned saturation
   void packuswb(XMMRegister dst, XMMRegister src);
   void packuswb(XMMRegister dst, Address src);
-  void vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
 
   // Pemutation of 64bit words
-  void vpermq(XMMRegister dst, XMMRegister src, int imm8, bool vector256);
+  void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
 
   void pause();
 
@@ -1734,54 +1851,54 @@
   // Add Packed Floating-Point Values
   void addpd(XMMRegister dst, XMMRegister src);
   void addps(XMMRegister dst, XMMRegister src);
-  void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
-  void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
-  void vaddpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
-  void vaddps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vaddpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+  void vaddps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
   // Subtract Packed Floating-Point Values
   void subpd(XMMRegister dst, XMMRegister src);
   void subps(XMMRegister dst, XMMRegister src);
-  void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
-  void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
-  void vsubpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
-  void vsubps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vsubpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+  void vsubps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
   // Multiply Packed Floating-Point Values
   void mulpd(XMMRegister dst, XMMRegister src);
   void mulps(XMMRegister dst, XMMRegister src);
-  void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
-  void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
-  void vmulpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
-  void vmulps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vmulpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+  void vmulps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
   // Divide Packed Floating-Point Values
   void divpd(XMMRegister dst, XMMRegister src);
   void divps(XMMRegister dst, XMMRegister src);
-  void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
-  void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
-  void vdivpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
-  void vdivps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+  void vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
   // Bitwise Logical AND of Packed Floating-Point Values
   void andpd(XMMRegister dst, XMMRegister src);
   void andps(XMMRegister dst, XMMRegister src);
-  void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
-  void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
-  void vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
-  void vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+  void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
   // Bitwise Logical XOR of Packed Floating-Point Values
   void xorpd(XMMRegister dst, XMMRegister src);
   void xorps(XMMRegister dst, XMMRegister src);
-  void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
-  void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
-  void vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
-  void vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+  void vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
   // Add horizontal packed integers
-  void vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
-  void vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
   void phaddw(XMMRegister dst, XMMRegister src);
   void phaddd(XMMRegister dst, XMMRegister src);
 
@@ -1790,36 +1907,38 @@
   void paddw(XMMRegister dst, XMMRegister src);
   void paddd(XMMRegister dst, XMMRegister src);
   void paddq(XMMRegister dst, XMMRegister src);
-  void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
-  void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
-  void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
-  void vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
-  void vpaddb(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
-  void vpaddw(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
-  void vpaddd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
-  void vpaddq(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+  void vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+  void vpaddd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+  void vpaddq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
   // Sub packed integers
   void psubb(XMMRegister dst, XMMRegister src);
   void psubw(XMMRegister dst, XMMRegister src);
   void psubd(XMMRegister dst, XMMRegister src);
   void psubq(XMMRegister dst, XMMRegister src);
-  void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
-  void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
-  void vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
-  void vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
-  void vpsubb(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
-  void vpsubw(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
-  void vpsubd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
-  void vpsubq(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+  void vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+  void vpsubd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+  void vpsubq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
   // Multiply packed integers (only shorts and ints)
   void pmullw(XMMRegister dst, XMMRegister src);
   void pmulld(XMMRegister dst, XMMRegister src);
-  void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
-  void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
-  void vpmullw(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
-  void vpmulld(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+  void vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+  void vpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
   // Shift left packed integers
   void psllw(XMMRegister dst, int shift);
@@ -1828,12 +1947,12 @@
   void psllw(XMMRegister dst, XMMRegister shift);
   void pslld(XMMRegister dst, XMMRegister shift);
   void psllq(XMMRegister dst, XMMRegister shift);
-  void vpsllw(XMMRegister dst, XMMRegister src, int shift, bool vector256);
-  void vpslld(XMMRegister dst, XMMRegister src, int shift, bool vector256);
-  void vpsllq(XMMRegister dst, XMMRegister src, int shift, bool vector256);
-  void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
-  void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
-  void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
+  void vpsllw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
+  void vpslld(XMMRegister dst, XMMRegister src, int shift, int vector_len);
+  void vpsllq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
+  void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
+  void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
+  void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
 
   // Logical shift right packed integers
   void psrlw(XMMRegister dst, int shift);
@@ -1842,42 +1961,43 @@
   void psrlw(XMMRegister dst, XMMRegister shift);
   void psrld(XMMRegister dst, XMMRegister shift);
   void psrlq(XMMRegister dst, XMMRegister shift);
-  void vpsrlw(XMMRegister dst, XMMRegister src, int shift, bool vector256);
-  void vpsrld(XMMRegister dst, XMMRegister src, int shift, bool vector256);
-  void vpsrlq(XMMRegister dst, XMMRegister src, int shift, bool vector256);
-  void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
-  void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
-  void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
+  void vpsrlw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
+  void vpsrld(XMMRegister dst, XMMRegister src, int shift, int vector_len);
+  void vpsrlq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
+  void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
+  void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
+  void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
 
   // Arithmetic shift right packed integers (only shorts and ints, no instructions for longs)
   void psraw(XMMRegister dst, int shift);
   void psrad(XMMRegister dst, int shift);
   void psraw(XMMRegister dst, XMMRegister shift);
   void psrad(XMMRegister dst, XMMRegister shift);
-  void vpsraw(XMMRegister dst, XMMRegister src, int shift, bool vector256);
-  void vpsrad(XMMRegister dst, XMMRegister src, int shift, bool vector256);
-  void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
-  void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
+  void vpsraw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
+  void vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_len);
+  void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
+  void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
 
   // And packed integers
   void pand(XMMRegister dst, XMMRegister src);
-  void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
-  void vpand(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
   // Or packed integers
   void por(XMMRegister dst, XMMRegister src);
-  void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
-  void vpor(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
   // Xor packed integers
   void pxor(XMMRegister dst, XMMRegister src);
-  void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
-  void vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+  void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+  void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
   // Copy low 128bit into high 128bit of YMM registers.
   void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
   void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
   void vextractf128h(XMMRegister dst, XMMRegister src);
+  void vextracti128h(XMMRegister dst, XMMRegister src);
 
   // Load/store high 128bit of YMM registers which does not destroy other half.
   void vinsertf128h(XMMRegister dst, Address src);
@@ -1885,9 +2005,25 @@
   void vextractf128h(Address dst, XMMRegister src);
   void vextracti128h(Address dst, XMMRegister src);
 
+  // Copy low 256bit into high 256bit of ZMM registers.
+  void vinserti64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vinsertf64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vextracti64x4h(XMMRegister dst, XMMRegister src);
+  void vextractf64x4h(XMMRegister dst, XMMRegister src);
+  void vextractf64x4h(Address dst, XMMRegister src);
+  void vinsertf64x4h(XMMRegister dst, Address src);
+
+  // Copy targeted 128bit segments of the ZMM registers
+  void vextracti64x2h(XMMRegister dst, XMMRegister src, int value);
+  void vextractf64x2h(XMMRegister dst, XMMRegister src, int value);
+  void vextractf32x4h(XMMRegister dst, XMMRegister src, int value);
+
   // duplicate 4-bytes integer data from src into 8 locations in dest
   void vpbroadcastd(XMMRegister dst, XMMRegister src);
 
+  // duplicate 4-bytes integer data from src into vector_len locations in dest
+  void evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len);
+
   // Carry-Less Multiplication Quadword
   void pclmulqdq(XMMRegister dst, XMMRegister src, int mask);
   void vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask);
--- a/hotspot/src/cpu/x86/vm/c1_FrameMap_x86.cpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/c1_FrameMap_x86.cpp	Fri May 08 11:49:20 2015 -0700
@@ -233,13 +233,30 @@
   _xmm_regs[13]  = xmm13;
   _xmm_regs[14]  = xmm14;
   _xmm_regs[15]  = xmm15;
+  _xmm_regs[16]  = xmm16;
+  _xmm_regs[17]  = xmm17;
+  _xmm_regs[18]  = xmm18;
+  _xmm_regs[19]  = xmm19;
+  _xmm_regs[20]  = xmm20;
+  _xmm_regs[21]  = xmm21;
+  _xmm_regs[22]  = xmm22;
+  _xmm_regs[23]  = xmm23;
+  _xmm_regs[24]  = xmm24;
+  _xmm_regs[25]  = xmm25;
+  _xmm_regs[26]  = xmm26;
+  _xmm_regs[27]  = xmm27;
+  _xmm_regs[28]  = xmm28;
+  _xmm_regs[29]  = xmm29;
+  _xmm_regs[30]  = xmm30;
+  _xmm_regs[31]  = xmm31;
 #endif // _LP64
 
   for (int i = 0; i < 8; i++) {
     _caller_save_fpu_regs[i] = LIR_OprFact::single_fpu(i);
   }
 
-  for (int i = 0; i < nof_caller_save_xmm_regs ; i++) {
+  int num_caller_save_xmm_regs = get_num_caller_save_xmms();
+  for (int i = 0; i < num_caller_save_xmm_regs; i++) {
     _caller_save_xmm_regs[i] = LIR_OprFact::single_xmm(i);
   }
 
--- a/hotspot/src/cpu/x86/vm/c1_FrameMap_x86.hpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/c1_FrameMap_x86.hpp	Fri May 08 11:49:20 2015 -0700
@@ -152,6 +152,16 @@
     return range;
   }
 
+  static int get_num_caller_save_xmms(void) {
+    int num_caller_save_xmm_regs = nof_caller_save_xmm_regs;
+#ifdef _LP64
+    if (UseAVX < 3) {
+      num_caller_save_xmm_regs = num_caller_save_xmm_regs / 2;
+    }
+#endif
+    return num_caller_save_xmm_regs;
+  }
+
   static int nof_caller_save_cpu_regs() { return adjust_reg_range(pd_nof_caller_save_cpu_regs_frame_map); }
   static int last_cpu_reg()             { return adjust_reg_range(pd_last_cpu_reg);  }
   static int last_byte_reg()            { return adjust_reg_range(pd_last_byte_reg); }
--- a/hotspot/src/cpu/x86/vm/c1_LinearScan_x86.hpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/c1_LinearScan_x86.hpp	Fri May 08 11:49:20 2015 -0700
@@ -85,8 +85,9 @@
           tty->print_cr("killing XMMs for trig");
         }
 #endif
+        int num_caller_save_xmm_regs = FrameMap::get_num_caller_save_xmms();
         int op_id = op->id();
-        for (int xmm = 0; xmm < FrameMap::nof_caller_save_xmm_regs; xmm++) {
+        for (int xmm = 0; xmm < num_caller_save_xmm_regs; xmm++) {
           LIR_Opr opr = FrameMap::caller_save_xmm_reg_at(xmm);
           add_temp(reg_num(opr), op_id, noUse, T_ILLEGAL);
         }
@@ -100,6 +101,10 @@
 // Implementation of LinearScanWalker
 
 inline bool LinearScanWalker::pd_init_regs_for_alloc(Interval* cur) {
+  int last_xmm_reg = pd_last_xmm_reg;
+  if (UseAVX < 3) {
+    last_xmm_reg = pd_first_xmm_reg + (pd_nof_xmm_regs_frame_map / 2) - 1;
+  }
   if (allocator()->gen()->is_vreg_flag_set(cur->reg_num(), LIRGenerator::byte_reg)) {
     assert(cur->type() != T_FLOAT && cur->type() != T_DOUBLE, "cpu regs only");
     _first_reg = pd_first_byte_reg;
@@ -107,7 +112,7 @@
     return true;
   } else if ((UseSSE >= 1 && cur->type() == T_FLOAT) || (UseSSE >= 2 && cur->type() == T_DOUBLE)) {
     _first_reg = pd_first_xmm_reg;
-    _last_reg = pd_last_xmm_reg;
+    _last_reg = last_xmm_reg;
     return true;
   }
 
--- a/hotspot/src/cpu/x86/vm/c1_Runtime1_x86.cpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/c1_Runtime1_x86.cpp	Fri May 08 11:49:20 2015 -0700
@@ -323,7 +323,7 @@
   LP64_ONLY(num_rt_args = 0);
   LP64_ONLY(assert((reg_save_frame_size * VMRegImpl::stack_slot_size) % 16 == 0, "must be 16 byte aligned");)
   int frame_size_in_slots = reg_save_frame_size + num_rt_args; // args + thread
-  sasm->set_frame_size(frame_size_in_slots / VMRegImpl::slots_per_word );
+  sasm->set_frame_size(frame_size_in_slots / VMRegImpl::slots_per_word);
 
   // record saved value locations in an OopMap
   // locations are offsets from sp after runtime call; num_rt_args is number of arguments in call, including thread
@@ -362,6 +362,13 @@
   map->set_callee_saved(VMRegImpl::stack2reg(r15H_off + num_rt_args), r15->as_VMReg()->next());
 #endif // _LP64
 
+  int xmm_bypass_limit = FrameMap::nof_xmm_regs;
+#ifdef _LP64
+  if (UseAVX < 3) {
+    xmm_bypass_limit = xmm_bypass_limit / 2;
+  }
+#endif
+
   if (save_fpu_registers) {
     if (UseSSE < 2) {
       int fpu_off = float_regs_as_doubles_off;
@@ -380,11 +387,13 @@
     if (UseSSE >= 2) {
       int xmm_off = xmm_regs_as_doubles_off;
       for (int n = 0; n < FrameMap::nof_xmm_regs; n++) {
-        VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg();
-        map->set_callee_saved(VMRegImpl::stack2reg(xmm_off +     num_rt_args), xmm_name_0);
-        // %%% This is really a waste but we'll keep things as they were for now
-        if (true) {
-          map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + 1 + num_rt_args), xmm_name_0->next());
+        if (n < xmm_bypass_limit) {
+          VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg();
+          map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + num_rt_args), xmm_name_0);
+          // %%% This is really a waste but we'll keep things as they were for now
+          if (true) {
+            map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + 1 + num_rt_args), xmm_name_0->next());
+          }
         }
         xmm_off += 2;
       }
@@ -393,8 +402,10 @@
     } else if (UseSSE == 1) {
       int xmm_off = xmm_regs_as_doubles_off;
       for (int n = 0; n < FrameMap::nof_xmm_regs; n++) {
-        VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg();
-        map->set_callee_saved(VMRegImpl::stack2reg(xmm_off +     num_rt_args), xmm_name_0);
+        if (n < xmm_bypass_limit) {
+          VMReg xmm_name_0 = as_XMMRegister(n)->as_VMReg();
+          map->set_callee_saved(VMRegImpl::stack2reg(xmm_off + num_rt_args), xmm_name_0);
+        }
         xmm_off += 2;
       }
       assert(xmm_off == float_regs_as_doubles_off, "incorrect number of xmm registers");
@@ -474,6 +485,24 @@
       __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 104), xmm13);
       __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 112), xmm14);
       __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 120), xmm15);
+      if (UseAVX > 2) {
+        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 128), xmm16);
+        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 136), xmm17);
+        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 144), xmm18);
+        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 152), xmm19);
+        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 160), xmm20);
+        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 168), xmm21);
+        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 176), xmm22);
+        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 184), xmm23);
+        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 192), xmm24);
+        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 200), xmm25);
+        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 208), xmm26);
+        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 216), xmm27);
+        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 224), xmm28);
+        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 232), xmm29);
+        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 240), xmm30);
+        __ movdbl(Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 248), xmm31);
+      }
 #endif // _LP64
     } else if (UseSSE == 1) {
       // save XMM registers as float because double not supported without SSE2
@@ -516,6 +545,24 @@
       __ movdbl(xmm13, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 104));
       __ movdbl(xmm14, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 112));
       __ movdbl(xmm15, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 120));
+      if (UseAVX > 2) {
+        __ movdbl(xmm16, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 128));
+        __ movdbl(xmm17, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 136));
+        __ movdbl(xmm18, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 144));
+        __ movdbl(xmm19, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 152));
+        __ movdbl(xmm20, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 160));
+        __ movdbl(xmm21, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 168));
+        __ movdbl(xmm22, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 176));
+        __ movdbl(xmm23, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 184));
+        __ movdbl(xmm24, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 192));
+        __ movdbl(xmm25, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 200));
+        __ movdbl(xmm26, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 208));
+        __ movdbl(xmm27, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 216));
+        __ movdbl(xmm28, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 224));
+        __ movdbl(xmm29, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 232));
+        __ movdbl(xmm30, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 240));
+        __ movdbl(xmm31, Address(rsp, xmm_regs_as_doubles_off * VMRegImpl::stack_slot_size + 248));
+      }
 #endif // _LP64
     } else if (UseSSE == 1) {
       // restore XMM registers
--- a/hotspot/src/cpu/x86/vm/c2_init_x86.cpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/c2_init_x86.cpp	Fri May 08 11:49:20 2015 -0700
@@ -25,6 +25,7 @@
 #include "precompiled.hpp"
 #include "opto/compile.hpp"
 #include "opto/node.hpp"
+#include "opto/optoreg.hpp"
 
 // processor dependent initialization for i486
 
@@ -37,4 +38,24 @@
     ConditionalMoveLimit = 0;
   }
 #endif // AMD64
+
+  if (UseAVX < 3) {
+    int delta = XMMRegisterImpl::max_slots_per_register * XMMRegisterImpl::number_of_registers;
+    int bottom = ConcreteRegisterImpl::max_fpr;
+    int top = bottom + delta;
+    int middle = bottom + (delta / 2);
+    int xmm_slots = XMMRegisterImpl::max_slots_per_register;
+    int lower = xmm_slots / 2;
+    // mark bad every register that we cannot get to if AVX less than 3, we have all slots in the array
+    // Note: vm2opto is allocated to ConcreteRegisterImpl::number_of_registers
+    for (int i = bottom; i < middle; i += xmm_slots) {
+      for (OptoReg::Name j = OptoReg::Name(i + lower); j<OptoReg::Name(i + xmm_slots); j = OptoReg::add(j, 1)) {
+        OptoReg::invalidate(j);
+      }
+    }
+    // mark the upper zmm bank bad and all the mask registers bad in this case
+    for (OptoReg::Name i = OptoReg::Name(middle); i<OptoReg::Name(_last_Mach_Reg - 1); i = OptoReg::add(i, 1)) {
+      OptoReg::invalidate(i);
+    }
+  }
 }
--- a/hotspot/src/cpu/x86/vm/frame_x86.hpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/frame_x86.hpp	Fri May 08 11:49:20 2015 -0700
@@ -125,7 +125,7 @@
     // Entry frames
 #ifdef AMD64
 #ifdef _WIN64
-    entry_frame_after_call_words                     =  28,
+    entry_frame_after_call_words                     =  60,
     entry_frame_call_wrapper_offset                  =  2,
 
     arg_reg_save_area_bytes                          = 32 // Register argument save area
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Fri May 08 11:49:20 2015 -0700
@@ -3996,21 +3996,21 @@
   }
 }
 
-void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
+void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
   if (reachable(src)) {
-    vandpd(dst, nds, as_Address(src), vector256);
+    vandpd(dst, nds, as_Address(src), vector_len);
   } else {
     lea(rscratch1, src);
-    vandpd(dst, nds, Address(rscratch1, 0), vector256);
-  }
-}
-
-void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
+    vandpd(dst, nds, Address(rscratch1, 0), vector_len);
+  }
+}
+
+void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
   if (reachable(src)) {
-    vandps(dst, nds, as_Address(src), vector256);
+    vandps(dst, nds, as_Address(src), vector_len);
   } else {
     lea(rscratch1, src);
-    vandps(dst, nds, Address(rscratch1, 0), vector256);
+    vandps(dst, nds, Address(rscratch1, 0), vector_len);
   }
 }
 
@@ -4068,21 +4068,21 @@
   }
 }
 
-void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
+void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
   if (reachable(src)) {
-    vxorpd(dst, nds, as_Address(src), vector256);
+    vxorpd(dst, nds, as_Address(src), vector_len);
   } else {
     lea(rscratch1, src);
-    vxorpd(dst, nds, Address(rscratch1, 0), vector256);
-  }
-}
-
-void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
+    vxorpd(dst, nds, Address(rscratch1, 0), vector_len);
+  }
+}
+
+void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
   if (reachable(src)) {
-    vxorps(dst, nds, as_Address(src), vector256);
+    vxorps(dst, nds, as_Address(src), vector_len);
   } else {
     lea(rscratch1, src);
-    vxorps(dst, nds, Address(rscratch1, 0), vector256);
+    vxorps(dst, nds, Address(rscratch1, 0), vector_len);
   }
 }
 
@@ -4561,6 +4561,14 @@
     movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
     movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
   } else if (UseSSE >= 2)  {
+    if (UseAVX > 2) {
+      movl(rbx, 0xffff);
+#ifdef _LP64
+      kmovql(k1, rbx);
+#else
+      kmovdl(k1, rbx);
+#endif
+    }
 #ifdef COMPILER2
     if (MaxVectorSize > 16) {
       assert(UseAVX > 0, "256bit vectors are supported only with AVX");
@@ -7063,8 +7071,39 @@
     {
       assert( UseSSE >= 2, "supported cpu only" );
       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
+      if (UseAVX > 2) {
+        movl(rtmp, 0xffff);
+#ifdef _LP64
+        kmovql(k1, rtmp);
+#else
+        kmovdl(k1, rtmp);
+#endif
+      }
       movdl(xtmp, value);
-      if (UseAVX >= 2 && UseUnalignedLoadStores) {
+      if (UseAVX > 2 && UseUnalignedLoadStores) {
+        // Fill 64-byte chunks
+        Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
+        evpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
+
+        subl(count, 16 << shift);
+        jcc(Assembler::less, L_check_fill_32_bytes);
+        align(16);
+
+        BIND(L_fill_64_bytes_loop);
+        evmovdqu(Address(to, 0), xtmp, Assembler::AVX_512bit);
+        addptr(to, 64);
+        subl(count, 16 << shift);
+        jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
+
+        BIND(L_check_fill_32_bytes);
+        addl(count, 8 << shift);
+        jccb(Assembler::less, L_check_fill_8_bytes);
+        evmovdqu(Address(to, 0), xtmp, Assembler::AVX_256bit);
+        addptr(to, 32);
+        subl(count, 8 << shift);
+
+        BIND(L_check_fill_8_bytes);
+      } else if (UseAVX == 2 && UseUnalignedLoadStores) {
         // Fill 64-byte chunks
         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
         vpbroadcastd(xtmp, xtmp);
@@ -7200,11 +7239,11 @@
       bind(L_copy_32_chars);
       vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
       vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
-      vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector256 */ true);
+      vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
       vptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
       jccb(Assembler::notZero, L_copy_32_chars_exit);
-      vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector256 */ true);
-      vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector256 */ true);
+      vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
+      vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
       vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
 
       bind(L_chars_32_check);
@@ -7227,13 +7266,13 @@
       vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
       vptest(tmp2Reg, tmp1Reg);
       jccb(Assembler::notZero, L_copy_16_chars_exit);
-      vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector256 */ true);
-      vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector256 */ true);
+      vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
+      vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
     } else {
       if (UseAVX > 0) {
         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
-        vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector256 */ false);
+        vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
       } else {
         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
         por(tmp2Reg, tmp3Reg);
@@ -7776,7 +7815,7 @@
   if (UseAVX > 0) {
     vpclmulhdq(xtmp, xK, xcrc); // [123:64]
     vpclmulldq(xcrc, xK, xcrc); // [63:0]
-    vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */);
+    vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
     pxor(xcrc, xtmp);
   } else {
     movdqa(xtmp, xcrc);
@@ -7920,7 +7959,7 @@
   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
   if (UseAVX > 0) {
     vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
-    vpand(xmm3, xmm0, xmm2, false /* vector256 */);
+    vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
     vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
   } else {
     movdqa(xmm2, xmm0);
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp	Fri May 08 11:49:20 2015 -0700
@@ -1024,13 +1024,13 @@
   void vaddss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vaddss(dst, nds, src); }
   void vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
 
-  void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vandpd(dst, nds, src, vector256); }
-  void vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256)     { Assembler::vandpd(dst, nds, src, vector256); }
-  void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256);
+  void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); }
+  void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len)     { Assembler::vandpd(dst, nds, src, vector_len); }
+  void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len);
 
-  void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vandps(dst, nds, src, vector256); }
-  void vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256)     { Assembler::vandps(dst, nds, src, vector256); }
-  void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256);
+  void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); }
+  void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len)     { Assembler::vandps(dst, nds, src, vector_len); }
+  void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len);
 
   void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vdivsd(dst, nds, src); }
   void vdivsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vdivsd(dst, nds, src); }
@@ -1058,25 +1058,25 @@
 
   // AVX Vector instructions
 
-  void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vxorpd(dst, nds, src, vector256); }
-  void vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { Assembler::vxorpd(dst, nds, src, vector256); }
-  void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256);
+  void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
+  void vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
+  void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len);
 
-  void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vxorps(dst, nds, src, vector256); }
-  void vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) { Assembler::vxorps(dst, nds, src, vector256); }
-  void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256);
+  void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); }
+  void vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); }
+  void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len);
 
-  void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
-    if (UseAVX > 1 || !vector256) // vpxor 256 bit is available only in AVX2
-      Assembler::vpxor(dst, nds, src, vector256);
+  void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
+    if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2
+      Assembler::vpxor(dst, nds, src, vector_len);
     else
-      Assembler::vxorpd(dst, nds, src, vector256);
+      Assembler::vxorpd(dst, nds, src, vector_len);
   }
-  void vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
-    if (UseAVX > 1 || !vector256) // vpxor 256 bit is available only in AVX2
-      Assembler::vpxor(dst, nds, src, vector256);
+  void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
+    if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2
+      Assembler::vpxor(dst, nds, src, vector_len);
     else
-      Assembler::vxorpd(dst, nds, src, vector256);
+      Assembler::vxorpd(dst, nds, src, vector_len);
   }
 
   // Simple version for AVX2 256bit vectors
--- a/hotspot/src/cpu/x86/vm/register_definitions_x86.cpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/register_definitions_x86.cpp	Fri May 08 11:49:20 2015 -0700
@@ -68,6 +68,22 @@
 REGISTER_DEFINITION(XMMRegister, xmm13);
 REGISTER_DEFINITION(XMMRegister, xmm14);
 REGISTER_DEFINITION(XMMRegister, xmm15);
+REGISTER_DEFINITION(XMMRegister, xmm16);
+REGISTER_DEFINITION(XMMRegister, xmm17);
+REGISTER_DEFINITION(XMMRegister, xmm18);
+REGISTER_DEFINITION(XMMRegister, xmm19);
+REGISTER_DEFINITION(XMMRegister, xmm20);
+REGISTER_DEFINITION(XMMRegister, xmm21);
+REGISTER_DEFINITION(XMMRegister, xmm22);
+REGISTER_DEFINITION(XMMRegister, xmm23);
+REGISTER_DEFINITION(XMMRegister, xmm24);
+REGISTER_DEFINITION(XMMRegister, xmm25);
+REGISTER_DEFINITION(XMMRegister, xmm26);
+REGISTER_DEFINITION(XMMRegister, xmm27);
+REGISTER_DEFINITION(XMMRegister, xmm28);
+REGISTER_DEFINITION(XMMRegister, xmm29);
+REGISTER_DEFINITION(XMMRegister, xmm30);
+REGISTER_DEFINITION(XMMRegister, xmm31);
 
 REGISTER_DEFINITION(Register, c_rarg0);
 REGISTER_DEFINITION(Register, c_rarg1);
@@ -123,5 +139,15 @@
 REGISTER_DEFINITION(MMXRegister, mmx6 );
 REGISTER_DEFINITION(MMXRegister, mmx7 );
 
+REGISTER_DEFINITION(KRegister, knoreg);
+REGISTER_DEFINITION(KRegister, k0);
+REGISTER_DEFINITION(KRegister, k1);
+REGISTER_DEFINITION(KRegister, k2);
+REGISTER_DEFINITION(KRegister, k3);
+REGISTER_DEFINITION(KRegister, k4);
+REGISTER_DEFINITION(KRegister, k5);
+REGISTER_DEFINITION(KRegister, k6);
+REGISTER_DEFINITION(KRegister, k7);
+
 // JSR 292
 REGISTER_DEFINITION(Register, rbp_mh_SP_save);
--- a/hotspot/src/cpu/x86/vm/register_x86.cpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/register_x86.cpp	Fri May 08 11:49:20 2015 -0700
@@ -31,11 +31,13 @@
 const int ConcreteRegisterImpl::max_gpr = RegisterImpl::number_of_registers << 1;
 #endif // AMD64
 
+const int ConcreteRegisterImpl::max_fpr = ConcreteRegisterImpl::max_gpr +
+    2 * FloatRegisterImpl::number_of_registers;
+const int ConcreteRegisterImpl::max_xmm = ConcreteRegisterImpl::max_fpr +
+    XMMRegisterImpl::max_slots_per_register * XMMRegisterImpl::number_of_registers;
+const int ConcreteRegisterImpl::max_kpr = ConcreteRegisterImpl::max_xmm +
+    KRegisterImpl::max_slots_per_register * KRegisterImpl::number_of_registers;
 
-const int ConcreteRegisterImpl::max_fpr = ConcreteRegisterImpl::max_gpr +
-                                                                 2 * FloatRegisterImpl::number_of_registers;
-const int ConcreteRegisterImpl::max_xmm = ConcreteRegisterImpl::max_fpr +
-                                                                 8 * XMMRegisterImpl::number_of_registers;
 const char* RegisterImpl::name() const {
   const char* names[number_of_registers] = {
 #ifndef AMD64
@@ -59,8 +61,17 @@
   const char* names[number_of_registers] = {
     "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"
 #ifdef AMD64
-    ,"xmm8",  "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
+    ,"xmm8",   "xmm9",  "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
+    ,"xmm16",  "xmm17", "xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23"
+    ,"xmm24",  "xmm25", "xmm26", "xmm27", "xmm28", "xmm29", "xmm30", "xmm31"
 #endif // AMD64
   };
   return is_valid() ? names[encoding()] : "xnoreg";
 }
+
+const char* KRegisterImpl::name() const {
+  const char* names[number_of_registers] = {
+    "k0", "k1", "k2", "k3", "k4", "k5", "k6", "k7"
+  };
+  return is_valid() ? names[encoding()] : "knoreg";
+}
--- a/hotspot/src/cpu/x86/vm/register_x86.hpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/register_x86.hpp	Fri May 08 11:49:20 2015 -0700
@@ -45,10 +45,12 @@
   enum {
 #ifndef AMD64
     number_of_registers      = 8,
-    number_of_byte_registers = 4
+    number_of_byte_registers = 4,
+    max_slots_per_register   = 1
 #else
     number_of_registers      = 16,
-    number_of_byte_registers = 16
+    number_of_byte_registers = 16,
+    max_slots_per_register   = 1
 #endif // AMD64
   };
 
@@ -143,9 +145,11 @@
  public:
   enum {
 #ifndef AMD64
-    number_of_registers = 8
+    number_of_registers = 8,
+    max_slots_per_register = 16   // 512-bit
 #else
-    number_of_registers = 16
+    number_of_registers = 32,
+    max_slots_per_register = 16   // 512-bit
 #endif // AMD64
   };
 
@@ -183,6 +187,22 @@
 CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm13,    (13));
 CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm14,    (14));
 CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm15,    (15));
+CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm16,    (16));
+CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm17,    (17));
+CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm18,    (18));
+CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm19,    (19));
+CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm20,    (20));
+CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm21,    (21));
+CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm22,    (22));
+CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm23,    (23));
+CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm24,    (24));
+CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm25,    (25));
+CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm26,    (26));
+CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm27,    (27));
+CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm28,    (28));
+CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm29,    (29));
+CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm30,    (30));
+CONSTANT_REGISTER_DECLARATION(XMMRegister, xmm31,    (31));
 #endif // AMD64
 
 // Only used by the 32bit stubGenerator. These can't be described by vmreg and hence
@@ -200,6 +220,46 @@
 CONSTANT_REGISTER_DECLARATION(MMXRegister, mmx6 , ( 6));
 CONSTANT_REGISTER_DECLARATION(MMXRegister, mmx7 , ( 7));
 
+// Use XMMRegister as shortcut
+class KRegisterImpl;
+typedef KRegisterImpl* KRegister;
+
+inline KRegister as_KRegister(int encoding) {
+  return (KRegister)(intptr_t)encoding;
+}
+
+// The implementation of XMM registers for the IA32 architecture
+class KRegisterImpl : public AbstractRegisterImpl {
+public:
+  enum {
+    number_of_registers = 8,
+    max_slots_per_register = 1
+  };
+
+  // construction
+  friend KRegister as_KRegister(int encoding);
+
+  inline VMReg as_VMReg();
+
+  // derived registers, offsets, and addresses
+  KRegister successor() const                          { return as_KRegister(encoding() + 1); }
+
+  // accessors
+  int   encoding() const                          { assert(is_valid(), err_msg("invalid register (%d)", (int)(intptr_t)this)); return (intptr_t)this; }
+  bool  is_valid() const                          { return 0 <= (intptr_t)this && (intptr_t)this < number_of_registers; }
+  const char* name() const;
+};
+
+// The Mask registers, for AVX3 enabled and up chips
+CONSTANT_REGISTER_DECLARATION(KRegister, knoreg, (-1));
+CONSTANT_REGISTER_DECLARATION(KRegister, k0, (0));
+CONSTANT_REGISTER_DECLARATION(KRegister, k1, (1));
+CONSTANT_REGISTER_DECLARATION(KRegister, k2, (2));
+CONSTANT_REGISTER_DECLARATION(KRegister, k3, (3));
+CONSTANT_REGISTER_DECLARATION(KRegister, k4, (4));
+CONSTANT_REGISTER_DECLARATION(KRegister, k5, (5));
+CONSTANT_REGISTER_DECLARATION(KRegister, k6, (6));
+CONSTANT_REGISTER_DECLARATION(KRegister, k7, (7));
 
 // Need to know the total number of registers of all sorts for SharedInfo.
 // Define a class that exports it.
@@ -211,18 +271,20 @@
   // There is no requirement that any ordering here matches any ordering c2 gives
   // it's optoregs.
 
-    number_of_registers =      RegisterImpl::number_of_registers +
+    number_of_registers = RegisterImpl::number_of_registers +
 #ifdef AMD64
-                               RegisterImpl::number_of_registers +  // "H" half of a 64bit register
+      RegisterImpl::number_of_registers +  // "H" half of a 64bit register
 #endif // AMD64
-                           2 * FloatRegisterImpl::number_of_registers +
-                           8 * XMMRegisterImpl::number_of_registers +
-                           1 // eflags
+      2 * FloatRegisterImpl::number_of_registers +
+      XMMRegisterImpl::max_slots_per_register * XMMRegisterImpl::number_of_registers +
+      KRegisterImpl::number_of_registers + // mask registers
+      1 // eflags
   };
 
   static const int max_gpr;
   static const int max_fpr;
   static const int max_xmm;
+  static const int max_kpr;
 
 };
 
--- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Fri May 08 11:49:20 2015 -0700
@@ -117,9 +117,9 @@
   int vect_words = 0;
 #ifdef COMPILER2
   if (save_vectors) {
-    assert(UseAVX > 0, "256bit vectors are supported only with AVX");
-    assert(MaxVectorSize == 32, "only 256bit vectors are supported now");
-    // Save upper half of YMM registes
+    assert(UseAVX > 0, "512bit vectors are supported only with EVEX");
+    assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
+    // Save upper half of ZMM/YMM registers :
     vect_words = 8 * 16 / wordSize;
     additional_frame_words += vect_words;
   }
@@ -216,6 +216,17 @@
     __ vextractf128h(Address(rsp, 80),xmm5);
     __ vextractf128h(Address(rsp, 96),xmm6);
     __ vextractf128h(Address(rsp,112),xmm7);
+    if (UseAVX > 2) {
+      __ subptr(rsp, 256); // Save upper half of ZMM registes
+      __ vextractf64x4h(Address(rsp, 0), xmm0);
+      __ vextractf64x4h(Address(rsp, 32), xmm1);
+      __ vextractf64x4h(Address(rsp, 64), xmm2);
+      __ vextractf64x4h(Address(rsp, 96), xmm3);
+      __ vextractf64x4h(Address(rsp, 128), xmm4);
+      __ vextractf64x4h(Address(rsp, 160), xmm5);
+      __ vextractf64x4h(Address(rsp, 192), xmm6);
+      __ vextractf64x4h(Address(rsp, 224), xmm7);
+    }
   }
 
   // Set an oopmap for the call site.  This oopmap will map all
@@ -283,8 +294,8 @@
   int additional_frame_bytes = 0;
 #ifdef COMPILER2
   if (restore_vectors) {
-    assert(UseAVX > 0, "256bit vectors are supported only with AVX");
-    assert(MaxVectorSize == 32, "only 256bit vectors are supported now");
+    assert(UseAVX > 0, "512bit vectors are supported only with EVEX");
+    assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
     additional_frame_bytes = 128;
   }
 #else
@@ -324,6 +335,18 @@
     __ vinsertf128h(xmm6, Address(rsp, 96));
     __ vinsertf128h(xmm7, Address(rsp,112));
     __ addptr(rsp, additional_frame_bytes);
+    if (UseAVX > 2) {
+      additional_frame_bytes = 256;
+      __ vinsertf64x4h(xmm0, Address(rsp, 0));
+      __ vinsertf64x4h(xmm1, Address(rsp, 32));
+      __ vinsertf64x4h(xmm2, Address(rsp, 64));
+      __ vinsertf64x4h(xmm3, Address(rsp, 96));
+      __ vinsertf64x4h(xmm4, Address(rsp, 128));
+      __ vinsertf64x4h(xmm5, Address(rsp, 160));
+      __ vinsertf64x4h(xmm6, Address(rsp, 192));
+      __ vinsertf64x4h(xmm7, Address(rsp, 224));
+      __ addptr(rsp, additional_frame_bytes);
+    }
   }
   __ pop_FPU_state();
   __ addptr(rsp, FPU_regs_live*wordSize); // Pop FPU registers
--- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Fri May 08 11:49:20 2015 -0700
@@ -86,7 +86,23 @@
     DEF_XMM_OFFS(13),
     DEF_XMM_OFFS(14),
     DEF_XMM_OFFS(15),
-    fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
+    DEF_XMM_OFFS(16),
+    DEF_XMM_OFFS(17),
+    DEF_XMM_OFFS(18),
+    DEF_XMM_OFFS(19),
+    DEF_XMM_OFFS(20),
+    DEF_XMM_OFFS(21),
+    DEF_XMM_OFFS(22),
+    DEF_XMM_OFFS(23),
+    DEF_XMM_OFFS(24),
+    DEF_XMM_OFFS(25),
+    DEF_XMM_OFFS(26),
+    DEF_XMM_OFFS(27),
+    DEF_XMM_OFFS(28),
+    DEF_XMM_OFFS(29),
+    DEF_XMM_OFFS(30),
+    DEF_XMM_OFFS(31),
+    fpu_state_end = fpu_state_off + ((FPUStateSizeInWords - 1)*wordSize / BytesPerInt),
     fpu_stateH_end,
     r15_off, r15H_off,
     r14_off, r14H_off,
@@ -136,13 +152,21 @@
 
 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
   int vect_words = 0;
+  int num_xmm_regs = 16;
+  if (UseAVX > 2) {
+    num_xmm_regs = 32;
+  }
 #ifdef COMPILER2
   if (save_vectors) {
-    assert(UseAVX > 0, "256bit vectors are supported only with AVX");
-    assert(MaxVectorSize == 32, "only 256bit vectors are supported now");
-    // Save upper half of YMM registes
-    vect_words = 16 * 16 / wordSize;
+    assert(UseAVX > 0, "512bit vectors are supported only with EVEX");
+    assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
+    // Save upper half of YMM registers
+    vect_words = 16 * num_xmm_regs / wordSize;
     additional_frame_words += vect_words;
+    if (UseAVX > 2) {
+      // Save upper half of ZMM registers as well
+      additional_frame_words += vect_words;
+    }
   }
 #else
   assert(!save_vectors, "vectors are generated only by C2");
@@ -150,7 +174,7 @@
 
   // Always make the frame size 16-byte aligned
   int frame_size_in_bytes = round_to(additional_frame_words*wordSize +
-                                     reg_save_size*BytesPerInt, 16);
+                                     reg_save_size*BytesPerInt, num_xmm_regs);
   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
   // The caller will allocate additional_frame_words
@@ -169,24 +193,77 @@
   __ push_CPU_state(); // Push a multiple of 16 bytes
 
   if (vect_words > 0) {
-    assert(vect_words*wordSize == 256, "");
-    __ subptr(rsp, 256); // Save upper half of YMM registes
-    __ vextractf128h(Address(rsp,  0),xmm0);
-    __ vextractf128h(Address(rsp, 16),xmm1);
-    __ vextractf128h(Address(rsp, 32),xmm2);
-    __ vextractf128h(Address(rsp, 48),xmm3);
-    __ vextractf128h(Address(rsp, 64),xmm4);
-    __ vextractf128h(Address(rsp, 80),xmm5);
-    __ vextractf128h(Address(rsp, 96),xmm6);
-    __ vextractf128h(Address(rsp,112),xmm7);
-    __ vextractf128h(Address(rsp,128),xmm8);
-    __ vextractf128h(Address(rsp,144),xmm9);
-    __ vextractf128h(Address(rsp,160),xmm10);
-    __ vextractf128h(Address(rsp,176),xmm11);
-    __ vextractf128h(Address(rsp,192),xmm12);
-    __ vextractf128h(Address(rsp,208),xmm13);
-    __ vextractf128h(Address(rsp,224),xmm14);
-    __ vextractf128h(Address(rsp,240),xmm15);
+    assert(vect_words*wordSize >= 256, "");
+    __ subptr(rsp, 256); // Save upper half of YMM registes(0..15)
+    __ vextractf128h(Address(rsp, 0), xmm0);
+    __ vextractf128h(Address(rsp, 16), xmm1);
+    __ vextractf128h(Address(rsp, 32), xmm2);
+    __ vextractf128h(Address(rsp, 48), xmm3);
+    __ vextractf128h(Address(rsp, 64), xmm4);
+    __ vextractf128h(Address(rsp, 80), xmm5);
+    __ vextractf128h(Address(rsp, 96), xmm6);
+    __ vextractf128h(Address(rsp, 112), xmm7);
+    __ vextractf128h(Address(rsp, 128), xmm8);
+    __ vextractf128h(Address(rsp, 144), xmm9);
+    __ vextractf128h(Address(rsp, 160), xmm10);
+    __ vextractf128h(Address(rsp, 176), xmm11);
+    __ vextractf128h(Address(rsp, 192), xmm12);
+    __ vextractf128h(Address(rsp, 208), xmm13);
+    __ vextractf128h(Address(rsp, 224), xmm14);
+    __ vextractf128h(Address(rsp, 240), xmm15);
+    if (UseAVX > 2) {
+      __ subptr(rsp, 256); // Save upper half of YMM registes(16..31)
+      __ vextractf128h(Address(rsp, 0), xmm16);
+      __ vextractf128h(Address(rsp, 16), xmm17);
+      __ vextractf128h(Address(rsp, 32), xmm18);
+      __ vextractf128h(Address(rsp, 48), xmm19);
+      __ vextractf128h(Address(rsp, 64), xmm20);
+      __ vextractf128h(Address(rsp, 80), xmm21);
+      __ vextractf128h(Address(rsp, 96), xmm22);
+      __ vextractf128h(Address(rsp, 112), xmm23);
+      __ vextractf128h(Address(rsp, 128), xmm24);
+      __ vextractf128h(Address(rsp, 144), xmm25);
+      __ vextractf128h(Address(rsp, 160), xmm26);
+      __ vextractf128h(Address(rsp, 176), xmm27);
+      __ vextractf128h(Address(rsp, 192), xmm28);
+      __ vextractf128h(Address(rsp, 208), xmm29);
+      __ vextractf128h(Address(rsp, 224), xmm30);
+      __ vextractf128h(Address(rsp, 240), xmm31);
+      // Now handle the ZMM registers (0..31)
+      __ subptr(rsp, 1024); // Save upper half of ZMM registes
+      __ vextractf64x4h(Address(rsp, 0), xmm0);
+      __ vextractf64x4h(Address(rsp, 32), xmm1);
+      __ vextractf64x4h(Address(rsp, 64), xmm2);
+      __ vextractf64x4h(Address(rsp, 96), xmm3);
+      __ vextractf64x4h(Address(rsp, 128), xmm4);
+      __ vextractf64x4h(Address(rsp, 160), xmm5);
+      __ vextractf64x4h(Address(rsp, 192), xmm6);
+      __ vextractf64x4h(Address(rsp, 224), xmm7);
+      __ vextractf64x4h(Address(rsp, 256), xmm8);
+      __ vextractf64x4h(Address(rsp, 288), xmm9);
+      __ vextractf64x4h(Address(rsp, 320), xmm10);
+      __ vextractf64x4h(Address(rsp, 352), xmm11);
+      __ vextractf64x4h(Address(rsp, 384), xmm12);
+      __ vextractf64x4h(Address(rsp, 416), xmm13);
+      __ vextractf64x4h(Address(rsp, 448), xmm14);
+      __ vextractf64x4h(Address(rsp, 480), xmm15);
+      __ vextractf64x4h(Address(rsp, 512), xmm16);
+      __ vextractf64x4h(Address(rsp, 544), xmm17);
+      __ vextractf64x4h(Address(rsp, 576), xmm18);
+      __ vextractf64x4h(Address(rsp, 608), xmm19);
+      __ vextractf64x4h(Address(rsp, 640), xmm20);
+      __ vextractf64x4h(Address(rsp, 672), xmm21);
+      __ vextractf64x4h(Address(rsp, 704), xmm22);
+      __ vextractf64x4h(Address(rsp, 736), xmm23);
+      __ vextractf64x4h(Address(rsp, 768), xmm24);
+      __ vextractf64x4h(Address(rsp, 800), xmm25);
+      __ vextractf64x4h(Address(rsp, 832), xmm26);
+      __ vextractf64x4h(Address(rsp, 864), xmm27);
+      __ vextractf64x4h(Address(rsp, 896), xmm28);
+      __ vextractf64x4h(Address(rsp, 928), xmm29);
+      __ vextractf64x4h(Address(rsp, 960), xmm30);
+      __ vextractf64x4h(Address(rsp, 992), xmm31);
+    }
   }
   if (frame::arg_reg_save_area_bytes != 0) {
     // Allocate argument register save area
@@ -235,6 +312,24 @@
   map->set_callee_saved(STACK_OFFSET(xmm13_off), xmm13->as_VMReg());
   map->set_callee_saved(STACK_OFFSET(xmm14_off), xmm14->as_VMReg());
   map->set_callee_saved(STACK_OFFSET(xmm15_off), xmm15->as_VMReg());
+  if (UseAVX > 2) {
+    map->set_callee_saved(STACK_OFFSET(xmm16_off), xmm16->as_VMReg());
+    map->set_callee_saved(STACK_OFFSET(xmm17_off), xmm17->as_VMReg());
+    map->set_callee_saved(STACK_OFFSET(xmm18_off), xmm18->as_VMReg());
+    map->set_callee_saved(STACK_OFFSET(xmm19_off), xmm19->as_VMReg());
+    map->set_callee_saved(STACK_OFFSET(xmm20_off), xmm20->as_VMReg());
+    map->set_callee_saved(STACK_OFFSET(xmm21_off), xmm21->as_VMReg());
+    map->set_callee_saved(STACK_OFFSET(xmm22_off), xmm22->as_VMReg());
+    map->set_callee_saved(STACK_OFFSET(xmm23_off), xmm23->as_VMReg());
+    map->set_callee_saved(STACK_OFFSET(xmm24_off), xmm24->as_VMReg());
+    map->set_callee_saved(STACK_OFFSET(xmm25_off), xmm25->as_VMReg());
+    map->set_callee_saved(STACK_OFFSET(xmm26_off), xmm26->as_VMReg());
+    map->set_callee_saved(STACK_OFFSET(xmm27_off), xmm27->as_VMReg());
+    map->set_callee_saved(STACK_OFFSET(xmm28_off), xmm28->as_VMReg());
+    map->set_callee_saved(STACK_OFFSET(xmm29_off), xmm29->as_VMReg());
+    map->set_callee_saved(STACK_OFFSET(xmm30_off), xmm30->as_VMReg());
+    map->set_callee_saved(STACK_OFFSET(xmm31_off), xmm31->as_VMReg());
+  }
 
   // %%% These should all be a waste but we'll keep things as they were for now
   if (true) {
@@ -269,6 +364,24 @@
     map->set_callee_saved(STACK_OFFSET(xmm13H_off), xmm13->as_VMReg()->next());
     map->set_callee_saved(STACK_OFFSET(xmm14H_off), xmm14->as_VMReg()->next());
     map->set_callee_saved(STACK_OFFSET(xmm15H_off), xmm15->as_VMReg()->next());
+    if (UseAVX > 2) {
+      map->set_callee_saved(STACK_OFFSET(xmm16H_off), xmm16->as_VMReg());
+      map->set_callee_saved(STACK_OFFSET(xmm17H_off), xmm17->as_VMReg());
+      map->set_callee_saved(STACK_OFFSET(xmm18H_off), xmm18->as_VMReg());
+      map->set_callee_saved(STACK_OFFSET(xmm19H_off), xmm19->as_VMReg());
+      map->set_callee_saved(STACK_OFFSET(xmm20H_off), xmm20->as_VMReg());
+      map->set_callee_saved(STACK_OFFSET(xmm21H_off), xmm21->as_VMReg());
+      map->set_callee_saved(STACK_OFFSET(xmm22H_off), xmm22->as_VMReg());
+      map->set_callee_saved(STACK_OFFSET(xmm23H_off), xmm23->as_VMReg());
+      map->set_callee_saved(STACK_OFFSET(xmm24H_off), xmm24->as_VMReg());
+      map->set_callee_saved(STACK_OFFSET(xmm25H_off), xmm25->as_VMReg());
+      map->set_callee_saved(STACK_OFFSET(xmm26H_off), xmm26->as_VMReg());
+      map->set_callee_saved(STACK_OFFSET(xmm27H_off), xmm27->as_VMReg());
+      map->set_callee_saved(STACK_OFFSET(xmm28H_off), xmm28->as_VMReg());
+      map->set_callee_saved(STACK_OFFSET(xmm29H_off), xmm29->as_VMReg());
+      map->set_callee_saved(STACK_OFFSET(xmm30H_off), xmm30->as_VMReg());
+      map->set_callee_saved(STACK_OFFSET(xmm31H_off), xmm31->as_VMReg());
+    }
   }
 
   return map;
@@ -281,9 +394,9 @@
   }
 #ifdef COMPILER2
   if (restore_vectors) {
-    // Restore upper half of YMM registes.
-    assert(UseAVX > 0, "256bit vectors are supported only with AVX");
-    assert(MaxVectorSize == 32, "only 256bit vectors are supported now");
+    // Restore upper half of YMM registes (0..15)
+    assert(UseAVX > 0, "512bit vectors are supported only with AVX");
+    assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
     __ vinsertf128h(xmm0, Address(rsp,  0));
     __ vinsertf128h(xmm1, Address(rsp, 16));
     __ vinsertf128h(xmm2, Address(rsp, 32));
@@ -301,6 +414,60 @@
     __ vinsertf128h(xmm14, Address(rsp,224));
     __ vinsertf128h(xmm15, Address(rsp,240));
     __ addptr(rsp, 256);
+    if (UseAVX > 2) {
+      // Restore upper half of YMM registes (16..31)
+      __ vinsertf128h(xmm16, Address(rsp,  0));
+      __ vinsertf128h(xmm17, Address(rsp, 16));
+      __ vinsertf128h(xmm18, Address(rsp, 32));
+      __ vinsertf128h(xmm19, Address(rsp, 48));
+      __ vinsertf128h(xmm20, Address(rsp, 64));
+      __ vinsertf128h(xmm21, Address(rsp, 80));
+      __ vinsertf128h(xmm22, Address(rsp, 96));
+      __ vinsertf128h(xmm23, Address(rsp,112));
+      __ vinsertf128h(xmm24, Address(rsp,128));
+      __ vinsertf128h(xmm25, Address(rsp,144));
+      __ vinsertf128h(xmm26, Address(rsp,160));
+      __ vinsertf128h(xmm27, Address(rsp,176));
+      __ vinsertf128h(xmm28, Address(rsp,192));
+      __ vinsertf128h(xmm29, Address(rsp,208));
+      __ vinsertf128h(xmm30, Address(rsp,224));
+      __ vinsertf128h(xmm31, Address(rsp,240));
+      __ addptr(rsp, 256);
+      // Restore upper half of ZMM registes.
+      __ vinsertf64x4h(xmm0, Address(rsp, 0));
+      __ vinsertf64x4h(xmm1, Address(rsp, 32));
+      __ vinsertf64x4h(xmm2, Address(rsp, 64));
+      __ vinsertf64x4h(xmm3, Address(rsp, 96));
+      __ vinsertf64x4h(xmm4, Address(rsp, 128));
+      __ vinsertf64x4h(xmm5, Address(rsp, 160));
+      __ vinsertf64x4h(xmm6, Address(rsp, 192));
+      __ vinsertf64x4h(xmm7, Address(rsp, 224));
+      __ vinsertf64x4h(xmm8, Address(rsp, 256));
+      __ vinsertf64x4h(xmm9, Address(rsp, 288));
+      __ vinsertf64x4h(xmm10, Address(rsp, 320));
+      __ vinsertf64x4h(xmm11, Address(rsp, 352));
+      __ vinsertf64x4h(xmm12, Address(rsp, 384));
+      __ vinsertf64x4h(xmm13, Address(rsp, 416));
+      __ vinsertf64x4h(xmm14, Address(rsp, 448));
+      __ vinsertf64x4h(xmm15, Address(rsp, 480));
+      __ vinsertf64x4h(xmm16, Address(rsp, 512));
+      __ vinsertf64x4h(xmm17, Address(rsp, 544));
+      __ vinsertf64x4h(xmm18, Address(rsp, 576));
+      __ vinsertf64x4h(xmm19, Address(rsp, 608));
+      __ vinsertf64x4h(xmm20, Address(rsp, 640));
+      __ vinsertf64x4h(xmm21, Address(rsp, 672));
+      __ vinsertf64x4h(xmm22, Address(rsp, 704));
+      __ vinsertf64x4h(xmm23, Address(rsp, 736));
+      __ vinsertf64x4h(xmm24, Address(rsp, 768));
+      __ vinsertf64x4h(xmm25, Address(rsp, 800));
+      __ vinsertf64x4h(xmm26, Address(rsp, 832));
+      __ vinsertf64x4h(xmm27, Address(rsp, 864));
+      __ vinsertf64x4h(xmm28, Address(rsp, 896));
+      __ vinsertf64x4h(xmm29, Address(rsp, 928));
+      __ vinsertf64x4h(xmm30, Address(rsp, 960));
+      __ vinsertf64x4h(xmm31, Address(rsp, 992));
+      __ subptr(rsp, 1024);
+    }
   }
 #else
   assert(!restore_vectors, "vectors are generated only by C2");
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Fri May 08 11:49:20 2015 -0700
@@ -166,6 +166,13 @@
     __ movptr(saved_rdi, rdi);
     __ movptr(saved_rsi, rsi);
     __ movptr(saved_rbx, rbx);
+
+    // provide initial value for required masks
+    if (UseAVX > 2) {
+      __ movl(rbx, 0xffff);
+      __ kmovdl(k1, rbx);
+    }
+
     // save and initialize %mxcsr
     if (sse_save) {
       Label skip_ldmx;
@@ -794,7 +801,10 @@
   __ BIND(L_copy_64_bytes_loop);
 
     if (UseUnalignedLoadStores) {
-      if (UseAVX >= 2) {
+      if (UseAVX > 2) {
+        __ evmovdqu(xmm0, Address(from, 0), Assembler::AVX_512bit);
+        __ evmovdqu(Address(from, to_from, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
+      } else if (UseAVX == 2) {
         __ vmovdqu(xmm0, Address(from,  0));
         __ vmovdqu(Address(from, to_from, Address::times_1,  0), xmm0);
         __ vmovdqu(xmm1, Address(from, 32));
@@ -833,7 +843,7 @@
     __ subl(qword_count, 8);
     __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
 
-    if (UseUnalignedLoadStores && (UseAVX >= 2)) {
+    if (UseUnalignedLoadStores && (UseAVX == 2)) {
       // clean upper bits of YMM registers
       __ vpxor(xmm0, xmm0);
       __ vpxor(xmm1, xmm1);
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Fri May 08 11:49:20 2015 -0700
@@ -137,8 +137,10 @@
   //     [ return_from_Java     ] <--- rsp
   //     [ argument word n      ]
   //      ...
-  // -28 [ argument word 1      ]
-  // -27 [ saved xmm15          ] <--- rsp_after_call
+  // -60 [ argument word 1      ]
+  // -59 [ saved xmm31          ] <--- rsp after_call
+  //     [ saved xmm16-xmm30    ] (EVEX enabled, else the space is blank)
+  // -27 [ saved xmm15          ]
   //     [ saved xmm7-xmm14     ]
   //  -9 [ saved xmm6           ] (each xmm register takes 2 slots)
   //  -7 [ saved r15            ]
@@ -166,7 +168,7 @@
   enum call_stub_layout {
 #ifdef _WIN64
     xmm_save_first     = 6,  // save from xmm6
-    xmm_save_last      = 15, // to xmm15
+    xmm_save_last      = 31, // to xmm31
     xmm_save_base      = -9,
     rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27
     r15_off            = -7,
@@ -262,9 +264,19 @@
     __ movptr(r13_save, r13);
     __ movptr(r14_save, r14);
     __ movptr(r15_save, r15);
+    if (UseAVX > 2) {
+      __ movl(rbx, 0xffff);
+      __ kmovql(k1, rbx);
+    }
 #ifdef _WIN64
-    for (int i = 6; i <= 15; i++) {
-      __ movdqu(xmm_save(i), as_XMMRegister(i));
+    if (UseAVX > 2) {
+      for (int i = 6; i <= 31; i++) {
+        __ movdqu(xmm_save(i), as_XMMRegister(i));
+      }
+    } else {
+      for (int i = 6; i <= 15; i++) {
+        __ movdqu(xmm_save(i), as_XMMRegister(i));
+      }
     }
 
     const Address rdi_save(rbp, rdi_off * wordSize);
@@ -1318,7 +1330,10 @@
       Label L_end;
       // Copy 64-bytes per iteration
       __ BIND(L_loop);
-      if (UseAVX >= 2) {
+      if (UseAVX > 2) {
+        __ evmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
+        __ evmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
+      } else if (UseAVX == 2) {
         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
         __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
@@ -1395,7 +1410,10 @@
       Label L_end;
       // Copy 64-bytes per iteration
       __ BIND(L_loop);
-      if (UseAVX >= 2) {
+      if (UseAVX > 2) {
+        __ evmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32), Assembler::AVX_512bit);
+        __ evmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0, Assembler::AVX_512bit);
+      } else if (UseAVX == 2) {
         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
         __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
         __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp	Fri May 08 11:49:20 2015 -0700
@@ -35,7 +35,7 @@
 int VM_Version::_cpu;
 int VM_Version::_model;
 int VM_Version::_stepping;
-int VM_Version::_cpuFeatures;
+uint64_t VM_Version::_cpuFeatures;
 const char*           VM_Version::_features_str = "";
 VM_Version::CpuidInfo VM_Version::_cpuid_info   = { 0, };
 
@@ -45,7 +45,7 @@
 address VM_Version::_cpuinfo_cont_addr = 0;
 
 static BufferBlob* stub_blob;
-static const int stub_size = 600;
+static const int stub_size = 1000;
 
 extern "C" {
   typedef void (*get_cpu_info_stub_t)(void*);
@@ -60,15 +60,16 @@
 
   address generate_get_cpu_info() {
     // Flags to test CPU type.
-    const uint32_t HS_EFL_AC           = 0x40000;
-    const uint32_t HS_EFL_ID           = 0x200000;
+    const uint32_t HS_EFL_AC = 0x40000;
+    const uint32_t HS_EFL_ID = 0x200000;
     // Values for when we don't have a CPUID instruction.
     const int      CPU_FAMILY_SHIFT = 8;
-    const uint32_t CPU_FAMILY_386   = (3 << CPU_FAMILY_SHIFT);
-    const uint32_t CPU_FAMILY_486   = (4 << CPU_FAMILY_SHIFT);
+    const uint32_t CPU_FAMILY_386 = (3 << CPU_FAMILY_SHIFT);
+    const uint32_t CPU_FAMILY_486 = (4 << CPU_FAMILY_SHIFT);
 
     Label detect_486, cpu486, detect_586, std_cpuid1, std_cpuid4;
-    Label sef_cpuid, ext_cpuid, ext_cpuid1, ext_cpuid5, ext_cpuid7, done;
+    Label sef_cpuid, ext_cpuid, ext_cpuid1, ext_cpuid5, ext_cpuid7, done, wrapup;
+    Label legacy_setup, save_restore_except, legacy_save_restore, start_simd_check;
 
     StubCodeMark mark(this, "VM_Version", "get_cpu_info_stub");
 #   define __ _masm->
@@ -241,53 +242,6 @@
     __ movl(Address(rsi, 0), rax);
     __ movl(Address(rsi, 4), rdx);
 
-    __ andl(rax, 0x6); // xcr0 bits sse | ymm
-    __ cmpl(rax, 0x6);
-    __ jccb(Assembler::notEqual, sef_cpuid); // jump if AVX is not supported
-
-    //
-    // Some OSs have a bug when upper 128bits of YMM
-    // registers are not restored after a signal processing.
-    // Generate SEGV here (reference through NULL)
-    // and check upper YMM bits after it.
-    //
-    VM_Version::set_avx_cpuFeatures(); // Enable temporary to pass asserts
-    intx saved_useavx = UseAVX;
-    intx saved_usesse = UseSSE;
-    UseAVX = 1;
-    UseSSE = 2;
-
-    // load value into all 32 bytes of ymm7 register
-    __ movl(rcx, VM_Version::ymm_test_value());
-
-    __ movdl(xmm0, rcx);
-    __ pshufd(xmm0, xmm0, 0x00);
-    __ vinsertf128h(xmm0, xmm0, xmm0);
-    __ vmovdqu(xmm7, xmm0);
-#ifdef _LP64
-    __ vmovdqu(xmm8,  xmm0);
-    __ vmovdqu(xmm15, xmm0);
-#endif
-
-    __ xorl(rsi, rsi);
-    VM_Version::set_cpuinfo_segv_addr( __ pc() );
-    // Generate SEGV
-    __ movl(rax, Address(rsi, 0));
-
-    VM_Version::set_cpuinfo_cont_addr( __ pc() );
-    // Returns here after signal. Save xmm0 to check it later.
-    __ lea(rsi, Address(rbp, in_bytes(VM_Version::ymm_save_offset())));
-    __ vmovdqu(Address(rsi,  0), xmm0);
-    __ vmovdqu(Address(rsi, 32), xmm7);
-#ifdef _LP64
-    __ vmovdqu(Address(rsi, 64), xmm8);
-    __ vmovdqu(Address(rsi, 96), xmm15);
-#endif
-
-    VM_Version::clean_cpuFeatures();
-    UseAVX = saved_useavx;
-    UseSSE = saved_usesse;
-
     //
     // cpuid(0x7) Structured Extended Features
     //
@@ -364,9 +318,143 @@
     __ movl(Address(rsi,12), rdx);
 
     //
-    // return
+    // Check if OS has enabled XGETBV instruction to access XCR0
+    // (OSXSAVE feature flag) and CPU supports AVX
+    //
+    __ lea(rsi, Address(rbp, in_bytes(VM_Version::std_cpuid1_offset())));
+    __ movl(rcx, 0x18000000); // cpuid1 bits osxsave | avx
+    __ andl(rcx, Address(rsi, 8)); // cpuid1 bits osxsave | avx
+    __ cmpl(rcx, 0x18000000);
+    __ jccb(Assembler::notEqual, done); // jump if AVX is not supported
+
+    __ movl(rax, 0x6);
+    __ andl(rax, Address(rbp, in_bytes(VM_Version::xem_xcr0_offset()))); // xcr0 bits sse | ymm
+    __ cmpl(rax, 0x6);
+    __ jccb(Assembler::equal, start_simd_check); // return if AVX is not supported
+
+    // we need to bridge farther than imm8, so we use this island as a thunk
+    __ bind(done);
+    __ jmp(wrapup);
+
+    __ bind(start_simd_check);
+    //
+    // Some OSs have a bug when upper 128/256bits of YMM/ZMM
+    // registers are not restored after a signal processing.
+    // Generate SEGV here (reference through NULL)
+    // and check upper YMM/ZMM bits after it.
     //
-    __ bind(done);
+    intx saved_useavx = UseAVX;
+    intx saved_usesse = UseSSE;
+    // check _cpuid_info.sef_cpuid7_ebx.bits.avx512f
+    __ lea(rsi, Address(rbp, in_bytes(VM_Version::sef_cpuid7_offset())));
+    __ movl(rax, 0x10000);
+    __ andl(rax, Address(rsi, 4)); // xcr0 bits sse | ymm
+    __ cmpl(rax, 0x10000);
+    __ jccb(Assembler::notEqual, legacy_setup); // jump if EVEX is not supported
+    // check _cpuid_info.xem_xcr0_eax.bits.opmask
+    // check _cpuid_info.xem_xcr0_eax.bits.zmm512
+    // check _cpuid_info.xem_xcr0_eax.bits.zmm32
+    __ movl(rax, 0xE0);
+    __ andl(rax, Address(rbp, in_bytes(VM_Version::xem_xcr0_offset()))); // xcr0 bits sse | ymm
+    __ cmpl(rax, 0xE0);
+    __ jccb(Assembler::notEqual, legacy_setup); // jump if EVEX is not supported
+
+    // EVEX setup: run in lowest evex mode
+    VM_Version::set_evex_cpuFeatures(); // Enable temporary to pass asserts
+    UseAVX = 3;
+    UseSSE = 2;
+    // load value into all 64 bytes of zmm7 register
+    __ movl(rcx, VM_Version::ymm_test_value());
+    __ movdl(xmm0, rcx);
+    __ movl(rcx, 0xffff);
+#ifdef _LP64
+    __ kmovql(k1, rcx);
+#else
+    __ kmovdl(k1, rcx);
+#endif
+    __ evpbroadcastd(xmm0, xmm0, Assembler::AVX_512bit);
+    __ evmovdqu(xmm7, xmm0, Assembler::AVX_512bit);
+#ifdef _LP64
+    __ evmovdqu(xmm8, xmm0, Assembler::AVX_512bit);
+    __ evmovdqu(xmm31, xmm0, Assembler::AVX_512bit);
+#endif
+    VM_Version::clean_cpuFeatures();
+    __ jmp(save_restore_except);
+
+    __ bind(legacy_setup);
+    // AVX setup
+    VM_Version::set_avx_cpuFeatures(); // Enable temporary to pass asserts
+    UseAVX = 1;
+    UseSSE = 2;
+    // load value into all 32 bytes of ymm7 register
+    __ movl(rcx, VM_Version::ymm_test_value());
+
+    __ movdl(xmm0, rcx);
+    __ pshufd(xmm0, xmm0, 0x00);
+    __ vinsertf128h(xmm0, xmm0, xmm0);
+    __ vmovdqu(xmm7, xmm0);
+#ifdef _LP64
+    __ vmovdqu(xmm8, xmm0);
+    __ vmovdqu(xmm15, xmm0);
+#endif
+    VM_Version::clean_cpuFeatures();
+
+    __ bind(save_restore_except);
+    __ xorl(rsi, rsi);
+    VM_Version::set_cpuinfo_segv_addr(__ pc());
+    // Generate SEGV
+    __ movl(rax, Address(rsi, 0));
+
+    VM_Version::set_cpuinfo_cont_addr(__ pc());
+    // Returns here after signal. Save xmm0 to check it later.
+
+    // check _cpuid_info.sef_cpuid7_ebx.bits.avx512f
+    __ lea(rsi, Address(rbp, in_bytes(VM_Version::sef_cpuid7_offset())));
+    __ movl(rax, 0x10000);
+    __ andl(rax, Address(rsi, 4));
+    __ cmpl(rax, 0x10000);
+    __ jccb(Assembler::notEqual, legacy_save_restore);
+    // check _cpuid_info.xem_xcr0_eax.bits.opmask
+    // check _cpuid_info.xem_xcr0_eax.bits.zmm512
+    // check _cpuid_info.xem_xcr0_eax.bits.zmm32
+    __ movl(rax, 0xE0);
+    __ andl(rax, Address(rbp, in_bytes(VM_Version::xem_xcr0_offset()))); // xcr0 bits sse | ymm
+    __ cmpl(rax, 0xE0);
+    __ jccb(Assembler::notEqual, legacy_save_restore);
+
+    // EVEX check: run in lowest evex mode
+    VM_Version::set_evex_cpuFeatures(); // Enable temporary to pass asserts
+    UseAVX = 3;
+    UseSSE = 2;
+    __ lea(rsi, Address(rbp, in_bytes(VM_Version::zmm_save_offset())));
+    __ evmovdqu(Address(rsi, 0), xmm0, Assembler::AVX_512bit);
+    __ evmovdqu(Address(rsi, 64), xmm7, Assembler::AVX_512bit);
+#ifdef _LP64
+    __ evmovdqu(Address(rsi, 128), xmm8, Assembler::AVX_512bit);
+    __ evmovdqu(Address(rsi, 192), xmm31, Assembler::AVX_512bit);
+#endif
+    VM_Version::clean_cpuFeatures();
+    UseAVX = saved_useavx;
+    UseSSE = saved_usesse;
+    __ jmp(wrapup);
+
+    __ bind(legacy_save_restore);
+    // AVX check
+    VM_Version::set_avx_cpuFeatures(); // Enable temporary to pass asserts
+    UseAVX = 1;
+    UseSSE = 2;
+    __ lea(rsi, Address(rbp, in_bytes(VM_Version::ymm_save_offset())));
+    __ vmovdqu(Address(rsi, 0), xmm0);
+    __ vmovdqu(Address(rsi, 32), xmm7);
+#ifdef _LP64
+    __ vmovdqu(Address(rsi, 64), xmm8);
+    __ vmovdqu(Address(rsi, 96), xmm15);
+#endif
+    VM_Version::clean_cpuFeatures();
+    UseAVX = saved_useavx;
+    UseSSE = saved_usesse;
+
+    __ bind(wrapup);
     __ popf();
     __ pop(rsi);
     __ pop(rbx);
@@ -459,6 +547,29 @@
   if (UseSSE < 1)
     _cpuFeatures &= ~CPU_SSE;
 
+  // first try initial setting and detect what we can support
+  if (UseAVX > 0) {
+    if (UseAVX > 2 && supports_evex()) {
+      UseAVX = 3;
+    } else if (UseAVX > 1 && supports_avx2()) {
+      UseAVX = 2;
+    } else if (UseAVX > 0 && supports_avx()) {
+      UseAVX = 1;
+    } else {
+      UseAVX = 0;
+    }
+  } else if (UseAVX < 0) {
+    UseAVX = 0;
+  }
+
+  if (UseAVX < 3) {
+    _cpuFeatures &= ~CPU_AVX512F;
+    _cpuFeatures &= ~CPU_AVX512DQ;
+    _cpuFeatures &= ~CPU_AVX512CD;
+    _cpuFeatures &= ~CPU_AVX512BW;
+    _cpuFeatures &= ~CPU_AVX512VL;
+  }
+
   if (UseAVX < 2)
     _cpuFeatures &= ~CPU_AVX2;
 
@@ -474,7 +585,7 @@
   }
 
   char buf[256];
-  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
                cores_per_cpu(), threads_per_core(),
                cpu_family(), _model, _stepping,
                (supports_cmov() ? ", cmov" : ""),
@@ -504,7 +615,8 @@
                (supports_tscinv() ? ", tscinv": ""),
                (supports_bmi1() ? ", bmi1" : ""),
                (supports_bmi2() ? ", bmi2" : ""),
-               (supports_adx() ? ", adx" : ""));
+               (supports_adx() ? ", adx" : ""),
+               (supports_evex() ? ", evex" : ""));
   _features_str = os::strdup(buf);
 
   // UseSSE is set to the smaller of what hardware supports and what
@@ -521,13 +633,6 @@
   if (!supports_sse ()) // Drop to 0 if no SSE  support
     UseSSE = 0;
 
-  if (UseAVX > 2) UseAVX=2;
-  if (UseAVX < 0) UseAVX=0;
-  if (!supports_avx2()) // Drop to 1 if no AVX2 support
-    UseAVX = MIN2((intx)1,UseAVX);
-  if (!supports_avx ()) // Drop to 0 if no AVX  support
-    UseAVX = 0;
-
   // Use AES instructions if available.
   if (supports_aes()) {
     if (FLAG_IS_DEFAULT(UseAES)) {
@@ -598,7 +703,8 @@
       if ((_model == CPU_MODEL_HASWELL_E3) ||
           (_model == CPU_MODEL_HASWELL_E7 && _stepping < 3) ||
           (_model == CPU_MODEL_BROADWELL  && _stepping < 4)) {
-        if (!UnlockExperimentalVMOptions) {
+        // currently a collision between SKL and HSW_E3
+        if (!UnlockExperimentalVMOptions && UseAVX < 3) {
           vm_exit_during_initialization("UseRTMLocking is only available as experimental option on this platform. It must be enabled via -XX:+UnlockExperimentalVMOptions flag.");
         } else {
           warning("UseRTMLocking is only available as experimental option on this platform.");
@@ -651,10 +757,10 @@
   if (MaxVectorSize > 0) {
     if (!is_power_of_2(MaxVectorSize)) {
       warning("MaxVectorSize must be a power of 2");
-      FLAG_SET_DEFAULT(MaxVectorSize, 32);
+      FLAG_SET_DEFAULT(MaxVectorSize, 64);
     }
-    if (MaxVectorSize > 32) {
-      FLAG_SET_DEFAULT(MaxVectorSize, 32);
+    if (MaxVectorSize > 64) {
+      FLAG_SET_DEFAULT(MaxVectorSize, 64);
     }
     if (MaxVectorSize > 16 && (UseAVX == 0 || !os_supports_avx_vectors())) {
       // 32 bytes vectors (in YMM) are only supported with AVX+
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp	Fri May 08 11:49:20 2015 -0700
@@ -208,20 +208,33 @@
                    bmi2 : 1,
                    erms : 1,
                         : 1,
-                   rtm  : 1,
-                        : 7,
-                   adx  : 1,
-                        : 12;
+                    rtm : 1,
+                        : 4,
+                avx512f : 1,
+               avx512dq : 1,
+                        : 1,
+                    adx : 1,
+                        : 6,
+               avx512pf : 1,
+               avx512er : 1,
+               avx512cd : 1,
+                        : 1,
+               avx512bw : 1,
+               avx512vl : 1;
     } bits;
   };
 
   union XemXcr0Eax {
     uint32_t value;
     struct {
-      uint32_t x87 : 1,
-               sse : 1,
-               ymm : 1,
-                   : 29;
+      uint32_t x87    : 1,
+               sse    : 1,
+               ymm    : 1,
+                      : 2,
+               opmask : 1,
+               zmm512 : 1,
+                zmm32 : 1,
+                      : 24;
     } bits;
   };
 
@@ -229,43 +242,51 @@
   static int _cpu;
   static int _model;
   static int _stepping;
-  static int _cpuFeatures;     // features returned by the "cpuid" instruction
-                               // 0 if this instruction is not available
+  static uint64_t _cpuFeatures; // features returned by the "cpuid" instruction
+                                // 0 if this instruction is not available
   static const char* _features_str;
 
   static address   _cpuinfo_segv_addr; // address of instruction which causes SEGV
   static address   _cpuinfo_cont_addr; // address of instruction after the one which causes SEGV
 
   enum {
-    CPU_CX8    = (1 << 0), // next bits are from cpuid 1 (EDX)
-    CPU_CMOV   = (1 << 1),
-    CPU_FXSR   = (1 << 2),
-    CPU_HT     = (1 << 3),
-    CPU_MMX    = (1 << 4),
-    CPU_3DNOW_PREFETCH  = (1 << 5), // Processor supports 3dnow prefetch and prefetchw instructions
-                                    // may not necessarily support other 3dnow instructions
-    CPU_SSE    = (1 << 6),
-    CPU_SSE2   = (1 << 7),
-    CPU_SSE3   = (1 << 8), // SSE3 comes from cpuid 1 (ECX)
-    CPU_SSSE3  = (1 << 9),
-    CPU_SSE4A  = (1 << 10),
-    CPU_SSE4_1 = (1 << 11),
-    CPU_SSE4_2 = (1 << 12),
-    CPU_POPCNT = (1 << 13),
-    CPU_LZCNT  = (1 << 14),
-    CPU_TSC    = (1 << 15),
-    CPU_TSCINV = (1 << 16),
-    CPU_AVX    = (1 << 17),
-    CPU_AVX2   = (1 << 18),
-    CPU_AES    = (1 << 19),
-    CPU_ERMS   = (1 << 20), // enhanced 'rep movsb/stosb' instructions
-    CPU_CLMUL  = (1 << 21), // carryless multiply for CRC
-    CPU_BMI1   = (1 << 22),
-    CPU_BMI2   = (1 << 23),
-    CPU_RTM    = (1 << 24),  // Restricted Transactional Memory instructions
-    CPU_ADX    = (1 << 25)
+    CPU_CX8      = (1 << 0), // next bits are from cpuid 1 (EDX)
+    CPU_CMOV     = (1 << 1),
+    CPU_FXSR     = (1 << 2),
+    CPU_HT       = (1 << 3),
+    CPU_MMX      = (1 << 4),
+    CPU_3DNOW_PREFETCH = (1 << 5), // Processor supports 3dnow prefetch and prefetchw instructions
+                                   // may not necessarily support other 3dnow instructions
+    CPU_SSE      = (1 << 6),
+    CPU_SSE2     = (1 << 7),
+    CPU_SSE3     = (1 << 8),  // SSE3 comes from cpuid 1 (ECX)
+    CPU_SSSE3    = (1 << 9),
+    CPU_SSE4A    = (1 << 10),
+    CPU_SSE4_1   = (1 << 11),
+    CPU_SSE4_2   = (1 << 12),
+    CPU_POPCNT   = (1 << 13),
+    CPU_LZCNT    = (1 << 14),
+    CPU_TSC      = (1 << 15),
+    CPU_TSCINV   = (1 << 16),
+    CPU_AVX      = (1 << 17),
+    CPU_AVX2     = (1 << 18),
+    CPU_AES      = (1 << 19),
+    CPU_ERMS     = (1 << 20), // enhanced 'rep movsb/stosb' instructions
+    CPU_CLMUL    = (1 << 21), // carryless multiply for CRC
+    CPU_BMI1     = (1 << 22),
+    CPU_BMI2     = (1 << 23),
+    CPU_RTM      = (1 << 24), // Restricted Transactional Memory instructions
+    CPU_ADX      = (1 << 25),
+    CPU_AVX512F  = (1 << 26), // AVX 512bit foundation instructions
+    CPU_AVX512DQ = (1 << 27),
+    CPU_AVX512PF = (1 << 28),
+    CPU_AVX512ER = (1 << 29),
+    CPU_AVX512CD = (1 << 30),
+    CPU_AVX512BW = (1 << 31)
   } cpuFeatureFlags;
 
+#define CPU_AVX512VL 0x100000000 // EVEX instructions with smaller vector length : enums are limited to 32bit
+
   enum {
     // AMD
     CPU_FAMILY_AMD_11H       = 0x11,
@@ -282,7 +303,8 @@
     CPU_MODEL_IVYBRIDGE_EP   = 0x3a,
     CPU_MODEL_HASWELL_E3     = 0x3c,
     CPU_MODEL_HASWELL_E7     = 0x3f,
-    CPU_MODEL_BROADWELL      = 0x3d
+    CPU_MODEL_BROADWELL      = 0x3d,
+    CPU_MODEL_SKYLAKE        = CPU_MODEL_HASWELL_E3
   } cpuExtendedFamily;
 
   // cpuid information block.  All info derived from executing cpuid with
@@ -376,6 +398,9 @@
 
     // Space to save ymm registers after signal handle
     int          ymm_save[8*4]; // Save ymm0, ymm7, ymm8, ymm15
+
+    // Space to save zmm registers after signal handle
+    int          zmm_save[16*4]; // Save zmm0, zmm7, zmm8, zmm31
   };
 
   // The actual cpuid info block
@@ -404,8 +429,8 @@
     return result;
   }
 
-  static uint32_t feature_flags() {
-    uint32_t result = 0;
+  static uint64_t feature_flags() {
+    uint64_t result = 0;
     if (_cpuid_info.std_cpuid1_edx.bits.cmpxchg8 != 0)
       result |= CPU_CX8;
     if (_cpuid_info.std_cpuid1_edx.bits.cmov != 0)
@@ -440,6 +465,24 @@
       result |= CPU_AVX;
       if (_cpuid_info.sef_cpuid7_ebx.bits.avx2 != 0)
         result |= CPU_AVX2;
+      if (_cpuid_info.sef_cpuid7_ebx.bits.avx512f != 0 &&
+          _cpuid_info.xem_xcr0_eax.bits.opmask != 0 &&
+          _cpuid_info.xem_xcr0_eax.bits.zmm512 != 0 &&
+          _cpuid_info.xem_xcr0_eax.bits.zmm32 != 0) {
+        result |= CPU_AVX512F;
+        if (_cpuid_info.sef_cpuid7_ebx.bits.avx512cd != 0)
+          result |= CPU_AVX512CD;
+        if (_cpuid_info.sef_cpuid7_ebx.bits.avx512dq != 0)
+          result |= CPU_AVX512DQ;
+        if (_cpuid_info.sef_cpuid7_ebx.bits.avx512pf != 0)
+          result |= CPU_AVX512PF;
+        if (_cpuid_info.sef_cpuid7_ebx.bits.avx512er != 0)
+          result |= CPU_AVX512ER;
+        if (_cpuid_info.sef_cpuid7_ebx.bits.avx512bw != 0)
+          result |= CPU_AVX512BW;
+        if (_cpuid_info.sef_cpuid7_ebx.bits.avx512vl != 0)
+          result |= CPU_AVX512VL;
+      }
     }
     if(_cpuid_info.sef_cpuid7_ebx.bits.bmi1 != 0)
       result |= CPU_BMI1;
@@ -484,18 +527,31 @@
   }
 
   static bool os_supports_avx_vectors() {
-    if (!supports_avx()) {
-      return false;
-    }
-    // Verify that OS save/restore all bits of AVX registers
-    // during signal processing.
-    int nreg = 2 LP64_ONLY(+2);
-    for (int i = 0; i < 8 * nreg; i++) { // 32 bytes per ymm register
-      if (_cpuid_info.ymm_save[i] != ymm_test_value()) {
-        return false;
+    bool retVal = false;
+    if (supports_evex()) {
+      // Verify that OS save/restore all bits of EVEX registers
+      // during signal processing.
+      int nreg = 2 LP64_ONLY(+2);
+      retVal = true;
+      for (int i = 0; i < 16 * nreg; i++) { // 64 bytes per zmm register
+        if (_cpuid_info.zmm_save[i] != ymm_test_value()) {
+          retVal = false;
+          break;
+        }
+      }
+    } else if (supports_avx()) {
+      // Verify that OS save/restore all bits of AVX registers
+      // during signal processing.
+      int nreg = 2 LP64_ONLY(+2);
+      retVal = true;
+      for (int i = 0; i < 8 * nreg; i++) { // 32 bytes per ymm register
+        if (_cpuid_info.ymm_save[i] != ymm_test_value()) {
+          retVal = false;
+          break;
+        }
       }
     }
-    return true;
+    return retVal;
   }
 
   static void get_processor_features();
@@ -515,6 +571,7 @@
   static ByteSize tpl_cpuidB2_offset() { return byte_offset_of(CpuidInfo, tpl_cpuidB2_eax); }
   static ByteSize xem_xcr0_offset() { return byte_offset_of(CpuidInfo, xem_xcr0_eax); }
   static ByteSize ymm_save_offset() { return byte_offset_of(CpuidInfo, ymm_save); }
+  static ByteSize zmm_save_offset() { return byte_offset_of(CpuidInfo, zmm_save); }
 
   // The value used to check ymm register after signal handle
   static int ymm_test_value()    { return 0xCAFEBABE; }
@@ -527,6 +584,7 @@
 
   static void clean_cpuFeatures()   { _cpuFeatures = 0; }
   static void set_avx_cpuFeatures() { _cpuFeatures = (CPU_SSE | CPU_SSE2 | CPU_AVX); }
+  static void set_evex_cpuFeatures() { _cpuFeatures = (CPU_AVX512F | CPU_SSE | CPU_SSE2 ); }
 
 
   // Initialization
@@ -636,7 +694,14 @@
   static bool supports_rtm()      { return (_cpuFeatures & CPU_RTM) != 0; }
   static bool supports_bmi1()     { return (_cpuFeatures & CPU_BMI1) != 0; }
   static bool supports_bmi2()     { return (_cpuFeatures & CPU_BMI2) != 0; }
-  static bool supports_adx()     { return (_cpuFeatures & CPU_ADX) != 0; }
+  static bool supports_adx()      { return (_cpuFeatures & CPU_ADX) != 0; }
+  static bool supports_evex()     { return (_cpuFeatures & CPU_AVX512F) != 0; }
+  static bool supports_avx512dq() { return (_cpuFeatures & CPU_AVX512DQ) != 0; }
+  static bool supports_avx512pf() { return (_cpuFeatures & CPU_AVX512PF) != 0; }
+  static bool supports_avx512er() { return (_cpuFeatures & CPU_AVX512ER) != 0; }
+  static bool supports_avx512cd() { return (_cpuFeatures & CPU_AVX512CD) != 0; }
+  static bool supports_avx512bw() { return (_cpuFeatures & CPU_AVX512BW) != 0; }
+  static bool supports_avx512vl() { return (_cpuFeatures & CPU_AVX512VL) != 0; }
   // Intel features
   static bool is_intel_family_core() { return is_intel() &&
                                        extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }
--- a/hotspot/src/cpu/x86/vm/vmreg_x86.cpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/vmreg_x86.cpp	Fri May 08 11:49:20 2015 -0700
@@ -47,13 +47,22 @@
   }
 
   XMMRegister xreg = ::as_XMMRegister(0);
-  for ( ; i < ConcreteRegisterImpl::max_xmm ; ) {
-    for (int j = 0 ; j < 8 ; j++) {
+  for (; i < ConcreteRegisterImpl::max_xmm;) {
+    for (int j = 0 ; j < XMMRegisterImpl::max_slots_per_register ; j++) {
       regName[i++] = xreg->name();
     }
     xreg = xreg->successor();
   }
+
+  KRegister kreg = ::as_KRegister(0);
+  for (; i < ConcreteRegisterImpl::max_kpr;) {
+    for (int j = 0; j < KRegisterImpl::max_slots_per_register; j++) {
+      regName[i++] = kreg->name();
+    }
+    kreg = kreg->successor();
+  }
+
   for ( ; i < ConcreteRegisterImpl::number_of_registers ; i ++ ) {
-    regName[i] = "NON-GPR-FPR-XMM";
+    regName[i] = "NON-GPR-FPR-XMM-KREG";
   }
 }
--- a/hotspot/src/cpu/x86/vm/vmreg_x86.hpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/vmreg_x86.hpp	Fri May 08 11:49:20 2015 -0700
@@ -36,7 +36,24 @@
 }
 
 inline bool is_XMMRegister() {
-  return value() >= ConcreteRegisterImpl::max_fpr && value() < ConcreteRegisterImpl::max_xmm;
+  int uarch_max_xmm = ConcreteRegisterImpl::max_xmm;
+
+#ifdef _LP64
+  if (UseAVX < 3) {
+    int half_xmm = (XMMRegisterImpl::max_slots_per_register * XMMRegisterImpl::number_of_registers) / 2;
+    uarch_max_xmm -= half_xmm;
+  }
+#endif
+
+  return (value() >= ConcreteRegisterImpl::max_fpr && value() < uarch_max_xmm);
+}
+
+inline bool is_KRegister() {
+  if (UseAVX > 2) {
+    return value() >= ConcreteRegisterImpl::max_xmm && value() < ConcreteRegisterImpl::max_kpr;
+  } else {
+    return false;
+  }
 }
 
 inline Register as_Register() {
@@ -59,7 +76,13 @@
 inline XMMRegister as_XMMRegister() {
   assert( is_XMMRegister() && is_even(value()), "must be" );
   // Yuk
-  return ::as_XMMRegister((value() - ConcreteRegisterImpl::max_fpr) >> 3);
+  return ::as_XMMRegister((value() - ConcreteRegisterImpl::max_fpr) >> 4);
+}
+
+inline KRegister as_KRegister() {
+  assert(is_KRegister(), "must be");
+  // Yuk
+  return ::as_KRegister((value() - ConcreteRegisterImpl::max_xmm));
 }
 
 inline   bool is_concrete() {
--- a/hotspot/src/cpu/x86/vm/vmreg_x86.inline.hpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/vmreg_x86.inline.hpp	Fri May 08 11:49:20 2015 -0700
@@ -39,7 +39,11 @@
 }
 
 inline VMReg XMMRegisterImpl::as_VMReg() {
-  return VMRegImpl::as_VMReg((encoding() << 3) + ConcreteRegisterImpl::max_fpr);
+  return VMRegImpl::as_VMReg((encoding() << 4) + ConcreteRegisterImpl::max_fpr);
+}
+
+inline VMReg KRegisterImpl::as_VMReg() {
+  return VMRegImpl::as_VMReg(encoding() + ConcreteRegisterImpl::max_xmm);
 }
 
 #endif // CPU_X86_VM_VMREG_X86_INLINE_HPP
--- a/hotspot/src/cpu/x86/vm/x86.ad	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/x86.ad	Fri May 08 11:49:20 2015 -0700
@@ -59,15 +59,19 @@
 //
 // The encoding number is the actual bit-pattern placed into the opcodes.
 
-// XMM registers.  256-bit registers or 8 words each, labeled (a)-h.
+// XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
 // Word a in each register holds a Float, words ab hold a Double.
 // The whole registers are used in SSE4.2 version intrinsics,
 // array copy stubs and superword operations (see UseSSE42Intrinsics,
 // UseXMMForArrayCopy and UseSuperword flags).
-// XMM8-XMM15 must be encoded with REX (VEX for UseAVX).
+// For pre EVEX enabled architectures:
+//      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
+// For EVEX enabled architectures:
+//      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
+//
 // Linux ABI:   No register preserved across function calls
 //              XMM0-XMM7 might hold parameters
-// Windows ABI: XMM6-XMM15 preserved across function calls
+// Windows ABI: XMM6-XMM31 preserved across function calls
 //              XMM0-XMM3 might hold parameters
 
 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
@@ -78,6 +82,14 @@
 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
+reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
+reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
+reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
+reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
+reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
+reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
+reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
+reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
 
 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
@@ -87,6 +99,14 @@
 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
+reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
+reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
+reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
+reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
+reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
+reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
+reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
+reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
 
 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
@@ -96,6 +116,14 @@
 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
+reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
+reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
+reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
+reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
+reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
+reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
+reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
+reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
 
 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
@@ -105,6 +133,14 @@
 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
+reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
+reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
+reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
+reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
+reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
+reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
+reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
+reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
 
 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
@@ -114,6 +150,14 @@
 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
+reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
+reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
+reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
+reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
+reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
+reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
+reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
+reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
 
 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
@@ -123,6 +167,14 @@
 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
+reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
+reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
+reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
+reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
+reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
+reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
+reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
+reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
 
 #ifdef _WIN64
 
@@ -134,6 +186,14 @@
 reg_def XMM6f( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 reg_def XMM6g( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 reg_def XMM6h( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(7));
+reg_def XMM6i( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(8));
+reg_def XMM6j( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(9));
+reg_def XMM6k( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(10));
+reg_def XMM6l( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(11));
+reg_def XMM6m( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(12));
+reg_def XMM6n( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(13));
+reg_def XMM6o( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(14));
+reg_def XMM6p( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 
 reg_def XMM7 ( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg());
 reg_def XMM7b( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(1));
@@ -143,6 +203,14 @@
 reg_def XMM7f( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 reg_def XMM7g( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 reg_def XMM7h( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(7));
+reg_def XMM7i( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(8));
+reg_def XMM7j( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(9));
+reg_def XMM7k( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(10));
+reg_def XMM7l( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(11));
+reg_def XMM7m( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(12));
+reg_def XMM7n( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(13));
+reg_def XMM7o( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(14));
+reg_def XMM7p( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 
 reg_def XMM8 ( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg());
 reg_def XMM8b( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(1));
@@ -152,6 +220,14 @@
 reg_def XMM8f( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 reg_def XMM8g( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 reg_def XMM8h( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(7));
+reg_def XMM8i( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(8));
+reg_def XMM8j( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(9));
+reg_def XMM8k( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(10));
+reg_def XMM8l( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(11));
+reg_def XMM8m( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(12));
+reg_def XMM8n( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(13));
+reg_def XMM8o( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(14));
+reg_def XMM8p( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 
 reg_def XMM9 ( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg());
 reg_def XMM9b( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(1));
@@ -161,6 +237,14 @@
 reg_def XMM9f( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 reg_def XMM9g( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 reg_def XMM9h( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(7));
+reg_def XMM9i( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(8));
+reg_def XMM9j( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(9));
+reg_def XMM9k( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(10));
+reg_def XMM9l( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(11));
+reg_def XMM9m( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(12));
+reg_def XMM9n( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(13));
+reg_def XMM9o( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(14));
+reg_def XMM9p( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 
 reg_def XMM10 ( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg());
 reg_def XMM10b( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(1));
@@ -170,6 +254,14 @@
 reg_def XMM10f( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 reg_def XMM10g( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 reg_def XMM10h( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(7));
+reg_def XMM10i( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(8));
+reg_def XMM10j( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(9));
+reg_def XMM10k( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(10));
+reg_def XMM10l( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(11));
+reg_def XMM10m( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(12));
+reg_def XMM10n( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(13));
+reg_def XMM10o( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(14));
+reg_def XMM10p( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 
 reg_def XMM11 ( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg());
 reg_def XMM11b( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(1));
@@ -179,6 +271,14 @@
 reg_def XMM11f( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 reg_def XMM11g( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 reg_def XMM11h( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(7));
+reg_def XMM11i( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(8));
+reg_def XMM11j( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(9));
+reg_def XMM11k( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(10));
+reg_def XMM11l( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(11));
+reg_def XMM11m( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(12));
+reg_def XMM11n( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(13));
+reg_def XMM11o( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(14));
+reg_def XMM11p( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 
 reg_def XMM12 ( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg());
 reg_def XMM12b( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(1));
@@ -188,6 +288,14 @@
 reg_def XMM12f( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 reg_def XMM12g( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 reg_def XMM12h( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(7));
+reg_def XMM12i( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(8));
+reg_def XMM12j( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(9));
+reg_def XMM12k( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(10));
+reg_def XMM12l( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(11));
+reg_def XMM12m( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(12));
+reg_def XMM12n( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(13));
+reg_def XMM12o( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(14));
+reg_def XMM12p( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 
 reg_def XMM13 ( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg());
 reg_def XMM13b( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(1));
@@ -197,6 +305,14 @@
 reg_def XMM13f( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 reg_def XMM13g( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 reg_def XMM13h( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(7));
+reg_def XMM13i( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(8));
+reg_def XMM13j( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(9));
+reg_def XMM13k( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(10));
+reg_def XMM13l( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(11));
+reg_def XMM13m( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(12));
+reg_def XMM13n( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(13));
+reg_def XMM13o( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(14));
+reg_def XMM13p( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 
 reg_def XMM14 ( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg());
 reg_def XMM14b( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(1));
@@ -206,6 +322,14 @@
 reg_def XMM14f( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 reg_def XMM14g( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 reg_def XMM14h( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(7));
+reg_def XMM14i( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(8));
+reg_def XMM14j( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(9));
+reg_def XMM14k( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(10));
+reg_def XMM14l( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(11));
+reg_def XMM14m( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(12));
+reg_def XMM14n( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(13));
+reg_def XMM14o( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(14));
+reg_def XMM14p( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 
 reg_def XMM15 ( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg());
 reg_def XMM15b( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(1));
@@ -215,6 +339,285 @@
 reg_def XMM15f( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 reg_def XMM15g( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 reg_def XMM15h( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(7));
+reg_def XMM15i( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(8));
+reg_def XMM15j( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(9));
+reg_def XMM15k( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(10));
+reg_def XMM15l( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(11));
+reg_def XMM15m( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(12));
+reg_def XMM15n( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(13));
+reg_def XMM15o( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(14));
+reg_def XMM15p( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(15));
+
+reg_def XMM16 ( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg());
+reg_def XMM16b( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(1));
+reg_def XMM16c( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(2));
+reg_def XMM16d( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(3));
+reg_def XMM16e( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(4));
+reg_def XMM16f( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(5));
+reg_def XMM16g( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(6));
+reg_def XMM16h( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(7));
+reg_def XMM16i( SOC, SOE, Op_RegF, 16, xmm15->as_VMReg()->next(8));
+reg_def XMM16j( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(9));
+reg_def XMM16k( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(10));
+reg_def XMM16l( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(11));
+reg_def XMM16m( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(12));
+reg_def XMM16n( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(13));
+reg_def XMM16o( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(14));
+reg_def XMM16p( SOC, SOE, Op_RegF, 16, xmm16->as_VMReg()->next(15));
+
+reg_def XMM17 ( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg());
+reg_def XMM17b( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(1));
+reg_def XMM17c( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(2));
+reg_def XMM17d( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(3));
+reg_def XMM17e( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(4));
+reg_def XMM17f( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(5));
+reg_def XMM17g( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(6));
+reg_def XMM17h( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(7));
+reg_def XMM17i( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(8));
+reg_def XMM17j( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(9));
+reg_def XMM17k( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(10));
+reg_def XMM17l( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(11));
+reg_def XMM17m( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(12));
+reg_def XMM17n( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(13));
+reg_def XMM17o( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(14));
+reg_def XMM17p( SOC, SOE, Op_RegF, 17, xmm17->as_VMReg()->next(15));
+
+reg_def XMM18 ( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg());
+reg_def XMM18b( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(1));
+reg_def XMM18c( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(2));
+reg_def XMM18d( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(3));
+reg_def XMM18e( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(4));
+reg_def XMM18f( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(5));
+reg_def XMM18g( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(6));
+reg_def XMM18h( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(7));
+reg_def XMM18i( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(8));
+reg_def XMM18j( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(9));
+reg_def XMM18k( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(10));
+reg_def XMM18l( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(11));
+reg_def XMM18m( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(12));
+reg_def XMM18n( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(13));
+reg_def XMM18o( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(14));
+reg_def XMM18p( SOC, SOE, Op_RegF, 18, xmm18->as_VMReg()->next(15));
+
+reg_def XMM19 ( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg());
+reg_def XMM19b( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(1));
+reg_def XMM19c( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(2));
+reg_def XMM19d( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(3));
+reg_def XMM19e( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(4));
+reg_def XMM19f( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(5));
+reg_def XMM19g( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(6));
+reg_def XMM19h( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(7));
+reg_def XMM19i( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(8));
+reg_def XMM19j( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(9));
+reg_def XMM19k( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(10));
+reg_def XMM19l( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(11));
+reg_def XMM19m( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(12));
+reg_def XMM19n( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(13));
+reg_def XMM19o( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(14));
+reg_def XMM19p( SOC, SOE, Op_RegF, 19, xmm19->as_VMReg()->next(15));
+
+reg_def XMM20 ( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg());
+reg_def XMM20b( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(1));
+reg_def XMM20c( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(2));
+reg_def XMM20d( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(3));
+reg_def XMM20e( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(4));
+reg_def XMM20f( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(5));
+reg_def XMM20g( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(6));
+reg_def XMM20h( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(7));
+reg_def XMM20i( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(8));
+reg_def XMM20j( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(9));
+reg_def XMM20k( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(10));
+reg_def XMM20l( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(11));
+reg_def XMM20m( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(12));
+reg_def XMM20n( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(13));
+reg_def XMM20o( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(14));
+reg_def XMM20p( SOC, SOE, Op_RegF, 20, xmm20->as_VMReg()->next(15));
+
+reg_def XMM21 ( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg());
+reg_def XMM21b( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(1));
+reg_def XMM21c( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(2));
+reg_def XMM21d( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(3));
+reg_def XMM21e( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(4));
+reg_def XMM21f( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(5));
+reg_def XMM21g( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(6));
+reg_def XMM21h( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(7));
+reg_def XMM21i( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(8));
+reg_def XMM21j( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(9));
+reg_def XMM21k( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(10));
+reg_def XMM21l( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(11));
+reg_def XMM21m( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(12));
+reg_def XMM21n( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(13));
+reg_def XMM21o( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(14));
+reg_def XMM21p( SOC, SOE, Op_RegF, 21, xmm21->as_VMReg()->next(15));
+
+reg_def XMM22 ( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg());
+reg_def XMM22b( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(1));
+reg_def XMM22c( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(2));
+reg_def XMM22d( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(3));
+reg_def XMM22e( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(4));
+reg_def XMM22f( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(5));
+reg_def XMM22g( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(6));
+reg_def XMM22h( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(7));
+reg_def XMM22i( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(8));
+reg_def XMM22j( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(9));
+reg_def XMM22k( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(10));
+reg_def XMM22l( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(11));
+reg_def XMM22m( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(12));
+reg_def XMM22n( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(13));
+reg_def XMM22o( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(14));
+reg_def XMM22p( SOC, SOE, Op_RegF, 22, xmm22->as_VMReg()->next(15));
+
+reg_def XMM23 ( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg());
+reg_def XMM23b( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(1));
+reg_def XMM23c( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(2));
+reg_def XMM23d( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(3));
+reg_def XMM23e( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(4));
+reg_def XMM23f( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(5));
+reg_def XMM23g( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(6));
+reg_def XMM23h( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(7));
+reg_def XMM23i( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(8));
+reg_def XMM23j( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(9));
+reg_def XMM23k( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(10));
+reg_def XMM23l( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(11));
+reg_def XMM23m( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(12));
+reg_def XMM23n( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(13));
+reg_def XMM23o( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(14));
+reg_def XMM23p( SOC, SOE, Op_RegF, 23, xmm23->as_VMReg()->next(15));
+
+reg_def XMM24 ( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg());
+reg_def XMM24b( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(1));
+reg_def XMM24c( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(2));
+reg_def XMM24d( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(3));
+reg_def XMM24e( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(4));
+reg_def XMM24f( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(5));
+reg_def XMM24g( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(6));
+reg_def XMM24h( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(7));
+reg_def XMM24i( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(8));
+reg_def XMM24j( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(9));
+reg_def XMM24k( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(10));
+reg_def XMM24l( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(11));
+reg_def XMM24m( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(12));
+reg_def XMM24n( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(13));
+reg_def XMM24o( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(14));
+reg_def XMM24p( SOC, SOE, Op_RegF, 24, xmm24->as_VMReg()->next(15));
+
+reg_def XMM25 ( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg());
+reg_def XMM25b( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(1));
+reg_def XMM25c( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(2));
+reg_def XMM25d( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(3));
+reg_def XMM25e( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(4));
+reg_def XMM25f( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(5));
+reg_def XMM25g( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(6));
+reg_def XMM25h( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(7));
+reg_def XMM25i( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(8));
+reg_def XMM25j( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(9));
+reg_def XMM25k( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(10));
+reg_def XMM25l( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(11));
+reg_def XMM25m( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(12));
+reg_def XMM25n( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(13));
+reg_def XMM25o( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(14));
+reg_def XMM25p( SOC, SOE, Op_RegF, 25, xmm25->as_VMReg()->next(15));
+
+reg_def XMM26 ( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg());
+reg_def XMM26b( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(1));
+reg_def XMM26c( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(2));
+reg_def XMM26d( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(3));
+reg_def XMM26e( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(4));
+reg_def XMM26f( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(5));
+reg_def XMM26g( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(6));
+reg_def XMM26h( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(7));
+reg_def XMM26i( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(8));
+reg_def XMM26j( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(9));
+reg_def XMM26k( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(10));
+reg_def XMM26l( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(11));
+reg_def XMM26m( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(12));
+reg_def XMM26n( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(13));
+reg_def XMM26o( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(14));
+reg_def XMM26p( SOC, SOE, Op_RegF, 26, xmm26->as_VMReg()->next(15));
+
+reg_def XMM27g( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(1));
+reg_def XMM27c( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(2));
+reg_def XMM27d( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(3));
+reg_def XMM27e( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(4));
+reg_def XMM27f( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(5));
+reg_def XMM27g( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(6));
+reg_def XMM27h( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(7));
+reg_def XMM27i( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(8));
+reg_def XMM27j( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(9));
+reg_def XMM27k( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(10));
+reg_def XMM27l( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(11));
+reg_def XMM27m( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(12));
+reg_def XMM27n( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(13));
+reg_def XMM27o( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(14));
+reg_def XMM27p( SOC, SOE, Op_RegF, 27, xmm27->as_VMReg()->next(15));
+
+reg_def XMM28 ( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg());
+reg_def XMM28b( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(1));
+reg_def XMM28c( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(2));
+reg_def XMM28d( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(3));
+reg_def XMM28e( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(4));
+reg_def XMM28f( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(5));
+reg_def XMM28g( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(6));
+reg_def XMM28h( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(7));
+reg_def XMM28i( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(8));
+reg_def XMM28j( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(9));
+reg_def XMM28k( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(10));
+reg_def XMM28l( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(11));
+reg_def XMM28m( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(12));
+reg_def XMM28n( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(13));
+reg_def XMM28o( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(14));
+reg_def XMM28p( SOC, SOE, Op_RegF, 28, xmm28->as_VMReg()->next(15));
+
+reg_def XMM29 ( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg());
+reg_def XMM29b( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(1));
+reg_def XMM29c( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(2));
+reg_def XMM29d( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(3));
+reg_def XMM29e( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(4));
+reg_def XMM29f( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(5));
+reg_def XMM29g( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(6));
+reg_def XMM29h( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(7));
+reg_def XMM29i( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(8));
+reg_def XMM29j( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(9));
+reg_def XMM29k( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(10));
+reg_def XMM29l( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(11));
+reg_def XMM29m( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(12));
+reg_def XMM29n( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(13));
+reg_def XMM29o( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(14));
+reg_def XMM29p( SOC, SOE, Op_RegF, 29, xmm29->as_VMReg()->next(15));
+
+reg_def XMM30 ( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg());
+reg_def XMM30b( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(1));
+reg_def XMM30c( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(2));
+reg_def XMM30d( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(3));
+reg_def XMM30e( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(4));
+reg_def XMM30f( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(5));
+reg_def XMM30g( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(6));
+reg_def XMM30h( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(7));
+reg_def XMM30i( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(8));
+reg_def XMM30j( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(9));
+reg_def XMM30k( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(10));
+reg_def XMM30l( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(11));
+reg_def XMM30m( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(12));
+reg_def XMM30n( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(13));
+reg_def XMM30o( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(14));
+reg_def XMM30p( SOC, SOE, Op_RegF, 30, xmm30->as_VMReg()->next(15));
+
+reg_def XMM31 ( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg());
+reg_def XMM31b( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(1));
+reg_def XMM31c( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(2));
+reg_def XMM31d( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(3));
+reg_def XMM31e( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(4));
+reg_def XMM31f( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(5));
+reg_def XMM31g( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(6));
+reg_def XMM31h( SOC, SOE, Op_RegF, 31, xmm31>-as_VMReg()->next(7));
+reg_def XMM31i( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(8));
+reg_def XMM31j( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(9));
+reg_def XMM31k( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(10));
+reg_def XMM31l( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(11));
+reg_def XMM31m( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(12));
+reg_def XMM31n( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(13));
+reg_def XMM31o( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(14));
+reg_def XMM31p( SOC, SOE, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 
 #else // _WIN64
 
@@ -226,6 +629,14 @@
 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
+reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
+reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
+reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
+reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
+reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
+reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
+reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
+reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 
 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
@@ -235,6 +646,14 @@
 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
+reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
+reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
+reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
+reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
+reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
+reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
+reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
+reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 
 #ifdef _LP64
 
@@ -246,6 +665,14 @@
 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
+reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
+reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
+reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
+reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
+reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
+reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
+reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
+reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 
 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
@@ -255,6 +682,14 @@
 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
+reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
+reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
+reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
+reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
+reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
+reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
+reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
+reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 
 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
@@ -264,6 +699,14 @@
 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
+reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
+reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
+reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
+reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
+reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
+reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
+reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
+reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 
 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
@@ -273,6 +716,14 @@
 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
+reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
+reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
+reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
+reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
+reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
+reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
+reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
+reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 
 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
@@ -282,6 +733,14 @@
 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
+reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
+reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
+reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
+reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
+reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
+reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
+reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
+reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 
 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
@@ -291,6 +750,14 @@
 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
+reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
+reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
+reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
+reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
+reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
+reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
+reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
+reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 
 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
@@ -300,6 +767,14 @@
 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
+reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
+reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
+reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
+reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
+reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
+reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
+reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
+reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 
 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
@@ -309,6 +784,286 @@
 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
+reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
+reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
+reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
+reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
+reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
+reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
+reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
+reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
+
+reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
+reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
+reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
+reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
+reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
+reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
+reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
+reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
+reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
+reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
+reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
+reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
+reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
+reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
+reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
+reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
+
+reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
+reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
+reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
+reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
+reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
+reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
+reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
+reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
+reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
+reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
+reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
+reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
+reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
+reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
+reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
+reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
+
+reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
+reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
+reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
+reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
+reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
+reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
+reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
+reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
+reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
+reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
+reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
+reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
+reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
+reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
+reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
+reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
+
+reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
+reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
+reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
+reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
+reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
+reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
+reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
+reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
+reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
+reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
+reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
+reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
+reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
+reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
+reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
+reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
+
+reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
+reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
+reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
+reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
+reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
+reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
+reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
+reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
+reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
+reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
+reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
+reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
+reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
+reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
+reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
+reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
+
+reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
+reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
+reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
+reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
+reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
+reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
+reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
+reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
+reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
+reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
+reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
+reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
+reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
+reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
+reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
+reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
+
+reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
+reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
+reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
+reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
+reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
+reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
+reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
+reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
+reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
+reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
+reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
+reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
+reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
+reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
+reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
+reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
+
+reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
+reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
+reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
+reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
+reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
+reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
+reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
+reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
+reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
+reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
+reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
+reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
+reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
+reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
+reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
+reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
+
+reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
+reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
+reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
+reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
+reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
+reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
+reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
+reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
+reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
+reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
+reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
+reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
+reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
+reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
+reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
+reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
+
+reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
+reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
+reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
+reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
+reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
+reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
+reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
+reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
+reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
+reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
+reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
+reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
+reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
+reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
+reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
+reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
+
+reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
+reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
+reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
+reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
+reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
+reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
+reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
+reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
+reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
+reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
+reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
+reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
+reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
+reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
+reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
+reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
+
+reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
+reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
+reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
+reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
+reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
+reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
+reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
+reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
+reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
+reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
+reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
+reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
+reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
+reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
+reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
+reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
+
+reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
+reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
+reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
+reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
+reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
+reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
+reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
+reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
+reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
+reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
+reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
+reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
+reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
+reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
+reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
+reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
+
+reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
+reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
+reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
+reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
+reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
+reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
+reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
+reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
+reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
+reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
+reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
+reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
+reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
+reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
+reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
+reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
+
+reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
+reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
+reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
+reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
+reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
+reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
+reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
+reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
+reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
+reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
+reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
+reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
+reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
+reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
+reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
+reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
+
+reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
+reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
+reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
+reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
+reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
+reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
+reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
+reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
+reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
+reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
+reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
+reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
+reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
+reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
+reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
+reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 
 #endif // _LP64
 
@@ -320,25 +1075,41 @@
 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 #endif // _LP64
 
-alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
-                   XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
-                   XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
-                   XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
-                   XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
-                   XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
-                   XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
-                   XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
+alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
+                   XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
+                   XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
+                   XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
+                   XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
+                   XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
+                   XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
+                   XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 #ifdef _LP64
-                  ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
-                   XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
-                   XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
-                   XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
-                   XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
-                   XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
-                   XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
-                   XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
+                  ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
+                   XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
+                   XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
+                   XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
+                   XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
+                   XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
+                   XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
+                   XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
+                  ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
+                   XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
+                   XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
+                   XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
+                   XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
+                   XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
+                   XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
+                   XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
+                   XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
+                   XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
+                   XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
+                   XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
+                   XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
+                   XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
+                   XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
+                   XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 #endif
-                   );
+                      );
 
 // flags allocation class should be last.
 alloc_class chunk2(RFLAGS);
@@ -346,8 +1117,8 @@
 // Singleton class for condition codes
 reg_class int_flags(RFLAGS);
 
-// Class for all float registers
-reg_class float_reg(XMM0,
+// Class for pre evex float registers
+reg_class float_reg_legacy(XMM0,
                     XMM1,
                     XMM2,
                     XMM3,
@@ -367,8 +1138,47 @@
 #endif
                     );
 
-// Class for all double registers
-reg_class double_reg(XMM0,  XMM0b,
+// Class for evex float registers
+reg_class float_reg_evex(XMM0,
+                    XMM1,
+                    XMM2,
+                    XMM3,
+                    XMM4,
+                    XMM5,
+                    XMM6,
+                    XMM7
+#ifdef _LP64
+                   ,XMM8,
+                    XMM9,
+                    XMM10,
+                    XMM11,
+                    XMM12,
+                    XMM13,
+                    XMM14,
+                    XMM15,
+                    XMM16,
+                    XMM17,
+                    XMM18,
+                    XMM19,
+                    XMM20,
+                    XMM21,
+                    XMM22,
+                    XMM23,
+                    XMM24,
+                    XMM25,
+                    XMM26,
+                    XMM27,
+                    XMM28,
+                    XMM29,
+                    XMM30,
+                    XMM31
+#endif
+                    );
+
+reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
+
+// Class for pre evex double registers
+reg_class double_reg_legacy(XMM0,  XMM0b,
                      XMM1,  XMM1b,
                      XMM2,  XMM2b,
                      XMM3,  XMM3b,
@@ -388,8 +1198,47 @@
 #endif
                      );
 
-// Class for all 32bit vector registers
-reg_class vectors_reg(XMM0,
+// Class for evex double registers
+reg_class double_reg_evex(XMM0,  XMM0b,
+                     XMM1,  XMM1b,
+                     XMM2,  XMM2b,
+                     XMM3,  XMM3b,
+                     XMM4,  XMM4b,
+                     XMM5,  XMM5b,
+                     XMM6,  XMM6b,
+                     XMM7,  XMM7b
+#ifdef _LP64
+                    ,XMM8,  XMM8b,
+                     XMM9,  XMM9b,
+                     XMM10, XMM10b,
+                     XMM11, XMM11b,
+                     XMM12, XMM12b,
+                     XMM13, XMM13b,
+                     XMM14, XMM14b,
+                     XMM15, XMM15b,
+                     XMM16, XMM16b,
+                     XMM17, XMM17b,
+                     XMM18, XMM18b,
+                     XMM19, XMM19b,
+                     XMM20, XMM20b,
+                     XMM21, XMM21b,
+                     XMM22, XMM22b,
+                     XMM23, XMM23b,
+                     XMM24, XMM24b,
+                     XMM25, XMM25b,
+                     XMM26, XMM26b,
+                     XMM27, XMM27b,
+                     XMM28, XMM28b,
+                     XMM29, XMM29b,
+                     XMM30, XMM30b,
+                     XMM31, XMM31b
+#endif
+                     );
+
+reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
+
+// Class for pre evex 32bit vector registers
+reg_class vectors_reg_legacy(XMM0,
                       XMM1,
                       XMM2,
                       XMM3,
@@ -409,8 +1258,47 @@
 #endif
                       );
 
+// Class for evex 32bit vector registers
+reg_class vectors_reg_evex(XMM0,
+                      XMM1,
+                      XMM2,
+                      XMM3,
+                      XMM4,
+                      XMM5,
+                      XMM6,
+                      XMM7
+#ifdef _LP64
+                     ,XMM8,
+                      XMM9,
+                      XMM10,
+                      XMM11,
+                      XMM12,
+                      XMM13,
+                      XMM14,
+                      XMM15,
+                      XMM16,
+                      XMM17,
+                      XMM18,
+                      XMM19,
+                      XMM20,
+                      XMM21,
+                      XMM22,
+                      XMM23,
+                      XMM24,
+                      XMM25,
+                      XMM26,
+                      XMM27,
+                      XMM28,
+                      XMM29,
+                      XMM30,
+                      XMM31
+#endif
+                      );
+
+reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
+
 // Class for all 64bit vector registers
-reg_class vectord_reg(XMM0,  XMM0b,
+reg_class vectord_reg_legacy(XMM0,  XMM0b,
                       XMM1,  XMM1b,
                       XMM2,  XMM2b,
                       XMM3,  XMM3b,
@@ -430,8 +1318,47 @@
 #endif
                       );
 
+// Class for all 64bit vector registers
+reg_class vectord_reg_evex(XMM0,  XMM0b,
+                      XMM1,  XMM1b,
+                      XMM2,  XMM2b,
+                      XMM3,  XMM3b,
+                      XMM4,  XMM4b,
+                      XMM5,  XMM5b,
+                      XMM6,  XMM6b,
+                      XMM7,  XMM7b
+#ifdef _LP64
+                     ,XMM8,  XMM8b,
+                      XMM9,  XMM9b,
+                      XMM10, XMM10b,
+                      XMM11, XMM11b,
+                      XMM12, XMM12b,
+                      XMM13, XMM13b,
+                      XMM14, XMM14b,
+                      XMM15, XMM15b,
+                      XMM16, XMM16b,
+                      XMM17, XMM17b,
+                      XMM18, XMM18b,
+                      XMM19, XMM19b,
+                      XMM20, XMM20b,
+                      XMM21, XMM21b,
+                      XMM22, XMM22b,
+                      XMM23, XMM23b,
+                      XMM24, XMM24b,
+                      XMM25, XMM25b,
+                      XMM26, XMM26b,
+                      XMM27, XMM27b,
+                      XMM28, XMM28b,
+                      XMM29, XMM29b,
+                      XMM30, XMM30b,
+                      XMM31, XMM31b
+#endif
+                      );
+
+reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
+
 // Class for all 128bit vector registers
-reg_class vectorx_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,
+reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
                       XMM1,  XMM1b,  XMM1c,  XMM1d,
                       XMM2,  XMM2b,  XMM2c,  XMM2d,
                       XMM3,  XMM3b,  XMM3c,  XMM3d,
@@ -451,8 +1378,47 @@
 #endif
                       );
 
+// Class for all 128bit vector registers
+reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
+                      XMM1,  XMM1b,  XMM1c,  XMM1d,
+                      XMM2,  XMM2b,  XMM2c,  XMM2d,
+                      XMM3,  XMM3b,  XMM3c,  XMM3d,
+                      XMM4,  XMM4b,  XMM4c,  XMM4d,
+                      XMM5,  XMM5b,  XMM5c,  XMM5d,
+                      XMM6,  XMM6b,  XMM6c,  XMM6d,
+                      XMM7,  XMM7b,  XMM7c,  XMM7d
+#ifdef _LP64
+                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,
+                      XMM9,  XMM9b,  XMM9c,  XMM9d,
+                      XMM10, XMM10b, XMM10c, XMM10d,
+                      XMM11, XMM11b, XMM11c, XMM11d,
+                      XMM12, XMM12b, XMM12c, XMM12d,
+                      XMM13, XMM13b, XMM13c, XMM13d,
+                      XMM14, XMM14b, XMM14c, XMM14d,
+                      XMM15, XMM15b, XMM15c, XMM15d,
+                      XMM16, XMM16b, XMM16c, XMM16d,
+                      XMM17, XMM17b, XMM17c, XMM17d,
+                      XMM18, XMM18b, XMM18c, XMM18d,
+                      XMM19, XMM19b, XMM19c, XMM19d,
+                      XMM20, XMM20b, XMM20c, XMM20d,
+                      XMM21, XMM21b, XMM21c, XMM21d,
+                      XMM22, XMM22b, XMM22c, XMM22d,
+                      XMM23, XMM23b, XMM23c, XMM23d,
+                      XMM24, XMM24b, XMM24c, XMM24d,
+                      XMM25, XMM25b, XMM25c, XMM25d,
+                      XMM26, XMM26b, XMM26c, XMM26d,
+                      XMM27, XMM27b, XMM27c, XMM27d,
+                      XMM28, XMM28b, XMM28c, XMM28d,
+                      XMM29, XMM29b, XMM29c, XMM29d,
+                      XMM30, XMM30b, XMM30c, XMM30d,
+                      XMM31, XMM31b, XMM31c, XMM31d
+#endif
+                      );
+
+reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
+
 // Class for all 256bit vector registers
-reg_class vectory_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
+reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
@@ -472,6 +1438,82 @@
 #endif
                       );
 
+// Class for all 256bit vector registers
+reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
+                      XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
+                      XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
+                      XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
+                      XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
+                      XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
+                      XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
+                      XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
+#ifdef _LP64
+                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
+                      XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
+                      XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
+                      XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
+                      XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
+                      XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
+                      XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
+                      XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
+                      XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
+                      XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
+                      XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
+                      XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
+                      XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
+                      XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
+                      XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
+                      XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
+                      XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
+                      XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
+                      XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
+                      XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
+                      XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
+                      XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
+                      XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
+                      XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
+#endif
+                      );
+
+reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
+
+// Class for all 512bit vector registers
+reg_class vectorz_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
+                      XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
+                      XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
+                      XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
+                      XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
+                      XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
+                      XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
+                      XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
+#ifdef _LP64
+                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
+                      XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
+                      XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
+                      XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
+                      XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
+                      XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
+                      XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
+                      XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
+                     ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
+                      XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
+                      XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
+                      XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
+                      XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
+                      XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
+                      XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
+                      XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
+                      XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
+                      XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
+                      XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
+                      XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
+                      XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
+                      XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
+                      XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
+                      XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
+#endif
+                      );
+
 %}
 
 
@@ -623,6 +1665,10 @@
       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
         return false;
     break;
+    case Op_MulVL:
+    case Op_MulReductionVL:
+      if (VM_Version::supports_avx512dq() == false)
+        return false;
     case Op_AddReductionVL:
       if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here
         return false;
@@ -657,10 +1703,11 @@
   if (UseSSE < 2) return 0;
   // SSE2 supports 128bit vectors for all types.
   // AVX2 supports 256bit vectors for all types.
-  int size = (UseAVX > 1) ? 32 : 16;
+  // AVX2/EVEX supports 512bit vectors for all types.
+  int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
-    size = 32;
+    size = (UseAVX > 2) ? 64 : 32;
   // Use flag to limit vector size.
   size = MIN2(size,(int)MaxVectorSize);
   // Minimum 2 values in vector (or 4 for bytes).
@@ -702,6 +1749,7 @@
     case  8: return Op_VecD;
     case 16: return Op_VecX;
     case 32: return Op_VecY;
+    case 64: return Op_VecZ;
   }
   ShouldNotReachHere();
   return 0;
@@ -745,6 +1793,9 @@
     case Op_VecY:
       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
       break;
+    case Op_VecZ:
+      __ evmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
+      break;
     default:
       ShouldNotReachHere();
     }
@@ -763,6 +1814,7 @@
       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
       break;
     case Op_VecY:
+    case Op_VecZ:
       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
       break;
     default:
@@ -771,7 +1823,7 @@
 #endif
   }
   // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
-  return 4;
+  return (UseAVX > 2) ? 6 : 4;
 }
 
 static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
@@ -796,6 +1848,9 @@
       case Op_VecY:
         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
         break;
+      case Op_VecZ:
+        __ evmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
+        break;
       default:
         ShouldNotReachHere();
       }
@@ -813,13 +1868,16 @@
       case Op_VecY:
         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
         break;
+      case Op_VecZ:
+        __ evmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
+        break;
       default:
         ShouldNotReachHere();
       }
     }
     int size = __ offset() - offset;
 #ifdef ASSERT
-    int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : 4);
+    int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4);
     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
     assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
 #endif
@@ -838,6 +1896,7 @@
         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
         break;
       case Op_VecY:
+      case Op_VecZ:
         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
         break;
       default:
@@ -855,6 +1914,7 @@
         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
         break;
       case Op_VecY:
+      case Op_VecZ:
         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
         break;
       default:
@@ -863,7 +1923,7 @@
     }
 #endif
   }
-  int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : 4);
+  int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4);
   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
   return 5+offset_size;
 }
@@ -952,40 +2012,15 @@
 // in the ADLC because operands constitute user defined types which are used in
 // instruction definitions.
 
-// Vectors
-operand vecS() %{
-  constraint(ALLOC_IN_RC(vectors_reg));
-  match(VecS);
-
-  format %{ %}
-  interface(REG_INTER);
-%}
-
-operand vecD() %{
-  constraint(ALLOC_IN_RC(vectord_reg));
-  match(VecD);
+// This one generically applies only for evex, so only one version
+operand vecZ() %{
+  constraint(ALLOC_IN_RC(vectorz_reg));
+  match(VecZ);
 
   format %{ %}
   interface(REG_INTER);
 %}
 
-operand vecX() %{
-  constraint(ALLOC_IN_RC(vectorx_reg));
-  match(VecX);
-
-  format %{ %}
-  interface(REG_INTER);
-%}
-
-operand vecY() %{
-  constraint(ALLOC_IN_RC(vectory_reg));
-  match(VecY);
-
-  format %{ %}
-  interface(REG_INTER);
-%}
-
-
 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 
 // ============================================================================
@@ -1586,9 +2621,9 @@
   ins_cost(150);
   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
   ins_encode %{
-    bool vector256 = false;
+    int vector_len = 0;
     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(float_signmask()), vector256);
+              ExternalAddress(float_signmask()), vector_len);
   %}
   ins_pipe(pipe_slow);
 %}
@@ -1612,9 +2647,9 @@
   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
             "# abs double by sign masking" %}
   ins_encode %{
-    bool vector256 = false;
+    int vector_len = 0;
     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(double_signmask()), vector256);
+              ExternalAddress(double_signmask()), vector_len);
   %}
   ins_pipe(pipe_slow);
 %}
@@ -1636,9 +2671,9 @@
   ins_cost(150);
   format %{ "vxorps  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
   ins_encode %{
-    bool vector256 = false;
+    int vector_len = 0;
     __ vxorps($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(float_signflip()), vector256);
+              ExternalAddress(float_signflip()), vector_len);
   %}
   ins_pipe(pipe_slow);
 %}
@@ -1662,9 +2697,9 @@
   format %{ "vxorpd  $dst, $src, [0x8000000000000000]\t"
             "# neg double by sign flipping" %}
   ins_encode %{
-    bool vector256 = false;
+    int vector_len = 0;
     __ vxorpd($dst$$XMMRegister, $src$$XMMRegister,
-              ExternalAddress(double_signflip()), vector256);
+              ExternalAddress(double_signflip()), vector_len);
   %}
   ins_pipe(pipe_slow);
 %}
@@ -1739,7 +2774,6 @@
   ins_pipe(pipe_slow);
 %}
 
-
 // ====================VECTOR INSTRUCTIONS=====================================
 
 // Load vectors (4 bytes long)
@@ -1790,6 +2824,19 @@
   ins_pipe( pipe_slow );
 %}
 
+// Load vectors (64 bytes long)
+instruct loadV64(vecZ dst, memory mem) %{
+  predicate(n->as_LoadVector()->memory_size() == 64);
+  match(Set dst (LoadVector mem));
+  ins_cost(125);
+  format %{ "vmovdqu $dst k0,$mem\t! load vector (64 bytes)" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evmovdqu($dst$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // Store vectors
 instruct storeV4(memory mem, vecS src) %{
   predicate(n->as_StoreVector()->memory_size() == 4);
@@ -1835,6 +2882,18 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct storeV64(memory mem, vecZ src) %{
+  predicate(n->as_StoreVector()->memory_size() == 64);
+  match(Set mem (StoreVector mem src));
+  ins_cost(145);
+  format %{ "vmovdqu $mem k0,$src\t! store vector (64 bytes)" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evmovdqu($mem$$Address, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // Replicate byte scalar to be vector
 instruct Repl4B(vecS dst, rRegI src) %{
   predicate(n->as_Vector()->length() == 4);
@@ -1898,6 +2957,26 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl64B(vecZ dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 64);
+  match(Set dst (ReplicateB src));
+  format %{ "movd    $dst,$src\n\t"
+            "punpcklbw $dst,$dst\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\t! lower replicate32B\n\t"
+            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate632B" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // Replicate byte scalar immediate to be vector by loading from const table.
 instruct Repl4B_imm(vecS dst, immI con) %{
   predicate(n->as_Vector()->length() == 4);
@@ -1945,6 +3024,22 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl64B_imm(vecZ dst, immI con) %{
+  predicate(n->as_Vector()->length() == 64);
+  match(Set dst (ReplicateB con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\t! lower replicate32B($con)\n\t"
+            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate32B($con)" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // Replicate byte scalar zero to be vector
 instruct Repl4B_zero(vecS dst, immI0 zero) %{
   predicate(n->as_Vector()->length() == 4);
@@ -1982,8 +3077,20 @@
   format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
   ins_encode %{
     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
-    bool vector256 = true;
-    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl64B_zero(vecZ dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 64);
+  match(Set dst (ReplicateB zero));
+  format %{ "vpxor   $dst k0,$dst,$dst\t! replicate64B zero" %}
+  ins_encode %{
+    // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( fpu_reg_reg );
 %}
@@ -2043,6 +3150,24 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl32S(vecZ dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 32);
+  match(Set dst (ReplicateS src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\t! lower replicate16S\n\t"
+            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate16S" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
 instruct Repl2S_imm(vecS dst, immI con) %{
   predicate(n->as_Vector()->length() == 2);
@@ -2090,6 +3215,22 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl32S_imm(vecZ dst, immI con) %{
+  predicate(n->as_Vector()->length() == 32);
+  match(Set dst (ReplicateS con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\t! lower replicate16S($con)\n\t"
+            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate16S($con)" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // Replicate char/short (2 byte) scalar zero to be vector
 instruct Repl2S_zero(vecS dst, immI0 zero) %{
   predicate(n->as_Vector()->length() == 2);
@@ -2127,8 +3268,20 @@
   format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
   ins_encode %{
     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
-    bool vector256 = true;
-    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl32S_zero(vecZ dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 32);
+  match(Set dst (ReplicateS zero));
+  format %{ "vpxor   $dst k0,$dst,$dst\t! replicate32S zero" %}
+  ins_encode %{
+    // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( fpu_reg_reg );
 %}
@@ -2172,6 +3325,22 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl16I(vecZ dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (ReplicateI src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshufd  $dst,$dst,0x00\n\t"
+            "vinserti128h $dst,$dst,$dst\t! lower replicate8I\n\t"
+            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate8I" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
 instruct Repl2I_imm(vecD dst, immI con) %{
   predicate(n->as_Vector()->length() == 2);
@@ -2209,6 +3378,22 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl16I_imm(vecZ dst, immI con) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (ReplicateI con));
+  format %{ "movq    $dst,[$constantaddress]\t! replicate16I($con)\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\n\t"
+            "vinserti64x4h $dst k0,$dst,$dst" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // Integer could be loaded into xmm register directly from memory.
 instruct Repl2I_mem(vecD dst, memory mem) %{
   predicate(n->as_Vector()->length() == 2);
@@ -2248,6 +3433,22 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl16I_mem(vecZ dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (ReplicateI (LoadI mem)));
+  format %{ "movd    $dst,$mem\n\t"
+            "pshufd  $dst,$dst,0x00\n\t"
+            "vinserti128h $dst,$dst,$dst\t! lower replicate8I\n\t"
+            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate8I" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $mem$$Address);
+    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // Replicate integer (4 byte) scalar zero to be vector
 instruct Repl2I_zero(vecD dst, immI0 zero) %{
   predicate(n->as_Vector()->length() == 2);
@@ -2275,8 +3476,20 @@
   format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
   ins_encode %{
     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
-    bool vector256 = true;
-    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl16I_zero(vecZ dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (ReplicateI zero));
+  format %{ "vpxor   $dst k0,$dst,$dst\t! replicate16I zero" %}
+  ins_encode %{
+    // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( fpu_reg_reg );
 %}
@@ -2308,6 +3521,22 @@
   %}
   ins_pipe( pipe_slow );
 %}
+
+instruct Repl8L(vecZ dst, rRegL src) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateL src));
+  format %{ "movdq   $dst,$src\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\t! lower replicate4L\n\t"
+            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L" %}
+  ins_encode %{
+    __ movdq($dst$$XMMRegister, $src$$Register);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
 #else // _LP64
 instruct Repl2L(vecX dst, eRegL src, regD tmp) %{
   predicate(n->as_Vector()->length() == 2);
@@ -2344,6 +3573,26 @@
   %}
   ins_pipe( pipe_slow );
 %}
+
+instruct Repl8L(vecZ dst, eRegL src, regD tmp) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateL src));
+  effect(TEMP dst, USE src, TEMP tmp);
+  format %{ "movdl   $dst,$src.lo\n\t"
+            "movdl   $tmp,$src.hi\n\t"
+            "punpckldq $dst,$tmp\n\t"
+            "vinserti128h $dst,$dst,$dst\t! lower replicate4L\n\t"
+            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
+    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
 #endif // _LP64
 
 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
@@ -2373,6 +3622,22 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl8L_imm(vecZ dst, immL con) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateL con));
+  format %{ "movq    $dst,[$constantaddress]\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\t! lower replicate4L($con)\n\t"
+            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L($con)" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $constantaddress($con));
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // Long could be loaded into xmm register directly from memory.
 instruct Repl2L_mem(vecX dst, memory mem) %{
   predicate(n->as_Vector()->length() == 2);
@@ -2400,6 +3665,22 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl8L_mem(vecZ dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateL (LoadL mem)));
+  format %{ "movq    $dst,$mem\n\t"
+            "punpcklqdq $dst,$dst\n\t"
+            "vinserti128h $dst,$dst,$dst\t! lower replicate4L\n\t"
+            "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+    __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // Replicate long (8 byte) scalar zero to be vector
 instruct Repl2L_zero(vecX dst, immL0 zero) %{
   predicate(n->as_Vector()->length() == 2);
@@ -2417,8 +3698,20 @@
   format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
   ins_encode %{
     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
-    bool vector256 = true;
-    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8L_zero(vecZ dst, immL0 zero) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateL zero));
+  format %{ "vpxor   $dst k0,$dst,$dst\t! replicate8L zero" %}
+  ins_encode %{
+    // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( fpu_reg_reg );
 %}
@@ -2456,6 +3749,20 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl16F(vecZ dst, regF src) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (ReplicateF src));
+  format %{ "pshufd  $dst,$src,0x00\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! lower replicate8F\n\t"
+            "vinsertf64x4h $dst k0,$dst,$dst\t! lower replicate8F" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // Replicate float (4 byte) scalar zero to be vector
 instruct Repl2F_zero(vecD dst, immF0 zero) %{
   predicate(n->as_Vector()->length() == 2);
@@ -2482,8 +3789,19 @@
   match(Set dst (ReplicateF zero));
   format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl16F_zero(vecZ dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (ReplicateF zero));
+  format %{ "vxorps  $dst k0,$dst,$dst\t! replicate16F zero" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( fpu_reg_reg );
 %}
@@ -2511,6 +3829,20 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct Repl8D(vecZ dst, regD src) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateD src));
+  format %{ "pshufd  $dst,$src,0x44\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! lower replicate4D\n\t"
+            "vinsertf64x4h $dst k0,$dst,$dst\t! upper replicate4D" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // Replicate double (8 byte) scalar zero to be vector
 instruct Repl2D_zero(vecX dst, immD0 zero) %{
   predicate(n->as_Vector()->length() == 2);
@@ -2527,8 +3859,19 @@
   match(Set dst (ReplicateD zero));
   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8D_zero(vecZ dst, immD0 zero) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateD zero));
+  format %{ "vxorpd  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
   %}
   ins_pipe( fpu_reg_reg );
 %}
@@ -2555,17 +3898,38 @@
 %}
 
 instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
-  predicate(UseAVX > 0);
+  predicate(UseAVX > 0 && UseAVX < 3);
   match(Set dst (AddReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
-  format %{ "vphaddd $tmp,$src2,$src2\n\t"
+  format %{ "vphaddd  $tmp,$src2,$src2\n\t"
+            "movd     $tmp2,$src1\n\t"
+            "vpaddd   $tmp2,$tmp2,$tmp\n\t"
+            "movd     $dst,$tmp2\t! add reduction2I" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
+    __ movdl($tmp2$$XMMRegister, $src1$$Register);
+    __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
+    __ movdl($dst$$Register, $tmp2$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
+  predicate(UseAVX > 2);
+  match(Set dst (AddReductionVI src1 src2));
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "pshufd  $tmp2,$src2,0x1\n\t"
+            "vpaddd  $tmp,$src2,$tmp2\n\t"
             "movd    $tmp2,$src1\n\t"
-            "vpaddd  $tmp2,$tmp2,$tmp\n\t"
+            "vpaddd  $tmp2,$tmp,$tmp2\n\t"
             "movd    $dst,$tmp2\t! add reduction2I" %}
   ins_encode %{
-    __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, false);
+    int vector_len = 0;
+    __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
+    __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
     __ movdl($tmp2$$XMMRegister, $src1$$Register);
-    __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, false);
+    __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
     __ movdl($dst$$Register, $tmp2$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
@@ -2593,47 +3957,203 @@
 %}
 
 instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
-  predicate(UseAVX > 0);
+  predicate(UseAVX > 0 && UseAVX < 3);
   match(Set dst (AddReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
-  format %{ "vphaddd $tmp,$src2,$src2\n\t"
-            "vphaddd $tmp,$tmp,$tmp2\n\t"
+  format %{ "vphaddd  $tmp,$src2,$src2\n\t"
+            "vphaddd  $tmp,$tmp,$tmp2\n\t"
+            "movd     $tmp2,$src1\n\t"
+            "vpaddd   $tmp2,$tmp2,$tmp\n\t"
+            "movd     $dst,$tmp2\t! add reduction4I" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
+    __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
+    __ movdl($tmp2$$XMMRegister, $src1$$Register);
+    __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
+    __ movdl($dst$$Register, $tmp2$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
+  predicate(UseAVX > 2);
+  match(Set dst (AddReductionVI src1 src2));
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "pshufd  $tmp2,$src2,0xE\n\t"
+            "vpaddd  $tmp,$src2,$tmp2\n\t"
+            "pshufd  $tmp2,$tmp,0x1\n\t"
+            "vpaddd  $tmp,$tmp,$tmp2\n\t"
             "movd    $tmp2,$src1\n\t"
-            "vpaddd  $tmp2,$tmp2,$tmp\n\t"
+            "vpaddd  $tmp2,$tmp,$tmp2\n\t"
             "movd    $dst,$tmp2\t! add reduction4I" %}
   ins_encode %{
-    __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, false);
-    __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
+    int vector_len = 0;
+    __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
+    __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
+    __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
+    __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
     __ movdl($tmp2$$XMMRegister, $src1$$Register);
-    __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, false);
+    __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
     __ movdl($dst$$Register, $tmp2$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
 
 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
-  predicate(UseAVX > 0);
+  predicate(UseAVX > 0 && UseAVX < 3);
+  match(Set dst (AddReductionVI src1 src2));
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "vphaddd  $tmp,$src2,$src2\n\t"
+            "vphaddd  $tmp,$tmp,$tmp2\n\t"
+            "vextracti128  $tmp2,$tmp\n\t"
+            "vpaddd   $tmp,$tmp,$tmp2\n\t"
+            "movd     $tmp2,$src1\n\t"
+            "vpaddd   $tmp2,$tmp2,$tmp\n\t"
+            "movd     $dst,$tmp2\t! add reduction8I" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
+    __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
+    __ vextracti128h($tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
+    __ movdl($tmp2$$XMMRegister, $src1$$Register);
+    __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
+    __ movdl($dst$$Register, $tmp2$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
+  predicate(UseAVX > 2);
   match(Set dst (AddReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
-  format %{ "vphaddd $tmp,$src2,$src2\n\t"
-            "vphaddd $tmp,$tmp,$tmp2\n\t"
-            "vextractf128  $tmp2,$tmp\n\t"
+  format %{ "vextracti128  $tmp,$src2\n\t"
+            "vpaddd  $tmp,$tmp,$src2\n\t"
+            "pshufd  $tmp2,$tmp,0xE\n\t"
+            "vpaddd  $tmp,$tmp,$tmp2\n\t"
+            "pshufd  $tmp2,$tmp,0x1\n\t"
+            "vpaddd  $tmp,$tmp,$tmp2\n\t"
+            "movd    $tmp2,$src1\n\t"
+            "vpaddd  $tmp2,$tmp,$tmp2\n\t"
+            "movd    $dst,$tmp2\t! add reduction8I" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
+    __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
+    __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
+    __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
+    __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
+    __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
+    __ movdl($tmp2$$XMMRegister, $src1$$Register);
+    __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
+    __ movdl($dst$$Register, $tmp2$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
+  predicate(UseAVX > 2);
+  match(Set dst (AddReductionVI src1 src2));
+  effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
+  format %{ "vextracti64x4  $tmp3,$src2\n\t"
+            "vpaddd  $tmp3,$tmp3,$src2\n\t"
+            "vextracti128   $tmp,$tmp3\n\t"
+            "vpaddd  $tmp,$tmp,$tmp3\n\t"
+            "pshufd  $tmp2,$tmp,0xE\n\t"
+            "vpaddd  $tmp,$tmp,$tmp2\n\t"
+            "pshufd  $tmp2,$tmp,0x1\n\t"
             "vpaddd  $tmp,$tmp,$tmp2\n\t"
             "movd    $tmp2,$src1\n\t"
-            "vpaddd  $tmp2,$tmp2,$tmp\n\t"
-            "movd    $dst,$tmp2\t! add reduction8I" %}
-  ins_encode %{
-    __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, true);
-    __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, true);
-    __ vextractf128h($tmp2$$XMMRegister, $tmp$$XMMRegister);
-    __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
+            "vpaddd  $tmp2,$tmp,$tmp2\n\t"
+            "movd    $dst,$tmp2\t! mul reduction16I" %}
+  ins_encode %{
+    __ vextracti64x4h($tmp3$$XMMRegister, $src2$$XMMRegister);
+    __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
+    __ vextracti128h($tmp$$XMMRegister, $tmp3$$XMMRegister);
+    __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
+    __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
+    __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
+    __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
+    __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
     __ movdl($tmp2$$XMMRegister, $src1$$Register);
-    __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, false);
+    __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
     __ movdl($dst$$Register, $tmp2$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
 
+#ifdef _LP64
+instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
+  predicate(UseAVX > 2);
+  match(Set dst (AddReductionVL src1 src2));
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "pshufd  $tmp2,$src2,0xE\n\t"
+            "vpaddq  $tmp,$src2,$tmp2\n\t"
+            "movdq   $tmp2,$src1\n\t"
+            "vpaddq  $tmp2,$tmp,$tmp2\n\t"
+            "movdq   $dst,$tmp2\t! add reduction2L" %}
+  ins_encode %{
+    __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
+    __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
+    __ movdq($tmp2$$XMMRegister, $src1$$Register);
+    __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
+    __ movdq($dst$$Register, $tmp2$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
+  predicate(UseAVX > 2);
+  match(Set dst (AddReductionVL src1 src2));
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "vextracti64x2  $tmp,$src2, 0x1\n\t"
+            "vpaddq  $tmp2,$tmp,$src2\n\t"
+            "pshufd  $tmp,$tmp2,0xE\n\t"
+            "vpaddq  $tmp2,$tmp2,$tmp\n\t"
+            "movdq   $tmp,$src1\n\t"
+            "vpaddq  $tmp2,$tmp2,$tmp\n\t"
+            "movdq   $dst,$tmp2\t! add reduction4L" %}
+  ins_encode %{
+    __ vextracti64x2h($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
+    __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
+    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
+    __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
+    __ movdq($tmp$$XMMRegister, $src1$$Register);
+    __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
+    __ movdq($dst$$Register, $tmp2$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
+  predicate(UseAVX > 2);
+  match(Set dst (AddReductionVL src1 src2));
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "vextracti64x4  $tmp2,$src2\n\t"
+            "vpaddq  $tmp2,$tmp2,$src2\n\t"
+            "vextracti128   $tmp,$tmp2\n\t"
+            "vpaddq  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$tmp2,0xE\n\t"
+            "vpaddq  $tmp2,$tmp2,$tmp\n\t"
+            "movdq   $tmp,$src1\n\t"
+            "vpaddq  $tmp2,$tmp2,$tmp\n\t"
+            "movdq   $dst,$tmp2\t! add reduction8L" %}
+  ins_encode %{
+    __ vextracti64x4h($tmp2$$XMMRegister, $src2$$XMMRegister);
+    __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
+    __ vextracti128h($tmp$$XMMRegister, $tmp2$$XMMRegister);
+    __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
+    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
+    __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
+    __ movdq($tmp$$XMMRegister, $src1$$Register);
+    __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
+    __ movdq($dst$$Register, $tmp2$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+#endif
+
 instruct rsadd2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
   predicate(UseSSE >= 1 && UseAVX == 0);
   match(Set dst (AddReductionVF src1 src2));
@@ -2757,6 +4277,77 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct radd16F_reduction_reg(regF dst, regF src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
+  predicate(UseAVX > 2);
+  match(Set dst (AddReductionVF src1 src2));
+  effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
+  format %{ "vaddss  $tmp2,$src1,$src2\n\t"
+            "pshufd  $tmp,$src2,0x01\n\t"
+            "vaddss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$src2,0x02\n\t"
+            "vaddss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$src2,0x03\n\t"
+            "vaddss  $tmp2,$tmp2,$tmp\n\t"
+            "vextractf64x2  $tmp3,$src2, 0x1\n\t"
+            "vaddss  $tmp2,$tmp2,$tmp3\n\t"
+            "pshufd  $tmp,$tmp3,0x01\n\t"
+            "vaddss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$tmp3,0x02\n\t"
+            "vaddss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$tmp3,0x03\n\t"
+            "vaddss  $tmp2,$tmp2,$tmp\n\t"
+            "vextractf64x2  $tmp3,$src2, 0x2\n\t"
+            "vaddss  $tmp2,$tmp2,$tmp3\n\t"
+            "pshufd  $tmp,$tmp3,0x01\n\t"
+            "vaddss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$tmp3,0x02\n\t"
+            "vaddss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$tmp3,0x03\n\t"
+            "vaddss  $tmp2,$tmp2,$tmp\n\t"
+            "vextractf64x2  $tmp3,$src2, 0x3\n\t"
+            "vaddss  $tmp2,$tmp2,$tmp3\n\t"
+            "pshufd  $tmp,$tmp3,0x01\n\t"
+            "vaddss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$tmp3,0x02\n\t"
+            "vaddss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$tmp3,0x03\n\t"
+            "vaddss  $dst,$tmp2,$tmp\t! add reduction16F" %}
+  ins_encode %{
+    __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
+    __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
+    __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
+    __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ vextractf32x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x1);
+    __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
+    __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
+    __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
+    __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ vextractf32x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x2);
+    __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
+    __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
+    __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
+    __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ vextractf32x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x3);
+    __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
+    __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
+    __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
+    __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct rsadd2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp) %{
   predicate(UseSSE >= 1 && UseAVX == 0);
   match(Set dst (AddReductionVD src1 src2));
@@ -2812,6 +4403,45 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct rvadd8D_reduction_reg(regD dst, regD src1, vecZ src2, regD tmp, regD tmp2, regD tmp3) %{
+  predicate(UseAVX > 2);
+  match(Set dst (AddReductionVD src1 src2));
+  effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
+  format %{ "vaddsd  $tmp2,$src1,$src2\n\t"
+            "pshufd  $tmp,$src2,0xE\n\t"
+            "vaddsd  $tmp2,$tmp2,$tmp\n\t"
+            "vextractf64x2  $tmp3,$src2, 0x1\n\t"
+            "vaddsd  $tmp2,$tmp2,$tmp3\n\t"
+            "pshufd  $tmp,$tmp3,0xE\n\t"
+            "vaddsd  $tmp2,$tmp2,$tmp\n\t"
+            "vextractf64x2  $tmp3,$src2, 0x2\n\t"
+            "vaddsd  $tmp2,$tmp2,$tmp3\n\t"
+            "pshufd  $tmp,$tmp3,0xE\n\t"
+            "vaddsd  $tmp2,$tmp2,$tmp\n\t"
+            "vextractf64x2  $tmp3,$src2, 0x3\n\t"
+            "vaddsd  $tmp2,$tmp2,$tmp3\n\t"
+            "pshufd  $tmp,$tmp3,0xE\n\t"
+            "vaddsd  $dst,$tmp2,$tmp\t! add reduction8D" %}
+  ins_encode %{
+    __ vaddsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
+    __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ vextractf64x2h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x1);
+    __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
+    __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ vextractf64x2h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x2);
+    __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
+    __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ vextractf64x2h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x3);
+    __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
+    __ vaddsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
   predicate(UseSSE > 3 && UseAVX == 0);
   match(Set dst (MulReductionVI src1 src2));
@@ -2835,16 +4465,17 @@
   predicate(UseAVX > 0);
   match(Set dst (MulReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
-  format %{ "pshufd  $tmp2,$src2,0x1\n\t"
-            "vpmulld $tmp,$src2,$tmp2\n\t"
-            "movd    $tmp2,$src1\n\t"
-            "vpmulld $tmp2,$tmp,$tmp2\n\t"
-            "movd    $dst,$tmp2\t! mul reduction2I" %}
-  ins_encode %{
+  format %{ "pshufd   $tmp2,$src2,0x1\n\t"
+            "vpmulld  $tmp,$src2,$tmp2\n\t"
+            "movd     $tmp2,$src1\n\t"
+            "vpmulld  $tmp2,$tmp,$tmp2\n\t"
+            "movd     $dst,$tmp2\t! mul reduction2I" %}
+  ins_encode %{
+    int vector_len = 0;
     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
-    __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, false);
+    __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
     __ movdl($tmp2$$XMMRegister, $src1$$Register);
-    __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
+    __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
     __ movdl($dst$$Register, $tmp2$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
@@ -2877,20 +4508,21 @@
   predicate(UseAVX > 0);
   match(Set dst (MulReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
-  format %{ "pshufd  $tmp2,$src2,0xE\n\t"
-            "vpmulld $tmp,$src2,$tmp2\n\t"
-            "pshufd  $tmp2,$tmp,0x1\n\t"
-            "vpmulld $tmp,$tmp,$tmp2\n\t"
-            "movd    $tmp2,$src1\n\t"
-            "vpmulld $tmp2,$tmp,$tmp2\n\t"
-            "movd    $dst,$tmp2\t! mul reduction4I" %}
-  ins_encode %{
+  format %{ "pshufd   $tmp2,$src2,0xE\n\t"
+            "vpmulld  $tmp,$src2,$tmp2\n\t"
+            "pshufd   $tmp2,$tmp,0x1\n\t"
+            "vpmulld  $tmp,$tmp,$tmp2\n\t"
+            "movd     $tmp2,$src1\n\t"
+            "vpmulld  $tmp2,$tmp,$tmp2\n\t"
+            "movd     $dst,$tmp2\t! mul reduction4I" %}
+  ins_encode %{
+    int vector_len = 0;
     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
-    __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, false);
+    __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
-    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
+    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
     __ movdl($tmp2$$XMMRegister, $src1$$Register);
-    __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
+    __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
     __ movdl($dst$$Register, $tmp2$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
@@ -2900,30 +4532,133 @@
   predicate(UseAVX > 0);
   match(Set dst (MulReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
-  format %{ "vextractf128  $tmp,$src2\n\t"
-            "vpmulld $tmp,$tmp,$src2\n\t"
-            "pshufd  $tmp2,$tmp,0xE\n\t"
-            "vpmulld $tmp,$tmp,$tmp2\n\t"
-            "pshufd  $tmp2,$tmp,0x1\n\t"
-            "vpmulld $tmp,$tmp,$tmp2\n\t"
-            "movd    $tmp2,$src1\n\t"
-            "vpmulld $tmp2,$tmp,$tmp2\n\t"
-            "movd    $dst,$tmp2\t! mul reduction8I" %}
-  ins_encode %{
-    __ vextractf128h($tmp$$XMMRegister, $src2$$XMMRegister);
-    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, false);
+  format %{ "vextracti128  $tmp,$src2\n\t"
+            "vpmulld  $tmp,$tmp,$src2\n\t"
+            "pshufd   $tmp2,$tmp,0xE\n\t"
+            "vpmulld  $tmp,$tmp,$tmp2\n\t"
+            "pshufd   $tmp2,$tmp,0x1\n\t"
+            "vpmulld  $tmp,$tmp,$tmp2\n\t"
+            "movd     $tmp2,$src1\n\t"
+            "vpmulld  $tmp2,$tmp,$tmp2\n\t"
+            "movd     $dst,$tmp2\t! mul reduction8I" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
+    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
+    __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
+    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
+    __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
+    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
+    __ movdl($tmp2$$XMMRegister, $src1$$Register);
+    __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
+    __ movdl($dst$$Register, $tmp2$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
+  predicate(UseAVX > 2);
+  match(Set dst (MulReductionVI src1 src2));
+  effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
+  format %{ "vextracti64x4  $tmp3,$src2\n\t"
+            "vpmulld  $tmp3,$tmp3,$src2\n\t"
+            "vextracti128   $tmp,$tmp3\n\t"
+            "vpmulld  $tmp,$tmp,$src2\n\t"
+            "pshufd   $tmp2,$tmp,0xE\n\t"
+            "vpmulld  $tmp,$tmp,$tmp2\n\t"
+            "pshufd   $tmp2,$tmp,0x1\n\t"
+            "vpmulld  $tmp,$tmp,$tmp2\n\t"
+            "movd     $tmp2,$src1\n\t"
+            "vpmulld  $tmp2,$tmp,$tmp2\n\t"
+            "movd     $dst,$tmp2\t! mul reduction16I" %}
+  ins_encode %{
+    __ vextracti64x4h($tmp3$$XMMRegister, $src2$$XMMRegister);
+    __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
+    __ vextracti128h($tmp$$XMMRegister, $tmp3$$XMMRegister);
+    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
-    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
+    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
-    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
+    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
     __ movdl($tmp2$$XMMRegister, $src1$$Register);
-    __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
+    __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
     __ movdl($dst$$Register, $tmp2$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
 
-instruct rsmul2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
+#ifdef _LP64
+instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
+  predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
+  match(Set dst (MulReductionVL src1 src2));
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "pshufd   $tmp2,$src2,0xE\n\t"
+            "vpmullq  $tmp,$src2,$tmp2\n\t"
+            "movdq    $tmp2,$src1\n\t"
+            "vpmullq  $tmp2,$tmp,$tmp2\n\t"
+            "movdq    $dst,$tmp2\t! mul reduction2L" %}
+  ins_encode %{
+    __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
+    __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
+    __ movdq($tmp2$$XMMRegister, $src1$$Register);
+    __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
+    __ movdq($dst$$Register, $tmp2$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
+  predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
+  match(Set dst (MulReductionVL src1 src2));
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "vextracti64x2  $tmp,$src2, 0x1\n\t"
+            "vpmullq  $tmp2,$tmp,$src2\n\t"
+            "pshufd   $tmp,$tmp2,0xE\n\t"
+            "vpmullq  $tmp2,$tmp2,$tmp\n\t"
+            "movdq    $tmp,$src1\n\t"
+            "vpmullq  $tmp2,$tmp2,$tmp\n\t"
+            "movdq    $dst,$tmp2\t! mul reduction4L" %}
+  ins_encode %{
+    __ vextracti64x2h($tmp$$XMMRegister, $src2$$XMMRegister, 0x1);
+    __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
+    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
+    __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
+    __ movdq($tmp$$XMMRegister, $src1$$Register);
+    __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
+    __ movdq($dst$$Register, $tmp2$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
+  predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
+  match(Set dst (MulReductionVL src1 src2));
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "vextracti64x4  $tmp2,$src2\n\t"
+            "vpmullq  $tmp2,$tmp2,$src2\n\t"
+            "vextracti128   $tmp,$tmp2\n\t"
+            "vpmullq  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd   $tmp,$tmp2,0xE\n\t"
+            "vpmullq  $tmp2,$tmp2,$tmp\n\t"
+            "movdq    $tmp,$src1\n\t"
+            "vpmullq  $tmp2,$tmp2,$tmp\n\t"
+            "movdq    $dst,$tmp2\t! mul reduction8L" %}
+  ins_encode %{
+    __ vextracti64x4h($tmp2$$XMMRegister, $src2$$XMMRegister);
+    __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
+    __ vextracti128h($tmp$$XMMRegister, $tmp2$$XMMRegister);
+    __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
+    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
+    __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
+    __ movdq($tmp$$XMMRegister, $src1$$Register);
+    __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
+    __ movdq($dst$$Register, $tmp2$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+#endif
+
+instruct rsmul2F_reduction(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
   predicate(UseSSE >= 1 && UseAVX == 0);
   match(Set dst (MulReductionVF src1 src2));
   effect(TEMP tmp, TEMP tmp2);
@@ -2931,7 +4666,7 @@
             "mulss   $tmp,$src2\n\t"
             "pshufd  $tmp2,$src2,0x01\n\t"
             "mulss   $tmp,$tmp2\n\t"
-            "movdqu  $dst,$tmp\t! add reduction2F" %}
+            "movdqu  $dst,$tmp\t! mul reduction2F" %}
   ins_encode %{
     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
     __ mulss($tmp$$XMMRegister, $src2$$XMMRegister);
@@ -2948,7 +4683,7 @@
   effect(TEMP tmp, TEMP tmp2);
   format %{ "vmulss  $tmp2,$src1,$src2\n\t"
             "pshufd  $tmp,$src2,0x01\n\t"
-            "vmulss  $dst,$tmp2,$tmp\t! add reduction2F" %}
+            "vmulss  $dst,$tmp2,$tmp\t! mul reduction2F" %}
   ins_encode %{
     __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
@@ -2969,7 +4704,7 @@
             "mulss   $tmp,$tmp2\n\t"
             "pshufd  $tmp2,$src2,0x03\n\t"
             "mulss   $tmp,$tmp2\n\t"
-            "movdqu  $dst,$tmp\t! add reduction4F" %}
+            "movdqu  $dst,$tmp\t! mul reduction4F" %}
   ins_encode %{
     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
     __ mulss($tmp$$XMMRegister, $src2$$XMMRegister);
@@ -2994,7 +4729,7 @@
             "pshufd  $tmp,$src2,0x02\n\t"
             "vmulss  $tmp2,$tmp2,$tmp\n\t"
             "pshufd  $tmp,$src2,0x03\n\t"
-            "vmulss  $dst,$tmp2,$tmp\t! add reduction4F" %}
+            "vmulss  $dst,$tmp2,$tmp\t! mul reduction4F" %}
   ins_encode %{
     __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
@@ -3046,6 +4781,77 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct rvmul16F_reduction_reg(regF dst, regF src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
+  predicate(UseAVX > 2);
+  match(Set dst (MulReductionVF src1 src2));
+  effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
+  format %{ "vmulss  $tmp2,$src1,$src2\n\t"
+            "pshufd  $tmp,$src2,0x01\n\t"
+            "vmulss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$src2,0x02\n\t"
+            "vmulss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$src2,0x03\n\t"
+            "vmulss  $tmp2,$tmp2,$tmp\n\t"
+            "vextractf32x4  $tmp3,$src2, 0x1\n\t"
+            "vmulss  $tmp2,$tmp2,$tmp3\n\t"
+            "pshufd  $tmp,$tmp3,0x01\n\t"
+            "vmulss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$tmp3,0x02\n\t"
+            "vmulss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$tmp3,0x03\n\t"
+            "vmulss  $tmp2,$tmp2,$tmp\n\t"
+            "vextractf32x4  $tmp3,$src2, 0x2\n\t"
+            "vmulss  $tmp2,$tmp2,$tmp3\n\t"
+            "pshufd  $tmp,$tmp3,0x01\n\t"
+            "vmulss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$tmp3,0x02\n\t"
+            "vmulss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$tmp3,0x03\n\t"
+            "vmulss  $tmp2,$tmp2,$tmp\n\t"
+            "vextractf32x4  $tmp3,$src2, 0x3\n\t"
+            "vmulss  $tmp2,$tmp2,$tmp3\n\t"
+            "pshufd  $tmp,$tmp3,0x01\n\t"
+            "vmulss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$tmp3,0x02\n\t"
+            "vmulss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$tmp3,0x03\n\t"
+            "vmulss  $dst,$tmp2,$tmp\t! mul reduction16F" %}
+  ins_encode %{
+    __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
+    __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
+    __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
+    __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ vextractf32x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x1);
+    __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
+    __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
+    __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
+    __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ vextractf32x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x2);
+    __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
+    __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
+    __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
+    __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ vextractf32x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x3);
+    __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
+    __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
+    __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
+    __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct rsmul2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp) %{
   predicate(UseSSE >= 1 && UseAVX == 0);
   match(Set dst (MulReductionVD src1 src2));
@@ -3053,7 +4859,7 @@
   format %{ "movdqu  $tmp,$src1\n\t"
             "mulsd   $tmp,$src2\n\t"
             "pshufd  $dst,$src2,0xE\n\t"
-            "mulsd   $dst,$tmp\t! add reduction2D" %}
+            "mulsd   $dst,$tmp\t! mul reduction2D" %}
   ins_encode %{
     __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
     __ mulsd($tmp$$XMMRegister, $src2$$XMMRegister);
@@ -3101,6 +4907,45 @@
   ins_pipe( pipe_slow );
 %}
 
+instruct rvmul8D_reduction_reg(regD dst, regD src1, vecZ src2, regD tmp, regD tmp2, regD tmp3) %{
+  predicate(UseAVX > 2);
+  match(Set dst (MulReductionVD src1 src2));
+  effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
+  format %{ "vmulsd  $tmp2,$src1,$src2\n\t"
+            "pshufd  $tmp,$src2,0xE\n\t"
+            "vmulsd  $tmp2,$tmp2,$tmp\n\t"
+            "vextractf64x2  $tmp3,$src2, 0x1\n\t"
+            "vmulsd  $tmp2,$tmp2,$tmp3\n\t"
+            "pshufd  $tmp,$src2,0xE\n\t"
+            "vmulsd  $tmp2,$tmp2,$tmp\n\t"
+            "vextractf64x2  $tmp3,$src2, 0x2\n\t"
+            "vmulsd  $tmp2,$tmp2,$tmp3\n\t"
+            "pshufd  $tmp,$tmp3,0xE\n\t"
+            "vmulsd  $tmp2,$tmp2,$tmp\n\t"
+            "vextractf64x2  $tmp3,$src2, 0x3\n\t"
+            "vmulsd  $tmp2,$tmp2,$tmp3\n\t"
+            "pshufd  $tmp,$tmp3,0xE\n\t"
+            "vmulsd  $dst,$tmp2,$tmp\t! mul reduction8D" %}
+  ins_encode %{
+    __ vmulsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
+    __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ vextractf64x2h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x1);
+    __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
+    __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ vextractf64x2h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x2);
+    __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
+    __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ vextractf64x2h($tmp3$$XMMRegister, $src2$$XMMRegister, 0x3);
+    __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
+    __ vmulsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // ====================VECTOR ARITHMETIC=======================================
 
 // --------------------------------- ADD --------------------------------------
@@ -3121,8 +4966,8 @@
   match(Set dst (AddVB src1 src2));
   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3142,8 +4987,8 @@
   match(Set dst (AddVB src1 src2));
   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3163,8 +5008,8 @@
   match(Set dst (AddVB src1 src2));
   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3174,8 +5019,8 @@
   match(Set dst (AddVB src (LoadVector mem)));
   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 0;
+    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3185,8 +5030,8 @@
   match(Set dst (AddVB src1 src2));
   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3196,8 +5041,30 @@
   match(Set dst (AddVB src (LoadVector mem)));
   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 1;
+    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
+  match(Set dst (AddVB src1 src2));
+  format %{ "vpaddb  $dst,$src1,$src2\t! add packed64B" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd64B_mem(vecZ dst, vecZ src, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
+  match(Set dst (AddVB src (LoadVector mem)));
+  format %{ "vpaddb  $dst,$src,$mem\t! add packed64B" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3218,8 +5085,8 @@
   match(Set dst (AddVS src1 src2));
   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3239,8 +5106,8 @@
   match(Set dst (AddVS src1 src2));
   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3260,8 +5127,8 @@
   match(Set dst (AddVS src1 src2));
   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3271,8 +5138,8 @@
   match(Set dst (AddVS src (LoadVector mem)));
   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 0;
+    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3282,8 +5149,8 @@
   match(Set dst (AddVS src1 src2));
   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3293,8 +5160,30 @@
   match(Set dst (AddVS src (LoadVector mem)));
   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 1;
+    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
+  match(Set dst (AddVS src1 src2));
+  format %{ "vpaddw  $dst,$src1,$src2\t! add packed32S" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd32S_mem(vecZ dst, vecZ src, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
+  match(Set dst (AddVS src (LoadVector mem)));
+  format %{ "vpaddw  $dst,$src,$mem\t! add packed32S" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3315,8 +5204,8 @@
   match(Set dst (AddVI src1 src2));
   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3336,8 +5225,8 @@
   match(Set dst (AddVI src1 src2));
   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3347,8 +5236,8 @@
   match(Set dst (AddVI src (LoadVector mem)));
   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 0;
+    __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3358,8 +5247,8 @@
   match(Set dst (AddVI src1 src2));
   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3369,8 +5258,30 @@
   match(Set dst (AddVI src (LoadVector mem)));
   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 1;
+    __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
+  match(Set dst (AddVI src1 src2));
+  format %{ "vpaddd  $dst,$src1,$src2\t! add packed16I" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd16I_mem(vecZ dst, vecZ src, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
+  match(Set dst (AddVI src (LoadVector mem)));
+  format %{ "vpaddd  $dst,$src,$mem\t! add packed16I" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3391,8 +5302,8 @@
   match(Set dst (AddVL src1 src2));
   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3402,8 +5313,8 @@
   match(Set dst (AddVL src (LoadVector mem)));
   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 0;
+    __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3413,8 +5324,8 @@
   match(Set dst (AddVL src1 src2));
   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3424,8 +5335,30 @@
   match(Set dst (AddVL src (LoadVector mem)));
   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 1;
+    __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
+  match(Set dst (AddVL src1 src2));
+  format %{ "vpaddq  $dst,$src1,$src2\t! add packed8L" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd8L_mem(vecZ dst, vecZ src, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
+  match(Set dst (AddVL src (LoadVector mem)));
+  format %{ "vpaddq  $dst,$src,$mem\t! add packed8L" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3446,8 +5379,8 @@
   match(Set dst (AddVF src1 src2));
   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3467,8 +5400,8 @@
   match(Set dst (AddVF src1 src2));
   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3478,8 +5411,8 @@
   match(Set dst (AddVF src (LoadVector mem)));
   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 0;
+    __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3489,8 +5422,8 @@
   match(Set dst (AddVF src1 src2));
   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3500,8 +5433,30 @@
   match(Set dst (AddVF src (LoadVector mem)));
   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 1;
+    __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
+  match(Set dst (AddVF src1 src2));
+  format %{ "vaddps  $dst,$src1,$src2\t! add packed16F" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd16F_mem(vecZ dst, vecZ src, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
+  match(Set dst (AddVF src (LoadVector mem)));
+  format %{ "vaddps  $dst,$src,$mem\t! add packed16F" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3522,8 +5477,8 @@
   match(Set dst (AddVD src1 src2));
   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3533,8 +5488,8 @@
   match(Set dst (AddVD src (LoadVector mem)));
   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 0;
+    __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3544,8 +5499,8 @@
   match(Set dst (AddVD src1 src2));
   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3555,8 +5510,30 @@
   match(Set dst (AddVD src (LoadVector mem)));
   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 1;
+    __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
+  match(Set dst (AddVD src1 src2));
+  format %{ "vaddpd  $dst,$src1,$src2\t! add packed8D" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vadd8D_mem(vecZ dst, vecZ src, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
+  match(Set dst (AddVD src (LoadVector mem)));
+  format %{ "vaddpd  $dst,$src,$mem\t! add packed8D" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3579,8 +5556,8 @@
   match(Set dst (SubVB src1 src2));
   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3600,8 +5577,8 @@
   match(Set dst (SubVB src1 src2));
   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3621,8 +5598,8 @@
   match(Set dst (SubVB src1 src2));
   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3632,8 +5609,8 @@
   match(Set dst (SubVB src (LoadVector mem)));
   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 0;
+    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3643,8 +5620,8 @@
   match(Set dst (SubVB src1 src2));
   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3654,8 +5631,30 @@
   match(Set dst (SubVB src (LoadVector mem)));
   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 1;
+    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
+  match(Set dst (SubVB src1 src2));
+  format %{ "vpsubb  $dst,$src1,$src2\t! sub packed64B" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub64B_mem(vecZ dst, vecZ src, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
+  match(Set dst (SubVB src (LoadVector mem)));
+  format %{ "vpsubb  $dst,$src,$mem\t! sub packed64B" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3676,8 +5675,8 @@
   match(Set dst (SubVS src1 src2));
   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3697,8 +5696,8 @@
   match(Set dst (SubVS src1 src2));
   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3718,8 +5717,8 @@
   match(Set dst (SubVS src1 src2));
   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3729,8 +5728,8 @@
   match(Set dst (SubVS src (LoadVector mem)));
   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 0;
+    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3740,8 +5739,8 @@
   match(Set dst (SubVS src1 src2));
   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3751,8 +5750,30 @@
   match(Set dst (SubVS src (LoadVector mem)));
   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 1;
+    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
+  match(Set dst (SubVS src1 src2));
+  format %{ "vpsubw  $dst,$src1,$src2\t! sub packed32S" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub32S_mem(vecZ dst, vecZ src, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
+  match(Set dst (SubVS src (LoadVector mem)));
+  format %{ "vpsubw  $dst,$src,$mem\t! sub packed32S" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3773,8 +5794,8 @@
   match(Set dst (SubVI src1 src2));
   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3794,8 +5815,8 @@
   match(Set dst (SubVI src1 src2));
   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3805,8 +5826,8 @@
   match(Set dst (SubVI src (LoadVector mem)));
   format %{ "vpsubd  $dst,$src,$mem\t! sub packed4I" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 0;
+    __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3816,8 +5837,8 @@
   match(Set dst (SubVI src1 src2));
   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed8I" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3827,8 +5848,30 @@
   match(Set dst (SubVI src (LoadVector mem)));
   format %{ "vpsubd  $dst,$src,$mem\t! sub packed8I" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 1;
+    __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
+  match(Set dst (SubVI src1 src2));
+  format %{ "vpsubd  $dst,$src1,$src2\t! sub packed16I" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub16I_mem(vecZ dst, vecZ src, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
+  match(Set dst (SubVI src (LoadVector mem)));
+  format %{ "vpsubd  $dst,$src,$mem\t! sub packed16I" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3849,8 +5892,8 @@
   match(Set dst (SubVL src1 src2));
   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed2L" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3860,8 +5903,8 @@
   match(Set dst (SubVL src (LoadVector mem)));
   format %{ "vpsubq  $dst,$src,$mem\t! sub packed2L" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 0;
+    __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3871,8 +5914,8 @@
   match(Set dst (SubVL src1 src2));
   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed4L" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3882,8 +5925,30 @@
   match(Set dst (SubVL src (LoadVector mem)));
   format %{ "vpsubq  $dst,$src,$mem\t! sub packed4L" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 1;
+    __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
+  match(Set dst (SubVL src1 src2));
+  format %{ "vpsubq  $dst,$src1,$src2\t! sub packed8L" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub8L_mem(vecZ dst, vecZ src, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
+  match(Set dst (SubVL src (LoadVector mem)));
+  format %{ "vpsubq  $dst,$src,$mem\t! sub packed8L" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3904,8 +5969,8 @@
   match(Set dst (SubVF src1 src2));
   format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3925,8 +5990,8 @@
   match(Set dst (SubVF src1 src2));
   format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3936,8 +6001,8 @@
   match(Set dst (SubVF src (LoadVector mem)));
   format %{ "vsubps  $dst,$src,$mem\t! sub packed4F" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 0;
+    __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3947,8 +6012,8 @@
   match(Set dst (SubVF src1 src2));
   format %{ "vsubps  $dst,$src1,$src2\t! sub packed8F" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3958,8 +6023,30 @@
   match(Set dst (SubVF src (LoadVector mem)));
   format %{ "vsubps  $dst,$src,$mem\t! sub packed8F" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 1;
+    __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
+  match(Set dst (SubVF src1 src2));
+  format %{ "vsubps  $dst,$src1,$src2\t! sub packed16F" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub16F_mem(vecZ dst, vecZ src, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
+  match(Set dst (SubVF src (LoadVector mem)));
+  format %{ "vsubps  $dst,$src,$mem\t! sub packed16F" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3980,8 +6067,8 @@
   match(Set dst (SubVD src1 src2));
   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed2D" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3991,8 +6078,8 @@
   match(Set dst (SubVD src (LoadVector mem)));
   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 0;
+    __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4002,8 +6089,8 @@
   match(Set dst (SubVD src1 src2));
   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4013,8 +6100,30 @@
   match(Set dst (SubVD src (LoadVector mem)));
   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 1;
+    __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
+  match(Set dst (SubVD src1 src2));
+  format %{ "vsubpd  $dst,$src1,$src2\t! sub packed8D" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsub8D_mem(vecZ dst, vecZ src, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
+  match(Set dst (SubVD src (LoadVector mem)));
+  format %{ "vsubpd  $dst,$src,$mem\t! sub packed8D" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4037,8 +6146,8 @@
   match(Set dst (MulVS src1 src2));
   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4058,8 +6167,8 @@
   match(Set dst (MulVS src1 src2));
   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4079,8 +6188,8 @@
   match(Set dst (MulVS src1 src2));
   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4090,8 +6199,8 @@
   match(Set dst (MulVS src (LoadVector mem)));
   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 0;
+    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4101,8 +6210,8 @@
   match(Set dst (MulVS src1 src2));
   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4112,8 +6221,30 @@
   match(Set dst (MulVS src (LoadVector mem)));
   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 1;
+    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
+  match(Set dst (MulVS src1 src2));
+  format %{ "vpmullw $dst,$src1,$src2\t! mul packed32S" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul32S_mem(vecZ dst, vecZ src, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
+  match(Set dst (MulVS src (LoadVector mem)));
+  format %{ "vpmullw $dst,$src,$mem\t! mul packed32S" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4134,8 +6265,19 @@
   match(Set dst (MulVI src1 src2));
   format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
+  match(Set dst (MulVL src1 src2));
+  format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4155,8 +6297,8 @@
   match(Set dst (MulVI src1 src2));
   format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4166,8 +6308,30 @@
   match(Set dst (MulVI src (LoadVector mem)));
   format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 0;
+    __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4L_reg(vecY dst, vecY src1, vecY src2) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
+  match(Set dst (MulVL src1 src2));
+  format %{ "vpmullq $dst,$src1,$src2\t! mul packed4L" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul4L_mem(vecY dst, vecY src, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
+  match(Set dst (MulVL src (LoadVector mem)));
+  format %{ "vpmullq $dst,$src,$mem\t! mul packed4L" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4177,8 +6341,30 @@
   match(Set dst (MulVI src1 src2));
   format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
+  match(Set dst (MulVL src1 src2));
+  format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
+  match(Set dst (MulVI src1 src2));
+  format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4188,8 +6374,30 @@
   match(Set dst (MulVI src (LoadVector mem)));
   format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 1;
+    __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
+  match(Set dst (MulVL src (LoadVector mem)));
+  format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul16I_mem(vecZ dst, vecZ src, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
+  match(Set dst (MulVI src (LoadVector mem)));
+  format %{ "vpmulld $dst,$src,$mem\t! mul packed16I" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4210,8 +6418,8 @@
   match(Set dst (MulVF src1 src2));
   format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4231,8 +6439,8 @@
   match(Set dst (MulVF src1 src2));
   format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4242,8 +6450,8 @@
   match(Set dst (MulVF src (LoadVector mem)));
   format %{ "vmulps  $dst,$src,$mem\t! mul packed4F" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 0;
+    __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4253,8 +6461,8 @@
   match(Set dst (MulVF src1 src2));
   format %{ "vmulps  $dst,$src1,$src2\t! mul packed8F" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4264,8 +6472,30 @@
   match(Set dst (MulVF src (LoadVector mem)));
   format %{ "vmulps  $dst,$src,$mem\t! mul packed8F" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 1;
+    __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
+  match(Set dst (MulVF src1 src2));
+  format %{ "vmulps  $dst,$src1,$src2\t! mul packed16F" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul16F_mem(vecZ dst, vecZ src, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
+  match(Set dst (MulVF src (LoadVector mem)));
+  format %{ "vmulps  $dst,$src,$mem\t! mul packed16F" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4286,8 +6516,8 @@
   match(Set dst (MulVD src1 src2));
   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed2D" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4297,8 +6527,8 @@
   match(Set dst (MulVD src (LoadVector mem)));
   format %{ "vmulpd  $dst,$src,$mem\t! mul packed2D" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 0;
+    __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4308,8 +6538,8 @@
   match(Set dst (MulVD src1 src2));
   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed4D" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4319,8 +6549,30 @@
   match(Set dst (MulVD src (LoadVector mem)));
   format %{ "vmulpd  $dst,$src,$mem\t! mul packed4D" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 1;
+    __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
+  match(Set dst (MulVD src1 src2));
+  format %{ "vmulpd  $dst k0,$src1,$src2\t! mul packed8D" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vmul8D_mem(vecZ dst, vecZ src, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
+  match(Set dst (MulVD src (LoadVector mem)));
+  format %{ "vmulpd  $dst k0,$src,$mem\t! mul packed8D" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4343,8 +6595,8 @@
   match(Set dst (DivVF src1 src2));
   format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4364,8 +6616,8 @@
   match(Set dst (DivVF src1 src2));
   format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4375,8 +6627,8 @@
   match(Set dst (DivVF src (LoadVector mem)));
   format %{ "vdivps  $dst,$src,$mem\t! div packed4F" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 0;
+    __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4386,8 +6638,8 @@
   match(Set dst (DivVF src1 src2));
   format %{ "vdivps  $dst,$src1,$src2\t! div packed8F" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4397,8 +6649,30 @@
   match(Set dst (DivVF src (LoadVector mem)));
   format %{ "vdivps  $dst,$src,$mem\t! div packed8F" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 1;
+    __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vdiv16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
+  match(Set dst (DivVF src1 src2));
+  format %{ "vdivps  $dst,$src1,$src2\t! div packed16F" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vdiv16F_mem(vecZ dst, vecZ src, memory mem) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
+  match(Set dst (DivVF src (LoadVector mem)));
+  format %{ "vdivps  $dst,$src,$mem\t! div packed16F" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4419,8 +6693,8 @@
   match(Set dst (DivVD src1 src2));
   format %{ "vdivpd  $dst,$src1,$src2\t! div packed2D" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4430,8 +6704,8 @@
   match(Set dst (DivVD src (LoadVector mem)));
   format %{ "vdivpd  $dst,$src,$mem\t! div packed2D" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 0;
+    __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4441,8 +6715,8 @@
   match(Set dst (DivVD src1 src2));
   format %{ "vdivpd  $dst,$src1,$src2\t! div packed4D" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4452,8 +6726,30 @@
   match(Set dst (DivVD src (LoadVector mem)));
   format %{ "vdivpd  $dst,$src,$mem\t! div packed4D" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 1;
+    __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vdiv8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
+  match(Set dst (DivVD src1 src2));
+  format %{ "vdivpd  $dst,$src1,$src2\t! div packed8D" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vdiv8D_mem(vecZ dst, vecZ src, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
+  match(Set dst (DivVD src (LoadVector mem)));
+  format %{ "vdivpd  $dst,$src,$mem\t! div packed8D" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4500,8 +6796,8 @@
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4511,8 +6807,8 @@
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+    int vector_len = 0;
+    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4542,8 +6838,8 @@
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4553,8 +6849,8 @@
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+    int vector_len = 0;
+    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4584,8 +6880,8 @@
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4595,8 +6891,8 @@
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+    int vector_len = 0;
+    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4606,8 +6902,8 @@
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4617,8 +6913,30 @@
   match(Set dst (LShiftVS src shift));
   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+    int vector_len = 1;
+    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll32S_reg(vecZ dst, vecZ src, vecS shift) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
+  match(Set dst (LShiftVS src shift));
+  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
+  match(Set dst (LShiftVS src shift));
+  format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4649,8 +6967,8 @@
   match(Set dst (LShiftVI src shift));
   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4660,8 +6978,8 @@
   match(Set dst (LShiftVI src shift));
   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+    int vector_len = 0;
+    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4691,8 +7009,8 @@
   match(Set dst (LShiftVI src shift));
   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4702,8 +7020,8 @@
   match(Set dst (LShiftVI src shift));
   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+    int vector_len = 0;
+    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4713,8 +7031,8 @@
   match(Set dst (LShiftVI src shift));
   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4724,8 +7042,30 @@
   match(Set dst (LShiftVI src shift));
   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+    int vector_len = 1;
+    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll16I_reg(vecZ dst, vecZ src, vecS shift) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
+  match(Set dst (LShiftVI src shift));
+  format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
+  match(Set dst (LShiftVI src shift));
+  format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4756,8 +7096,8 @@
   match(Set dst (LShiftVL src shift));
   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4767,8 +7107,8 @@
   match(Set dst (LShiftVL src shift));
   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+    int vector_len = 0;
+    __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4778,8 +7118,8 @@
   match(Set dst (LShiftVL src shift));
   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4789,8 +7129,30 @@
   match(Set dst (LShiftVL src shift));
   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+    int vector_len = 1;
+    __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll8L_reg(vecZ dst, vecZ src, vecS shift) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVL src shift));
+  format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsll8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVL src shift));
+  format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4827,8 +7189,8 @@
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4838,8 +7200,8 @@
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+    int vector_len = 0;
+    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4869,8 +7231,8 @@
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4880,8 +7242,8 @@
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+    int vector_len = 0;
+    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4911,8 +7273,8 @@
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4922,8 +7284,8 @@
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+    int vector_len = 0;
+    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4933,8 +7295,8 @@
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4944,8 +7306,30 @@
   match(Set dst (URShiftVS src shift));
   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+    int vector_len = 1;
+    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl32S_reg(vecZ dst, vecZ src, vecS shift) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
+  match(Set dst (URShiftVS src shift));
+  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
+  match(Set dst (URShiftVS src shift));
+  format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4976,8 +7360,8 @@
   match(Set dst (URShiftVI src shift));
   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4987,8 +7371,8 @@
   match(Set dst (URShiftVI src shift));
   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+    int vector_len = 0;
+    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5018,8 +7402,8 @@
   match(Set dst (URShiftVI src shift));
   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5029,8 +7413,8 @@
   match(Set dst (URShiftVI src shift));
   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+    int vector_len = 0;
+    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5040,8 +7424,8 @@
   match(Set dst (URShiftVI src shift));
   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5051,8 +7435,30 @@
   match(Set dst (URShiftVI src shift));
   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+    int vector_len = 1;
+    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl16I_reg(vecZ dst, vecZ src, vecS shift) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
+  match(Set dst (URShiftVI src shift));
+  format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
+  match(Set dst (URShiftVI src shift));
+  format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5083,8 +7489,8 @@
   match(Set dst (URShiftVL src shift));
   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5094,8 +7500,8 @@
   match(Set dst (URShiftVL src shift));
   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+    int vector_len = 0;
+    __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5105,8 +7511,8 @@
   match(Set dst (URShiftVL src shift));
   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5116,8 +7522,30 @@
   match(Set dst (URShiftVL src shift));
   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+    int vector_len = 1;
+    __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl8L_reg(vecZ dst, vecZ src, vecS shift) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
+  match(Set dst (URShiftVL src shift));
+  format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsrl8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
+  match(Set dst (URShiftVL src shift));
+  format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5150,8 +7578,8 @@
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5161,8 +7589,8 @@
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+    int vector_len = 0;
+    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5192,8 +7620,8 @@
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5203,8 +7631,8 @@
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+    int vector_len = 0;
+    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5234,8 +7662,8 @@
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5245,8 +7673,8 @@
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+    int vector_len = 0;
+    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5256,8 +7684,8 @@
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5267,8 +7695,30 @@
   match(Set dst (RShiftVS src shift));
   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+    int vector_len = 1;
+    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra32S_reg(vecZ dst, vecZ src, vecS shift) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
+  match(Set dst (RShiftVS src shift));
+  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
+  match(Set dst (RShiftVS src shift));
+  format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5299,8 +7749,8 @@
   match(Set dst (RShiftVI src shift));
   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5310,8 +7760,8 @@
   match(Set dst (RShiftVI src shift));
   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+    int vector_len = 0;
+    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5341,8 +7791,8 @@
   match(Set dst (RShiftVI src shift));
   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5352,8 +7802,8 @@
   match(Set dst (RShiftVI src shift));
   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+    int vector_len = 0;
+    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5363,8 +7813,8 @@
   match(Set dst (RShiftVI src shift));
   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5374,8 +7824,30 @@
   match(Set dst (RShiftVI src shift));
   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
+    int vector_len = 1;
+    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra16I_reg(vecZ dst, vecZ src, vecS shift) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
+  match(Set dst (RShiftVI src shift));
+  format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vsra16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
+  match(Set dst (RShiftVI src shift));
+  format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5400,8 +7872,8 @@
   match(Set dst (AndV src1 src2));
   format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5421,8 +7893,8 @@
   match(Set dst (AndV src1 src2));
   format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5442,8 +7914,8 @@
   match(Set dst (AndV src1 src2));
   format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5453,8 +7925,8 @@
   match(Set dst (AndV src (LoadVector mem)));
   format %{ "vpand   $dst,$src,$mem\t! and vectors (16 bytes)" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 0;
+    __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5464,8 +7936,8 @@
   match(Set dst (AndV src1 src2));
   format %{ "vpand   $dst,$src1,$src2\t! and vectors (32 bytes)" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5475,8 +7947,30 @@
   match(Set dst (AndV src (LoadVector mem)));
   format %{ "vpand   $dst,$src,$mem\t! and vectors (32 bytes)" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 1;
+    __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vand64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
+  match(Set dst (AndV src1 src2));
+  format %{ "vpand   $dst,$src1,$src2\t! and vectors (64 bytes)" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vand64B_mem(vecZ dst, vecZ src, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
+  match(Set dst (AndV src (LoadVector mem)));
+  format %{ "vpand   $dst,$src,$mem\t! and vectors (64 bytes)" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5498,8 +7992,8 @@
   match(Set dst (OrV src1 src2));
   format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5519,8 +8013,8 @@
   match(Set dst (OrV src1 src2));
   format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5540,8 +8034,8 @@
   match(Set dst (OrV src1 src2));
   format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5551,8 +8045,8 @@
   match(Set dst (OrV src (LoadVector mem)));
   format %{ "vpor    $dst,$src,$mem\t! or vectors (16 bytes)" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 0;
+    __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5562,8 +8056,8 @@
   match(Set dst (OrV src1 src2));
   format %{ "vpor    $dst,$src1,$src2\t! or vectors (32 bytes)" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5573,8 +8067,30 @@
   match(Set dst (OrV src (LoadVector mem)));
   format %{ "vpor    $dst,$src,$mem\t! or vectors (32 bytes)" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 1;
+    __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
+  match(Set dst (OrV src1 src2));
+  format %{ "vpor    $dst,$src1,$src2\t! or vectors (64 bytes)" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vor64B_mem(vecZ dst, vecZ src, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
+  match(Set dst (OrV src (LoadVector mem)));
+  format %{ "vpor    $dst,$src,$mem\t! or vectors (64 bytes)" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5596,8 +8112,8 @@
   match(Set dst (XorV src1 src2));
   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5617,8 +8133,8 @@
   match(Set dst (XorV src1 src2));
   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5638,8 +8154,8 @@
   match(Set dst (XorV src1 src2));
   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 0;
+    __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5649,8 +8165,8 @@
   match(Set dst (XorV src (LoadVector mem)));
   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (16 bytes)" %}
   ins_encode %{
-    bool vector256 = false;
-    __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
+    int vector_len = 0;
+    __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5660,8 +8176,8 @@
   match(Set dst (XorV src1 src2));
   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (32 bytes)" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
+    int vector_len = 1;
+    __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -5671,9 +8187,31 @@
   match(Set dst (XorV src (LoadVector mem)));
   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (32 bytes)" %}
   ins_encode %{
-    bool vector256 = true;
-    __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
+    int vector_len = 1;
+    __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vxor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
+  match(Set dst (XorV src1 src2));
+  format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (64 bytes)" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vxor64B_mem(vecZ dst, vecZ src, memory mem) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
+  match(Set dst (XorV src (LoadVector mem)));
+  format %{ "vpxor   $dst,$src,$mem\t! xor vectors (64 bytes)" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
--- a/hotspot/src/cpu/x86/vm/x86_32.ad	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad	Fri May 08 11:49:20 2015 -0700
@@ -101,6 +101,17 @@
 reg_def FPR6H( SOC, SOC, Op_RegF, 6, as_FloatRegister(5)->as_VMReg()->next());
 reg_def FPR7L( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg());
 reg_def FPR7H( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next());
+//
+// Empty fill registers, which are never used, but supply alignment to xmm regs
+//
+reg_def FILL0( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(2));
+reg_def FILL1( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(3));
+reg_def FILL2( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(4));
+reg_def FILL3( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(5));
+reg_def FILL4( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(6));
+reg_def FILL5( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(7));
+reg_def FILL6( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(8));
+reg_def FILL7( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(9));
 
 // Specify priority of register selection within phases of register
 // allocation.  Highest priority is first.  A useful heuristic is to
@@ -112,7 +123,8 @@
 alloc_class chunk0( ECX,   EBX,   EBP,   EDI,   EAX,   EDX,   ESI, ESP,
                     FPR0L, FPR0H, FPR1L, FPR1H, FPR2L, FPR2H,
                     FPR3L, FPR3H, FPR4L, FPR4H, FPR5L, FPR5H,
-                    FPR6L, FPR6H, FPR7L, FPR7H );
+                    FPR6L, FPR6H, FPR7L, FPR7H,
+                    FILL0, FILL1, FILL2, FILL3, FILL4, FILL5, FILL6, FILL7);
 
 
 //----------Architecture Description Register Classes--------------------------
@@ -131,7 +143,7 @@
 // Class for all registers (excluding EBP)
 reg_class any_reg_no_ebp(EAX, EDX, EDI, ESI, ECX, EBX, ESP);
 // Dynamic register class that selects at runtime between register classes
-// any_reg and any_no_ebp_reg (depending on the value of the flag PreserveFramePointer). 
+// any_reg and any_no_ebp_reg (depending on the value of the flag PreserveFramePointer).
 // Equivalent to: return PreserveFramePointer ? any_no_ebp_reg : any_reg;
 reg_class_dynamic any_reg(any_reg_no_ebp, any_reg_with_ebp, %{ PreserveFramePointer %});
 
@@ -279,7 +291,9 @@
     size += 6; // fldcw
   }
   if (C->max_vector_size() > 16) {
-    size += 3; // vzeroupper
+    if(UseAVX <= 2) {
+      size += 3; // vzeroupper
+    }
   }
   return size;
 }
@@ -288,7 +302,7 @@
 //       from the start of the call to the point where the return address
 //       will point.
 int MachCallStaticJavaNode::ret_addr_offset() {
-  return 5 + pre_call_resets_size();  // 5 bytes from start of call to where return address points  
+  return 5 + pre_call_resets_size();  // 5 bytes from start of call to where return address points
 }
 
 int MachCallDynamicJavaNode::ret_addr_offset() {
@@ -767,6 +781,12 @@
 // Helper for XMM registers.  Extra opcode bits, limited syntax.
 static int impl_x_helper( CodeBuffer *cbuf, bool do_size, bool is_load,
                          int offset, int reg_lo, int reg_hi, int size, outputStream* st ) {
+  int in_size_in_bits = Assembler::EVEX_32bit;
+  int evex_encoding = 0;
+  if (reg_lo+1 == reg_hi) {
+    in_size_in_bits = Assembler::EVEX_64bit;
+    evex_encoding = Assembler::VEX_W;
+  }
   if (cbuf) {
     MacroAssembler _masm(cbuf);
     if (reg_lo+1 == reg_hi) { // double move?
@@ -799,7 +819,17 @@
     }
 #endif
   }
-  int offset_size = (offset == 0) ? 0 : ((offset <= 127) ? 1 : 4);
+  bool is_single_byte = false;
+  if ((UseAVX > 2) && (offset != 0)) {
+    is_single_byte = Assembler::query_compressed_disp_byte(offset, true, 0, Assembler::EVEX_T1S, in_size_in_bits, evex_encoding);
+  }
+  int offset_size = 0;
+  if (UseAVX > 2 ) {
+    offset_size = (offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
+  } else {
+    offset_size = (offset == 0) ? 0 : ((offset <= 127) ? 1 : 4);
+  }
+  size += (UseAVX > 2) ? 2 : 0; // Need an additional two bytes for EVEX
   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
   return size+5+offset_size;
 }
@@ -835,8 +865,8 @@
 #endif
   }
   // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
-  // Only MOVAPS SSE prefix uses 1 byte.
-  int sz = 4;
+  // Only MOVAPS SSE prefix uses 1 byte.  EVEX uses an additional 2 bytes.
+  int sz = (UseAVX > 2) ? 6 : 4;
   if (!(src_lo+1 == src_hi && dst_lo+1 == dst_hi) &&
       UseXmmRegToRegMoveAll && (UseAVX == 0)) sz = 3;
   return size + sz;
@@ -854,7 +884,7 @@
     st->print("movdl   %s, %s\t# spill", Matcher::regName[dst_lo], Matcher::regName[src_lo]);
 #endif
   }
-  return 4;
+  return (UseAVX> 2) ? 6 : 4;
 }
 
 
@@ -870,7 +900,7 @@
     st->print("movdl   %s, %s\t# spill", Matcher::regName[dst_lo], Matcher::regName[src_lo]);
 #endif
   }
-  return 4;
+  return (UseAVX> 2) ? 6 : 4;
 }
 
 static int impl_mov_helper( CodeBuffer *cbuf, bool do_size, int src, int dst, int size, outputStream* st ) {
@@ -941,9 +971,8 @@
     calc_size += 3+src_offset_size + 3+dst_offset_size;
     break;
   case Op_VecX:
-    calc_size = 6 + 6 + 5+src_offset_size + 5+dst_offset_size;
-    break;
   case Op_VecY:
+  case Op_VecZ:
     calc_size = 6 + 6 + 5+src_offset_size + 5+dst_offset_size;
     break;
   default:
@@ -974,6 +1003,11 @@
       __ vmovdqu(xmm0, Address(rsp, src_offset));
       __ vmovdqu(Address(rsp, dst_offset), xmm0);
       __ vmovdqu(xmm0, Address(rsp, -32));
+    case Op_VecZ:
+      __ evmovdqu(Address(rsp, -64), xmm0, 2);
+      __ evmovdqu(xmm0, Address(rsp, src_offset), 2);
+      __ evmovdqu(Address(rsp, dst_offset), xmm0, 2);
+      __ evmovdqu(xmm0, Address(rsp, -64), 2);
       break;
     default:
       ShouldNotReachHere();
@@ -1009,6 +1043,12 @@
                 "vmovdqu [rsp + #%d], xmm0\n\t"
                 "vmovdqu xmm0, [rsp - #32]",
                 src_offset, dst_offset);
+    case Op_VecZ:
+      st->print("vmovdqu [rsp - #64], xmm0\t# 512-bit mem-mem spill\n\t"
+                "vmovdqu xmm0, [rsp + #%d]\n\t"
+                "vmovdqu [rsp + #%d], xmm0\n\t"
+                "vmovdqu xmm0, [rsp - #64]",
+                src_offset, dst_offset);
       break;
     default:
       ShouldNotReachHere();
@@ -1042,7 +1082,7 @@
     uint ireg = ideal_reg();
     assert((src_first_rc != rc_int && dst_first_rc != rc_int), "sanity");
     assert((src_first_rc != rc_float && dst_first_rc != rc_float), "sanity");
-    assert((ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY), "sanity");
+    assert((ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY || ireg == Op_VecZ ), "sanity");
     if( src_first_rc == rc_stack && dst_first_rc == rc_stack ) {
       // mem -> mem
       int src_offset = ra_->reg2offset(src_first);
@@ -3998,7 +4038,7 @@
 // XMM Float register operands
 operand regF() %{
   predicate( UseSSE>=1 );
-  constraint(ALLOC_IN_RC(float_reg));
+  constraint(ALLOC_IN_RC(float_reg_legacy));
   match(RegF);
   format %{ %}
   interface(REG_INTER);
@@ -4007,12 +4047,45 @@
 // XMM Double register operands
 operand regD() %{
   predicate( UseSSE>=2 );
-  constraint(ALLOC_IN_RC(double_reg));
+  constraint(ALLOC_IN_RC(double_reg_legacy));
   match(RegD);
   format %{ %}
   interface(REG_INTER);
 %}
 
+// Vectors : note, we use legacy registers to avoid extra (unneeded in 32-bit VM)
+// runtime code generation via reg_class_dynamic.
+operand vecS() %{
+  constraint(ALLOC_IN_RC(vectors_reg_legacy));
+  match(VecS);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vecD() %{
+  constraint(ALLOC_IN_RC(vectord_reg_legacy));
+  match(VecD);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vecX() %{
+  constraint(ALLOC_IN_RC(vectorx_reg_legacy));
+  match(VecX);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vecY() %{
+  constraint(ALLOC_IN_RC(vectory_reg_legacy));
+  match(VecY);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
 
 //----------Memory Operands----------------------------------------------------
 // Direct Memory Operand
@@ -5020,11 +5093,11 @@
   match(Set dst (ReverseBytesUS dst));
   effect(KILL cr);
 
-  format %{ "BSWAP  $dst\n\t" 
+  format %{ "BSWAP  $dst\n\t"
             "SHR    $dst,16\n\t" %}
   ins_encode %{
     __ bswapl($dst$$Register);
-    __ shrl($dst$$Register, 16); 
+    __ shrl($dst$$Register, 16);
   %}
   ins_pipe( ialu_reg );
 %}
@@ -5033,11 +5106,11 @@
   match(Set dst (ReverseBytesS dst));
   effect(KILL cr);
 
-  format %{ "BSWAP  $dst\n\t" 
+  format %{ "BSWAP  $dst\n\t"
             "SAR    $dst,16\n\t" %}
   ins_encode %{
     __ bswapl($dst$$Register);
-    __ sarl($dst$$Register, 16); 
+    __ sarl($dst$$Register, 16);
   %}
   ins_pipe( ialu_reg );
 %}
@@ -6525,7 +6598,7 @@
   effect(KILL cr);
   ins_cost(400);
 
-  format %{ 
+  format %{
     $$template
     if (os::is_MP()) {
       $$emit$$"LOCK ADDL [ESP + #0], 0\t! membar_volatile"
@@ -8288,10 +8361,10 @@
 
 // Xor Register with Immediate -1
 instruct xorI_eReg_im1(rRegI dst, immI_M1 imm) %{
-  match(Set dst (XorI dst imm));  
+  match(Set dst (XorI dst imm));
 
   size(2);
-  format %{ "NOT    $dst" %}  
+  format %{ "NOT    $dst" %}
   ins_encode %{
      __ notl($dst$$Register);
   %}
@@ -8939,7 +9012,7 @@
 
 // Xor Long Register with Immediate -1
 instruct xorl_eReg_im1(eRegL dst, immL_M1 imm) %{
-  match(Set dst (XorL dst imm));  
+  match(Set dst (XorL dst imm));
   format %{ "NOT    $dst.lo\n\t"
             "NOT    $dst.hi" %}
   ins_encode %{
@@ -8994,7 +9067,7 @@
   effect(KILL cr);
   ins_cost(100);
   format %{ "ADD    $dst.lo,$dst.lo\n\t"
-            "ADC    $dst.hi,$dst.hi\n\t" 
+            "ADC    $dst.hi,$dst.hi\n\t"
             "ADD    $dst.lo,$dst.lo\n\t"
             "ADC    $dst.hi,$dst.hi" %}
   ins_encode %{
@@ -9013,9 +9086,9 @@
   effect(KILL cr);
   ins_cost(100);
   format %{ "ADD    $dst.lo,$dst.lo\n\t"
-            "ADC    $dst.hi,$dst.hi\n\t" 
+            "ADC    $dst.hi,$dst.hi\n\t"
             "ADD    $dst.lo,$dst.lo\n\t"
-            "ADC    $dst.hi,$dst.hi\n\t" 
+            "ADC    $dst.hi,$dst.hi\n\t"
             "ADD    $dst.lo,$dst.lo\n\t"
             "ADC    $dst.hi,$dst.hi" %}
   ins_encode %{
@@ -11168,7 +11241,6 @@
   ins_pipe( ialu_reg_reg );
 %}
 
-
 instruct MoveF2I_stack_reg(rRegI dst, stackSlotF src) %{
   match(Set dst (MoveF2I src));
   effect( DEF dst, USE src );
@@ -11400,7 +11472,7 @@
   format %{ "XOR    EAX,EAX\t# ClearArray:\n\t"
             "SHL    ECX,1\t# Convert doublewords to words\n\t"
             "REP STOS\t# store EAX into [EDI++] while ECX--" %}
-  ins_encode %{ 
+  ins_encode %{
     __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
   %}
   ins_pipe( pipe_slow );
@@ -11413,7 +11485,7 @@
   format %{ "XOR    EAX,EAX\t# ClearArray:\n\t"
             "SHL    ECX,3\t# Convert doublewords to bytes\n\t"
             "REP STOSB\t# store EAX into [EDI++] while ECX--" %}
-  ins_encode %{ 
+  ins_encode %{
     __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
   %}
   ins_pipe( pipe_slow );
--- a/hotspot/src/cpu/x86/vm/x86_64.ad	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad	Fri May 08 11:49:20 2015 -0700
@@ -172,7 +172,7 @@
 // Class for all pointer registers (including RSP and RBP)
 reg_class any_reg_with_rbp(RAX, RAX_H,
                            RDX, RDX_H,
-                           RBP, RBP_H,               
+                           RBP, RBP_H,
                            RDI, RDI_H,
                            RSI, RSI_H,
                            RCX, RCX_H,
@@ -189,7 +189,7 @@
 
 // Class for all pointer registers (including RSP, but excluding RBP)
 reg_class any_reg_no_rbp(RAX, RAX_H,
-                         RDX, RDX_H,                
+                         RDX, RDX_H,
                          RDI, RDI_H,
                          RSI, RSI_H,
                          RCX, RCX_H,
@@ -205,10 +205,10 @@
                          R15, R15_H);
 
 // Dynamic register class that selects at runtime between register classes
-// any_reg_no_rbp and any_reg_with_rbp (depending on the value of the flag PreserveFramePointer). 
+// any_reg_no_rbp and any_reg_with_rbp (depending on the value of the flag PreserveFramePointer).
 // Equivalent to: return PreserveFramePointer ? any_reg_no_rbp : any_reg_with_rbp;
 reg_class_dynamic any_reg(any_reg_no_rbp, any_reg_with_rbp, %{ PreserveFramePointer %});
-                  
+
 // Class for all pointer registers (excluding RSP)
 reg_class ptr_reg_with_rbp(RAX, RAX_H,
                            RDX, RDX_H,
@@ -226,7 +226,7 @@
 
 // Class for all pointer registers (excluding RSP and RBP)
 reg_class ptr_reg_no_rbp(RAX, RAX_H,
-                         RDX, RDX_H,                         
+                         RDX, RDX_H,
                          RDI, RDI_H,
                          RSI, RSI_H,
                          RCX, RCX_H,
@@ -536,7 +536,11 @@
 #define __ _masm.
 
 static int clear_avx_size() {
-  return (Compile::current()->max_vector_size() > 16) ? 3 : 0;  // vzeroupper
+  if(UseAVX > 2) {
+    return 0; // vzeroupper is ignored
+  } else {
+    return (Compile::current()->max_vector_size() > 16) ? 3 : 0;  // vzeroupper
+  }
 }
 
 // !!!!! Special hack to get all types of calls to specify the byte offset
@@ -545,7 +549,7 @@
 int MachCallStaticJavaNode::ret_addr_offset()
 {
   int offset = 5; // 5 bytes from start of call to where return address points
-  offset += clear_avx_size();  
+  offset += clear_avx_size();
   return offset;
 }
 
@@ -860,7 +864,7 @@
     st->print("subq    rsp, #%d\t# Create frame",framesize);
     st->print("\n\t");
     framesize -= wordSize;
-    st->print("movq    [rsp + #%d], rbp\t# Save rbp",framesize);    
+    st->print("movq    [rsp + #%d], rbp\t# Save rbp",framesize);
     if (PreserveFramePointer) {
       st->print("\n\t");
       st->print("movq    rbp, [rsp + #%d]\t# Save the caller's SP into rbp", (framesize + wordSize));
@@ -1070,6 +1074,11 @@
       __ vmovdqu(xmm0, Address(rsp, src_offset));
       __ vmovdqu(Address(rsp, dst_offset), xmm0);
       __ vmovdqu(xmm0, Address(rsp, -32));
+    case Op_VecZ:
+      __ evmovdqu(Address(rsp, -64), xmm0, 2);
+      __ evmovdqu(xmm0, Address(rsp, src_offset), 2);
+      __ evmovdqu(Address(rsp, dst_offset), xmm0, 2);
+      __ evmovdqu(xmm0, Address(rsp, -64), 2);
       break;
     default:
       ShouldNotReachHere();
@@ -1103,6 +1112,13 @@
                 "vmovdqu xmm0, [rsp - #32]",
                 src_offset, dst_offset);
       break;
+    case Op_VecZ:
+      st->print("vmovdqu [rsp - #64], xmm0\t# 512-bit mem-mem spill\n\t"
+                "vmovdqu xmm0, [rsp + #%d]\n\t"
+                "vmovdqu [rsp + #%d], xmm0\n\t"
+                "vmovdqu xmm0, [rsp - #64]",
+                src_offset, dst_offset);
+      break;
     default:
       ShouldNotReachHere();
     }
@@ -1136,7 +1152,7 @@
   if (bottom_type()->isa_vect() != NULL) {
     uint ireg = ideal_reg();
     assert((src_first_rc != rc_int && dst_first_rc != rc_int), "sanity");
-    assert((ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY), "sanity");
+    assert((ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY || ireg == Op_VecZ ), "sanity");
     if( src_first_rc == rc_stack && dst_first_rc == rc_stack ) {
       // mem -> mem
       int src_offset = ra_->reg2offset(src_first);
@@ -1573,7 +1589,7 @@
   return MachNode::size(ra_); // too many variables; just compute it
                               // the hard way
 }
- 
+
 
 //=============================================================================
 
@@ -2832,7 +2848,7 @@
       RAX_H_num     // Op_RegL
     };
     // Excluded flags and vector registers.
-    assert(ARRAY_SIZE(hi) == _last_machine_leaf - 5, "missing type");
+    assert(ARRAY_SIZE(hi) == _last_machine_leaf - 6, "missing type");
     return OptoRegPair(hi[ideal_reg], lo[ideal_reg]);
   %}
 %}
@@ -3335,7 +3351,7 @@
 // Pointer Register
 operand any_RegP()
 %{
-  constraint(ALLOC_IN_RC(any_reg));  
+  constraint(ALLOC_IN_RC(any_reg));
   match(RegP);
   match(rax_RegP);
   match(rbx_RegP);
@@ -3589,20 +3605,51 @@
 %}
 
 // Float register operands
-operand regF()
-%{
-  constraint(ALLOC_IN_RC(float_reg));
-  match(RegF);
+operand regF() %{
+   constraint(ALLOC_IN_RC(float_reg));
+   match(RegF);
+
+   format %{ %}
+   interface(REG_INTER);
+%}
+
+// Double register operands
+operand regD() %{
+   constraint(ALLOC_IN_RC(double_reg));
+   match(RegD);
+
+   format %{ %}
+   interface(REG_INTER);
+%}
+
+// Vectors
+operand vecS() %{
+  constraint(ALLOC_IN_RC(vectors_reg));
+  match(VecS);
 
   format %{ %}
   interface(REG_INTER);
 %}
 
-// Double register operands
-operand regD()
-%{
-  constraint(ALLOC_IN_RC(double_reg));
-  match(RegD);
+operand vecD() %{
+  constraint(ALLOC_IN_RC(vectord_reg));
+  match(VecD);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vecX() %{
+  constraint(ALLOC_IN_RC(vectorx_reg));
+  match(VecX);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vecY() %{
+  constraint(ALLOC_IN_RC(vectory_reg));
+  match(VecY);
 
   format %{ %}
   interface(REG_INTER);
@@ -4947,7 +4994,7 @@
 %}
 
 // Load Unsigned Integer into Long Register
-instruct loadUI2L(rRegL dst, memory mem, immL_32bits mask) 
+instruct loadUI2L(rRegL dst, memory mem, immL_32bits mask)
 %{
   match(Set dst (AndL (ConvI2L (LoadI mem)) mask));
 
@@ -10374,7 +10421,7 @@
 
   format %{ "xorq    rax, rax\t# ClearArray:\n\t"
             "rep     stosq\t# Store rax to *rdi++ while rcx--" %}
-  ins_encode %{ 
+  ins_encode %{
     __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
   %}
   ins_pipe(pipe_slow);
@@ -10389,7 +10436,7 @@
   format %{ "xorq    rax, rax\t# ClearArray:\n\t"
             "shlq    rcx,3\t# Convert doublewords to bytes\n\t"
             "rep     stosb\t# Store rax to *rdi++ while rcx--" %}
-  ins_encode %{ 
+  ins_encode %{
     __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
   %}
   ins_pipe( pipe_slow );
--- a/hotspot/src/share/vm/adlc/archDesc.cpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/share/vm/adlc/archDesc.cpp	Fri May 08 11:49:20 2015 -0700
@@ -929,6 +929,7 @@
     case 'D':  return "TypeVect::VECTD";
     case 'X':  return "TypeVect::VECTX";
     case 'Y':  return "TypeVect::VECTY";
+    case 'Z':  return "TypeVect::VECTZ";
     default:
       internal_err("Vector type %s with unrecognized type\n",idealOp);
     }
--- a/hotspot/src/share/vm/adlc/formssel.cpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/share/vm/adlc/formssel.cpp	Fri May 08 11:49:20 2015 -0700
@@ -3919,6 +3919,7 @@
          strcmp(opType,"VecD")==0 ||
          strcmp(opType,"VecX")==0 ||
          strcmp(opType,"VecY")==0 ||
+         strcmp(opType,"VecZ")==0 ||
          strcmp(opType,"Reg" )==0) ) {
       return 1;
     }
@@ -4048,6 +4049,7 @@
         strcmp(opType,"AddReductionVF")==0 ||
         strcmp(opType,"AddReductionVD")==0 ||
         strcmp(opType,"MulReductionVI")==0 ||
+        strcmp(opType,"MulReductionVL")==0 ||
         strcmp(opType,"MulReductionVF")==0 ||
         strcmp(opType,"MulReductionVD")==0 ||
         0 /* 0 to line up columns nicely */ )
@@ -4139,12 +4141,12 @@
   static const char *vector_list[] = {
     "AddVB","AddVS","AddVI","AddVL","AddVF","AddVD",
     "SubVB","SubVS","SubVI","SubVL","SubVF","SubVD",
-    "MulVS","MulVI","MulVF","MulVD",
+    "MulVS","MulVI","MulVL","MulVF","MulVD",
     "DivVF","DivVD",
     "AndV" ,"XorV" ,"OrV",
     "AddReductionVI", "AddReductionVL",
     "AddReductionVF", "AddReductionVD",
-    "MulReductionVI",
+    "MulReductionVI", "MulReductionVL",
     "MulReductionVF", "MulReductionVD",
     "LShiftCntV","RShiftCntV",
     "LShiftVB","LShiftVS","LShiftVI","LShiftVL",
--- a/hotspot/src/share/vm/c1/c1_LinearScan.cpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/share/vm/c1/c1_LinearScan.cpp	Fri May 08 11:49:20 2015 -0700
@@ -1290,7 +1290,8 @@
 #ifdef X86
     }
     if (UseSSE > 0) {
-      for (i = 0; i < FrameMap::nof_caller_save_xmm_regs; i++) {
+      int num_caller_save_xmm_regs = FrameMap::get_num_caller_save_xmms();
+      for (i = 0; i < num_caller_save_xmm_regs; i ++) {
         LIR_Opr opr = FrameMap::caller_save_xmm_reg_at(i);
         assert(opr->is_valid() && opr->is_register(), "FrameMap should not return invalid operands");
         assert(reg_numHi(opr) == -1, "missing addition of range for hi-register");
@@ -2098,7 +2099,13 @@
       case T_FLOAT: {
 #ifdef X86
         if (UseSSE >= 1) {
-          assert(assigned_reg >= pd_first_xmm_reg && assigned_reg <= pd_last_xmm_reg, "no xmm register");
+          int last_xmm_reg = pd_last_xmm_reg;
+#ifdef _LP64
+          if (UseAVX < 3) {
+            last_xmm_reg = pd_first_xmm_reg + (pd_nof_xmm_regs_frame_map / 2) - 1;
+          }
+#endif
+          assert(assigned_reg >= pd_first_xmm_reg && assigned_reg <= last_xmm_reg, "no xmm register");
           assert(interval->assigned_regHi() == any_reg, "must not have hi register");
           return LIR_OprFact::single_xmm(assigned_reg - pd_first_xmm_reg);
         }
@@ -2112,7 +2119,13 @@
       case T_DOUBLE: {
 #ifdef X86
         if (UseSSE >= 2) {
-          assert(assigned_reg >= pd_first_xmm_reg && assigned_reg <= pd_last_xmm_reg, "no xmm register");
+          int last_xmm_reg = pd_last_xmm_reg;
+#ifdef _LP64
+          if (UseAVX < 3) {
+            last_xmm_reg = pd_first_xmm_reg + (pd_nof_xmm_regs_frame_map / 2) - 1;
+          }
+#endif
+          assert(assigned_reg >= pd_first_xmm_reg && assigned_reg <= last_xmm_reg, "no xmm register");
           assert(interval->assigned_regHi() == any_reg, "must not have hi register (double xmm values are stored in one register)");
           return LIR_OprFact::double_xmm(assigned_reg - pd_first_xmm_reg);
         }
@@ -3600,7 +3613,8 @@
       }
 
 #ifdef X86
-      for (j = 0; j < FrameMap::nof_caller_save_xmm_regs; j++) {
+      int num_caller_save_xmm_regs = FrameMap::get_num_caller_save_xmms();
+      for (j = 0; j < num_caller_save_xmm_regs; j++) {
         state_put(input_state, reg_num(FrameMap::caller_save_xmm_reg_at(j)), NULL);
       }
 #endif
@@ -4514,12 +4528,20 @@
   if (reg_num() < LIR_OprDesc::vreg_base) {
     type_name = "fixed";
     // need a temporary operand for fixed intervals because type() cannot be called
+#ifdef X86
+    int last_xmm_reg = pd_last_xmm_reg;
+#ifdef _LP64
+    if (UseAVX < 3) {
+      last_xmm_reg = pd_first_xmm_reg + (pd_nof_xmm_regs_frame_map / 2) - 1;
+    }
+#endif
+#endif
     if (assigned_reg() >= pd_first_cpu_reg && assigned_reg() <= pd_last_cpu_reg) {
       opr = LIR_OprFact::single_cpu(assigned_reg());
     } else if (assigned_reg() >= pd_first_fpu_reg && assigned_reg() <= pd_last_fpu_reg) {
       opr = LIR_OprFact::single_fpu(assigned_reg() - pd_first_fpu_reg);
 #ifdef X86
-    } else if (assigned_reg() >= pd_first_xmm_reg && assigned_reg() <= pd_last_xmm_reg) {
+    } else if (assigned_reg() >= pd_first_xmm_reg && assigned_reg() <= last_xmm_reg) {
       opr = LIR_OprFact::single_xmm(assigned_reg() - pd_first_xmm_reg);
 #endif
     } else {
--- a/hotspot/src/share/vm/opto/c2_globals.hpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/share/vm/opto/c2_globals.hpp	Fri May 08 11:49:20 2015 -0700
@@ -96,7 +96,7 @@
   product(intx, MaxLoopPad, (OptoLoopAlignment-1),                          \
           "Align a loop if padding size in bytes is less or equal to this value") \
                                                                             \
-  product(intx, MaxVectorSize, 32,                                          \
+  product(intx, MaxVectorSize, 64,                                          \
           "Max vector size in bytes, "                                      \
           "actual size could be less depending on elements type")           \
                                                                             \
--- a/hotspot/src/share/vm/opto/chaitin.cpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/share/vm/opto/chaitin.cpp	Fri May 08 11:49:20 2015 -0700
@@ -907,6 +907,13 @@
           lrg.set_num_regs(RegMask::SlotsPerVecY);
           lrg.set_reg_pressure(1);
           break;
+        case Op_VecZ:
+          assert(Matcher::vector_size_supported(T_FLOAT,RegMask::SlotsPerVecZ), "sanity");
+          assert(RegMask::num_registers(Op_VecZ) == RegMask::SlotsPerVecZ, "sanity");
+          assert(lrgmask.is_aligned_sets(RegMask::SlotsPerVecZ), "vector should be aligned");
+          lrg.set_num_regs(RegMask::SlotsPerVecZ);
+          lrg.set_reg_pressure(1);
+          break;
         default:
           ShouldNotReachHere();
         }
@@ -1514,7 +1521,7 @@
       int n_regs = lrg->num_regs();
       assert(!lrg->_is_vector || !lrg->_fat_proj, "sanity");
       if (n_regs == 1 || !lrg->_fat_proj) {
-        assert(!lrg->_is_vector || n_regs <= RegMask::SlotsPerVecY, "sanity");
+        assert(!lrg->_is_vector || n_regs <= RegMask::SlotsPerVecZ, "sanity");
         lrg->Clear();           // Clear the mask
         lrg->Insert(reg);       // Set regmask to match selected reg
         // For vectors and pairs, also insert the low bit of the pair
--- a/hotspot/src/share/vm/opto/chaitin.hpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/share/vm/opto/chaitin.hpp	Fri May 08 11:49:20 2015 -0700
@@ -141,7 +141,7 @@
 
   // Number of registers this live range uses when it colors
 private:
-  uint8_t _num_regs;            // 2 for Longs and Doubles, 1 for all else
+  uint16_t _num_regs;           // 2 for Longs and Doubles, 1 for all else
                                 // except _num_regs is kill count for fat_proj
 public:
   int num_regs() const { return _num_regs; }
@@ -150,7 +150,7 @@
 private:
   // Number of physical registers this live range uses when it colors
   // Architecture and register-set dependent
-  uint8_t _reg_pressure;
+  uint16_t _reg_pressure;
 public:
   void set_reg_pressure(int i)  { _reg_pressure = i; }
   int      reg_pressure() const { return _reg_pressure; }
--- a/hotspot/src/share/vm/opto/classes.hpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/share/vm/opto/classes.hpp	Fri May 08 11:49:20 2015 -0700
@@ -282,6 +282,8 @@
 macro(MulVS)
 macro(MulVI)
 macro(MulReductionVI)
+macro(MulVL)
+macro(MulReductionVL)
 macro(MulVF)
 macro(MulReductionVF)
 macro(MulVD)
--- a/hotspot/src/share/vm/opto/compile.cpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/share/vm/opto/compile.cpp	Fri May 08 11:49:20 2015 -0700
@@ -3110,6 +3110,7 @@
   case Op_AddReductionVF:
   case Op_AddReductionVD:
   case Op_MulReductionVI:
+  case Op_MulReductionVL:
   case Op_MulReductionVF:
   case Op_MulReductionVD:
     break;
--- a/hotspot/src/share/vm/opto/matcher.cpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/share/vm/opto/matcher.cpp	Fri May 08 11:49:20 2015 -0700
@@ -83,6 +83,7 @@
   idealreg2spillmask  [Op_VecD] = NULL;
   idealreg2spillmask  [Op_VecX] = NULL;
   idealreg2spillmask  [Op_VecY] = NULL;
+  idealreg2spillmask  [Op_VecZ] = NULL;
 
   idealreg2debugmask  [Op_RegI] = NULL;
   idealreg2debugmask  [Op_RegN] = NULL;
@@ -94,6 +95,7 @@
   idealreg2debugmask  [Op_VecD] = NULL;
   idealreg2debugmask  [Op_VecX] = NULL;
   idealreg2debugmask  [Op_VecY] = NULL;
+  idealreg2debugmask  [Op_VecZ] = NULL;
 
   idealreg2mhdebugmask[Op_RegI] = NULL;
   idealreg2mhdebugmask[Op_RegN] = NULL;
@@ -105,6 +107,7 @@
   idealreg2mhdebugmask[Op_VecD] = NULL;
   idealreg2mhdebugmask[Op_VecX] = NULL;
   idealreg2mhdebugmask[Op_VecY] = NULL;
+  idealreg2mhdebugmask[Op_VecZ] = NULL;
 
   debug_only(_mem_node = NULL;)   // Ideal memory node consumed by mach node
 }
@@ -413,7 +416,7 @@
 void Matcher::init_first_stack_mask() {
 
   // Allocate storage for spill masks as masks for the appropriate load type.
-  RegMask *rms = (RegMask*)C->comp_arena()->Amalloc_D(sizeof(RegMask) * (3*6+4));
+  RegMask *rms = (RegMask*)C->comp_arena()->Amalloc_D(sizeof(RegMask) * (3*6+5));
 
   idealreg2spillmask  [Op_RegN] = &rms[0];
   idealreg2spillmask  [Op_RegI] = &rms[1];
@@ -440,6 +443,7 @@
   idealreg2spillmask  [Op_VecD] = &rms[19];
   idealreg2spillmask  [Op_VecX] = &rms[20];
   idealreg2spillmask  [Op_VecY] = &rms[21];
+  idealreg2spillmask  [Op_VecZ] = &rms[22];
 
   OptoReg::Name i;
 
@@ -524,6 +528,18 @@
     *idealreg2spillmask[Op_VecY] = *idealreg2regmask[Op_VecY];
      idealreg2spillmask[Op_VecY]->OR(aligned_stack_mask);
   }
+  if (Matcher::vector_size_supported(T_FLOAT,16)) {
+    // For VecZ we need enough alignment and 64 bytes (16 slots) for spills.
+    OptoReg::Name in = OptoReg::add(_in_arg_limit, -1);
+    for (int k = 1; (in >= init_in) && (k < RegMask::SlotsPerVecZ); k++) {
+      aligned_stack_mask.Remove(in);
+      in = OptoReg::add(in, -1);
+    }
+     aligned_stack_mask.clear_to_sets(RegMask::SlotsPerVecZ);
+     assert(aligned_stack_mask.is_AllStack(), "should be infinite stack");
+    *idealreg2spillmask[Op_VecZ] = *idealreg2regmask[Op_VecZ];
+     idealreg2spillmask[Op_VecZ]->OR(aligned_stack_mask);
+  }
    if (UseFPUForSpilling) {
      // This mask logic assumes that the spill operations are
      // symmetric and that the registers involved are the same size.
@@ -862,6 +878,10 @@
     MachNode *spillVectY = match_tree(new LoadVectorNode(NULL,mem,fp,atp,TypeVect::VECTY));
     idealreg2regmask[Op_VecY] = &spillVectY->out_RegMask();
   }
+  if (Matcher::vector_size_supported(T_FLOAT,16)) {
+    MachNode *spillVectZ = match_tree(new LoadVectorNode(NULL,mem,fp,atp,TypeVect::VECTZ));
+    idealreg2regmask[Op_VecZ] = &spillVectZ->out_RegMask();
+  }
 }
 
 #ifdef ASSERT
--- a/hotspot/src/share/vm/opto/opcodes.cpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/share/vm/opto/opcodes.cpp	Fri May 08 11:49:20 2015 -0700
@@ -42,6 +42,7 @@
   "VecD",
   "VecX",
   "VecY",
+  "VecZ",
   "_last_machine_leaf",
 #include "classes.hpp"
   "_last_class_name",
--- a/hotspot/src/share/vm/opto/opcodes.hpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/share/vm/opto/opcodes.hpp	Fri May 08 11:49:20 2015 -0700
@@ -40,6 +40,7 @@
   macro(VecD)                   // Machine vectord register
   macro(VecX)                   // Machine vectorx register
   macro(VecY)                   // Machine vectory register
+  macro(VecZ)                   // Machine vectorz register
   macro(RegFlags)               // Machine flags   register
   _last_machine_leaf,           // Split between regular opcodes and machine
 #include "classes.hpp"
--- a/hotspot/src/share/vm/opto/optoreg.hpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/share/vm/opto/optoreg.hpp	Fri May 08 11:49:20 2015 -0700
@@ -103,6 +103,10 @@
     return r - stack0();
   }
 
+  static void invalidate(Name n) {
+    vm2opto[n] = Bad;
+  }
+
   // convert a stack slot number into an OptoReg::Name
   static OptoReg::Name stack2reg( int idx) {
     return Name(stack0() + idx);
--- a/hotspot/src/share/vm/opto/output.cpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/share/vm/opto/output.cpp	Fri May 08 11:49:20 2015 -0700
@@ -1880,8 +1880,8 @@
   if (!do_scheduling())
     return;
 
-  // Scheduling code works only with pairs (8 bytes) maximum.
-  if (max_vector_size() > 8)
+  // Scheduling code works only with pairs (16 bytes) maximum.
+  if (max_vector_size() > 16)
     return;
 
   TracePhase tp("isched", &timers[_t_instrSched]);
--- a/hotspot/src/share/vm/opto/regmask.cpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/share/vm/opto/regmask.cpp	Fri May 08 11:49:20 2015 -0700
@@ -114,11 +114,14 @@
 
 //=============================================================================
 bool RegMask::is_vector(uint ireg) {
-  return (ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY);
+  return (ireg == Op_VecS || ireg == Op_VecD ||
+          ireg == Op_VecX || ireg == Op_VecY || ireg == Op_VecZ );
 }
 
 int RegMask::num_registers(uint ireg) {
     switch(ireg) {
+      case Op_VecZ:
+        return 16;
       case Op_VecY:
         return 8;
       case Op_VecX:
@@ -233,7 +236,8 @@
   return true;
 }
 
-static int low_bits[3] = { 0x55555555, 0x11111111, 0x01010101 };
+// only indicies of power 2 are accessed, so index 3 is only filled in for storage.
+static int low_bits[5] = { 0x55555555, 0x11111111, 0x01010101, 0x00000000, 0x00010001 };
 //------------------------------find_first_set---------------------------------
 // Find the lowest-numbered register set in the mask.  Return the
 // HIGHEST register number in the set, or BAD if no sets.
@@ -254,7 +258,7 @@
 // Clear out partial bits; leave only aligned adjacent bit pairs
 void RegMask::clear_to_sets(const int size) {
   if (size == 1) return;
-  assert(2 <= size && size <= 8, "update low bits table");
+  assert(2 <= size && size <= 16, "update low bits table");
   assert(is_power_of_2(size), "sanity");
   int low_bits_mask = low_bits[size>>2];
   for (int i = 0; i < RM_SIZE; i++) {
@@ -268,6 +272,9 @@
       sets |= (sets>>2);         // Smear 2 hi-bits into a set
       if (size > 4) {
         sets |= (sets>>4);       // Smear 4 hi-bits into a set
+        if (size > 8) {
+          sets |= (sets>>8);     // Smear 8 hi-bits into a set
+        }
       }
     }
     _A[i] = sets;
@@ -279,7 +286,7 @@
 // Smear out partial bits to aligned adjacent bit sets
 void RegMask::smear_to_sets(const int size) {
   if (size == 1) return;
-  assert(2 <= size && size <= 8, "update low bits table");
+  assert(2 <= size && size <= 16, "update low bits table");
   assert(is_power_of_2(size), "sanity");
   int low_bits_mask = low_bits[size>>2];
   for (int i = 0; i < RM_SIZE; i++) {
@@ -294,6 +301,9 @@
       sets |= (sets<<2);         // Smear 2 lo-bits into a set
       if (size > 4) {
         sets |= (sets<<4);       // Smear 4 lo-bits into a set
+        if (size > 8) {
+          sets |= (sets<<8);     // Smear 8 lo-bits into a set
+        }
       }
     }
     _A[i] = sets;
@@ -304,7 +314,7 @@
 //------------------------------is_aligned_set--------------------------------
 bool RegMask::is_aligned_sets(const int size) const {
   if (size == 1) return true;
-  assert(2 <= size && size <= 8, "update low bits table");
+  assert(2 <= size && size <= 16, "update low bits table");
   assert(is_power_of_2(size), "sanity");
   int low_bits_mask = low_bits[size>>2];
   // Assert that the register mask contains only bit sets.
@@ -330,7 +340,7 @@
 // Works also for size 1.
 int RegMask::is_bound_set(const int size) const {
   if( is_AllStack() ) return false;
-  assert(1 <= size && size <= 8, "update low bits table");
+  assert(1 <= size && size <= 16, "update low bits table");
   int bit = -1;                 // Set to hold the one bit allowed
   for (int i = 0; i < RM_SIZE; i++) {
     if (_A[i] ) {               // Found some bits
@@ -346,10 +356,12 @@
         if (((-1) & ~(bit-1)) != _A[i])
           return false;         // Found many bits, so fail
         i++;                    // Skip iteration forward and check high part
-        // The lower 24 bits should be 0 since it is split case and size <= 8.
-        int set = bit>>24;
+        // The lower (32-size) bits should be 0 since it is split case.
+        int clear_bit_size = 32-size;
+        int shift_back_size = 32-clear_bit_size;
+        int set = bit>>clear_bit_size;
         set = set & -set; // Remove sign extension.
-        set = (((set << size) - 1) >> 8);
+        set = (((set << size) - 1) >> shift_back_size);
         if (i >= RM_SIZE || _A[i] != set)
           return false; // Require expected low bits in next word
       }
@@ -375,7 +387,7 @@
 //------------------------------Size-------------------------------------------
 // Compute size of register mask in bits
 uint RegMask::Size() const {
-  extern uint8_t bitsInByte[256];
+  extern uint8_t bitsInByte[512];
   uint sum = 0;
   for( int i = 0; i < RM_SIZE; i++ )
     sum +=
--- a/hotspot/src/share/vm/opto/regmask.hpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/share/vm/opto/regmask.hpp	Fri May 08 11:49:20 2015 -0700
@@ -98,7 +98,8 @@
          SlotsPerVecS = 1,
          SlotsPerVecD = 2,
          SlotsPerVecX = 4,
-         SlotsPerVecY = 8 };
+         SlotsPerVecY = 8,
+         SlotsPerVecZ = 16 };
 
   // A constructor only used by the ADLC output.  All mask fields are filled
   // in directly.  Calls to this look something like RM(1,2,3,4);
@@ -299,13 +300,13 @@
   static bool can_represent(OptoReg::Name reg) {
     // NOTE: -1 in computation reflects the usage of the last
     //       bit of the regmask as an infinite stack flag and
-    //       -7 is to keep mask aligned for largest value (VecY).
+    //       -7 is to keep mask aligned for largest value (VecZ).
     return (int)reg < (int)(CHUNK_SIZE-1);
   }
   static bool can_represent_arg(OptoReg::Name reg) {
-    // NOTE: -SlotsPerVecY in computation reflects the need
-    //       to keep mask aligned for largest value (VecY).
-    return (int)reg < (int)(CHUNK_SIZE-SlotsPerVecY);
+    // NOTE: -SlotsPerVecZ in computation reflects the need
+    //       to keep mask aligned for largest value (VecZ).
+    return (int)reg < (int)(CHUNK_SIZE-SlotsPerVecZ);
   }
 };
 
--- a/hotspot/src/share/vm/opto/type.cpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/share/vm/opto/type.cpp	Fri May 08 11:49:20 2015 -0700
@@ -68,16 +68,19 @@
   { Bad,             T_ILLEGAL,    "vectord:",      false, Op_RegD,              relocInfo::none          },  // VectorD
   { Bad,             T_ILLEGAL,    "vectorx:",      false, 0,                    relocInfo::none          },  // VectorX
   { Bad,             T_ILLEGAL,    "vectory:",      false, 0,                    relocInfo::none          },  // VectorY
+  { Bad,             T_ILLEGAL,    "vectorz:",      false, 0,                    relocInfo::none          },  // VectorZ
 #elif defined(PPC64)
   { Bad,             T_ILLEGAL,    "vectors:",      false, 0,                    relocInfo::none          },  // VectorS
   { Bad,             T_ILLEGAL,    "vectord:",      false, Op_RegL,              relocInfo::none          },  // VectorD
   { Bad,             T_ILLEGAL,    "vectorx:",      false, 0,                    relocInfo::none          },  // VectorX
   { Bad,             T_ILLEGAL,    "vectory:",      false, 0,                    relocInfo::none          },  // VectorY
+  { Bad,             T_ILLEGAL,    "vectorz:",      false, 0,                    relocInfo::none          },  // VectorZ
 #else // all other
   { Bad,             T_ILLEGAL,    "vectors:",      false, Op_VecS,              relocInfo::none          },  // VectorS
   { Bad,             T_ILLEGAL,    "vectord:",      false, Op_VecD,              relocInfo::none          },  // VectorD
   { Bad,             T_ILLEGAL,    "vectorx:",      false, Op_VecX,              relocInfo::none          },  // VectorX
   { Bad,             T_ILLEGAL,    "vectory:",      false, Op_VecY,              relocInfo::none          },  // VectorY
+  { Bad,             T_ILLEGAL,    "vectorz:",      false, Op_VecZ,              relocInfo::none          },  // VectorZ
 #endif
   { Bad,             T_ADDRESS,    "anyptr:",       false, Op_RegP,              relocInfo::none          },  // AnyPtr
   { Bad,             T_ADDRESS,    "rawptr:",       false, Op_RegP,              relocInfo::none          },  // RawPtr
@@ -503,10 +506,14 @@
   if (Matcher::vector_size_supported(T_FLOAT,8)) {
     TypeVect::VECTY = TypeVect::make(T_FLOAT,8);
   }
+  if (Matcher::vector_size_supported(T_FLOAT,16)) {
+    TypeVect::VECTZ = TypeVect::make(T_FLOAT,16);
+  }
   mreg2type[Op_VecS] = TypeVect::VECTS;
   mreg2type[Op_VecD] = TypeVect::VECTD;
   mreg2type[Op_VecX] = TypeVect::VECTX;
   mreg2type[Op_VecY] = TypeVect::VECTY;
+  mreg2type[Op_VecZ] = TypeVect::VECTZ;
 
   // Restore working type arena.
   current->set_type_arena(save);
@@ -798,6 +805,7 @@
   Bad,          // VectorD - handled in v-call
   Bad,          // VectorX - handled in v-call
   Bad,          // VectorY - handled in v-call
+  Bad,          // VectorZ - handled in v-call
 
   Bad,          // AnyPtr - handled in v-call
   Bad,          // RawPtr - handled in v-call
@@ -2051,6 +2059,7 @@
 const TypeVect *TypeVect::VECTD = NULL; //  64-bit vectors
 const TypeVect *TypeVect::VECTX = NULL; // 128-bit vectors
 const TypeVect *TypeVect::VECTY = NULL; // 256-bit vectors
+const TypeVect *TypeVect::VECTZ = NULL; // 512-bit vectors
 
 //------------------------------make-------------------------------------------
 const TypeVect* TypeVect::make(const Type *elem, uint length) {
@@ -2070,6 +2079,8 @@
     return (TypeVect*)(new TypeVectX(elem, length))->hashcons();
   case Op_VecY:
     return (TypeVect*)(new TypeVectY(elem, length))->hashcons();
+  case Op_VecZ:
+    return (TypeVect*)(new TypeVectZ(elem, length))->hashcons();
   }
  ShouldNotReachHere();
   return NULL;
@@ -2093,7 +2104,8 @@
   case VectorS:
   case VectorD:
   case VectorX:
-  case VectorY: {                // Meeting 2 vectors?
+  case VectorY:
+  case VectorZ: {                // Meeting 2 vectors?
     const TypeVect* v = t->is_vect();
     assert(  base() == v->base(), "");
     assert(length() == v->length(), "");
@@ -2151,6 +2163,8 @@
     st->print("vectorx["); break;
   case VectorY:
     st->print("vectory["); break;
+  case VectorZ:
+    st->print("vectorz["); break;
   default:
     ShouldNotReachHere();
   }
--- a/hotspot/src/share/vm/opto/type.hpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/share/vm/opto/type.hpp	Fri May 08 11:49:20 2015 -0700
@@ -57,6 +57,7 @@
 class     TypeVectD;
 class     TypeVectX;
 class     TypeVectY;
+class     TypeVectZ;
 class   TypePtr;
 class     TypeRawPtr;
 class     TypeOopPtr;
@@ -90,6 +91,7 @@
     VectorD,                    //  64bit Vector types
     VectorX,                    // 128bit Vector types
     VectorY,                    // 256bit Vector types
+    VectorZ,                    // 512bit Vector types
 
     AnyPtr,                     // Any old raw, klass, inst, or array pointer
     RawPtr,                     // Raw (non-oop) pointers
@@ -729,6 +731,7 @@
   static const TypeVect *VECTD;
   static const TypeVect *VECTX;
   static const TypeVect *VECTY;
+  static const TypeVect *VECTZ;
 
 #ifndef PRODUCT
   virtual void dump2(Dict &d, uint, outputStream *st) const; // Specialized per-Type dumping
@@ -755,6 +758,11 @@
   TypeVectY(const Type* elem, uint length) : TypeVect(VectorY, elem, length) {}
 };
 
+class TypeVectZ : public TypeVect {
+  friend class TypeVect;
+  TypeVectZ(const Type* elem, uint length) : TypeVect(VectorZ, elem, length) {}
+};
+
 //------------------------------TypePtr----------------------------------------
 // Class of machine Pointer Types: raw data, instances or arrays.
 // If the _base enum is AnyPtr, then this refers to all of the above.
@@ -1568,12 +1576,12 @@
 }
 
 inline const TypeVect *Type::is_vect() const {
-  assert( _base >= VectorS && _base <= VectorY, "Not a Vector" );
+  assert( _base >= VectorS && _base <= VectorZ, "Not a Vector" );
   return (TypeVect*)this;
 }
 
 inline const TypeVect *Type::isa_vect() const {
-  return (_base >= VectorS && _base <= VectorY) ? (TypeVect*)this : NULL;
+  return (_base >= VectorS && _base <= VectorZ) ? (TypeVect*)this : NULL;
 }
 
 inline const TypePtr *Type::is_ptr() const {
--- a/hotspot/src/share/vm/opto/vectornode.cpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/share/vm/opto/vectornode.cpp	Fri May 08 11:49:20 2015 -0700
@@ -77,6 +77,9 @@
     case T_INT:    return Op_MulVI;
     }
     ShouldNotReachHere();
+  case Op_MulL:
+    assert(bt == T_LONG, "must be");
+    return Op_MulVL;
   case Op_MulF:
     assert(bt == T_FLOAT, "must be");
     return Op_MulVF;
@@ -267,6 +270,7 @@
 
   case Op_MulVS: return new MulVSNode(n1, n2, vt);
   case Op_MulVI: return new MulVINode(n1, n2, vt);
+  case Op_MulVL: return new MulVLNode(n1, n2, vt);
   case Op_MulVF: return new MulVFNode(n1, n2, vt);
   case Op_MulVD: return new MulVDNode(n1, n2, vt);
 
@@ -463,6 +467,10 @@
       assert(bt == T_INT, "must be");
       vopc = Op_MulReductionVI;
       break;
+    case Op_MulL:
+      assert(bt == T_LONG, "must be");
+      vopc = Op_MulReductionVL;
+      break;
     case Op_MulF:
       assert(bt == T_FLOAT, "must be");
       vopc = Op_MulReductionVF;
@@ -492,6 +500,7 @@
   case Op_AddReductionVF: return new AddReductionVFNode(ctrl, n1, n2);
   case Op_AddReductionVD: return new AddReductionVDNode(ctrl, n1, n2);
   case Op_MulReductionVI: return new MulReductionVINode(ctrl, n1, n2);
+  case Op_MulReductionVL: return new MulReductionVLNode(ctrl, n1, n2);
   case Op_MulReductionVF: return new MulReductionVFNode(ctrl, n1, n2);
   case Op_MulReductionVD: return new MulReductionVDNode(ctrl, n1, n2);
   }
--- a/hotspot/src/share/vm/opto/vectornode.hpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/share/vm/opto/vectornode.hpp	Fri May 08 11:49:20 2015 -0700
@@ -90,6 +90,30 @@
   virtual int Opcode() const;
 };
 
+//------------------------------AddVLNode--------------------------------------
+// Vector add long
+class AddVLNode : public VectorNode {
+public:
+  AddVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {}
+  virtual int Opcode() const;
+};
+
+//------------------------------AddVFNode--------------------------------------
+// Vector add float
+class AddVFNode : public VectorNode {
+public:
+  AddVFNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {}
+  virtual int Opcode() const;
+};
+
+//------------------------------AddVDNode--------------------------------------
+// Vector add double
+class AddVDNode : public VectorNode {
+public:
+  AddVDNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {}
+  virtual int Opcode() const;
+};
+
 //------------------------------ReductionNode------------------------------------
 // Perform reduction of a vector
 class ReductionNode : public Node {
@@ -121,22 +145,6 @@
   virtual uint ideal_reg() const { return Op_RegL; }
 };
 
-//------------------------------AddVLNode--------------------------------------
-// Vector add long
-class AddVLNode : public VectorNode {
- public:
-  AddVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
-  virtual int Opcode() const;
-};
-
-//------------------------------AddVFNode--------------------------------------
-// Vector add float
-class AddVFNode : public VectorNode {
- public:
-  AddVFNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
-  virtual int Opcode() const;
-};
-
 //------------------------------AddReductionVFNode--------------------------------------
 // Vector add float as a reduction
 class AddReductionVFNode : public ReductionNode {
@@ -147,14 +155,6 @@
   virtual uint ideal_reg() const { return Op_RegF; }
 };
 
-//------------------------------AddVDNode--------------------------------------
-// Vector add double
-class AddVDNode : public VectorNode {
- public:
-  AddVDNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
-  virtual int Opcode() const;
-};
-
 //------------------------------AddReductionVDNode--------------------------------------
 // Vector add double as a reduction
 class AddReductionVDNode : public ReductionNode {
@@ -229,6 +229,30 @@
   virtual int Opcode() const;
 };
 
+//------------------------------MulVLNode--------------------------------------
+// Vector multiply long
+class MulVLNode : public VectorNode {
+public:
+  MulVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {}
+  virtual int Opcode() const;
+};
+
+//------------------------------MulVFNode--------------------------------------
+// Vector multiply float
+class MulVFNode : public VectorNode {
+public:
+  MulVFNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {}
+  virtual int Opcode() const;
+};
+
+//------------------------------MulVDNode--------------------------------------
+// Vector multiply double
+class MulVDNode : public VectorNode {
+public:
+  MulVDNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {}
+  virtual int Opcode() const;
+};
+
 //------------------------------MulReductionVINode--------------------------------------
 // Vector multiply int as a reduction
 class MulReductionVINode : public ReductionNode {
@@ -239,12 +263,14 @@
   virtual uint ideal_reg() const { return Op_RegI; }
 };
 
-//------------------------------MulVFNode--------------------------------------
-// Vector multiply float
-class MulVFNode : public VectorNode {
- public:
-  MulVFNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
+//------------------------------MulReductionVLNode--------------------------------------
+// Vector multiply int as a reduction
+class MulReductionVLNode : public ReductionNode {
+public:
+  MulReductionVLNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
   virtual int Opcode() const;
+  virtual const Type* bottom_type() const { return TypeLong::LONG; }
+  virtual uint ideal_reg() const { return Op_RegI; }
 };
 
 //------------------------------MulReductionVFNode--------------------------------------
@@ -257,14 +283,6 @@
   virtual uint ideal_reg() const { return Op_RegF; }
 };
 
-//------------------------------MulVDNode--------------------------------------
-// Vector multiply double
-class MulVDNode : public VectorNode {
- public:
-  MulVDNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
-  virtual int Opcode() const;
-};
-
 //------------------------------MulReductionVDNode--------------------------------------
 // Vector multiply double as a reduction
 class MulReductionVDNode : public ReductionNode {
--- a/hotspot/src/share/vm/runtime/vmStructs.cpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/share/vm/runtime/vmStructs.cpp	Fri May 08 11:49:20 2015 -0700
@@ -2010,6 +2010,8 @@
   declare_c2_type(SubVFNode, VectorNode)                                  \
   declare_c2_type(SubVDNode, VectorNode)                                  \
   declare_c2_type(MulVSNode, VectorNode)                                  \
+  declare_c2_type(MulVLNode, VectorNode)                                  \
+  declare_c2_type(MulReductionVLNode, ReductionNode)                      \
   declare_c2_type(MulVINode, VectorNode)                                  \
   declare_c2_type(MulReductionVINode, ReductionNode)                      \
   declare_c2_type(MulVFNode, VectorNode)                                  \
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/loopopts/superword/SumRed_Long.java	Fri May 08 11:49:20 2015 -0700
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 8076276
+ * @summary Add C2 x86 Superword support for scalar sum reduction optimizations : long test
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRed_Double
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRed_Double
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRed_Double
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRed_Double
+ *
+ */
+
+public class SumRed_Long
+{
+  public static void main(String[] args) throws Exception {
+    long[] a = new long[256*1024];
+    long[] b = new long[256*1024];
+    long[] c = new long[256*1024];
+    long[] d = new long[256*1024];
+    sumReductionInit(a,b,c);
+    long total = 0;
+    long valid = 262144000;
+    for(int j = 0; j < 2000; j++) {
+      total = sumReductionImplement(a,b,c,d,total);
+    }
+    total = (int)total;
+    if(total == valid) {
+      System.out.println("Success");
+    } else {
+      System.out.println("Invalid sum of elements variable in total: " + total);
+      System.out.println("Expected value = " + valid);
+      throw new Exception("Failed");
+    }
+  }
+
+  public static void sumReductionInit(
+    long[] a,
+    long[] b,
+    long[] c)
+  {
+    for(int j = 0; j < 1; j++)
+    {
+      for(int i = 0; i < a.length; i++)
+      {
+        a[i] = i * 1 + j;
+        b[i] = i * 1 - j;
+        c[i] = i + j;
+      }
+    }
+  }
+
+  public static long sumReductionImplement(
+    long[] a,
+    long[] b,
+    long[] c,
+    long[] d,
+    long total)
+  {
+    for(int i = 0; i < a.length; i++)
+    {
+      d[i]= (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
+      total += d[i];
+    }
+    return total;
+  }
+
+}