8151002: Make Assembler methods vextract and vinsert match actual instructions
authormikael
Mon, 07 Mar 2016 15:03:48 -0800
changeset 36561 b18243f4d955
parent 36560 59ca2b7f6162
child 36562 4d1e93624d6a
8151002: Make Assembler methods vextract and vinsert match actual instructions Reviewed-by: kvn, vlivanov, mcberg
hotspot/src/cpu/x86/vm/assembler_x86.cpp
hotspot/src/cpu/x86/vm/assembler_x86.hpp
hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp
hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp
hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
hotspot/src/cpu/x86/vm/vm_version_x86.cpp
hotspot/src/cpu/x86/vm/x86.ad
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Mon Mar 07 10:03:06 2016 -0300
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Mon Mar 07 15:03:48 2016 -0800
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -5699,8 +5699,9 @@
 }
 
 
-void Assembler::vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+void Assembler::vinsertf128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx(), "");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
@@ -5709,11 +5710,12 @@
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - insert into lower 128 bits
   // 0x01 - insert into upper 128 bits
-  emit_int8(0x01);
-}
-
-void Assembler::vinsertf64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src, int value) {
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vinsertf64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
@@ -5721,26 +5723,29 @@
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - insert into lower 256 bits
   // 0x01 - insert into upper 256 bits
-  emit_int8(value & 0x01);
-}
-
-void Assembler::vinsertf64x4h(XMMRegister dst, Address src, int value) {
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vinsertf64x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
   assert(dst != xnoreg, "sanity");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   InstructionMark im(this);
   InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_64bit);
   // swap src<->dst for encoding
-  vex_prefix(src, dst->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x1A);
   emit_operand(dst, src);
   // 0x00 - insert into lower 256 bits
-  // 0x01 - insert into upper 128 bits
-  emit_int8(value & 0x01);
-}
-
-void Assembler::vinsertf32x4h(XMMRegister dst, XMMRegister nds, XMMRegister src, int value) {
+  // 0x01 - insert into upper 256 bits
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vinsertf32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
+  assert(imm8 <= 0x03, "imm8: %u", imm8);
   InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
@@ -5750,57 +5755,64 @@
   // 0x01 - insert into q1 128 bits (128..255)
   // 0x02 - insert into q2 128 bits (256..383)
   // 0x03 - insert into q3 128 bits (384..511)
-  emit_int8(value & 0x3);
-}
-
-void Assembler::vinsertf32x4h(XMMRegister dst, Address src, int value) {
+  emit_int8(imm8 & 0x03);
+}
+
+void Assembler::vinsertf32x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
   assert(VM_Version::supports_avx(), "");
   assert(dst != xnoreg, "sanity");
+  assert(imm8 <= 0x03, "imm8: %u", imm8);
   int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
   // swap src<->dst for encoding
-  vex_prefix(src, dst->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x18);
   emit_operand(dst, src);
   // 0x00 - insert into q0 128 bits (0..127)
   // 0x01 - insert into q1 128 bits (128..255)
   // 0x02 - insert into q2 128 bits (256..383)
   // 0x03 - insert into q3 128 bits (384..511)
-  emit_int8(value & 0x3);
-}
-
-void Assembler::vinsertf128h(XMMRegister dst, Address src) {
+  emit_int8(imm8 & 0x03);
+}
+
+void Assembler::vinsertf128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
   assert(VM_Version::supports_avx(), "");
   assert(dst != xnoreg, "sanity");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
   // swap src<->dst for encoding
-  vex_prefix(src, dst->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x18);
   emit_operand(dst, src);
+  // 0x00 - insert into lower 128 bits
   // 0x01 - insert into upper 128 bits
-  emit_int8(0x01);
-}
-
-void Assembler::vextractf128h(XMMRegister dst, XMMRegister src) {
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vextractf128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx(), "");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x19);
   emit_int8((unsigned char)(0xC0 | encode));
-  // 0x00 - insert into lower 128 bits
-  // 0x01 - insert into upper 128 bits
-  emit_int8(0x01);
-}
-
-void Assembler::vextractf128h(Address dst, XMMRegister src) {
+  // 0x00 - extract from lower 128 bits
+  // 0x01 - extract from upper 128 bits
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vextractf128(Address dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx(), "");
   assert(src != xnoreg, "sanity");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
@@ -5808,12 +5820,14 @@
   vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x19);
   emit_operand(src, dst);
+  // 0x00 - extract from lower 128 bits
   // 0x01 - extract from upper 128 bits
-  emit_int8(0x01);
-}
-
-void Assembler::vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx2(), "");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
@@ -5822,11 +5836,12 @@
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - insert into lower 128 bits
   // 0x01 - insert into upper 128 bits
-  emit_int8(0x01);
-}
-
-void Assembler::vinserti64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src, int value) {
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vinserti64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
@@ -5834,39 +5849,44 @@
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - insert into lower 256 bits
   // 0x01 - insert into upper 256 bits
-  emit_int8(value & 0x01);
-}
-
-void Assembler::vinserti128h(XMMRegister dst, Address src) {
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
   assert(VM_Version::supports_avx2(), "");
   assert(dst != xnoreg, "sanity");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
+  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
   // swap src<->dst for encoding
-  vex_prefix(src, dst->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x38);
   emit_operand(dst, src);
+  // 0x00 - insert into lower 128 bits
   // 0x01 - insert into upper 128 bits
-  emit_int8(0x01);
-}
-
-void Assembler::vextracti128h(XMMRegister dst, XMMRegister src) {
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx(), "");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x39);
   emit_int8((unsigned char)(0xC0 | encode));
-  // 0x00 - insert into lower 128 bits
-  // 0x01 - insert into upper 128 bits
-  emit_int8(0x01);
-}
-
-void Assembler::vextracti128h(Address dst, XMMRegister src) {
+  // 0x00 - extract from lower 128 bits
+  // 0x01 - extract from upper 128 bits
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vextracti128(Address dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx2(), "");
   assert(src != xnoreg, "sanity");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
   InstructionMark im(this);
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
@@ -5874,47 +5894,53 @@
   vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x39);
   emit_operand(src, dst);
+  // 0x00 - extract from lower 128 bits
   // 0x01 - extract from upper 128 bits
-  emit_int8(0x01);
-}
-
-void Assembler::vextracti64x4h(XMMRegister dst, XMMRegister src, int value) {
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vextracti64x4(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x3B);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - extract from lower 256 bits
   // 0x01 - extract from upper 256 bits
-  emit_int8(value & 0x01);
-}
-
-void Assembler::vextracti64x2h(XMMRegister dst, XMMRegister src, int value) {
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vextracti64x2(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
+  assert(imm8 <= 0x03, "imm8: %u", imm8);
   InstructionAttr attributes(AVX_512bit, /* vex_w */ !_legacy_mode_dq, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x39);
   emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - extract from bits 127:0
   // 0x01 - extract from bits 255:128
   // 0x02 - extract from bits 383:256
   // 0x03 - extract from bits 511:384
-  emit_int8(value & 0x3);
-}
-
-void Assembler::vextractf64x4h(XMMRegister dst, XMMRegister src, int value) {
+  emit_int8(imm8 & 0x03);
+}
+
+void Assembler::vextractf64x4(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x1B);
   emit_int8((unsigned char)(0xC0 | encode));
   // 0x00 - extract from lower 256 bits
   // 0x01 - extract from upper 256 bits
-  emit_int8(value & 0x1);
-}
-
-void Assembler::vextractf64x4h(Address dst, XMMRegister src, int value) {
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vextractf64x4(Address dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
   assert(src != xnoreg, "sanity");
+  assert(imm8 <= 0x01, "imm8: %u", imm8);
   InstructionMark im(this);
   InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4,/* input_size_in_bits */  EVEX_64bit);
@@ -5923,11 +5949,12 @@
   emit_operand(src, dst);
   // 0x00 - extract from lower 256 bits
   // 0x01 - extract from upper 256 bits
-  emit_int8(value & 0x01);
-}
-
-void Assembler::vextractf32x4h(XMMRegister dst, XMMRegister src, int value) {
+  emit_int8(imm8 & 0x01);
+}
+
+void Assembler::vextractf32x4(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_avx(), "");
+  assert(imm8 <= 0x03, "imm8: %u", imm8);
   int vector_len = VM_Version::supports_evex() ? AVX_512bit : AVX_256bit;
   InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
@@ -5937,12 +5964,13 @@
   // 0x01 - extract from bits 255:128
   // 0x02 - extract from bits 383:256
   // 0x03 - extract from bits 511:384
-  emit_int8(value & 0x3);
-}
-
-void Assembler::vextractf32x4h(Address dst, XMMRegister src, int value) {
+  emit_int8(imm8 & 0x03);
+}
+
+void Assembler::vextractf32x4(Address dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
   assert(src != xnoreg, "sanity");
+  assert(imm8 <= 0x03, "imm8: %u", imm8);
   InstructionMark im(this);
   InstructionAttr attributes(AVX_512bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
@@ -5953,19 +5981,21 @@
   // 0x01 - extract from bits 255:128
   // 0x02 - extract from bits 383:256
   // 0x03 - extract from bits 511:384
-  emit_int8(value & 0x3);
-}
-
-void Assembler::vextractf64x2h(XMMRegister dst, XMMRegister src, int value) {
+  emit_int8(imm8 & 0x03);
+}
+
+void Assembler::vextractf64x2(XMMRegister dst, XMMRegister src, uint8_t imm8) {
   assert(VM_Version::supports_evex(), "");
+  assert(imm8 <= 0x03, "imm8: %u", imm8);
   InstructionAttr attributes(AVX_512bit, /* vex_w */ !_legacy_mode_dq, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = vex_prefix_and_encode(src->encoding(), 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x19);
   emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - extract from bits 127:0
   // 0x01 - extract from bits 255:128
   // 0x02 - extract from bits 383:256
   // 0x03 - extract from bits 511:384
-  emit_int8(value & 0x3);
+  emit_int8(imm8 & 0x03);
 }
 
 // duplicate 4-bytes integer data from src into 8 locations in dest
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Mon Mar 07 10:03:06 2016 -0300
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Mon Mar 07 15:03:48 2016 -0800
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -1971,33 +1971,31 @@
   void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
   void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
 
-  // Copy low 128bit into high 128bit of YMM registers.
-  void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
-  void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
-  void vextractf128h(XMMRegister dst, XMMRegister src);
-  void vextracti128h(XMMRegister dst, XMMRegister src);
-
-  // Load/store high 128bit of YMM registers which does not destroy other half.
-  void vinsertf128h(XMMRegister dst, Address src);
-  void vinserti128h(XMMRegister dst, Address src);
-  void vextractf128h(Address dst, XMMRegister src);
-  void vextracti128h(Address dst, XMMRegister src);
-
-  // Copy low 256bit into high 256bit of ZMM registers.
-  void vinserti64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src, int value);
-  void vinsertf64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src, int value);
-  void vextracti64x4h(XMMRegister dst, XMMRegister src, int value);
-  void vextractf64x4h(XMMRegister dst, XMMRegister src, int value);
-  void vextractf64x4h(Address dst, XMMRegister src, int value);
-  void vinsertf64x4h(XMMRegister dst, Address src, int value);
-
-  // Copy targeted 128bit segments of the ZMM registers
-  void vextracti64x2h(XMMRegister dst, XMMRegister src, int value);
-  void vextractf64x2h(XMMRegister dst, XMMRegister src, int value);
-  void vextractf32x4h(XMMRegister dst, XMMRegister src, int value);
-  void vextractf32x4h(Address dst, XMMRegister src, int value);
-  void vinsertf32x4h(XMMRegister dst, XMMRegister nds, XMMRegister src, int value);
-  void vinsertf32x4h(XMMRegister dst, Address src, int value);
+  // 128bit copy from/to 256bit (YMM) vector registers
+  void vinsertf128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+  void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+  void vextractf128(XMMRegister dst, XMMRegister src, uint8_t imm8);
+  void vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8);
+  void vinsertf128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
+  void vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
+  void vextractf128(Address dst, XMMRegister src, uint8_t imm8);
+  void vextracti128(Address dst, XMMRegister src, uint8_t imm8);
+
+  // 256bit copy from/to 512bit (ZMM) vector registers
+  void vinserti64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+  void vinsertf64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+  void vextracti64x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
+  void vextractf64x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
+  void vextractf64x4(Address dst, XMMRegister src, uint8_t imm8);
+  void vinsertf64x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
+
+  // 128bit copy from/to 256bit (YMM) or 512bit (ZMM) vector registers
+  void vextracti64x2(XMMRegister dst, XMMRegister src, uint8_t imm8);
+  void vextractf64x2(XMMRegister dst, XMMRegister src, uint8_t imm8);
+  void vextractf32x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
+  void vextractf32x4(Address dst, XMMRegister src, uint8_t imm8);
+  void vinsertf32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+  void vinsertf32x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
 
   // duplicate 4-bytes integer data from src into 8 locations in dest
   void vpbroadcastd(XMMRegister dst, XMMRegister src);
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Mon Mar 07 10:03:06 2016 -0300
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Mon Mar 07 15:03:48 2016 -0800
@@ -3445,7 +3445,7 @@
 
 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
-    Assembler::vextractf32x4h(dst, src, 0);
+    Assembler::vextractf32x4(dst, src, 0);
   } else {
     Assembler::movdqu(dst, src);
   }
@@ -3453,7 +3453,7 @@
 
 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
-    Assembler::vinsertf32x4h(dst, src, 0);
+    Assembler::vinsertf32x4(dst, dst, src, 0);
   } else {
     Assembler::movdqu(dst, src);
   }
@@ -3478,7 +3478,7 @@
 
 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
-    Assembler::vextractf64x4h(dst, src, 0);
+    vextractf64x4_low(dst, src);
   } else {
     Assembler::vmovdqu(dst, src);
   }
@@ -3486,7 +3486,7 @@
 
 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
-    Assembler::vinsertf64x4h(dst, src, 0);
+    vinsertf64x4_low(dst, src);
   } else {
     Assembler::vmovdqu(dst, src);
   }
@@ -5649,14 +5649,14 @@
         // Save upper half of ZMM registers
         subptr(rsp, 32*num_xmm_regs);
         for (int n = 0; n < num_xmm_regs; n++) {
-          vextractf64x4h(Address(rsp, n*32), as_XMMRegister(n), 1);
+          vextractf64x4_high(Address(rsp, n*32), as_XMMRegister(n));
         }
       }
       assert(UseAVX > 0, "256 bit vectors are supported only with AVX");
       // Save upper half of YMM registers
       subptr(rsp, 16*num_xmm_regs);
       for (int n = 0; n < num_xmm_regs; n++) {
-        vextractf128h(Address(rsp, n*16), as_XMMRegister(n));
+        vextractf128_high(Address(rsp, n*16), as_XMMRegister(n));
       }
     }
 #endif
@@ -5665,7 +5665,7 @@
 #ifdef _LP64
     if (VM_Version::supports_evex()) {
       for (int n = 0; n < num_xmm_regs; n++) {
-        vextractf32x4h(Address(rsp, n*16), as_XMMRegister(n), 0);
+        vextractf32x4(Address(rsp, n*16), as_XMMRegister(n), 0);
       }
     } else {
       for (int n = 0; n < num_xmm_regs; n++) {
@@ -5753,7 +5753,7 @@
 #ifdef _LP64
   if (VM_Version::supports_evex()) {
     for (int n = 0; n < num_xmm_regs; n++) {
-      vinsertf32x4h(as_XMMRegister(n), Address(rsp, n*16), 0);
+      vinsertf32x4(as_XMMRegister(n), as_XMMRegister(n), Address(rsp, n*16), 0);
     }
   } else {
     for (int n = 0; n < num_xmm_regs; n++) {
@@ -5771,12 +5771,12 @@
     if (MaxVectorSize > 16) {
       // Restore upper half of YMM registers.
       for (int n = 0; n < num_xmm_regs; n++) {
-        vinsertf128h(as_XMMRegister(n), Address(rsp, n*16));
+        vinsertf128_high(as_XMMRegister(n), Address(rsp, n*16));
       }
       addptr(rsp, 16*num_xmm_regs);
       if(UseAVX > 2) {
         for (int n = 0; n < num_xmm_regs; n++) {
-          vinsertf64x4h(as_XMMRegister(n), Address(rsp, n*32), 1);
+          vinsertf64x4_high(as_XMMRegister(n), Address(rsp, n*32));
         }
         addptr(rsp, 32*num_xmm_regs);
       }
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp	Mon Mar 07 10:03:06 2016 -0300
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp	Mon Mar 07 15:03:48 2016 -0800
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -1215,13 +1215,130 @@
   void vpxor(XMMRegister dst, XMMRegister src) { Assembler::vpxor(dst, dst, src, true); }
   void vpxor(XMMRegister dst, Address src) { Assembler::vpxor(dst, dst, src, true); }
 
-  // Move packed integer values from low 128 bit to hign 128 bit in 256 bit vector.
-  void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
-    if (UseAVX > 1) // vinserti128h is available only in AVX2
-      Assembler::vinserti128h(dst, nds, src);
-    else
-      Assembler::vinsertf128h(dst, nds, src);
+  void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
+    if (UseAVX > 1) { // vinserti128 is available only in AVX2
+      Assembler::vinserti128(dst, nds, src, imm8);
+    } else {
+      Assembler::vinsertf128(dst, nds, src, imm8);
+    }
+  }
+
+  void vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
+    if (UseAVX > 1) { // vinserti128 is available only in AVX2
+      Assembler::vinserti128(dst, nds, src, imm8);
+    } else {
+      Assembler::vinsertf128(dst, nds, src, imm8);
+    }
+  }
+
+  void vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
+    if (UseAVX > 1) { // vextracti128 is available only in AVX2
+      Assembler::vextracti128(dst, src, imm8);
+    } else {
+      Assembler::vextractf128(dst, src, imm8);
+    }
+  }
+
+  void vextracti128(Address dst, XMMRegister src, uint8_t imm8) {
+    if (UseAVX > 1) { // vextracti128 is available only in AVX2
+      Assembler::vextracti128(dst, src, imm8);
+    } else {
+      Assembler::vextractf128(dst, src, imm8);
+    }
+  }
+
+  // 128bit copy to/from high 128 bits of 256bit (YMM) vector registers
+  void vinserti128_high(XMMRegister dst, XMMRegister src) {
+    vinserti128(dst, dst, src, 1);
+  }
+  void vinserti128_high(XMMRegister dst, Address src) {
+    vinserti128(dst, dst, src, 1);
+  }
+  void vextracti128_high(XMMRegister dst, XMMRegister src) {
+    vextracti128(dst, src, 1);
+  }
+  void vextracti128_high(Address dst, XMMRegister src) {
+    vextracti128(dst, src, 1);
+  }
+  void vinsertf128_high(XMMRegister dst, XMMRegister src) {
+    vinsertf128(dst, dst, src, 1);
+  }
+  void vinsertf128_high(XMMRegister dst, Address src) {
+    vinsertf128(dst, dst, src, 1);
+  }
+  void vextractf128_high(XMMRegister dst, XMMRegister src) {
+    vextractf128(dst, src, 1);
+  }
+  void vextractf128_high(Address dst, XMMRegister src) {
+    vextractf128(dst, src, 1);
+  }
+
+  // 256bit copy to/from high 256 bits of 512bit (ZMM) vector registers
+  void vinserti64x4_high(XMMRegister dst, XMMRegister src) {
+    vinserti64x4(dst, dst, src, 1);
   }
+  void vinsertf64x4_high(XMMRegister dst, XMMRegister src) {
+    vinsertf64x4(dst, dst, src, 1);
+  }
+  void vextracti64x4_high(XMMRegister dst, XMMRegister src) {
+    vextracti64x4(dst, src, 1);
+  }
+  void vextractf64x4_high(XMMRegister dst, XMMRegister src) {
+    vextractf64x4(dst, src, 1);
+  }
+  void vextractf64x4_high(Address dst, XMMRegister src) {
+    vextractf64x4(dst, src, 1);
+  }
+  void vinsertf64x4_high(XMMRegister dst, Address src) {
+    vinsertf64x4(dst, dst, src, 1);
+  }
+
+  // 128bit copy to/from low 128 bits of 256bit (YMM) vector registers
+  void vinserti128_low(XMMRegister dst, XMMRegister src) {
+    vinserti128(dst, dst, src, 0);
+  }
+  void vinserti128_low(XMMRegister dst, Address src) {
+    vinserti128(dst, dst, src, 0);
+  }
+  void vextracti128_low(XMMRegister dst, XMMRegister src) {
+    vextracti128(dst, src, 0);
+  }
+  void vextracti128_low(Address dst, XMMRegister src) {
+    vextracti128(dst, src, 0);
+  }
+  void vinsertf128_low(XMMRegister dst, XMMRegister src) {
+    vinsertf128(dst, dst, src, 0);
+  }
+  void vinsertf128_low(XMMRegister dst, Address src) {
+    vinsertf128(dst, dst, src, 0);
+  }
+  void vextractf128_low(XMMRegister dst, XMMRegister src) {
+    vextractf128(dst, src, 0);
+  }
+  void vextractf128_low(Address dst, XMMRegister src) {
+    vextractf128(dst, src, 0);
+  }
+
+  // 256bit copy to/from low 256 bits of 512bit (ZMM) vector registers
+  void vinserti64x4_low(XMMRegister dst, XMMRegister src) {
+    vinserti64x4(dst, dst, src, 0);
+  }
+  void vinsertf64x4_low(XMMRegister dst, XMMRegister src) {
+    vinsertf64x4(dst, dst, src, 0);
+  }
+  void vextracti64x4_low(XMMRegister dst, XMMRegister src) {
+    vextracti64x4(dst, src, 0);
+  }
+  void vextractf64x4_low(XMMRegister dst, XMMRegister src) {
+    vextractf64x4(dst, src, 0);
+  }
+  void vextractf64x4_low(Address dst, XMMRegister src) {
+    vextractf64x4(dst, src, 0);
+  }
+  void vinsertf64x4_low(XMMRegister dst, Address src) {
+    vinsertf64x4(dst, dst, src, 0);
+  }
+
 
   // Carry-Less Multiplication Quadword
   void vpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
--- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Mon Mar 07 10:03:06 2016 -0300
+++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Mon Mar 07 15:03:48 2016 -0800
@@ -208,13 +208,13 @@
     __ subptr(rsp, ymm_bytes);
     // Save upper half of YMM registers
     for (int n = 0; n < num_xmm_regs; n++) {
-      __ vextractf128h(Address(rsp, n*16), as_XMMRegister(n));
+      __ vextractf128_high(Address(rsp, n*16), as_XMMRegister(n));
     }
     if (UseAVX > 2) {
       __ subptr(rsp, zmm_bytes);
       // Save upper half of ZMM registers
       for (int n = 0; n < num_xmm_regs; n++) {
-        __ vextractf64x4h(Address(rsp, n*32), as_XMMRegister(n), 1);
+        __ vextractf64x4_high(Address(rsp, n*32), as_XMMRegister(n));
       }
     }
   }
@@ -304,13 +304,13 @@
     if (UseAVX > 2) {
       // Restore upper half of ZMM registers.
       for (int n = 0; n < num_xmm_regs; n++) {
-        __ vinsertf64x4h(as_XMMRegister(n), Address(rsp, n*32), 1);
+        __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, n*32));
       }
       __ addptr(rsp, zmm_bytes);
     }
     // Restore upper half of YMM registers.
     for (int n = 0; n < num_xmm_regs; n++) {
-      __ vinsertf128h(as_XMMRegister(n), Address(rsp, n*16));
+      __ vinsertf128_high(as_XMMRegister(n), Address(rsp, n*16));
     }
     __ addptr(rsp, ymm_bytes);
   }
--- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Mon Mar 07 10:03:06 2016 -0300
+++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Mon Mar 07 15:03:48 2016 -0800
@@ -179,13 +179,13 @@
     // Save upper half of YMM registers(0..15)
     int base_addr = XSAVE_AREA_YMM_BEGIN;
     for (int n = 0; n < 16; n++) {
-      __ vextractf128h(Address(rsp, base_addr+n*16), as_XMMRegister(n));
+      __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
     }
     if (VM_Version::supports_evex()) {
       // Save upper half of ZMM registers(0..15)
       base_addr = XSAVE_AREA_ZMM_BEGIN;
       for (int n = 0; n < 16; n++) {
-        __ vextractf64x4h(Address(rsp, base_addr+n*32), as_XMMRegister(n), 1);
+        __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
       }
       // Save full ZMM registers(16..num_xmm_regs)
       base_addr = XSAVE_AREA_UPPERBANK;
@@ -333,13 +333,13 @@
     // Restore upper half of YMM registers (0..15)
     int base_addr = XSAVE_AREA_YMM_BEGIN;
     for (int n = 0; n < 16; n++) {
-      __ vinsertf128h(as_XMMRegister(n), Address(rsp,  base_addr+n*16));
+      __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
     }
     if (VM_Version::supports_evex()) {
       // Restore upper half of ZMM registers (0..15)
       base_addr = XSAVE_AREA_ZMM_BEGIN;
       for (int n = 0; n < 16; n++) {
-        __ vinsertf64x4h(as_XMMRegister(n), Address(rsp, base_addr+n*32), 1);
+        __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
       }
       // Restore full ZMM registers(16..num_xmm_regs)
       base_addr = XSAVE_AREA_UPPERBANK;
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Mon Mar 07 10:03:06 2016 -0300
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Mon Mar 07 15:03:48 2016 -0800
@@ -275,7 +275,7 @@
     }
     if (VM_Version::supports_evex()) {
       for (int i = xmm_save_first; i <= last_reg; i++) {
-        __ vextractf32x4h(xmm_save(i), as_XMMRegister(i), 0);
+        __ vextractf32x4(xmm_save(i), as_XMMRegister(i), 0);
       }
     } else {
       for (int i = xmm_save_first; i <= last_reg; i++) {
@@ -393,7 +393,7 @@
     // emit the restores for xmm regs
     if (VM_Version::supports_evex()) {
       for (int i = xmm_save_first; i <= last_reg; i++) {
-        __ vinsertf32x4h(as_XMMRegister(i), xmm_save(i), 0);
+        __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0);
       }
     } else {
       for (int i = xmm_save_first; i <= last_reg; i++) {
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp	Mon Mar 07 10:03:06 2016 -0300
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp	Mon Mar 07 15:03:48 2016 -0800
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -385,7 +385,7 @@
 
     __ movdl(xmm0, rcx);
     __ pshufd(xmm0, xmm0, 0x00);
-    __ vinsertf128h(xmm0, xmm0, xmm0);
+    __ vinsertf128_high(xmm0, xmm0);
     __ vmovdqu(xmm7, xmm0);
 #ifdef _LP64
     __ vmovdqu(xmm8, xmm0);
--- a/hotspot/src/cpu/x86/vm/x86.ad	Mon Mar 07 10:03:06 2016 -0300
+++ b/hotspot/src/cpu/x86/vm/x86.ad	Mon Mar 07 15:03:48 2016 -0800
@@ -3179,13 +3179,13 @@
             "punpcklbw $dst,$dst\n\t"
             "pshuflw $dst,$dst,0x00\n\t"
             "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
+            "vinserti128_high $dst,$dst\t! replicate32B" %}
   ins_encode %{
     __ movdl($dst$$XMMRegister, $src$$Register);
     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3196,12 +3196,12 @@
   format %{ "punpcklbw $dst,$mem\n\t"
             "pshuflw $dst,$dst,0x00\n\t"
             "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
+            "vinserti128_high $dst,$dst\t! replicate32B" %}
   ins_encode %{
     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3223,11 +3223,11 @@
   match(Set dst (ReplicateB con));
   format %{ "movq    $dst,[$constantaddress]\n\t"
             "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! lreplicate32B($con)" %}
+            "vinserti128_high $dst,$dst\t! lreplicate32B($con)" %}
   ins_encode %{
     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3298,12 +3298,12 @@
   format %{ "movd    $dst,$src\n\t"
             "pshuflw $dst,$dst,0x00\n\t"
             "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
+            "vinserti128_high $dst,$dst\t! replicate16S" %}
   ins_encode %{
     __ movdl($dst$$XMMRegister, $src$$Register);
     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3313,11 +3313,11 @@
   match(Set dst (ReplicateS (LoadS mem)));
   format %{ "pshuflw $dst,$mem,0x00\n\t"
             "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
+            "vinserti128_high $dst,$dst\t! replicate16S" %}
   ins_encode %{
     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3327,11 +3327,11 @@
   match(Set dst (ReplicateS con));
   format %{ "movq    $dst,[$constantaddress]\n\t"
             "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate16S($con)" %}
+            "vinserti128_high $dst,$dst\t! replicate16S($con)" %}
   ins_encode %{
     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3363,11 +3363,11 @@
   match(Set dst (ReplicateI src));
   format %{ "movd    $dst,$src\n\t"
             "pshufd  $dst,$dst,0x00\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
+            "vinserti128_high $dst,$dst\t! replicate8I" %}
   ins_encode %{
     __ movdl($dst$$XMMRegister, $src$$Register);
     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3376,10 +3376,10 @@
   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
   match(Set dst (ReplicateI (LoadI mem)));
   format %{ "pshufd  $dst,$mem,0x00\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
+            "vinserti128_high $dst,$dst\t! replicate8I" %}
   ins_encode %{
     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3401,11 +3401,11 @@
   match(Set dst (ReplicateI con));
   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
             "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst" %}
+            "vinserti128_high $dst,$dst" %}
   ins_encode %{
     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3430,11 +3430,11 @@
   match(Set dst (ReplicateL src));
   format %{ "movdq   $dst,$src\n\t"
             "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
+            "vinserti128_high $dst,$dst\t! replicate4L" %}
   ins_encode %{
     __ movdq($dst$$XMMRegister, $src$$Register);
     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3447,13 +3447,13 @@
             "movdl   $tmp,$src.hi\n\t"
             "punpckldq $dst,$tmp\n\t"
             "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
+            "vinserti128_high $dst,$dst\t! replicate4L" %}
   ins_encode %{
     __ movdl($dst$$XMMRegister, $src$$Register);
     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3464,11 +3464,11 @@
   match(Set dst (ReplicateL con));
   format %{ "movq    $dst,[$constantaddress]\n\t"
             "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate4L($con)" %}
+            "vinserti128_high $dst,$dst\t! replicate4L($con)" %}
   ins_encode %{
     __ movq($dst$$XMMRegister, $constantaddress($con));
     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3478,11 +3478,11 @@
   match(Set dst (ReplicateL (LoadL mem)));
   format %{ "movq    $dst,$mem\n\t"
             "punpcklqdq $dst,$dst\n\t"
-            "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
+            "vinserti128_high $dst,$dst\t! replicate4L" %}
   ins_encode %{
     __ movq($dst$$XMMRegister, $mem$$Address);
     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
-    __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3511,10 +3511,10 @@
   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
   match(Set dst (ReplicateF src));
   format %{ "pshufd  $dst,$src,0x00\n\t"
-            "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
+            "vinsertf128_high $dst,$dst\t! replicate8F" %}
   ins_encode %{
     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
-    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3523,10 +3523,10 @@
   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
   match(Set dst (ReplicateF (LoadF mem)));
   format %{ "pshufd  $dst,$mem,0x00\n\t"
-            "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
+            "vinsertf128_high $dst,$dst\t! replicate8F" %}
   ins_encode %{
     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
-    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3576,10 +3576,10 @@
   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
   match(Set dst (ReplicateD src));
   format %{ "pshufd  $dst,$src,0x44\n\t"
-            "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
+            "vinsertf128_high $dst,$dst\t! replicate4D" %}
   ins_encode %{
     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
-    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -3588,10 +3588,10 @@
   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
   match(Set dst (ReplicateD (LoadD mem)));
   format %{ "pshufd  $dst,$mem,0x44\n\t"
-            "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
+            "vinsertf128_high $dst,$dst\t! replicate4D" %}
   ins_encode %{
     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
-    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -4791,7 +4791,7 @@
   effect(TEMP tmp, TEMP tmp2);
   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
             "vphaddd  $tmp,$tmp,$tmp2\n\t"
-            "vextracti128  $tmp2,$tmp\n\t"
+            "vextracti128_high  $tmp2,$tmp\n\t"
             "vpaddd   $tmp,$tmp,$tmp2\n\t"
             "movd     $tmp2,$src1\n\t"
             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
@@ -4800,7 +4800,7 @@
     int vector_len = 1;
     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
-    __ vextracti128h($tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
     __ movdl($tmp2$$XMMRegister, $src1$$Register);
     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
@@ -4813,7 +4813,7 @@
   predicate(UseAVX > 2);
   match(Set dst (AddReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
-  format %{ "vextracti128  $tmp,$src2\n\t"
+  format %{ "vextracti128_high  $tmp,$src2\n\t"
             "vpaddd  $tmp,$tmp,$src2\n\t"
             "pshufd  $tmp2,$tmp,0xE\n\t"
             "vpaddd  $tmp,$tmp,$tmp2\n\t"
@@ -4824,7 +4824,7 @@
             "movd    $dst,$tmp2\t! add reduction8I" %}
   ins_encode %{
     int vector_len = 0;
-    __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
+    __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
@@ -4841,9 +4841,9 @@
   predicate(UseAVX > 2);
   match(Set dst (AddReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
-  format %{ "vextracti64x4  $tmp3,$src2,0x1\n\t"
+  format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
             "vpaddd  $tmp3,$tmp3,$src2\n\t"
-            "vextracti128   $tmp,$tmp3\n\t"
+            "vextracti128_high  $tmp,$tmp3\n\t"
             "vpaddd  $tmp,$tmp,$tmp3\n\t"
             "pshufd  $tmp2,$tmp,0xE\n\t"
             "vpaddd  $tmp,$tmp,$tmp2\n\t"
@@ -4853,9 +4853,9 @@
             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
             "movd    $dst,$tmp2\t! mul reduction16I" %}
   ins_encode %{
-    __ vextracti64x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 1);
+    __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
     __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
-    __ vextracti128h($tmp$$XMMRegister, $tmp3$$XMMRegister);
+    __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
@@ -4892,7 +4892,7 @@
   predicate(UseAVX > 2);
   match(Set dst (AddReductionVL src1 src2));
   effect(TEMP tmp, TEMP tmp2);
-  format %{ "vextracti128  $tmp,$src2\n\t"
+  format %{ "vextracti128_high  $tmp,$src2\n\t"
             "vpaddq  $tmp2,$tmp,$src2\n\t"
             "pshufd  $tmp,$tmp2,0xE\n\t"
             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
@@ -4900,7 +4900,7 @@
             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
             "movdq   $dst,$tmp2\t! add reduction4L" %}
   ins_encode %{
-    __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
+    __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
@@ -4915,9 +4915,9 @@
   predicate(UseAVX > 2);
   match(Set dst (AddReductionVL src1 src2));
   effect(TEMP tmp, TEMP tmp2);
-  format %{ "vextracti64x4  $tmp2,$src2,0x1\n\t"
+  format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
             "vpaddq  $tmp2,$tmp2,$src2\n\t"
-            "vextracti128   $tmp,$tmp2\n\t"
+            "vextracti128_high  $tmp,$tmp2\n\t"
             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
             "pshufd  $tmp,$tmp2,0xE\n\t"
             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
@@ -4925,9 +4925,9 @@
             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
             "movdq   $dst,$tmp2\t! add reduction8L" %}
   ins_encode %{
-    __ vextracti64x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 1);
+    __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
-    __ vextracti128h($tmp$$XMMRegister, $tmp2$$XMMRegister);
+    __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
@@ -5026,7 +5026,7 @@
             "vaddss  $dst,$dst,$tmp\n\t"
             "pshufd  $tmp,$src2,0x03\n\t"
             "vaddss  $dst,$dst,$tmp\n\t"
-            "vextractf128  $tmp2,$src2\n\t"
+            "vextractf128_high  $tmp2,$src2\n\t"
             "vaddss  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0x01\n\t"
             "vaddss  $dst,$dst,$tmp\n\t"
@@ -5042,7 +5042,7 @@
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf128h($tmp2$$XMMRegister, $src2$$XMMRegister);
+    __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
@@ -5065,7 +5065,7 @@
             "vaddss  $dst,$dst,$tmp\n\t"
             "pshufd  $tmp,$src2,0x03\n\t"
             "vaddss  $dst,$dst,$tmp\n\t"
-            "vextractf32x4  $tmp2,$src2, 0x1\n\t"
+            "vextractf32x4  $tmp2,$src2,0x1\n\t"
             "vaddss  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0x01\n\t"
             "vaddss  $dst,$dst,$tmp\n\t"
@@ -5073,7 +5073,7 @@
             "vaddss  $dst,$dst,$tmp\n\t"
             "pshufd  $tmp,$tmp2,0x03\n\t"
             "vaddss  $dst,$dst,$tmp\n\t"
-            "vextractf32x4  $tmp2,$src2, 0x2\n\t"
+            "vextractf32x4  $tmp2,$src2,0x2\n\t"
             "vaddss  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0x01\n\t"
             "vaddss  $dst,$dst,$tmp\n\t"
@@ -5081,7 +5081,7 @@
             "vaddss  $dst,$dst,$tmp\n\t"
             "pshufd  $tmp,$tmp2,0x03\n\t"
             "vaddss  $dst,$dst,$tmp\n\t"
-            "vextractf32x4  $tmp2,$src2, 0x3\n\t"
+            "vextractf32x4  $tmp2,$src2,0x3\n\t"
             "vaddss  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0x01\n\t"
             "vaddss  $dst,$dst,$tmp\n\t"
@@ -5097,7 +5097,7 @@
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
+    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
@@ -5105,7 +5105,7 @@
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
+    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
@@ -5113,7 +5113,7 @@
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
+    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
@@ -5162,7 +5162,7 @@
   format %{ "vaddsd  $dst,$dst,$src2\n\t"
             "pshufd  $tmp,$src2,0xE\n\t"
             "vaddsd  $dst,$dst,$tmp\n\t"
-            "vextractf32x4h  $tmp2,$src2, 0x1\n\t"
+            "vextractf32x4  $tmp2,$src2,0x1\n\t"
             "vaddsd  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0xE\n\t"
             "vaddsd  $dst,$dst,$tmp\t! add reduction4D" %}
@@ -5170,7 +5170,7 @@
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
+    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
@@ -5185,15 +5185,15 @@
   format %{ "vaddsd  $dst,$dst,$src2\n\t"
             "pshufd  $tmp,$src2,0xE\n\t"
             "vaddsd  $dst,$dst,$tmp\n\t"
-            "vextractf32x4  $tmp2,$src2, 0x1\n\t"
+            "vextractf32x4  $tmp2,$src2,0x1\n\t"
             "vaddsd  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0xE\n\t"
             "vaddsd  $dst,$dst,$tmp\n\t"
-            "vextractf32x4  $tmp2,$src2, 0x2\n\t"
+            "vextractf32x4  $tmp2,$src2,0x2\n\t"
             "vaddsd  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0xE\n\t"
             "vaddsd  $dst,$dst,$tmp\n\t"
-            "vextractf32x4  $tmp2,$src2, 0x3\n\t"
+            "vextractf32x4  $tmp2,$src2,0x3\n\t"
             "vaddsd  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0xE\n\t"
             "vaddsd  $dst,$dst,$tmp\t! add reduction8D" %}
@@ -5201,15 +5201,15 @@
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
+    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
+    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
+    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
@@ -5307,7 +5307,7 @@
   predicate(UseAVX > 0);
   match(Set dst (MulReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2);
-  format %{ "vextracti128  $tmp,$src2\n\t"
+  format %{ "vextracti128_high  $tmp,$src2\n\t"
             "vpmulld  $tmp,$tmp,$src2\n\t"
             "pshufd   $tmp2,$tmp,0xE\n\t"
             "vpmulld  $tmp,$tmp,$tmp2\n\t"
@@ -5318,7 +5318,7 @@
             "movd     $dst,$tmp2\t! mul reduction8I" %}
   ins_encode %{
     int vector_len = 0;
-    __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
+    __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
@@ -5335,9 +5335,9 @@
   predicate(UseAVX > 2);
   match(Set dst (MulReductionVI src1 src2));
   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
-  format %{ "vextracti64x4  $tmp3,$src2,0x1\n\t"
+  format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
             "vpmulld  $tmp3,$tmp3,$src2\n\t"
-            "vextracti128   $tmp,$tmp3\n\t"
+            "vextracti128_high  $tmp,$tmp3\n\t"
             "vpmulld  $tmp,$tmp,$src2\n\t"
             "pshufd   $tmp2,$tmp,0xE\n\t"
             "vpmulld  $tmp,$tmp,$tmp2\n\t"
@@ -5347,9 +5347,9 @@
             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
             "movd     $dst,$tmp2\t! mul reduction16I" %}
   ins_encode %{
-    __ vextracti64x4h($tmp3$$XMMRegister, $src2$$XMMRegister, 1);
+    __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
     __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
-    __ vextracti128h($tmp$$XMMRegister, $tmp3$$XMMRegister);
+    __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
@@ -5386,7 +5386,7 @@
   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
   match(Set dst (MulReductionVL src1 src2));
   effect(TEMP tmp, TEMP tmp2);
-  format %{ "vextracti128  $tmp,$src2\n\t"
+  format %{ "vextracti128_high  $tmp,$src2\n\t"
             "vpmullq  $tmp2,$tmp,$src2\n\t"
             "pshufd   $tmp,$tmp2,0xE\n\t"
             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
@@ -5394,7 +5394,7 @@
             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
             "movdq    $dst,$tmp2\t! mul reduction4L" %}
   ins_encode %{
-    __ vextracti128h($tmp$$XMMRegister, $src2$$XMMRegister);
+    __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
@@ -5409,9 +5409,9 @@
   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
   match(Set dst (MulReductionVL src1 src2));
   effect(TEMP tmp, TEMP tmp2);
-  format %{ "vextracti64x4  $tmp2,$src2,0x1\n\t"
+  format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
             "vpmullq  $tmp2,$tmp2,$src2\n\t"
-            "vextracti128   $tmp,$tmp2\n\t"
+            "vextracti128_high  $tmp,$tmp2\n\t"
             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
             "pshufd   $tmp,$tmp2,0xE\n\t"
             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
@@ -5419,9 +5419,9 @@
             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
             "movdq    $dst,$tmp2\t! mul reduction8L" %}
   ins_encode %{
-    __ vextracti64x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 1);
+    __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
-    __ vextracti128h($tmp$$XMMRegister, $tmp2$$XMMRegister);
+    __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
@@ -5520,7 +5520,7 @@
             "vmulss  $dst,$dst,$tmp\n\t"
             "pshufd  $tmp,$src2,0x03\n\t"
             "vmulss  $dst,$dst,$tmp\n\t"
-            "vextractf128  $tmp2,$src2\n\t"
+            "vextractf128_high  $tmp2,$src2\n\t"
             "vmulss  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0x01\n\t"
             "vmulss  $dst,$dst,$tmp\n\t"
@@ -5536,7 +5536,7 @@
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf128h($tmp2$$XMMRegister, $src2$$XMMRegister);
+    __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
@@ -5559,7 +5559,7 @@
             "vmulss  $dst,$dst,$tmp\n\t"
             "pshufd  $tmp,$src2,0x03\n\t"
             "vmulss  $dst,$dst,$tmp\n\t"
-            "vextractf32x4  $tmp2,$src2, 0x1\n\t"
+            "vextractf32x4  $tmp2,$src2,0x1\n\t"
             "vmulss  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0x01\n\t"
             "vmulss  $dst,$dst,$tmp\n\t"
@@ -5567,7 +5567,7 @@
             "vmulss  $dst,$dst,$tmp\n\t"
             "pshufd  $tmp,$tmp2,0x03\n\t"
             "vmulss  $dst,$dst,$tmp\n\t"
-            "vextractf32x4  $tmp2,$src2, 0x2\n\t"
+            "vextractf32x4  $tmp2,$src2,0x2\n\t"
             "vmulss  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0x01\n\t"
             "vmulss  $dst,$dst,$tmp\n\t"
@@ -5575,7 +5575,7 @@
             "vmulss  $dst,$dst,$tmp\n\t"
             "pshufd  $tmp,$tmp2,0x03\n\t"
             "vmulss  $dst,$dst,$tmp\n\t"
-            "vextractf32x4  $tmp2,$src2, 0x3\n\t"
+            "vextractf32x4  $tmp2,$src2,0x3\n\t"
             "vmulss  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0x01\n\t"
             "vmulss  $dst,$dst,$tmp\n\t"
@@ -5591,7 +5591,7 @@
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
+    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
@@ -5599,7 +5599,7 @@
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
+    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
@@ -5607,7 +5607,7 @@
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
+    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
@@ -5656,7 +5656,7 @@
   format %{ "vmulsd  $dst,$dst,$src2\n\t"
             "pshufd  $tmp,$src2,0xE\n\t"
             "vmulsd  $dst,$dst,$tmp\n\t"
-            "vextractf128  $tmp2,$src2\n\t"
+            "vextractf128_high  $tmp2,$src2\n\t"
             "vmulsd  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0xE\n\t"
             "vmulsd  $dst,$dst,$tmp\t! mul reduction4D" %}
@@ -5664,7 +5664,7 @@
     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf128h($tmp2$$XMMRegister, $src2$$XMMRegister);
+    __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
@@ -5679,15 +5679,15 @@
   format %{ "vmulsd  $dst,$dst,$src2\n\t"
             "pshufd  $tmp,$src2,0xE\n\t"
             "vmulsd  $dst,$dst,$tmp\n\t"
-            "vextractf32x4  $tmp2,$src2, 0x1\n\t"
+            "vextractf32x4  $tmp2,$src2,0x1\n\t"
             "vmulsd  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$src2,0xE\n\t"
             "vmulsd  $dst,$dst,$tmp\n\t"
-            "vextractf32x4  $tmp2,$src2, 0x2\n\t"
+            "vextractf32x4  $tmp2,$src2,0x2\n\t"
             "vmulsd  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0xE\n\t"
             "vmulsd  $dst,$dst,$tmp\n\t"
-            "vextractf32x4  $tmp2,$src2, 0x3\n\t"
+            "vextractf32x4  $tmp2,$src2,0x3\n\t"
             "vmulsd  $dst,$dst,$tmp2\n\t"
             "pshufd  $tmp,$tmp2,0xE\n\t"
             "vmulsd  $dst,$dst,$tmp\t! mul reduction8D" %}
@@ -5695,15 +5695,15 @@
     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
+    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
+    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
-    __ vextractf32x4h($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
+    __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);