--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp Mon Jun 22 14:22:19 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp Tue Jun 23 12:45:08 2015 -0700
@@ -1347,7 +1347,7 @@
void Assembler::andnl(Register dst, Register src1, Register src2) {
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- int encode = vex_prefix_0F38_and_encode(dst, src1, src2, false);
+ int encode = vex_prefix_0F38_and_encode_legacy(dst, src1, src2, false);
emit_int8((unsigned char)0xF2);
emit_int8((unsigned char)(0xC0 | encode));
}
@@ -1355,7 +1355,7 @@
void Assembler::andnl(Register dst, Register src1, Address src2) {
InstructionMark im(this);
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- vex_prefix_0F38(dst, src1, src2, false);
+ vex_prefix_0F38_legacy(dst, src1, src2, false);
emit_int8((unsigned char)0xF2);
emit_operand(dst, src2);
}
@@ -1382,7 +1382,7 @@
void Assembler::blsil(Register dst, Register src) {
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- int encode = vex_prefix_0F38_and_encode(rbx, dst, src, false);
+ int encode = vex_prefix_0F38_and_encode_legacy(rbx, dst, src, false);
emit_int8((unsigned char)0xF3);
emit_int8((unsigned char)(0xC0 | encode));
}
@@ -1390,14 +1390,14 @@
void Assembler::blsil(Register dst, Address src) {
InstructionMark im(this);
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- vex_prefix_0F38(rbx, dst, src, false);
+ vex_prefix_0F38_legacy(rbx, dst, src, false);
emit_int8((unsigned char)0xF3);
emit_operand(rbx, src);
}
void Assembler::blsmskl(Register dst, Register src) {
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- int encode = vex_prefix_0F38_and_encode(rdx, dst, src, false);
+ int encode = vex_prefix_0F38_and_encode_legacy(rdx, dst, src, false);
emit_int8((unsigned char)0xF3);
emit_int8((unsigned char)(0xC0 | encode));
}
@@ -1412,7 +1412,7 @@
void Assembler::blsrl(Register dst, Register src) {
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- int encode = vex_prefix_0F38_and_encode(rcx, dst, src, false);
+ int encode = vex_prefix_0F38_and_encode_legacy(rcx, dst, src, false);
emit_int8((unsigned char)0xF3);
emit_int8((unsigned char)(0xC0 | encode));
}
@@ -1420,7 +1420,7 @@
void Assembler::blsrl(Register dst, Address src) {
InstructionMark im(this);
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- vex_prefix_0F38(rcx, dst, src, false);
+ vex_prefix_0F38_legacy(rcx, dst, src, false);
emit_int8((unsigned char)0xF3);
emit_operand(rcx, src);
}
@@ -3114,15 +3114,16 @@
assert(VM_Version::supports_sse4_1(), "");
assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
InstructionMark im(this);
- simd_prefix(dst, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38);
+ simd_prefix(dst, xnoreg, src, VEX_SIMD_66, false,
+ VEX_OPCODE_0F_38, false, AVX_128bit, true);
emit_int8(0x17);
emit_operand(dst, src);
}
void Assembler::ptest(XMMRegister dst, XMMRegister src) {
assert(VM_Version::supports_sse4_1(), "");
- int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
- false, VEX_OPCODE_0F_38);
+ int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, false,
+ VEX_OPCODE_0F_38, false, AVX_128bit, true);
emit_int8(0x17);
emit_int8((unsigned char)(0xC0 | encode));
}
@@ -3134,7 +3135,7 @@
assert(dst != xnoreg, "sanity");
int dst_enc = dst->encoding();
// swap src<->dst for encoding
- vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+ vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len, true, false);
emit_int8(0x17);
emit_operand(dst, src);
}
@@ -3143,7 +3144,7 @@
assert(VM_Version::supports_avx(), "");
int vector_len = AVX_256bit;
int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
- vector_len, VEX_OPCODE_0F_38);
+ vector_len, VEX_OPCODE_0F_38, true, false);
emit_int8(0x17);
emit_int8((unsigned char)(0xC0 | encode));
}
@@ -3154,12 +3155,12 @@
if (VM_Version::supports_evex()) {
tuple_type = EVEX_FVM;
}
- emit_simd_arith(0x60, dst, src, VEX_SIMD_66);
+ emit_simd_arith(0x60, dst, src, VEX_SIMD_66, false, (VM_Version::supports_avx512vlbw() == false));
}
void Assembler::punpcklbw(XMMRegister dst, XMMRegister src) {
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
- emit_simd_arith(0x60, dst, src, VEX_SIMD_66);
+ emit_simd_arith(0x60, dst, src, VEX_SIMD_66, false, (VM_Version::supports_avx512vlbw() == false));
}
void Assembler::punpckldq(XMMRegister dst, Address src) {
@@ -4987,7 +4988,51 @@
emit_int8((unsigned char)(0xC0 | encode));
}
-// duplicate 4-bytes integer data from src into 8 locations in dest
+// duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL
+void Assembler::evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
+ vector_len, VEX_OPCODE_0F_38, false);
+ emit_int8(0x78);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evpbroadcastb(XMMRegister dst, Address src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ tuple_type = EVEX_T1S;
+ input_size_in_bits = EVEX_8bit;
+ InstructionMark im(this);
+ assert(dst != xnoreg, "sanity");
+ int dst_enc = dst->encoding();
+ // swap src<->dst for encoding
+ vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+ emit_int8(0x78);
+ emit_operand(dst, src);
+}
+
+// duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL
+void Assembler::evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
+ vector_len, VEX_OPCODE_0F_38, false);
+ emit_int8(0x79);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evpbroadcastw(XMMRegister dst, Address src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ tuple_type = EVEX_T1S;
+ input_size_in_bits = EVEX_16bit;
+ InstructionMark im(this);
+ assert(dst != xnoreg, "sanity");
+ int dst_enc = dst->encoding();
+ // swap src<->dst for encoding
+ vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+ emit_int8(0x79);
+ emit_operand(dst, src);
+}
+
+// duplicate 4-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
void Assembler::evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len) {
assert(VM_Version::supports_evex(), "");
int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
@@ -4996,6 +5041,121 @@
emit_int8((unsigned char)(0xC0 | encode));
}
+void Assembler::evpbroadcastd(XMMRegister dst, Address src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ tuple_type = EVEX_T1S;
+ input_size_in_bits = EVEX_32bit;
+ InstructionMark im(this);
+ assert(dst != xnoreg, "sanity");
+ int dst_enc = dst->encoding();
+ // swap src<->dst for encoding
+ vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+ emit_int8(0x58);
+ emit_operand(dst, src);
+}
+
+// duplicate 8-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
+void Assembler::evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+ VEX_OPCODE_0F_38, true, vector_len, false, false);
+ emit_int8(0x59);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evpbroadcastq(XMMRegister dst, Address src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ tuple_type = EVEX_T1S;
+ input_size_in_bits = EVEX_64bit;
+ InstructionMark im(this);
+ assert(dst != xnoreg, "sanity");
+ int dst_enc = dst->encoding();
+ // swap src<->dst for encoding
+ vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, true, vector_len);
+ emit_int8(0x59);
+ emit_operand(dst, src);
+}
+
+// duplicate single precision fp from src into 4|8|16 locations in dest : requires AVX512VL
+void Assembler::evpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+ VEX_OPCODE_0F_38, false, vector_len, false, false);
+ emit_int8(0x18);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evpbroadcastss(XMMRegister dst, Address src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ tuple_type = EVEX_T1S;
+ input_size_in_bits = EVEX_32bit;
+ InstructionMark im(this);
+ assert(dst != xnoreg, "sanity");
+ int dst_enc = dst->encoding();
+ // swap src<->dst for encoding
+ vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+ emit_int8(0x18);
+ emit_operand(dst, src);
+}
+
+// duplicate double precision fp from src into 2|4|8 locations in dest : requires AVX512VL
+void Assembler::evpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+ VEX_OPCODE_0F_38, true, vector_len, false, false);
+ emit_int8(0x19);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evpbroadcastsd(XMMRegister dst, Address src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ tuple_type = EVEX_T1S;
+ input_size_in_bits = EVEX_64bit;
+ InstructionMark im(this);
+ assert(dst != xnoreg, "sanity");
+ int dst_enc = dst->encoding();
+ // swap src<->dst for encoding
+ vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, true, vector_len);
+ emit_int8(0x19);
+ emit_operand(dst, src);
+}
+
+// duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL
+void Assembler::evpbroadcastb(XMMRegister dst, Register src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+ VEX_OPCODE_0F_38, false, vector_len, false, false);
+ emit_int8(0x7A);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL
+void Assembler::evpbroadcastw(XMMRegister dst, Register src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+ VEX_OPCODE_0F_38, false, vector_len, false, false);
+ emit_int8(0x7B);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// duplicate 4-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
+void Assembler::evpbroadcastd(XMMRegister dst, Register src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+ VEX_OPCODE_0F_38, false, vector_len, false, false);
+ emit_int8(0x7C);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// duplicate 8-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
+void Assembler::evpbroadcastq(XMMRegister dst, Register src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+ VEX_OPCODE_0F_38, true, vector_len, false, false);
+ emit_int8(0x7C);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
// Carry-Less Multiplication Quadword
void Assembler::pclmulqdq(XMMRegister dst, XMMRegister src, int mask) {
assert(VM_Version::supports_clmul(), "");
@@ -5606,7 +5766,7 @@
void Assembler::vex_prefix(Address adr, int nds_enc, int xreg_enc, VexSimdPrefix pre,
VexOpcode opc, bool vex_w, int vector_len, bool legacy_mode, bool no_mask_reg) {
- bool vex_r = (xreg_enc >= 8);
+ bool vex_r = ((xreg_enc & 8) == 8) ? 1 : 0;
bool vex_b = adr.base_needs_rex();
bool vex_x = adr.index_needs_rex();
avx_vector_len = vector_len;
@@ -5634,8 +5794,8 @@
int Assembler::vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc,
bool vex_w, int vector_len, bool legacy_mode, bool no_mask_reg ) {
- bool vex_r = (dst_enc >= 8);
- bool vex_b = (src_enc >= 8);
+ bool vex_r = ((dst_enc & 8) == 8) ? 1 : 0;
+ bool vex_b = ((src_enc & 8) == 8) ? 1 : 0;
bool vex_x = false;
avx_vector_len = vector_len;
@@ -6280,19 +6440,15 @@
void Assembler::andnq(Register dst, Register src1, Register src2) {
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- int encode = vex_prefix_0F38_and_encode_q(dst, src1, src2);
+ int encode = vex_prefix_0F38_and_encode_q_legacy(dst, src1, src2);
emit_int8((unsigned char)0xF2);
emit_int8((unsigned char)(0xC0 | encode));
}
void Assembler::andnq(Register dst, Register src1, Address src2) {
- if (VM_Version::supports_evex()) {
- tuple_type = EVEX_T1S;
- input_size_in_bits = EVEX_64bit;
- }
InstructionMark im(this);
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- vex_prefix_0F38_q(dst, src1, src2);
+ vex_prefix_0F38_q_legacy(dst, src1, src2);
emit_int8((unsigned char)0xF2);
emit_operand(dst, src2);
}
@@ -6319,7 +6475,7 @@
void Assembler::blsiq(Register dst, Register src) {
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- int encode = vex_prefix_0F38_and_encode_q(rbx, dst, src);
+ int encode = vex_prefix_0F38_and_encode_q_legacy(rbx, dst, src);
emit_int8((unsigned char)0xF3);
emit_int8((unsigned char)(0xC0 | encode));
}
@@ -6327,14 +6483,14 @@
void Assembler::blsiq(Register dst, Address src) {
InstructionMark im(this);
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- vex_prefix_0F38_q(rbx, dst, src);
+ vex_prefix_0F38_q_legacy(rbx, dst, src);
emit_int8((unsigned char)0xF3);
emit_operand(rbx, src);
}
void Assembler::blsmskq(Register dst, Register src) {
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- int encode = vex_prefix_0F38_and_encode_q(rdx, dst, src);
+ int encode = vex_prefix_0F38_and_encode_q_legacy(rdx, dst, src);
emit_int8((unsigned char)0xF3);
emit_int8((unsigned char)(0xC0 | encode));
}
@@ -6342,14 +6498,14 @@
void Assembler::blsmskq(Register dst, Address src) {
InstructionMark im(this);
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- vex_prefix_0F38_q(rdx, dst, src);
+ vex_prefix_0F38_q_legacy(rdx, dst, src);
emit_int8((unsigned char)0xF3);
emit_operand(rdx, src);
}
void Assembler::blsrq(Register dst, Register src) {
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- int encode = vex_prefix_0F38_and_encode_q(rcx, dst, src);
+ int encode = vex_prefix_0F38_and_encode_q_legacy(rcx, dst, src);
emit_int8((unsigned char)0xF3);
emit_int8((unsigned char)(0xC0 | encode));
}
@@ -6357,7 +6513,7 @@
void Assembler::blsrq(Register dst, Address src) {
InstructionMark im(this);
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- vex_prefix_0F38_q(rcx, dst, src);
+ vex_prefix_0F38_q_legacy(rcx, dst, src);
emit_int8((unsigned char)0xF3);
emit_operand(rcx, src);
}
--- a/hotspot/src/cpu/x86/vm/x86.ad Mon Jun 22 14:22:19 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/x86.ad Tue Jun 23 12:45:08 2015 -0700
@@ -2894,6 +2894,457 @@
ins_pipe( pipe_slow );
%}
+// ====================LEGACY REPLICATE=======================================
+
+instruct Repl4B_mem(vecS dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB (LoadB mem)));
+ format %{ "punpcklbw $dst,$mem\n\t"
+ "pshuflw $dst,$dst,0x00\t! replicate4B" %}
+ ins_encode %{
+ __ punpcklbw($dst$$XMMRegister, $mem$$Address);
+ __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8B_mem(vecD dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB (LoadB mem)));
+ format %{ "punpcklbw $dst,$mem\n\t"
+ "pshuflw $dst,$dst,0x00\t! replicate8B" %}
+ ins_encode %{
+ __ punpcklbw($dst$$XMMRegister, $mem$$Address);
+ __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B(vecX dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB src));
+ format %{ "movd $dst,$src\n\t"
+ "punpcklbw $dst,$dst\n\t"
+ "pshuflw $dst,$dst,0x00\n\t"
+ "punpcklqdq $dst,$dst\t! replicate16B" %}
+ ins_encode %{
+ __ movdl($dst$$XMMRegister, $src$$Register);
+ __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
+ __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B_mem(vecX dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB (LoadB mem)));
+ format %{ "punpcklbw $dst,$mem\n\t"
+ "pshuflw $dst,$dst,0x00\n\t"
+ "punpcklqdq $dst,$dst\t! replicate16B" %}
+ ins_encode %{
+ __ punpcklbw($dst$$XMMRegister, $mem$$Address);
+ __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B(vecY dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB src));
+ format %{ "movd $dst,$src\n\t"
+ "punpcklbw $dst,$dst\n\t"
+ "pshuflw $dst,$dst,0x00\n\t"
+ "punpcklqdq $dst,$dst\n\t"
+ "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
+ ins_encode %{
+ __ movdl($dst$$XMMRegister, $src$$Register);
+ __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
+ __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B_mem(vecY dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB (LoadB mem)));
+ format %{ "punpcklbw $dst,$mem\n\t"
+ "pshuflw $dst,$dst,0x00\n\t"
+ "punpcklqdq $dst,$dst\n\t"
+ "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
+ ins_encode %{
+ __ punpcklbw($dst$$XMMRegister, $mem$$Address);
+ __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B_imm(vecX dst, immI con) %{
+ predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB con));
+ format %{ "movq $dst,[$constantaddress]\n\t"
+ "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
+ ins_encode %{
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B_imm(vecY dst, immI con) %{
+ predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB con));
+ format %{ "movq $dst,[$constantaddress]\n\t"
+ "punpcklqdq $dst,$dst\n\t"
+ "vinserti128h $dst,$dst,$dst\t! lreplicate32B($con)" %}
+ ins_encode %{
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4S(vecD dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS src));
+ format %{ "movd $dst,$src\n\t"
+ "pshuflw $dst,$dst,0x00\t! replicate4S" %}
+ ins_encode %{
+ __ movdl($dst$$XMMRegister, $src$$Register);
+ __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4S_mem(vecD dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS (LoadS mem)));
+ format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}
+ ins_encode %{
+ __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8S(vecX dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS src));
+ format %{ "movd $dst,$src\n\t"
+ "pshuflw $dst,$dst,0x00\n\t"
+ "punpcklqdq $dst,$dst\t! replicate8S" %}
+ ins_encode %{
+ __ movdl($dst$$XMMRegister, $src$$Register);
+ __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8S_mem(vecX dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS (LoadS mem)));
+ format %{ "pshuflw $dst,$mem,0x00\n\t"
+ "punpcklqdq $dst,$dst\t! replicate8S" %}
+ ins_encode %{
+ __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8S_imm(vecX dst, immI con) %{
+ predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS con));
+ format %{ "movq $dst,[$constantaddress]\n\t"
+ "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
+ ins_encode %{
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S(vecY dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS src));
+ format %{ "movd $dst,$src\n\t"
+ "pshuflw $dst,$dst,0x00\n\t"
+ "punpcklqdq $dst,$dst\n\t"
+ "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
+ ins_encode %{
+ __ movdl($dst$$XMMRegister, $src$$Register);
+ __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S_mem(vecY dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS (LoadS mem)));
+ format %{ "pshuflw $dst,$mem,0x00\n\t"
+ "punpcklqdq $dst,$dst\n\t"
+ "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
+ ins_encode %{
+ __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S_imm(vecY dst, immI con) %{
+ predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS con));
+ format %{ "movq $dst,[$constantaddress]\n\t"
+ "punpcklqdq $dst,$dst\n\t"
+ "vinserti128h $dst,$dst,$dst\t! replicate16S($con)" %}
+ ins_encode %{
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4I(vecX dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateI src));
+ format %{ "movd $dst,$src\n\t"
+ "pshufd $dst,$dst,0x00\t! replicate4I" %}
+ ins_encode %{
+ __ movdl($dst$$XMMRegister, $src$$Register);
+ __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4I_mem(vecX dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateI (LoadI mem)));
+ format %{ "pshufd $dst,$mem,0x00\t! replicate4I" %}
+ ins_encode %{
+ __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I(vecY dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateI src));
+ format %{ "movd $dst,$src\n\t"
+ "pshufd $dst,$dst,0x00\n\t"
+ "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
+ ins_encode %{
+ __ movdl($dst$$XMMRegister, $src$$Register);
+ __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+ __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I_mem(vecY dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateI (LoadI mem)));
+ format %{ "pshufd $dst,$mem,0x00\n\t"
+ "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
+ ins_encode %{
+ __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
+ __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4I_imm(vecX dst, immI con) %{
+ predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateI con));
+ format %{ "movq $dst,[$constantaddress]\t! replicate4I($con)\n\t"
+ "punpcklqdq $dst,$dst" %}
+ ins_encode %{
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I_imm(vecY dst, immI con) %{
+ predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateI con));
+ format %{ "movq $dst,[$constantaddress]\t! replicate8I($con)\n\t"
+ "punpcklqdq $dst,$dst\n\t"
+ "vinserti128h $dst,$dst,$dst" %}
+ ins_encode %{
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+// Long could be loaded into xmm register directly from memory.
+instruct Repl2L_mem(vecX dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateL (LoadL mem)));
+ format %{ "movq $dst,$mem\n\t"
+ "punpcklqdq $dst,$dst\t! replicate2L" %}
+ ins_encode %{
+ __ movq($dst$$XMMRegister, $mem$$Address);
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+// Replicate long (8 byte) scalar to be vector
+#ifdef _LP64
+instruct Repl4L(vecY dst, rRegL src) %{
+ predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateL src));
+ format %{ "movdq $dst,$src\n\t"
+ "punpcklqdq $dst,$dst\n\t"
+ "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
+ ins_encode %{
+ __ movdq($dst$$XMMRegister, $src$$Register);
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+#else // _LP64
+instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
+ predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateL src));
+ effect(TEMP dst, USE src, TEMP tmp);
+ format %{ "movdl $dst,$src.lo\n\t"
+ "movdl $tmp,$src.hi\n\t"
+ "punpckldq $dst,$tmp\n\t"
+ "punpcklqdq $dst,$dst\n\t"
+ "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
+ ins_encode %{
+ __ movdl($dst$$XMMRegister, $src$$Register);
+ __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
+ __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+#endif // _LP64
+
+instruct Repl4L_imm(vecY dst, immL con) %{
+ predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateL con));
+ format %{ "movq $dst,[$constantaddress]\n\t"
+ "punpcklqdq $dst,$dst\n\t"
+ "vinserti128h $dst,$dst,$dst\t! replicate4L($con)" %}
+ ins_encode %{
+ __ movq($dst$$XMMRegister, $constantaddress($con));
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4L_mem(vecY dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateL (LoadL mem)));
+ format %{ "movq $dst,$mem\n\t"
+ "punpcklqdq $dst,$dst\n\t"
+ "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
+ ins_encode %{
+ __ movq($dst$$XMMRegister, $mem$$Address);
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl2F_mem(vecD dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateF (LoadF mem)));
+ format %{ "pshufd $dst,$mem,0x00\t! replicate2F" %}
+ ins_encode %{
+ __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4F_mem(vecX dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateF (LoadF mem)));
+ format %{ "pshufd $dst,$mem,0x00\t! replicate4F" %}
+ ins_encode %{
+ __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8F(vecY dst, regF src) %{
+ predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateF src));
+ format %{ "pshufd $dst,$src,0x00\n\t"
+ "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
+ ins_encode %{
+ __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
+ __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8F_mem(vecY dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateF (LoadF mem)));
+ format %{ "pshufd $dst,$mem,0x00\n\t"
+ "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
+ ins_encode %{
+ __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
+ __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl2D_mem(vecX dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateD (LoadD mem)));
+ format %{ "pshufd $dst,$mem,0x44\t! replicate2D" %}
+ ins_encode %{
+ __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4D(vecY dst, regD src) %{
+ predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateD src));
+ format %{ "pshufd $dst,$src,0x44\n\t"
+ "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
+ ins_encode %{
+ __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
+ __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4D_mem(vecY dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateD (LoadD mem)));
+ format %{ "pshufd $dst,$mem,0x44\n\t"
+ "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
+ ins_encode %{
+ __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
+ __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+// ====================GENERIC REPLICATE==========================================
+
// Replicate byte scalar to be vector
instruct Repl4B(vecS dst, rRegI src) %{
predicate(n->as_Vector()->length() == 4);
@@ -2923,60 +3374,6 @@
ins_pipe( pipe_slow );
%}
-instruct Repl16B(vecX dst, rRegI src) %{
- predicate(n->as_Vector()->length() == 16);
- match(Set dst (ReplicateB src));
- format %{ "movd $dst,$src\n\t"
- "punpcklbw $dst,$dst\n\t"
- "pshuflw $dst,$dst,0x00\n\t"
- "punpcklqdq $dst,$dst\t! replicate16B" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $src$$Register);
- __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
- __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl32B(vecY dst, rRegI src) %{
- predicate(n->as_Vector()->length() == 32);
- match(Set dst (ReplicateB src));
- format %{ "movd $dst,$src\n\t"
- "punpcklbw $dst,$dst\n\t"
- "pshuflw $dst,$dst,0x00\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $src$$Register);
- __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
- __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl64B(vecZ dst, rRegI src) %{
- predicate(n->as_Vector()->length() == 64);
- match(Set dst (ReplicateB src));
- format %{ "movd $dst,$src\n\t"
- "punpcklbw $dst,$dst\n\t"
- "pshuflw $dst,$dst,0x00\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! lower replicate32B\n\t"
- "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate632B" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $src$$Register);
- __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
- __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
// Replicate byte scalar immediate to be vector by loading from const table.
instruct Repl4B_imm(vecS dst, immI con) %{
predicate(n->as_Vector()->length() == 4);
@@ -2998,48 +3395,6 @@
ins_pipe( pipe_slow );
%}
-instruct Repl16B_imm(vecX dst, immI con) %{
- predicate(n->as_Vector()->length() == 16);
- match(Set dst (ReplicateB con));
- format %{ "movq $dst,[$constantaddress]\n\t"
- "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl32B_imm(vecY dst, immI con) %{
- predicate(n->as_Vector()->length() == 32);
- match(Set dst (ReplicateB con));
- format %{ "movq $dst,[$constantaddress]\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! lreplicate32B($con)" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl64B_imm(vecZ dst, immI con) %{
- predicate(n->as_Vector()->length() == 64);
- match(Set dst (ReplicateB con));
- format %{ "movq $dst,[$constantaddress]\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! lower replicate32B($con)\n\t"
- "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate32B($con)" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
// Replicate byte scalar zero to be vector
instruct Repl4B_zero(vecS dst, immI0 zero) %{
predicate(n->as_Vector()->length() == 4);
@@ -3083,18 +3438,6 @@
ins_pipe( fpu_reg_reg );
%}
-instruct Repl64B_zero(vecZ dst, immI0 zero) %{
- predicate(n->as_Vector()->length() == 64);
- match(Set dst (ReplicateB zero));
- format %{ "vpxor $dst k0,$dst,$dst\t! replicate64B zero" %}
- ins_encode %{
- // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
- int vector_len = 2;
- __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
- %}
- ins_pipe( fpu_reg_reg );
-%}
-
// Replicate char/short (2 byte) scalar to be vector
instruct Repl2S(vecS dst, rRegI src) %{
predicate(n->as_Vector()->length() == 2);
@@ -3108,66 +3451,6 @@
ins_pipe( fpu_reg_reg );
%}
-instruct Repl4S(vecD dst, rRegI src) %{
- predicate(n->as_Vector()->length() == 4);
- match(Set dst (ReplicateS src));
- format %{ "movd $dst,$src\n\t"
- "pshuflw $dst,$dst,0x00\t! replicate4S" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $src$$Register);
- __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
- %}
- ins_pipe( fpu_reg_reg );
-%}
-
-instruct Repl8S(vecX dst, rRegI src) %{
- predicate(n->as_Vector()->length() == 8);
- match(Set dst (ReplicateS src));
- format %{ "movd $dst,$src\n\t"
- "pshuflw $dst,$dst,0x00\n\t"
- "punpcklqdq $dst,$dst\t! replicate8S" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $src$$Register);
- __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl16S(vecY dst, rRegI src) %{
- predicate(n->as_Vector()->length() == 16);
- match(Set dst (ReplicateS src));
- format %{ "movd $dst,$src\n\t"
- "pshuflw $dst,$dst,0x00\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $src$$Register);
- __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl32S(vecZ dst, rRegI src) %{
- predicate(n->as_Vector()->length() == 32);
- match(Set dst (ReplicateS src));
- format %{ "movd $dst,$src\n\t"
- "pshuflw $dst,$dst,0x00\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! lower replicate16S\n\t"
- "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate16S" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $src$$Register);
- __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
// Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
instruct Repl2S_imm(vecS dst, immI con) %{
predicate(n->as_Vector()->length() == 2);
@@ -3189,48 +3472,6 @@
ins_pipe( fpu_reg_reg );
%}
-instruct Repl8S_imm(vecX dst, immI con) %{
- predicate(n->as_Vector()->length() == 8);
- match(Set dst (ReplicateS con));
- format %{ "movq $dst,[$constantaddress]\n\t"
- "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl16S_imm(vecY dst, immI con) %{
- predicate(n->as_Vector()->length() == 16);
- match(Set dst (ReplicateS con));
- format %{ "movq $dst,[$constantaddress]\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! replicate16S($con)" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl32S_imm(vecZ dst, immI con) %{
- predicate(n->as_Vector()->length() == 32);
- match(Set dst (ReplicateS con));
- format %{ "movq $dst,[$constantaddress]\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! lower replicate16S($con)\n\t"
- "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate16S($con)" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
// Replicate char/short (2 byte) scalar zero to be vector
instruct Repl2S_zero(vecS dst, immI0 zero) %{
predicate(n->as_Vector()->length() == 2);
@@ -3274,18 +3515,6 @@
ins_pipe( fpu_reg_reg );
%}
-instruct Repl32S_zero(vecZ dst, immI0 zero) %{
- predicate(n->as_Vector()->length() == 32);
- match(Set dst (ReplicateS zero));
- format %{ "vpxor $dst k0,$dst,$dst\t! replicate32S zero" %}
- ins_encode %{
- // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
- int vector_len = 2;
- __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
- %}
- ins_pipe( fpu_reg_reg );
-%}
-
// Replicate integer (4 byte) scalar to be vector
instruct Repl2I(vecD dst, rRegI src) %{
predicate(n->as_Vector()->length() == 2);
@@ -3299,101 +3528,6 @@
ins_pipe( fpu_reg_reg );
%}
-instruct Repl4I(vecX dst, rRegI src) %{
- predicate(n->as_Vector()->length() == 4);
- match(Set dst (ReplicateI src));
- format %{ "movd $dst,$src\n\t"
- "pshufd $dst,$dst,0x00\t! replicate4I" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $src$$Register);
- __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl8I(vecY dst, rRegI src) %{
- predicate(n->as_Vector()->length() == 8);
- match(Set dst (ReplicateI src));
- format %{ "movd $dst,$src\n\t"
- "pshufd $dst,$dst,0x00\n\t"
- "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $src$$Register);
- __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl16I(vecZ dst, rRegI src) %{
- predicate(n->as_Vector()->length() == 16);
- match(Set dst (ReplicateI src));
- format %{ "movd $dst,$src\n\t"
- "pshufd $dst,$dst,0x00\n\t"
- "vinserti128h $dst,$dst,$dst\t! lower replicate8I\n\t"
- "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate8I" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $src$$Register);
- __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-// Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
-instruct Repl2I_imm(vecD dst, immI con) %{
- predicate(n->as_Vector()->length() == 2);
- match(Set dst (ReplicateI con));
- format %{ "movq $dst,[$constantaddress]\t! replicate2I($con)" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
- %}
- ins_pipe( fpu_reg_reg );
-%}
-
-instruct Repl4I_imm(vecX dst, immI con) %{
- predicate(n->as_Vector()->length() == 4);
- match(Set dst (ReplicateI con));
- format %{ "movq $dst,[$constantaddress]\t! replicate4I($con)\n\t"
- "punpcklqdq $dst,$dst" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl8I_imm(vecY dst, immI con) %{
- predicate(n->as_Vector()->length() == 8);
- match(Set dst (ReplicateI con));
- format %{ "movq $dst,[$constantaddress]\t! replicate8I($con)\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl16I_imm(vecZ dst, immI con) %{
- predicate(n->as_Vector()->length() == 16);
- match(Set dst (ReplicateI con));
- format %{ "movq $dst,[$constantaddress]\t! replicate16I($con)\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\n\t"
- "vinserti64x4h $dst k0,$dst,$dst" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
// Integer could be loaded into xmm register directly from memory.
instruct Repl2I_mem(vecD dst, memory mem) %{
predicate(n->as_Vector()->length() == 2);
@@ -3407,46 +3541,15 @@
ins_pipe( fpu_reg_reg );
%}
-instruct Repl4I_mem(vecX dst, memory mem) %{
- predicate(n->as_Vector()->length() == 4);
- match(Set dst (ReplicateI (LoadI mem)));
- format %{ "movd $dst,$mem\n\t"
- "pshufd $dst,$dst,0x00\t! replicate4I" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $mem$$Address);
- __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl8I_mem(vecY dst, memory mem) %{
- predicate(n->as_Vector()->length() == 8);
- match(Set dst (ReplicateI (LoadI mem)));
- format %{ "movd $dst,$mem\n\t"
- "pshufd $dst,$dst,0x00\n\t"
- "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $mem$$Address);
- __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl16I_mem(vecZ dst, memory mem) %{
- predicate(n->as_Vector()->length() == 16);
- match(Set dst (ReplicateI (LoadI mem)));
- format %{ "movd $dst,$mem\n\t"
- "pshufd $dst,$dst,0x00\n\t"
- "vinserti128h $dst,$dst,$dst\t! lower replicate8I\n\t"
- "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate8I" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $mem$$Address);
- __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
+// Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
+instruct Repl2I_imm(vecD dst, immI con) %{
+ predicate(n->as_Vector()->length() == 2);
+ match(Set dst (ReplicateI con));
+ format %{ "movq $dst,[$constantaddress]\t! replicate2I($con)" %}
+ ins_encode %{
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+ %}
+ ins_pipe( fpu_reg_reg );
%}
// Replicate integer (4 byte) scalar zero to be vector
@@ -3482,18 +3585,6 @@
ins_pipe( fpu_reg_reg );
%}
-instruct Repl16I_zero(vecZ dst, immI0 zero) %{
- predicate(n->as_Vector()->length() == 16);
- match(Set dst (ReplicateI zero));
- format %{ "vpxor $dst k0,$dst,$dst\t! replicate16I zero" %}
- ins_encode %{
- // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
- int vector_len = 2;
- __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
- %}
- ins_pipe( fpu_reg_reg );
-%}
-
// Replicate long (8 byte) scalar to be vector
#ifdef _LP64
instruct Repl2L(vecX dst, rRegL src) %{
@@ -3507,36 +3598,6 @@
%}
ins_pipe( pipe_slow );
%}
-
-instruct Repl4L(vecY dst, rRegL src) %{
- predicate(n->as_Vector()->length() == 4);
- match(Set dst (ReplicateL src));
- format %{ "movdq $dst,$src\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
- ins_encode %{
- __ movdq($dst$$XMMRegister, $src$$Register);
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl8L(vecZ dst, rRegL src) %{
- predicate(n->as_Vector()->length() == 8);
- match(Set dst (ReplicateL src));
- format %{ "movdq $dst,$src\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! lower replicate4L\n\t"
- "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L" %}
- ins_encode %{
- __ movdq($dst$$XMMRegister, $src$$Register);
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
#else // _LP64
instruct Repl2L(vecX dst, eRegL src, regD tmp) %{
predicate(n->as_Vector()->length() == 2);
@@ -3554,45 +3615,6 @@
%}
ins_pipe( pipe_slow );
%}
-
-instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
- predicate(n->as_Vector()->length() == 4);
- match(Set dst (ReplicateL src));
- effect(TEMP dst, USE src, TEMP tmp);
- format %{ "movdl $dst,$src.lo\n\t"
- "movdl $tmp,$src.hi\n\t"
- "punpckldq $dst,$tmp\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $src$$Register);
- __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
- __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl8L(vecZ dst, eRegL src, regD tmp) %{
- predicate(n->as_Vector()->length() == 4);
- match(Set dst (ReplicateL src));
- effect(TEMP dst, USE src, TEMP tmp);
- format %{ "movdl $dst,$src.lo\n\t"
- "movdl $tmp,$src.hi\n\t"
- "punpckldq $dst,$tmp\n\t"
- "vinserti128h $dst,$dst,$dst\t! lower replicate4L\n\t"
- "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $src$$Register);
- __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
- __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
#endif // _LP64
// Replicate long (8 byte) scalar immediate to be vector by loading from const table.
@@ -3608,79 +3630,6 @@
ins_pipe( pipe_slow );
%}
-instruct Repl4L_imm(vecY dst, immL con) %{
- predicate(n->as_Vector()->length() == 4);
- match(Set dst (ReplicateL con));
- format %{ "movq $dst,[$constantaddress]\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! replicate4L($con)" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $constantaddress($con));
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl8L_imm(vecZ dst, immL con) %{
- predicate(n->as_Vector()->length() == 8);
- match(Set dst (ReplicateL con));
- format %{ "movq $dst,[$constantaddress]\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! lower replicate4L($con)\n\t"
- "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L($con)" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $constantaddress($con));
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-// Long could be loaded into xmm register directly from memory.
-instruct Repl2L_mem(vecX dst, memory mem) %{
- predicate(n->as_Vector()->length() == 2);
- match(Set dst (ReplicateL (LoadL mem)));
- format %{ "movq $dst,$mem\n\t"
- "punpcklqdq $dst,$dst\t! replicate2L" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $mem$$Address);
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl4L_mem(vecY dst, memory mem) %{
- predicate(n->as_Vector()->length() == 4);
- match(Set dst (ReplicateL (LoadL mem)));
- format %{ "movq $dst,$mem\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $mem$$Address);
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl8L_mem(vecZ dst, memory mem) %{
- predicate(n->as_Vector()->length() == 8);
- match(Set dst (ReplicateL (LoadL mem)));
- format %{ "movq $dst,$mem\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! lower replicate4L\n\t"
- "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $mem$$Address);
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
// Replicate long (8 byte) scalar zero to be vector
instruct Repl2L_zero(vecX dst, immL0 zero) %{
predicate(n->as_Vector()->length() == 2);
@@ -3704,18 +3653,6 @@
ins_pipe( fpu_reg_reg );
%}
-instruct Repl8L_zero(vecZ dst, immL0 zero) %{
- predicate(n->as_Vector()->length() == 8);
- match(Set dst (ReplicateL zero));
- format %{ "vpxor $dst k0,$dst,$dst\t! replicate8L zero" %}
- ins_encode %{
- // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
- int vector_len = 2;
- __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
- %}
- ins_pipe( fpu_reg_reg );
-%}
-
// Replicate float (4 byte) scalar to be vector
instruct Repl2F(vecD dst, regF src) %{
predicate(n->as_Vector()->length() == 2);
@@ -3737,32 +3674,6 @@
ins_pipe( pipe_slow );
%}
-instruct Repl8F(vecY dst, regF src) %{
- predicate(n->as_Vector()->length() == 8);
- match(Set dst (ReplicateF src));
- format %{ "pshufd $dst,$src,0x00\n\t"
- "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
- ins_encode %{
- __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
- __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl16F(vecZ dst, regF src) %{
- predicate(n->as_Vector()->length() == 16);
- match(Set dst (ReplicateF src));
- format %{ "pshufd $dst,$src,0x00\n\t"
- "vinsertf128h $dst,$dst,$dst\t! lower replicate8F\n\t"
- "vinsertf64x4h $dst k0,$dst,$dst\t! lower replicate8F" %}
- ins_encode %{
- __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
- __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- __ vinsertf64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
// Replicate float (4 byte) scalar zero to be vector
instruct Repl2F_zero(vecD dst, immF0 zero) %{
predicate(n->as_Vector()->length() == 2);
@@ -3795,17 +3706,6 @@
ins_pipe( fpu_reg_reg );
%}
-instruct Repl16F_zero(vecZ dst, immF0 zero) %{
- predicate(n->as_Vector()->length() == 16);
- match(Set dst (ReplicateF zero));
- format %{ "vxorps $dst k0,$dst,$dst\t! replicate16F zero" %}
- ins_encode %{
- int vector_len = 2;
- __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
- %}
- ins_pipe( fpu_reg_reg );
-%}
-
// Replicate double (8 bytes) scalar to be vector
instruct Repl2D(vecX dst, regD src) %{
predicate(n->as_Vector()->length() == 2);
@@ -3817,32 +3717,6 @@
ins_pipe( pipe_slow );
%}
-instruct Repl4D(vecY dst, regD src) %{
- predicate(n->as_Vector()->length() == 4);
- match(Set dst (ReplicateD src));
- format %{ "pshufd $dst,$src,0x44\n\t"
- "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
- ins_encode %{
- __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
- __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl8D(vecZ dst, regD src) %{
- predicate(n->as_Vector()->length() == 8);
- match(Set dst (ReplicateD src));
- format %{ "pshufd $dst,$src,0x44\n\t"
- "vinsertf128h $dst,$dst,$dst\t! lower replicate4D\n\t"
- "vinsertf64x4h $dst k0,$dst,$dst\t! upper replicate4D" %}
- ins_encode %{
- __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
- __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- __ vinsertf64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
// Replicate double (8 byte) scalar zero to be vector
instruct Repl2D_zero(vecX dst, immD0 zero) %{
predicate(n->as_Vector()->length() == 2);
@@ -3865,8 +3739,636 @@
ins_pipe( fpu_reg_reg );
%}
-instruct Repl8D_zero(vecZ dst, immD0 zero) %{
- predicate(n->as_Vector()->length() == 8);
+// ====================EVEX REPLICATE=============================================
+
+instruct Repl4B_mem_evex(vecS dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB (LoadB mem)));
+ format %{ "vpbroadcastb $dst,$mem\t! replicate4B" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8B_mem_evex(vecD dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB (LoadB mem)));
+ format %{ "vpbroadcastb $dst,$mem\t! replicate8B" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B_evex(vecX dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB src));
+ format %{ "vpbroadcastb $dst,$src\t! replicate16B" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B_mem_evex(vecX dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB (LoadB mem)));
+ format %{ "vpbroadcastb $dst,$mem\t! replicate16B" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B_evex(vecY dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB src));
+ format %{ "vpbroadcastb $dst,$src\t! replicate32B" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B_mem_evex(vecY dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB (LoadB mem)));
+ format %{ "vpbroadcastb $dst,$mem\t! replicate32B" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl64B_evex(vecZ dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
+ match(Set dst (ReplicateB src));
+ format %{ "vpbroadcastb $dst,$src\t! upper replicate64B" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB (LoadB mem)));
+ format %{ "vpbroadcastb $dst,$mem\t! replicate64B" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B_imm_evex(vecX dst, immI con) %{
+ predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB con));
+ format %{ "movq $dst,[$constantaddress]\n\t"
+ "vpbroadcastb $dst,$dst\t! replicate16B" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+ __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B_imm_evex(vecY dst, immI con) %{
+ predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB con));
+ format %{ "movq $dst,[$constantaddress]\n\t"
+ "vpbroadcastb $dst,$dst\t! replicate32B" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+ __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl64B_imm_evex(vecZ dst, immI con) %{
+ predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
+ match(Set dst (ReplicateB con));
+ format %{ "movq $dst,[$constantaddress]\n\t"
+ "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+ __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl64B_zero_evex(vecZ dst, immI0 zero) %{
+ predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
+ match(Set dst (ReplicateB zero));
+ format %{ "vpxor $dst k0,$dst,$dst\t! replicate64B zero" %}
+ ins_encode %{
+ // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
+ int vector_len = 2;
+ __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4S_evex(vecD dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS src));
+ format %{ "vpbroadcastw $dst,$src\t! replicate4S" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4S_mem_evex(vecD dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS (LoadS mem)));
+ format %{ "vpbroadcastw $dst,$mem\t! replicate4S" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8S_evex(vecX dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS src));
+ format %{ "vpbroadcastw $dst,$src\t! replicate8S" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8S_mem_evex(vecX dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS (LoadS mem)));
+ format %{ "vpbroadcastw $dst,$mem\t! replicate8S" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S_evex(vecY dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS src));
+ format %{ "vpbroadcastw $dst,$src\t! replicate16S" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S_mem_evex(vecY dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS (LoadS mem)));
+ format %{ "vpbroadcastw $dst,$mem\t! replicate16S" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl32S_evex(vecZ dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+ match(Set dst (ReplicateS src));
+ format %{ "vpbroadcastw $dst,$src\t! replicate32S" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+ match(Set dst (ReplicateS (LoadS mem)));
+ format %{ "vpbroadcastw $dst,$mem\t! replicate32S" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8S_imm_evex(vecX dst, immI con) %{
+ predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS con));
+ format %{ "movq $dst,[$constantaddress]\n\t"
+ "vpbroadcastw $dst,$dst\t! replicate8S" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+ __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S_imm_evex(vecY dst, immI con) %{
+ predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS con));
+ format %{ "movq $dst,[$constantaddress]\n\t"
+ "vpbroadcastw $dst,$dst\t! replicate16S" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+ __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl32S_imm_evex(vecZ dst, immI con) %{
+ predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+ match(Set dst (ReplicateS con));
+ format %{ "movq $dst,[$constantaddress]\n\t"
+ "vpbroadcastw $dst,$dst\t! replicate32S" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+ __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl32S_zero_evex(vecZ dst, immI0 zero) %{
+ predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+ match(Set dst (ReplicateS zero));
+ format %{ "vpxor $dst k0,$dst,$dst\t! replicate32S zero" %}
+ ins_encode %{
+ // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
+ int vector_len = 2;
+ __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4I_evex(vecX dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateI src));
+ format %{ "vpbroadcastd $dst,$src\t! replicate4I" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4I_mem_evex(vecX dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateI (LoadI mem)));
+ format %{ "vpbroadcastd $dst,$mem\t! replicate4I" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I_evex(vecY dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateI src));
+ format %{ "vpbroadcastd $dst,$src\t! replicate8I" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I_mem_evex(vecY dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateI (LoadI mem)));
+ format %{ "vpbroadcastd $dst,$mem\t! replicate8I" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16I_evex(vecZ dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+ match(Set dst (ReplicateI src));
+ format %{ "vpbroadcastd $dst,$src\t! replicate16I" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16I_mem_evex(vecZ dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+ match(Set dst (ReplicateI (LoadI mem)));
+ format %{ "vpbroadcastd $dst,$mem\t! replicate16I" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4I_imm_evex(vecX dst, immI con) %{
+ predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateI con));
+ format %{ "movq $dst,[$constantaddress]\t! replicate8I($con)\n\t"
+ "vpbroadcastd $dst,$dst\t! replicate4I" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+ __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I_imm_evex(vecY dst, immI con) %{
+ predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateI con));
+ format %{ "movq $dst,[$constantaddress]\t! replicate8I($con)\n\t"
+ "vpbroadcastd $dst,$dst\t! replicate8I" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+ __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16I_imm_evex(vecZ dst, immI con) %{
+ predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+ match(Set dst (ReplicateI con));
+ format %{ "movq $dst,[$constantaddress]\t! replicate16I($con)\n\t"
+ "vpbroadcastd $dst,$dst\t! replicate16I" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+ __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16I_zero_evex(vecZ dst, immI0 zero) %{
+ predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+ match(Set dst (ReplicateI zero));
+ format %{ "vpxor $dst k0,$dst,$dst\t! replicate16I zero" %}
+ ins_encode %{
+ // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
+ int vector_len = 2;
+ __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( fpu_reg_reg );
+%}
+
+// Replicate long (8 byte) scalar to be vector
+#ifdef _LP64
+instruct Repl4L_evex(vecY dst, rRegL src) %{
+ predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateL src));
+ format %{ "vpbroadcastq $dst,$src\t! replicate4L" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8L_evex(vecZ dst, rRegL src) %{
+ predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+ match(Set dst (ReplicateL src));
+ format %{ "vpbroadcastq $dst,$src\t! replicate8L" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+#else // _LP64
+instruct Repl4L_evex(vecY dst, eRegL src, regD tmp) %{
+ predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateL src));
+ effect(TEMP dst, USE src, TEMP tmp);
+ format %{ "movdl $dst,$src.lo\n\t"
+ "movdl $tmp,$src.hi\n\t"
+ "punpckldq $dst,$tmp\n\t"
+ "vpbroadcastq $dst,$dst\t! replicate4L" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ movdl($dst$$XMMRegister, $src$$Register);
+ __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
+ __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
+ __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8L_evex(vecZ dst, eRegL src, regD tmp) %{
+ predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+ match(Set dst (ReplicateL src));
+ effect(TEMP dst, USE src, TEMP tmp);
+ format %{ "movdl $dst,$src.lo\n\t"
+ "movdl $tmp,$src.hi\n\t"
+ "punpckldq $dst,$tmp\n\t"
+ "vpbroadcastq $dst,$dst\t! replicate8L" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ movdl($dst$$XMMRegister, $src$$Register);
+ __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
+ __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
+ __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+#endif // _LP64
+
+instruct Repl4L_imm_evex(vecY dst, immL con) %{
+ predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateL con));
+ format %{ "movq $dst,[$constantaddress]\n\t"
+ "vpbroadcastq $dst,$dst\t! replicate4L" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ movq($dst$$XMMRegister, $constantaddress($con));
+ __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8L_imm_evex(vecZ dst, immL con) %{
+ predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+ match(Set dst (ReplicateL con));
+ format %{ "movq $dst,[$constantaddress]\n\t"
+ "vpbroadcastq $dst,$dst\t! replicate8L" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ movq($dst$$XMMRegister, $constantaddress($con));
+ __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl2L_mem_evex(vecX dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 2 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateL (LoadL mem)));
+ format %{ "vpbroadcastd $dst,$mem\t! replicate2L" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4L_mem_evex(vecY dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateL (LoadL mem)));
+ format %{ "vpbroadcastd $dst,$mem\t! replicate4L" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8L_mem_evex(vecZ dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+ match(Set dst (ReplicateL (LoadL mem)));
+ format %{ "vpbroadcastd $dst,$mem\t! replicate8L" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8L_zero_evex(vecZ dst, immL0 zero) %{
+ predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+ match(Set dst (ReplicateL zero));
+ format %{ "vpxor $dst k0,$dst,$dst\t! replicate8L zero" %}
+ ins_encode %{
+ // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
+ int vector_len = 2;
+ __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8F_evex(vecY dst, regF src) %{
+ predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateF src));
+ format %{ "vbroadcastss $dst,$src\t! replicate8F" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8F_mem_evex(vecY dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateF (LoadF mem)));
+ format %{ "vbroadcastss $dst,$mem\t! replicate8F" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16F_evex(vecZ dst, regF src) %{
+ predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+ match(Set dst (ReplicateF src));
+ format %{ "vbroadcastss $dst,$src\t! replicate16F" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16F_mem_evex(vecZ dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+ match(Set dst (ReplicateF (LoadF mem)));
+ format %{ "vbroadcastss $dst,$mem\t! replicate16F" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
+ predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+ match(Set dst (ReplicateF zero));
+ format %{ "vxorps $dst k0,$dst,$dst\t! replicate16F zero" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4D_evex(vecY dst, regD src) %{
+ predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateD src));
+ format %{ "vbroadcastsd $dst,$src\t! replicate4D" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4D_mem_evex(vecY dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateD (LoadD mem)));
+ format %{ "vbroadcastsd $dst,$mem\t! replicate4D" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8D_evex(vecZ dst, regD src) %{
+ predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+ match(Set dst (ReplicateD src));
+ format %{ "vbroadcastsd $dst,$src\t! replicate8D" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8D_mem_evex(vecZ dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+ match(Set dst (ReplicateD (LoadD mem)));
+ format %{ "vbroadcastsd $dst,$mem\t! replicate8D" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
+ predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
match(Set dst (ReplicateD zero));
format %{ "vxorpd $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
ins_encode %{
@@ -4972,6 +5474,17 @@
ins_pipe( pipe_slow );
%}
+instruct vadd4B_mem(vecS dst, vecS src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+ match(Set dst (AddVB src (LoadVector mem)));
+ format %{ "vpaddb $dst,$src,$mem\t! add packed4B" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vadd8B(vecD dst, vecD src) %{
predicate(n->as_Vector()->length() == 8);
match(Set dst (AddVB dst src));
@@ -4993,6 +5506,17 @@
ins_pipe( pipe_slow );
%}
+instruct vadd8B_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+ match(Set dst (AddVB src (LoadVector mem)));
+ format %{ "vpaddb $dst,$src,$mem\t! add packed8B" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vadd16B(vecX dst, vecX src) %{
predicate(n->as_Vector()->length() == 16);
match(Set dst (AddVB dst src));
@@ -5091,6 +5615,17 @@
ins_pipe( pipe_slow );
%}
+instruct vadd2S_mem(vecS dst, vecS src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+ match(Set dst (AddVS src (LoadVector mem)));
+ format %{ "vpaddw $dst,$src,$mem\t! add packed2S" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vadd4S(vecD dst, vecD src) %{
predicate(n->as_Vector()->length() == 4);
match(Set dst (AddVS dst src));
@@ -5112,6 +5647,17 @@
ins_pipe( pipe_slow );
%}
+instruct vadd4S_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+ match(Set dst (AddVS src (LoadVector mem)));
+ format %{ "vpaddw $dst,$src,$mem\t! add packed4S" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vadd8S(vecX dst, vecX src) %{
predicate(n->as_Vector()->length() == 8);
match(Set dst (AddVS dst src));
@@ -5210,6 +5756,17 @@
ins_pipe( pipe_slow );
%}
+instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+ match(Set dst (AddVI src (LoadVector mem)));
+ format %{ "vpaddd $dst,$src,$mem\t! add packed2I" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vadd4I(vecX dst, vecX src) %{
predicate(n->as_Vector()->length() == 4);
match(Set dst (AddVI dst src));
@@ -5385,6 +5942,17 @@
ins_pipe( pipe_slow );
%}
+instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+ match(Set dst (AddVF src (LoadVector mem)));
+ format %{ "vaddps $dst,$src,$mem\t! add packed2F" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vadd4F(vecX dst, vecX src) %{
predicate(n->as_Vector()->length() == 4);
match(Set dst (AddVF dst src));
@@ -5562,6 +6130,17 @@
ins_pipe( pipe_slow );
%}
+instruct vsub4B_mem(vecS dst, vecS src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+ match(Set dst (SubVB src (LoadVector mem)));
+ format %{ "vpsubb $dst,$src,$mem\t! sub packed4B" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vsub8B(vecD dst, vecD src) %{
predicate(n->as_Vector()->length() == 8);
match(Set dst (SubVB dst src));
@@ -5583,6 +6162,17 @@
ins_pipe( pipe_slow );
%}
+instruct vsub8B_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+ match(Set dst (SubVB src (LoadVector mem)));
+ format %{ "vpsubb $dst,$src,$mem\t! sub packed8B" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vsub16B(vecX dst, vecX src) %{
predicate(n->as_Vector()->length() == 16);
match(Set dst (SubVB dst src));
@@ -5681,6 +6271,17 @@
ins_pipe( pipe_slow );
%}
+instruct vsub2S_mem(vecS dst, vecS src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+ match(Set dst (SubVS src (LoadVector mem)));
+ format %{ "vpsubw $dst,$src,$mem\t! sub packed2S" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vsub4S(vecD dst, vecD src) %{
predicate(n->as_Vector()->length() == 4);
match(Set dst (SubVS dst src));
@@ -5702,6 +6303,17 @@
ins_pipe( pipe_slow );
%}
+instruct vsub4S_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+ match(Set dst (SubVS src (LoadVector mem)));
+ format %{ "vpsubw $dst,$src,$mem\t! sub packed4S" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vsub8S(vecX dst, vecX src) %{
predicate(n->as_Vector()->length() == 8);
match(Set dst (SubVS dst src));
@@ -5800,6 +6412,17 @@
ins_pipe( pipe_slow );
%}
+instruct vsub2I_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+ match(Set dst (SubVI src (LoadVector mem)));
+ format %{ "vpsubd $dst,$src,$mem\t! sub packed2I" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vsub4I(vecX dst, vecX src) %{
predicate(n->as_Vector()->length() == 4);
match(Set dst (SubVI dst src));
@@ -5975,6 +6598,17 @@
ins_pipe( pipe_slow );
%}
+instruct vsub2F_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+ match(Set dst (SubVF src (LoadVector mem)));
+ format %{ "vsubps $dst,$src,$mem\t! sub packed2F" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vsub4F(vecX dst, vecX src) %{
predicate(n->as_Vector()->length() == 4);
match(Set dst (SubVF dst src));
@@ -6152,6 +6786,17 @@
ins_pipe( pipe_slow );
%}
+instruct vmul2S_mem(vecS dst, vecS src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+ match(Set dst (MulVS src (LoadVector mem)));
+ format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vmul4S(vecD dst, vecD src) %{
predicate(n->as_Vector()->length() == 4);
match(Set dst (MulVS dst src));
@@ -6173,6 +6818,17 @@
ins_pipe( pipe_slow );
%}
+instruct vmul4S_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+ match(Set dst (MulVS src (LoadVector mem)));
+ format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vmul8S(vecX dst, vecX src) %{
predicate(n->as_Vector()->length() == 8);
match(Set dst (MulVS dst src));
@@ -6271,13 +6927,13 @@
ins_pipe( pipe_slow );
%}
-instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{
- predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
- match(Set dst (MulVL src1 src2));
- format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %}
- ins_encode %{
- int vector_len = 0;
- __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+instruct vmul2I_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+ match(Set dst (MulVI src (LoadVector mem)));
+ format %{ "vpmulld $dst,$src,$mem\t! mul packed2I" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
%}
ins_pipe( pipe_slow );
%}
@@ -6314,6 +6970,28 @@
ins_pipe( pipe_slow );
%}
+instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{
+ predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
+ match(Set dst (MulVL src1 src2));
+ format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct vmul2L_mem(vecX dst, vecX src, memory mem) %{
+ predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
+ match(Set dst (MulVL src (LoadVector mem)));
+ format %{ "vpmullq $dst,$src,$mem\t! mul packed2L" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vmul4L_reg(vecY dst, vecY src1, vecY src2) %{
predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
match(Set dst (MulVL src1 src2));
@@ -6336,17 +7014,6 @@
ins_pipe( pipe_slow );
%}
-instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
- predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
- match(Set dst (MulVI src1 src2));
- format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
- ins_encode %{
- int vector_len = 1;
- __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
- %}
- ins_pipe( pipe_slow );
-%}
-
instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
match(Set dst (MulVL src1 src2));
@@ -6358,12 +7025,23 @@
ins_pipe( pipe_slow );
%}
-instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
- predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
- match(Set dst (MulVI src1 src2));
- format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %}
+instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
+ predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
+ match(Set dst (MulVL src (LoadVector mem)));
+ format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
ins_encode %{
int vector_len = 2;
+ __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
+ predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+ match(Set dst (MulVI src1 src2));
+ format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
+ ins_encode %{
+ int vector_len = 1;
__ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
%}
ins_pipe( pipe_slow );
@@ -6380,13 +7058,13 @@
ins_pipe( pipe_slow );
%}
-instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
- predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
- match(Set dst (MulVL src (LoadVector mem)));
- format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
+instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
+ predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
+ match(Set dst (MulVI src1 src2));
+ format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %}
ins_encode %{
int vector_len = 2;
- __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
%}
ins_pipe( pipe_slow );
%}
@@ -6424,6 +7102,17 @@
ins_pipe( pipe_slow );
%}
+instruct vmul2F_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+ match(Set dst (MulVF src (LoadVector mem)));
+ format %{ "vmulps $dst,$src,$mem\t! mul packed2F" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vmul4F(vecX dst, vecX src) %{
predicate(n->as_Vector()->length() == 4);
match(Set dst (MulVF dst src));
@@ -6601,6 +7290,17 @@
ins_pipe( pipe_slow );
%}
+instruct vdiv2F_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+ match(Set dst (DivVF src (LoadVector mem)));
+ format %{ "vdivps $dst,$src,$mem\t! div packed2F" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vdiv4F(vecX dst, vecX src) %{
predicate(n->as_Vector()->length() == 4);
match(Set dst (DivVF dst src));
@@ -7878,6 +8578,17 @@
ins_pipe( pipe_slow );
%}
+instruct vand4B_mem(vecS dst, vecS src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
+ match(Set dst (AndV src (LoadVector mem)));
+ format %{ "vpand $dst,$src,$mem\t! and vectors (4 bytes)" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vand8B(vecD dst, vecD src) %{
predicate(n->as_Vector()->length_in_bytes() == 8);
match(Set dst (AndV dst src));
@@ -7899,6 +8610,17 @@
ins_pipe( pipe_slow );
%}
+instruct vand8B_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
+ match(Set dst (AndV src (LoadVector mem)));
+ format %{ "vpand $dst,$src,$mem\t! and vectors (8 bytes)" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vand16B(vecX dst, vecX src) %{
predicate(n->as_Vector()->length_in_bytes() == 16);
match(Set dst (AndV dst src));
@@ -7998,6 +8720,17 @@
ins_pipe( pipe_slow );
%}
+instruct vor4B_mem(vecS dst, vecS src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
+ match(Set dst (OrV src (LoadVector mem)));
+ format %{ "vpor $dst,$src,$mem\t! or vectors (4 bytes)" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vor8B(vecD dst, vecD src) %{
predicate(n->as_Vector()->length_in_bytes() == 8);
match(Set dst (OrV dst src));
@@ -8019,6 +8752,17 @@
ins_pipe( pipe_slow );
%}
+instruct vor8B_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
+ match(Set dst (OrV src (LoadVector mem)));
+ format %{ "vpor $dst,$src,$mem\t! or vectors (8 bytes)" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vor16B(vecX dst, vecX src) %{
predicate(n->as_Vector()->length_in_bytes() == 16);
match(Set dst (OrV dst src));
@@ -8118,6 +8862,17 @@
ins_pipe( pipe_slow );
%}
+instruct vxor4B_mem(vecS dst, vecS src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
+ match(Set dst (XorV src (LoadVector mem)));
+ format %{ "vpxor $dst,$src,$mem\t! xor vectors (4 bytes)" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vxor8B(vecD dst, vecD src) %{
predicate(n->as_Vector()->length_in_bytes() == 8);
match(Set dst (XorV dst src));
@@ -8139,6 +8894,17 @@
ins_pipe( pipe_slow );
%}
+instruct vxor8B_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
+ match(Set dst (XorV src (LoadVector mem)));
+ format %{ "vpxor $dst,$src,$mem\t! xor vectors (8 bytes)" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vxor16B(vecX dst, vecX src) %{
predicate(n->as_Vector()->length_in_bytes() == 16);
match(Set dst (XorV dst src));