8221092: UseAVX=3 has performance degredation on Skylake (X7) processors
Summary: Fix for UseAVX=3 has performance degredation on Skylake (X7) processors
Reviewed-by: kvn
--- a/src/hotspot/cpu/x86/globals_x86.hpp Fri Oct 04 12:00:16 2019 -0400
+++ b/src/hotspot/cpu/x86/globals_x86.hpp Fri Oct 04 11:45:16 2019 -0700
@@ -211,5 +211,15 @@
"Use BMI2 instructions") \
\
diagnostic(bool, UseLibmIntrinsic, true, \
- "Use Libm Intrinsics")
+ "Use Libm Intrinsics") \
+ \
+ /* Minimum array size in bytes to use AVX512 intrinsics */ \
+ /* for copy, inflate and fill which don't bail out early based on any */ \
+ /* condition. When this value is set to zero compare operations like */ \
+ /* compare, vectorizedMismatch, compress can also use AVX512 intrinsics.*/\
+ diagnostic(int, AVX3Threshold, 4096, \
+ "Minimum array size in bytes to use AVX512 intrinsics" \
+ "for copy, inflate and fill. When this value is set as zero" \
+ "compare operations can also use AVX512 intrinsics.") \
+ range(0, max_jint)
#endif // CPU_X86_GLOBALS_X86_HPP
--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp Fri Oct 04 12:00:16 2019 -0400
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp Fri Oct 04 11:45:16 2019 -0700
@@ -6593,7 +6593,7 @@
bind(COMPARE_WIDE_VECTORS_LOOP);
#ifdef _LP64
- if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
+ if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
cmpl(cnt2, stride2x2);
jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
testl(cnt2, stride2x2-1); // cnt2 holds the vector count
@@ -6853,7 +6853,7 @@
testl(len, len);
jcc(Assembler::zero, FALSE_LABEL);
- if ((UseAVX > 2) && // AVX512
+ if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
VM_Version::supports_avx512vlbw() &&
VM_Version::supports_bmi2()) {
@@ -6926,7 +6926,7 @@
} else {
movl(result, len); // copy
- if (UseAVX == 2 && UseSSE >= 2) {
+ if (UseAVX >= 2 && UseSSE >= 2) {
// With AVX2, use 32-byte vector compare
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
@@ -7099,14 +7099,12 @@
lea(ary2, Address(ary2, limit, Address::times_1));
negptr(limit);
- bind(COMPARE_WIDE_VECTORS);
-
#ifdef _LP64
- if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
+ if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
cmpl(limit, -64);
- jccb(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
+ jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
@@ -7139,7 +7137,7 @@
}//if (VM_Version::supports_avx512vlbw())
#endif //_LP64
-
+ bind(COMPARE_WIDE_VECTORS);
vmovdqu(vec1, Address(ary1, limit, Address::times_1));
vmovdqu(vec2, Address(ary2, limit, Address::times_1));
vpxor(vec1, vec2);
@@ -7365,32 +7363,33 @@
assert( UseSSE >= 2, "supported cpu only" );
Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
movdl(xtmp, value);
- if (UseAVX > 2 && UseUnalignedLoadStores) {
+ if (UseAVX >= 2 && UseUnalignedLoadStores) {
+ Label L_check_fill_32_bytes;
+ if (UseAVX > 2) {
+ // Fill 64-byte chunks
+ Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
+
+ // If number of bytes to fill < AVX3Threshold, perform fill using AVX2
+ cmpl(count, AVX3Threshold);
+ jccb(Assembler::below, L_check_fill_64_bytes_avx2);
+
+ vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
+
+ subl(count, 16 << shift);
+ jccb(Assembler::less, L_check_fill_32_bytes);
+ align(16);
+
+ BIND(L_fill_64_bytes_loop_avx3);
+ evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
+ addptr(to, 64);
+ subl(count, 16 << shift);
+ jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
+ jmpb(L_check_fill_32_bytes);
+
+ BIND(L_check_fill_64_bytes_avx2);
+ }
// Fill 64-byte chunks
- Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
- vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
-
- subl(count, 16 << shift);
- jcc(Assembler::less, L_check_fill_32_bytes);
- align(16);
-
- BIND(L_fill_64_bytes_loop);
- evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
- addptr(to, 64);
- subl(count, 16 << shift);
- jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
-
- BIND(L_check_fill_32_bytes);
- addl(count, 8 << shift);
- jccb(Assembler::less, L_check_fill_8_bytes);
- vmovdqu(Address(to, 0), xtmp);
- addptr(to, 32);
- subl(count, 8 << shift);
-
- BIND(L_check_fill_8_bytes);
- } else if (UseAVX == 2 && UseUnalignedLoadStores) {
- // Fill 64-byte chunks
- Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
+ Label L_fill_64_bytes_loop;
vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
subl(count, 16 << shift);
@@ -8104,12 +8103,13 @@
shlq(length);
xorq(result, result);
- if ((UseAVX > 2) &&
+ if ((AVX3Threshold == 0) && (UseAVX > 2) &&
VM_Version::supports_avx512vlbw()) {
Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
cmpq(length, 64);
jcc(Assembler::less, VECTOR32_TAIL);
+
movq(tmp1, length);
andq(tmp1, 0x3F); // tail count
andq(length, ~(0x3F)); //vector count
@@ -9566,7 +9566,7 @@
// save length for return
push(len);
- if ((UseAVX > 2) && // AVX512
+ if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
VM_Version::supports_avx512vlbw() &&
VM_Version::supports_bmi2()) {
@@ -9758,7 +9758,7 @@
// }
void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
XMMRegister tmp1, Register tmp2) {
- Label copy_chars_loop, done, below_threshold;
+ Label copy_chars_loop, done, below_threshold, avx3_threshold;
// rsi: src
// rdi: dst
// rdx: len
@@ -9768,7 +9768,7 @@
// rdi holds start addr of destination char[]
// rdx holds length
assert_different_registers(src, dst, len, tmp2);
-
+ movl(tmp2, len);
if ((UseAVX > 2) && // AVX512
VM_Version::supports_avx512vlbw() &&
VM_Version::supports_bmi2()) {
@@ -9780,9 +9780,11 @@
testl(len, -16);
jcc(Assembler::zero, below_threshold);
+ testl(len, -1 * AVX3Threshold);
+ jcc(Assembler::zero, avx3_threshold);
+
// In order to use only one arithmetic operation for the main loop we use
// this pre-calculation
- movl(tmp2, len);
andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
andl(len, -32); // vector count
jccb(Assembler::zero, copy_tail);
@@ -9813,12 +9815,11 @@
evmovdquw(Address(dst, 0), k2, tmp1, Assembler::AVX_512bit);
jmp(done);
+ bind(avx3_threshold);
}
if (UseSSE42Intrinsics) {
Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
- movl(tmp2, len);
-
if (UseAVX > 1) {
andl(tmp2, (16 - 1));
andl(len, -16);
@@ -9843,13 +9844,7 @@
bind(below_threshold);
bind(copy_new_tail);
- if ((UseAVX > 2) &&
- VM_Version::supports_avx512vlbw() &&
- VM_Version::supports_bmi2()) {
- movl(tmp2, len);
- } else {
- movl(len, tmp2);
- }
+ movl(len, tmp2);
andl(tmp2, 0x00000007);
andl(len, 0xFFFFFFF8);
jccb(Assembler::zero, copy_tail);
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp Fri Oct 04 12:00:16 2019 -0400
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp Fri Oct 04 11:45:16 2019 -0700
@@ -1288,30 +1288,58 @@
if (UseUnalignedLoadStores) {
Label L_end;
// Copy 64-bytes per iteration
- __ BIND(L_loop);
if (UseAVX > 2) {
+ Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold;
+
+ __ BIND(L_copy_bytes);
+ __ cmpptr(qword_count, (-1 * AVX3Threshold / 8));
+ __ jccb(Assembler::less, L_above_threshold);
+ __ jmpb(L_below_threshold);
+
+ __ bind(L_loop_avx512);
__ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
__ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
- } else if (UseAVX == 2) {
+ __ bind(L_above_threshold);
+ __ addptr(qword_count, 8);
+ __ jcc(Assembler::lessEqual, L_loop_avx512);
+ __ jmpb(L_32_byte_head);
+
+ __ bind(L_loop_avx2);
__ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
__ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
+ __ bind(L_below_threshold);
+ __ addptr(qword_count, 8);
+ __ jcc(Assembler::lessEqual, L_loop_avx2);
+
+ __ bind(L_32_byte_head);
+ __ subptr(qword_count, 4); // sub(8) and add(4)
+ __ jccb(Assembler::greater, L_end);
} else {
- __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
- __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
- __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
- __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
- __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
- __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
- __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
- __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
+ __ BIND(L_loop);
+ if (UseAVX == 2) {
+ __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
+ __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
+ __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
+ __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
+ } else {
+ __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
+ __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
+ __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
+ __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
+ __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
+ __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
+ __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
+ __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
+ }
+
+ __ BIND(L_copy_bytes);
+ __ addptr(qword_count, 8);
+ __ jcc(Assembler::lessEqual, L_loop);
+ __ subptr(qword_count, 4); // sub(8) and add(4)
+ __ jccb(Assembler::greater, L_end);
}
- __ BIND(L_copy_bytes);
- __ addptr(qword_count, 8);
- __ jcc(Assembler::lessEqual, L_loop);
- __ subptr(qword_count, 4); // sub(8) and add(4)
- __ jccb(Assembler::greater, L_end);
// Copy trailing 32 bytes
if (UseAVX >= 2) {
__ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
@@ -1368,31 +1396,59 @@
if (UseUnalignedLoadStores) {
Label L_end;
// Copy 64-bytes per iteration
- __ BIND(L_loop);
if (UseAVX > 2) {
+ Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold;
+
+ __ BIND(L_copy_bytes);
+ __ cmpptr(qword_count, (AVX3Threshold / 8));
+ __ jccb(Assembler::greater, L_above_threshold);
+ __ jmpb(L_below_threshold);
+
+ __ BIND(L_loop_avx512);
__ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit);
__ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit);
- } else if (UseAVX == 2) {
+ __ bind(L_above_threshold);
+ __ subptr(qword_count, 8);
+ __ jcc(Assembler::greaterEqual, L_loop_avx512);
+ __ jmpb(L_32_byte_head);
+
+ __ bind(L_loop_avx2);
__ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
__ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
- __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
- __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
+ __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
+ __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
+ __ bind(L_below_threshold);
+ __ subptr(qword_count, 8);
+ __ jcc(Assembler::greaterEqual, L_loop_avx2);
+
+ __ bind(L_32_byte_head);
+ __ addptr(qword_count, 4); // add(8) and sub(4)
+ __ jccb(Assembler::less, L_end);
} else {
- __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
- __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
- __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
- __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
- __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
- __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
- __ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0));
- __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3);
+ __ BIND(L_loop);
+ if (UseAVX == 2) {
+ __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
+ __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
+ __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
+ __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
+ } else {
+ __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
+ __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
+ __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
+ __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
+ __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
+ __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
+ __ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0));
+ __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3);
+ }
+
+ __ BIND(L_copy_bytes);
+ __ subptr(qword_count, 8);
+ __ jcc(Assembler::greaterEqual, L_loop);
+
+ __ addptr(qword_count, 4); // add(8) and sub(4)
+ __ jccb(Assembler::less, L_end);
}
- __ BIND(L_copy_bytes);
- __ subptr(qword_count, 8);
- __ jcc(Assembler::greaterEqual, L_loop);
-
- __ addptr(qword_count, 4); // add(8) and sub(4)
- __ jccb(Assembler::less, L_end);
// Copy trailing 32 bytes
if (UseAVX >= 2) {
__ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
--- a/src/hotspot/cpu/x86/vm_version_x86.cpp Fri Oct 04 12:00:16 2019 -0400
+++ b/src/hotspot/cpu/x86/vm_version_x86.cpp Fri Oct 04 11:45:16 2019 -0700
@@ -381,6 +381,10 @@
__ cmpl(rax, 0xE0);
__ jccb(Assembler::notEqual, legacy_setup); // jump if EVEX is not supported
+ __ lea(rsi, Address(rbp, in_bytes(VM_Version::std_cpuid1_offset())));
+ __ movl(rax, Address(rsi, 0));
+ __ cmpl(rax, 0x50654); // If it is Skylake
+ __ jcc(Assembler::equal, legacy_setup);
// If UseAVX is unitialized or is set by the user to include EVEX
if (use_evex) {
// EVEX setup: run in lowest evex mode
@@ -465,6 +469,11 @@
__ cmpl(rax, 0xE0);
__ jcc(Assembler::notEqual, legacy_save_restore);
+ __ lea(rsi, Address(rbp, in_bytes(VM_Version::std_cpuid1_offset())));
+ __ movl(rax, Address(rsi, 0));
+ __ cmpl(rax, 0x50654); // If it is Skylake
+ __ jcc(Assembler::equal, legacy_save_restore);
+
// If UseAVX is unitialized or is set by the user to include EVEX
if (use_evex) {
// EVEX check: run in lowest evex mode
@@ -660,6 +669,9 @@
}
if (FLAG_IS_DEFAULT(UseAVX)) {
FLAG_SET_DEFAULT(UseAVX, use_avx_limit);
+ if (is_intel_family_core() && _model == CPU_MODEL_SKYLAKE && _stepping < 5) {
+ FLAG_SET_DEFAULT(UseAVX, 2); //Set UseAVX=2 for Skylake
+ }
} else if (UseAVX > use_avx_limit) {
warning("UseAVX=%d is not supported on this CPU, setting it to UseAVX=%d", (int) UseAVX, use_avx_limit);
FLAG_SET_DEFAULT(UseAVX, use_avx_limit);
@@ -1059,6 +1071,13 @@
}
#endif // COMPILER2 && ASSERT
+ if (!FLAG_IS_DEFAULT(AVX3Threshold)) {
+ if (!is_power_of_2(AVX3Threshold)) {
+ warning("AVX3Threshold must be a power of 2");
+ FLAG_SET_DEFAULT(AVX3Threshold, 4096);
+ }
+ }
+
#ifdef _LP64
if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
UseMultiplyToLenIntrinsic = true;
--- a/src/hotspot/cpu/x86/vm_version_x86.hpp Fri Oct 04 12:00:16 2019 -0400
+++ b/src/hotspot/cpu/x86/vm_version_x86.hpp Fri Oct 04 11:45:16 2019 -0700
@@ -366,7 +366,7 @@
CPU_MODEL_HASWELL_E3 = 0x3c,
CPU_MODEL_HASWELL_E7 = 0x3f,
CPU_MODEL_BROADWELL = 0x3d,
- CPU_MODEL_SKYLAKE = CPU_MODEL_HASWELL_E3
+ CPU_MODEL_SKYLAKE = 0x55
};
// cpuid information block. All info derived from executing cpuid with
--- a/src/hotspot/cpu/x86/x86.ad Fri Oct 04 12:00:16 2019 -0400
+++ b/src/hotspot/cpu/x86/x86.ad Fri Oct 04 11:45:16 2019 -0700
@@ -3861,7 +3861,7 @@
%}
instruct Repl2F_zero(vecD dst, immF0 zero) %{
- predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
+ predicate(n->as_Vector()->length() == 2);
match(Set dst (ReplicateF zero));
format %{ "xorps $dst,$dst\t! replicate2F zero" %}
ins_encode %{
@@ -3871,7 +3871,7 @@
%}
instruct Repl4F_zero(vecX dst, immF0 zero) %{
- predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
+ predicate(n->as_Vector()->length() == 4);
match(Set dst (ReplicateF zero));
format %{ "xorps $dst,$dst\t! replicate4F zero" %}
ins_encode %{
@@ -3881,7 +3881,7 @@
%}
instruct Repl8F_zero(vecY dst, immF0 zero) %{
- predicate(n->as_Vector()->length() == 8 && UseAVX < 3);
+ predicate(n->as_Vector()->length() == 8 && UseAVX > 0);
match(Set dst (ReplicateF zero));
format %{ "vxorps $dst,$dst,$dst\t! replicate8F zero" %}
ins_encode %{
@@ -3955,7 +3955,7 @@
// Replicate double (8 byte) scalar zero to be vector
instruct Repl2D_zero(vecX dst, immD0 zero) %{
- predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
+ predicate(n->as_Vector()->length() == 2);
match(Set dst (ReplicateD zero));
format %{ "xorpd $dst,$dst\t! replicate2D zero" %}
ins_encode %{
@@ -3965,7 +3965,7 @@
%}
instruct Repl4D_zero(vecY dst, immD0 zero) %{
- predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
+ predicate(n->as_Vector()->length() == 4 && UseAVX > 0);
match(Set dst (ReplicateD zero));
format %{ "vxorpd $dst,$dst,$dst,vect256\t! replicate4D zero" %}
ins_encode %{
@@ -4890,42 +4890,6 @@
ins_pipe( pipe_slow );
%}
-instruct Repl2F_zero_evex(vecD dst, immF0 zero) %{
- predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
- match(Set dst (ReplicateF zero));
- format %{ "vpxor $dst k0,$dst,$dst\t! replicate2F zero" %}
- ins_encode %{
- // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
- int vector_len = 2;
- __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
- %}
- ins_pipe( fpu_reg_reg );
-%}
-
-instruct Repl4F_zero_evex(vecX dst, immF0 zero) %{
- predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
- match(Set dst (ReplicateF zero));
- format %{ "vpxor $dst k0,$dst,$dst\t! replicate4F zero" %}
- ins_encode %{
- // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
- int vector_len = 2;
- __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
- %}
- ins_pipe( fpu_reg_reg );
-%}
-
-instruct Repl8F_zero_evex(vecY dst, immF0 zero) %{
- predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
- match(Set dst (ReplicateF zero));
- format %{ "vpxor $dst k0,$dst,$dst\t! replicate8F zero" %}
- ins_encode %{
- // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
- int vector_len = 2;
- __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
- %}
- ins_pipe( fpu_reg_reg );
-%}
-
instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
match(Set dst (ReplicateF zero));
@@ -4982,30 +4946,6 @@
ins_pipe( pipe_slow );
%}
-instruct Repl2D_zero_evex(vecX dst, immD0 zero) %{
- predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
- match(Set dst (ReplicateD zero));
- format %{ "vpxor $dst k0,$dst,$dst\t! replicate2D zero" %}
- ins_encode %{
- // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
- int vector_len = 2;
- __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
- %}
- ins_pipe( fpu_reg_reg );
-%}
-
-instruct Repl4D_zero_evex(vecY dst, immD0 zero) %{
- predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
- match(Set dst (ReplicateD zero));
- format %{ "vpxor $dst k0,$dst,$dst\t! replicate4D zero" %}
- ins_encode %{
- // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
- int vector_len = 2;
- __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
- %}
- ins_pipe( fpu_reg_reg );
-%}
-
instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
match(Set dst (ReplicateD zero));