--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp Fri Oct 04 12:00:16 2019 -0400
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp Fri Oct 04 11:45:16 2019 -0700
@@ -6593,7 +6593,7 @@
bind(COMPARE_WIDE_VECTORS_LOOP);
#ifdef _LP64
- if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
+ if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
cmpl(cnt2, stride2x2);
jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
testl(cnt2, stride2x2-1); // cnt2 holds the vector count
@@ -6853,7 +6853,7 @@
testl(len, len);
jcc(Assembler::zero, FALSE_LABEL);
- if ((UseAVX > 2) && // AVX512
+ if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
VM_Version::supports_avx512vlbw() &&
VM_Version::supports_bmi2()) {
@@ -6926,7 +6926,7 @@
} else {
movl(result, len); // copy
- if (UseAVX == 2 && UseSSE >= 2) {
+ if (UseAVX >= 2 && UseSSE >= 2) {
// With AVX2, use 32-byte vector compare
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
@@ -7099,14 +7099,12 @@
lea(ary2, Address(ary2, limit, Address::times_1));
negptr(limit);
- bind(COMPARE_WIDE_VECTORS);
-
#ifdef _LP64
- if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
+ if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
cmpl(limit, -64);
- jccb(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
+ jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
@@ -7139,7 +7137,7 @@
}//if (VM_Version::supports_avx512vlbw())
#endif //_LP64
-
+ bind(COMPARE_WIDE_VECTORS);
vmovdqu(vec1, Address(ary1, limit, Address::times_1));
vmovdqu(vec2, Address(ary2, limit, Address::times_1));
vpxor(vec1, vec2);
@@ -7365,32 +7363,33 @@
assert( UseSSE >= 2, "supported cpu only" );
Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
movdl(xtmp, value);
- if (UseAVX > 2 && UseUnalignedLoadStores) {
+ if (UseAVX >= 2 && UseUnalignedLoadStores) {
+ Label L_check_fill_32_bytes;
+ if (UseAVX > 2) {
+ // Fill 64-byte chunks
+ Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
+
+ // If number of bytes to fill < AVX3Threshold, perform fill using AVX2
+ cmpl(count, AVX3Threshold);
+ jccb(Assembler::below, L_check_fill_64_bytes_avx2);
+
+ vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
+
+ subl(count, 16 << shift);
+ jccb(Assembler::less, L_check_fill_32_bytes);
+ align(16);
+
+ BIND(L_fill_64_bytes_loop_avx3);
+ evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
+ addptr(to, 64);
+ subl(count, 16 << shift);
+ jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
+ jmpb(L_check_fill_32_bytes);
+
+ BIND(L_check_fill_64_bytes_avx2);
+ }
// Fill 64-byte chunks
- Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
- vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
-
- subl(count, 16 << shift);
- jcc(Assembler::less, L_check_fill_32_bytes);
- align(16);
-
- BIND(L_fill_64_bytes_loop);
- evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
- addptr(to, 64);
- subl(count, 16 << shift);
- jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
-
- BIND(L_check_fill_32_bytes);
- addl(count, 8 << shift);
- jccb(Assembler::less, L_check_fill_8_bytes);
- vmovdqu(Address(to, 0), xtmp);
- addptr(to, 32);
- subl(count, 8 << shift);
-
- BIND(L_check_fill_8_bytes);
- } else if (UseAVX == 2 && UseUnalignedLoadStores) {
- // Fill 64-byte chunks
- Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
+ Label L_fill_64_bytes_loop;
vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
subl(count, 16 << shift);
@@ -8104,12 +8103,13 @@
shlq(length);
xorq(result, result);
- if ((UseAVX > 2) &&
+ if ((AVX3Threshold == 0) && (UseAVX > 2) &&
VM_Version::supports_avx512vlbw()) {
Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
cmpq(length, 64);
jcc(Assembler::less, VECTOR32_TAIL);
+
movq(tmp1, length);
andq(tmp1, 0x3F); // tail count
andq(length, ~(0x3F)); //vector count
@@ -9566,7 +9566,7 @@
// save length for return
push(len);
- if ((UseAVX > 2) && // AVX512
+ if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
VM_Version::supports_avx512vlbw() &&
VM_Version::supports_bmi2()) {
@@ -9758,7 +9758,7 @@
// }
void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
XMMRegister tmp1, Register tmp2) {
- Label copy_chars_loop, done, below_threshold;
+ Label copy_chars_loop, done, below_threshold, avx3_threshold;
// rsi: src
// rdi: dst
// rdx: len
@@ -9768,7 +9768,7 @@
// rdi holds start addr of destination char[]
// rdx holds length
assert_different_registers(src, dst, len, tmp2);
-
+ movl(tmp2, len);
if ((UseAVX > 2) && // AVX512
VM_Version::supports_avx512vlbw() &&
VM_Version::supports_bmi2()) {
@@ -9780,9 +9780,11 @@
testl(len, -16);
jcc(Assembler::zero, below_threshold);
+ testl(len, -1 * AVX3Threshold);
+ jcc(Assembler::zero, avx3_threshold);
+
// In order to use only one arithmetic operation for the main loop we use
// this pre-calculation
- movl(tmp2, len);
andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
andl(len, -32); // vector count
jccb(Assembler::zero, copy_tail);
@@ -9813,12 +9815,11 @@
evmovdquw(Address(dst, 0), k2, tmp1, Assembler::AVX_512bit);
jmp(done);
+ bind(avx3_threshold);
}
if (UseSSE42Intrinsics) {
Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
- movl(tmp2, len);
-
if (UseAVX > 1) {
andl(tmp2, (16 - 1));
andl(len, -16);
@@ -9843,13 +9844,7 @@
bind(below_threshold);
bind(copy_new_tail);
- if ((UseAVX > 2) &&
- VM_Version::supports_avx512vlbw() &&
- VM_Version::supports_bmi2()) {
- movl(tmp2, len);
- } else {
- movl(len, tmp2);
- }
+ movl(len, tmp2);
andl(tmp2, 0x00000007);
andl(len, 0xFFFFFFF8);
jccb(Assembler::zero, copy_tail);