--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp Fri Oct 04 12:00:16 2019 -0400
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp Fri Oct 04 11:45:16 2019 -0700
@@ -1288,30 +1288,58 @@
if (UseUnalignedLoadStores) {
Label L_end;
// Copy 64-bytes per iteration
- __ BIND(L_loop);
if (UseAVX > 2) {
+ Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold;
+
+ __ BIND(L_copy_bytes);
+ __ cmpptr(qword_count, (-1 * AVX3Threshold / 8));
+ __ jccb(Assembler::less, L_above_threshold);
+ __ jmpb(L_below_threshold);
+
+ __ bind(L_loop_avx512);
__ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
__ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
- } else if (UseAVX == 2) {
+ __ bind(L_above_threshold);
+ __ addptr(qword_count, 8);
+ __ jcc(Assembler::lessEqual, L_loop_avx512);
+ __ jmpb(L_32_byte_head);
+
+ __ bind(L_loop_avx2);
__ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
__ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
+ __ bind(L_below_threshold);
+ __ addptr(qword_count, 8);
+ __ jcc(Assembler::lessEqual, L_loop_avx2);
+
+ __ bind(L_32_byte_head);
+ __ subptr(qword_count, 4); // sub(8) and add(4)
+ __ jccb(Assembler::greater, L_end);
} else {
- __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
- __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
- __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
- __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
- __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
- __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
- __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
- __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
+ __ BIND(L_loop);
+ if (UseAVX == 2) {
+ __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
+ __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
+ __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
+ __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
+ } else {
+ __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
+ __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
+ __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
+ __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
+ __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
+ __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
+ __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
+ __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
+ }
+
+ __ BIND(L_copy_bytes);
+ __ addptr(qword_count, 8);
+ __ jcc(Assembler::lessEqual, L_loop);
+ __ subptr(qword_count, 4); // sub(8) and add(4)
+ __ jccb(Assembler::greater, L_end);
}
- __ BIND(L_copy_bytes);
- __ addptr(qword_count, 8);
- __ jcc(Assembler::lessEqual, L_loop);
- __ subptr(qword_count, 4); // sub(8) and add(4)
- __ jccb(Assembler::greater, L_end);
// Copy trailing 32 bytes
if (UseAVX >= 2) {
__ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
@@ -1368,31 +1396,59 @@
if (UseUnalignedLoadStores) {
Label L_end;
// Copy 64-bytes per iteration
- __ BIND(L_loop);
if (UseAVX > 2) {
+ Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold;
+
+ __ BIND(L_copy_bytes);
+ __ cmpptr(qword_count, (AVX3Threshold / 8));
+ __ jccb(Assembler::greater, L_above_threshold);
+ __ jmpb(L_below_threshold);
+
+ __ BIND(L_loop_avx512);
__ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit);
__ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit);
- } else if (UseAVX == 2) {
+ __ bind(L_above_threshold);
+ __ subptr(qword_count, 8);
+ __ jcc(Assembler::greaterEqual, L_loop_avx512);
+ __ jmpb(L_32_byte_head);
+
+ __ bind(L_loop_avx2);
__ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
__ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
- __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
- __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
+ __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
+ __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
+ __ bind(L_below_threshold);
+ __ subptr(qword_count, 8);
+ __ jcc(Assembler::greaterEqual, L_loop_avx2);
+
+ __ bind(L_32_byte_head);
+ __ addptr(qword_count, 4); // add(8) and sub(4)
+ __ jccb(Assembler::less, L_end);
} else {
- __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
- __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
- __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
- __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
- __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
- __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
- __ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0));
- __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3);
+ __ BIND(L_loop);
+ if (UseAVX == 2) {
+ __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
+ __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
+ __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
+ __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
+ } else {
+ __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
+ __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
+ __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
+ __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
+ __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
+ __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
+ __ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0));
+ __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3);
+ }
+
+ __ BIND(L_copy_bytes);
+ __ subptr(qword_count, 8);
+ __ jcc(Assembler::greaterEqual, L_loop);
+
+ __ addptr(qword_count, 4); // add(8) and sub(4)
+ __ jccb(Assembler::less, L_end);
}
- __ BIND(L_copy_bytes);
- __ subptr(qword_count, 8);
- __ jcc(Assembler::greaterEqual, L_loop);
-
- __ addptr(qword_count, 4); // add(8) and sub(4)
- __ jccb(Assembler::less, L_end);
// Copy trailing 32 bytes
if (UseAVX >= 2) {
__ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));