6591 |
6591 |
6592 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) |
6592 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) |
6593 bind(COMPARE_WIDE_VECTORS_LOOP); |
6593 bind(COMPARE_WIDE_VECTORS_LOOP); |
6594 |
6594 |
6595 #ifdef _LP64 |
6595 #ifdef _LP64 |
6596 if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop |
6596 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop |
6597 cmpl(cnt2, stride2x2); |
6597 cmpl(cnt2, stride2x2); |
6598 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); |
6598 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); |
6599 testl(cnt2, stride2x2-1); // cnt2 holds the vector count |
6599 testl(cnt2, stride2x2-1); // cnt2 holds the vector count |
6600 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 |
6600 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 |
6601 |
6601 |
6924 |
6924 |
6925 jmp(FALSE_LABEL); |
6925 jmp(FALSE_LABEL); |
6926 } else { |
6926 } else { |
6927 movl(result, len); // copy |
6927 movl(result, len); // copy |
6928 |
6928 |
6929 if (UseAVX == 2 && UseSSE >= 2) { |
6929 if (UseAVX >= 2 && UseSSE >= 2) { |
6930 // With AVX2, use 32-byte vector compare |
6930 // With AVX2, use 32-byte vector compare |
6931 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; |
6931 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; |
6932 |
6932 |
6933 // Compare 32-byte vectors |
6933 // Compare 32-byte vectors |
6934 andl(result, 0x0000001f); // tail count (in bytes) |
6934 andl(result, 0x0000001f); // tail count (in bytes) |
7097 |
7097 |
7098 lea(ary1, Address(ary1, limit, Address::times_1)); |
7098 lea(ary1, Address(ary1, limit, Address::times_1)); |
7099 lea(ary2, Address(ary2, limit, Address::times_1)); |
7099 lea(ary2, Address(ary2, limit, Address::times_1)); |
7100 negptr(limit); |
7100 negptr(limit); |
7101 |
7101 |
7102 bind(COMPARE_WIDE_VECTORS); |
|
7103 |
|
7104 #ifdef _LP64 |
7102 #ifdef _LP64 |
7105 if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop |
7103 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop |
7106 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; |
7104 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; |
7107 |
7105 |
7108 cmpl(limit, -64); |
7106 cmpl(limit, -64); |
7109 jccb(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); |
7107 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); |
7110 |
7108 |
7111 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop |
7109 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop |
7112 |
7110 |
7113 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); |
7111 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); |
7114 evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); |
7112 evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); |
7137 |
7135 |
7138 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); |
7136 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); |
7139 |
7137 |
7140 }//if (VM_Version::supports_avx512vlbw()) |
7138 }//if (VM_Version::supports_avx512vlbw()) |
7141 #endif //_LP64 |
7139 #endif //_LP64 |
7142 |
7140 bind(COMPARE_WIDE_VECTORS); |
7143 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); |
7141 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); |
7144 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); |
7142 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); |
7145 vpxor(vec1, vec2); |
7143 vpxor(vec1, vec2); |
7146 |
7144 |
7147 vptest(vec1, vec1); |
7145 vptest(vec1, vec1); |
7363 BIND(L_fill_32_bytes); |
7361 BIND(L_fill_32_bytes); |
7364 { |
7362 { |
7365 assert( UseSSE >= 2, "supported cpu only" ); |
7363 assert( UseSSE >= 2, "supported cpu only" ); |
7366 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes; |
7364 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes; |
7367 movdl(xtmp, value); |
7365 movdl(xtmp, value); |
7368 if (UseAVX > 2 && UseUnalignedLoadStores) { |
7366 if (UseAVX >= 2 && UseUnalignedLoadStores) { |
|
7367 Label L_check_fill_32_bytes; |
|
7368 if (UseAVX > 2) { |
|
7369 // Fill 64-byte chunks |
|
7370 Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2; |
|
7371 |
|
7372 // If number of bytes to fill < AVX3Threshold, perform fill using AVX2 |
|
7373 cmpl(count, AVX3Threshold); |
|
7374 jccb(Assembler::below, L_check_fill_64_bytes_avx2); |
|
7375 |
|
7376 vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit); |
|
7377 |
|
7378 subl(count, 16 << shift); |
|
7379 jccb(Assembler::less, L_check_fill_32_bytes); |
|
7380 align(16); |
|
7381 |
|
7382 BIND(L_fill_64_bytes_loop_avx3); |
|
7383 evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit); |
|
7384 addptr(to, 64); |
|
7385 subl(count, 16 << shift); |
|
7386 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3); |
|
7387 jmpb(L_check_fill_32_bytes); |
|
7388 |
|
7389 BIND(L_check_fill_64_bytes_avx2); |
|
7390 } |
7369 // Fill 64-byte chunks |
7391 // Fill 64-byte chunks |
7370 Label L_fill_64_bytes_loop, L_check_fill_32_bytes; |
7392 Label L_fill_64_bytes_loop; |
7371 vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit); |
|
7372 |
|
7373 subl(count, 16 << shift); |
|
7374 jcc(Assembler::less, L_check_fill_32_bytes); |
|
7375 align(16); |
|
7376 |
|
7377 BIND(L_fill_64_bytes_loop); |
|
7378 evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit); |
|
7379 addptr(to, 64); |
|
7380 subl(count, 16 << shift); |
|
7381 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop); |
|
7382 |
|
7383 BIND(L_check_fill_32_bytes); |
|
7384 addl(count, 8 << shift); |
|
7385 jccb(Assembler::less, L_check_fill_8_bytes); |
|
7386 vmovdqu(Address(to, 0), xtmp); |
|
7387 addptr(to, 32); |
|
7388 subl(count, 8 << shift); |
|
7389 |
|
7390 BIND(L_check_fill_8_bytes); |
|
7391 } else if (UseAVX == 2 && UseUnalignedLoadStores) { |
|
7392 // Fill 64-byte chunks |
|
7393 Label L_fill_64_bytes_loop, L_check_fill_32_bytes; |
|
7394 vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit); |
7393 vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit); |
7395 |
7394 |
7396 subl(count, 16 << shift); |
7395 subl(count, 16 << shift); |
7397 jcc(Assembler::less, L_check_fill_32_bytes); |
7396 jcc(Assembler::less, L_check_fill_32_bytes); |
7398 align(16); |
7397 align(16); |
8102 ShortBranchVerifier sbv(this); |
8101 ShortBranchVerifier sbv(this); |
8103 |
8102 |
8104 shlq(length); |
8103 shlq(length); |
8105 xorq(result, result); |
8104 xorq(result, result); |
8106 |
8105 |
8107 if ((UseAVX > 2) && |
8106 if ((AVX3Threshold == 0) && (UseAVX > 2) && |
8108 VM_Version::supports_avx512vlbw()) { |
8107 VM_Version::supports_avx512vlbw()) { |
8109 Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL; |
8108 Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL; |
8110 |
8109 |
8111 cmpq(length, 64); |
8110 cmpq(length, 64); |
8112 jcc(Assembler::less, VECTOR32_TAIL); |
8111 jcc(Assembler::less, VECTOR32_TAIL); |
|
8112 |
8113 movq(tmp1, length); |
8113 movq(tmp1, length); |
8114 andq(tmp1, 0x3F); // tail count |
8114 andq(tmp1, 0x3F); // tail count |
8115 andq(length, ~(0x3F)); //vector count |
8115 andq(length, ~(0x3F)); //vector count |
8116 |
8116 |
8117 bind(VECTOR64_LOOP); |
8117 bind(VECTOR64_LOOP); |
9564 assert(len != result, ""); |
9564 assert(len != result, ""); |
9565 |
9565 |
9566 // save length for return |
9566 // save length for return |
9567 push(len); |
9567 push(len); |
9568 |
9568 |
9569 if ((UseAVX > 2) && // AVX512 |
9569 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 |
9570 VM_Version::supports_avx512vlbw() && |
9570 VM_Version::supports_avx512vlbw() && |
9571 VM_Version::supports_bmi2()) { |
9571 VM_Version::supports_bmi2()) { |
9572 |
9572 |
9573 Label copy_32_loop, copy_loop_tail, below_threshold; |
9573 Label copy_32_loop, copy_loop_tail, below_threshold; |
9574 |
9574 |
9756 // dst[dstOff++] = (char)(src[srcOff++] & 0xff); |
9756 // dst[dstOff++] = (char)(src[srcOff++] & 0xff); |
9757 // } |
9757 // } |
9758 // } |
9758 // } |
9759 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, |
9759 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, |
9760 XMMRegister tmp1, Register tmp2) { |
9760 XMMRegister tmp1, Register tmp2) { |
9761 Label copy_chars_loop, done, below_threshold; |
9761 Label copy_chars_loop, done, below_threshold, avx3_threshold; |
9762 // rsi: src |
9762 // rsi: src |
9763 // rdi: dst |
9763 // rdi: dst |
9764 // rdx: len |
9764 // rdx: len |
9765 // rcx: tmp2 |
9765 // rcx: tmp2 |
9766 |
9766 |
9767 // rsi holds start addr of source byte[] to be inflated |
9767 // rsi holds start addr of source byte[] to be inflated |
9768 // rdi holds start addr of destination char[] |
9768 // rdi holds start addr of destination char[] |
9769 // rdx holds length |
9769 // rdx holds length |
9770 assert_different_registers(src, dst, len, tmp2); |
9770 assert_different_registers(src, dst, len, tmp2); |
9771 |
9771 movl(tmp2, len); |
9772 if ((UseAVX > 2) && // AVX512 |
9772 if ((UseAVX > 2) && // AVX512 |
9773 VM_Version::supports_avx512vlbw() && |
9773 VM_Version::supports_avx512vlbw() && |
9774 VM_Version::supports_bmi2()) { |
9774 VM_Version::supports_bmi2()) { |
9775 |
9775 |
9776 Label copy_32_loop, copy_tail; |
9776 Label copy_32_loop, copy_tail; |
9778 |
9778 |
9779 // if length of the string is less than 16, handle it in an old fashioned way |
9779 // if length of the string is less than 16, handle it in an old fashioned way |
9780 testl(len, -16); |
9780 testl(len, -16); |
9781 jcc(Assembler::zero, below_threshold); |
9781 jcc(Assembler::zero, below_threshold); |
9782 |
9782 |
|
9783 testl(len, -1 * AVX3Threshold); |
|
9784 jcc(Assembler::zero, avx3_threshold); |
|
9785 |
9783 // In order to use only one arithmetic operation for the main loop we use |
9786 // In order to use only one arithmetic operation for the main loop we use |
9784 // this pre-calculation |
9787 // this pre-calculation |
9785 movl(tmp2, len); |
|
9786 andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop |
9788 andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop |
9787 andl(len, -32); // vector count |
9789 andl(len, -32); // vector count |
9788 jccb(Assembler::zero, copy_tail); |
9790 jccb(Assembler::zero, copy_tail); |
9789 |
9791 |
9790 lea(src, Address(src, len, Address::times_1)); |
9792 lea(src, Address(src, len, Address::times_1)); |
9811 kmovdl(k2, tmp3_aliased); |
9813 kmovdl(k2, tmp3_aliased); |
9812 evpmovzxbw(tmp1, k2, Address(src, 0), Assembler::AVX_512bit); |
9814 evpmovzxbw(tmp1, k2, Address(src, 0), Assembler::AVX_512bit); |
9813 evmovdquw(Address(dst, 0), k2, tmp1, Assembler::AVX_512bit); |
9815 evmovdquw(Address(dst, 0), k2, tmp1, Assembler::AVX_512bit); |
9814 |
9816 |
9815 jmp(done); |
9817 jmp(done); |
|
9818 bind(avx3_threshold); |
9816 } |
9819 } |
9817 if (UseSSE42Intrinsics) { |
9820 if (UseSSE42Intrinsics) { |
9818 Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail; |
9821 Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail; |
9819 |
|
9820 movl(tmp2, len); |
|
9821 |
9822 |
9822 if (UseAVX > 1) { |
9823 if (UseAVX > 1) { |
9823 andl(tmp2, (16 - 1)); |
9824 andl(tmp2, (16 - 1)); |
9824 andl(len, -16); |
9825 andl(len, -16); |
9825 jccb(Assembler::zero, copy_new_tail); |
9826 jccb(Assembler::zero, copy_new_tail); |
9841 addptr(len, 16); |
9842 addptr(len, 16); |
9842 jcc(Assembler::notZero, copy_16_loop); |
9843 jcc(Assembler::notZero, copy_16_loop); |
9843 |
9844 |
9844 bind(below_threshold); |
9845 bind(below_threshold); |
9845 bind(copy_new_tail); |
9846 bind(copy_new_tail); |
9846 if ((UseAVX > 2) && |
9847 movl(len, tmp2); |
9847 VM_Version::supports_avx512vlbw() && |
|
9848 VM_Version::supports_bmi2()) { |
|
9849 movl(tmp2, len); |
|
9850 } else { |
|
9851 movl(len, tmp2); |
|
9852 } |
|
9853 andl(tmp2, 0x00000007); |
9848 andl(tmp2, 0x00000007); |
9854 andl(len, 0xFFFFFFF8); |
9849 andl(len, 0xFFFFFFF8); |
9855 jccb(Assembler::zero, copy_tail); |
9850 jccb(Assembler::zero, copy_tail); |
9856 |
9851 |
9857 pmovzxbw(tmp1, Address(src, 0)); |
9852 pmovzxbw(tmp1, Address(src, 0)); |