src/hotspot/cpu/x86/macroAssembler_x86.cpp
changeset 58462 c6f1226cfb72
parent 58421 6fc57e391539
child 58536 1b76d17440a0
child 58625 9b54aee889b4
equal deleted inserted replaced
58461:26f0ed77734e 58462:c6f1226cfb72
  6591 
  6591 
  6592     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
  6592     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
  6593     bind(COMPARE_WIDE_VECTORS_LOOP);
  6593     bind(COMPARE_WIDE_VECTORS_LOOP);
  6594 
  6594 
  6595 #ifdef _LP64
  6595 #ifdef _LP64
  6596     if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
  6596     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
  6597       cmpl(cnt2, stride2x2);
  6597       cmpl(cnt2, stride2x2);
  6598       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
  6598       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
  6599       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
  6599       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
  6600       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
  6600       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
  6601 
  6601 
  6851 
  6851 
  6852   // len == 0
  6852   // len == 0
  6853   testl(len, len);
  6853   testl(len, len);
  6854   jcc(Assembler::zero, FALSE_LABEL);
  6854   jcc(Assembler::zero, FALSE_LABEL);
  6855 
  6855 
  6856   if ((UseAVX > 2) && // AVX512
  6856   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
  6857     VM_Version::supports_avx512vlbw() &&
  6857     VM_Version::supports_avx512vlbw() &&
  6858     VM_Version::supports_bmi2()) {
  6858     VM_Version::supports_bmi2()) {
  6859 
  6859 
  6860     Label test_64_loop, test_tail;
  6860     Label test_64_loop, test_tail;
  6861     Register tmp3_aliased = len;
  6861     Register tmp3_aliased = len;
  6924 
  6924 
  6925     jmp(FALSE_LABEL);
  6925     jmp(FALSE_LABEL);
  6926   } else {
  6926   } else {
  6927     movl(result, len); // copy
  6927     movl(result, len); // copy
  6928 
  6928 
  6929     if (UseAVX == 2 && UseSSE >= 2) {
  6929     if (UseAVX >= 2 && UseSSE >= 2) {
  6930       // With AVX2, use 32-byte vector compare
  6930       // With AVX2, use 32-byte vector compare
  6931       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
  6931       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
  6932 
  6932 
  6933       // Compare 32-byte vectors
  6933       // Compare 32-byte vectors
  6934       andl(result, 0x0000001f);  //   tail count (in bytes)
  6934       andl(result, 0x0000001f);  //   tail count (in bytes)
  7097 
  7097 
  7098     lea(ary1, Address(ary1, limit, Address::times_1));
  7098     lea(ary1, Address(ary1, limit, Address::times_1));
  7099     lea(ary2, Address(ary2, limit, Address::times_1));
  7099     lea(ary2, Address(ary2, limit, Address::times_1));
  7100     negptr(limit);
  7100     negptr(limit);
  7101 
  7101 
  7102     bind(COMPARE_WIDE_VECTORS);
       
  7103 
       
  7104 #ifdef _LP64
  7102 #ifdef _LP64
  7105     if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
  7103     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
  7106       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
  7104       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
  7107 
  7105 
  7108       cmpl(limit, -64);
  7106       cmpl(limit, -64);
  7109       jccb(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
  7107       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
  7110 
  7108 
  7111       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
  7109       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
  7112 
  7110 
  7113       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
  7111       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
  7114       evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
  7112       evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
  7137 
  7135 
  7138       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
  7136       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
  7139 
  7137 
  7140     }//if (VM_Version::supports_avx512vlbw())
  7138     }//if (VM_Version::supports_avx512vlbw())
  7141 #endif //_LP64
  7139 #endif //_LP64
  7142 
  7140     bind(COMPARE_WIDE_VECTORS);
  7143     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
  7141     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
  7144     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
  7142     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
  7145     vpxor(vec1, vec2);
  7143     vpxor(vec1, vec2);
  7146 
  7144 
  7147     vptest(vec1, vec1);
  7145     vptest(vec1, vec1);
  7363     BIND(L_fill_32_bytes);
  7361     BIND(L_fill_32_bytes);
  7364     {
  7362     {
  7365       assert( UseSSE >= 2, "supported cpu only" );
  7363       assert( UseSSE >= 2, "supported cpu only" );
  7366       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
  7364       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
  7367       movdl(xtmp, value);
  7365       movdl(xtmp, value);
  7368       if (UseAVX > 2 && UseUnalignedLoadStores) {
  7366       if (UseAVX >= 2 && UseUnalignedLoadStores) {
       
  7367         Label L_check_fill_32_bytes;
       
  7368         if (UseAVX > 2) {
       
  7369           // Fill 64-byte chunks
       
  7370           Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
       
  7371 
       
  7372           // If number of bytes to fill < AVX3Threshold, perform fill using AVX2
       
  7373           cmpl(count, AVX3Threshold);
       
  7374           jccb(Assembler::below, L_check_fill_64_bytes_avx2);
       
  7375 
       
  7376           vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
       
  7377 
       
  7378           subl(count, 16 << shift);
       
  7379           jccb(Assembler::less, L_check_fill_32_bytes);
       
  7380           align(16);
       
  7381 
       
  7382           BIND(L_fill_64_bytes_loop_avx3);
       
  7383           evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
       
  7384           addptr(to, 64);
       
  7385           subl(count, 16 << shift);
       
  7386           jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
       
  7387           jmpb(L_check_fill_32_bytes);
       
  7388 
       
  7389           BIND(L_check_fill_64_bytes_avx2);
       
  7390         }
  7369         // Fill 64-byte chunks
  7391         // Fill 64-byte chunks
  7370         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
  7392         Label L_fill_64_bytes_loop;
  7371         vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
       
  7372 
       
  7373         subl(count, 16 << shift);
       
  7374         jcc(Assembler::less, L_check_fill_32_bytes);
       
  7375         align(16);
       
  7376 
       
  7377         BIND(L_fill_64_bytes_loop);
       
  7378         evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
       
  7379         addptr(to, 64);
       
  7380         subl(count, 16 << shift);
       
  7381         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
       
  7382 
       
  7383         BIND(L_check_fill_32_bytes);
       
  7384         addl(count, 8 << shift);
       
  7385         jccb(Assembler::less, L_check_fill_8_bytes);
       
  7386         vmovdqu(Address(to, 0), xtmp);
       
  7387         addptr(to, 32);
       
  7388         subl(count, 8 << shift);
       
  7389 
       
  7390         BIND(L_check_fill_8_bytes);
       
  7391       } else if (UseAVX == 2 && UseUnalignedLoadStores) {
       
  7392         // Fill 64-byte chunks
       
  7393         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
       
  7394         vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
  7393         vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
  7395 
  7394 
  7396         subl(count, 16 << shift);
  7395         subl(count, 16 << shift);
  7397         jcc(Assembler::less, L_check_fill_32_bytes);
  7396         jcc(Assembler::less, L_check_fill_32_bytes);
  7398         align(16);
  7397         align(16);
  8102   ShortBranchVerifier sbv(this);
  8101   ShortBranchVerifier sbv(this);
  8103 
  8102 
  8104   shlq(length);
  8103   shlq(length);
  8105   xorq(result, result);
  8104   xorq(result, result);
  8106 
  8105 
  8107   if ((UseAVX > 2) &&
  8106   if ((AVX3Threshold == 0) && (UseAVX > 2) &&
  8108       VM_Version::supports_avx512vlbw()) {
  8107       VM_Version::supports_avx512vlbw()) {
  8109     Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
  8108     Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
  8110 
  8109 
  8111     cmpq(length, 64);
  8110     cmpq(length, 64);
  8112     jcc(Assembler::less, VECTOR32_TAIL);
  8111     jcc(Assembler::less, VECTOR32_TAIL);
       
  8112 
  8113     movq(tmp1, length);
  8113     movq(tmp1, length);
  8114     andq(tmp1, 0x3F);      // tail count
  8114     andq(tmp1, 0x3F);      // tail count
  8115     andq(length, ~(0x3F)); //vector count
  8115     andq(length, ~(0x3F)); //vector count
  8116 
  8116 
  8117     bind(VECTOR64_LOOP);
  8117     bind(VECTOR64_LOOP);
  9564   assert(len != result, "");
  9564   assert(len != result, "");
  9565 
  9565 
  9566   // save length for return
  9566   // save length for return
  9567   push(len);
  9567   push(len);
  9568 
  9568 
  9569   if ((UseAVX > 2) && // AVX512
  9569   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
  9570     VM_Version::supports_avx512vlbw() &&
  9570     VM_Version::supports_avx512vlbw() &&
  9571     VM_Version::supports_bmi2()) {
  9571     VM_Version::supports_bmi2()) {
  9572 
  9572 
  9573     Label copy_32_loop, copy_loop_tail, below_threshold;
  9573     Label copy_32_loop, copy_loop_tail, below_threshold;
  9574 
  9574 
  9756 //       dst[dstOff++] = (char)(src[srcOff++] & 0xff);
  9756 //       dst[dstOff++] = (char)(src[srcOff++] & 0xff);
  9757 //     }
  9757 //     }
  9758 //   }
  9758 //   }
  9759 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
  9759 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
  9760   XMMRegister tmp1, Register tmp2) {
  9760   XMMRegister tmp1, Register tmp2) {
  9761   Label copy_chars_loop, done, below_threshold;
  9761   Label copy_chars_loop, done, below_threshold, avx3_threshold;
  9762   // rsi: src
  9762   // rsi: src
  9763   // rdi: dst
  9763   // rdi: dst
  9764   // rdx: len
  9764   // rdx: len
  9765   // rcx: tmp2
  9765   // rcx: tmp2
  9766 
  9766 
  9767   // rsi holds start addr of source byte[] to be inflated
  9767   // rsi holds start addr of source byte[] to be inflated
  9768   // rdi holds start addr of destination char[]
  9768   // rdi holds start addr of destination char[]
  9769   // rdx holds length
  9769   // rdx holds length
  9770   assert_different_registers(src, dst, len, tmp2);
  9770   assert_different_registers(src, dst, len, tmp2);
  9771 
  9771   movl(tmp2, len);
  9772   if ((UseAVX > 2) && // AVX512
  9772   if ((UseAVX > 2) && // AVX512
  9773     VM_Version::supports_avx512vlbw() &&
  9773     VM_Version::supports_avx512vlbw() &&
  9774     VM_Version::supports_bmi2()) {
  9774     VM_Version::supports_bmi2()) {
  9775 
  9775 
  9776     Label copy_32_loop, copy_tail;
  9776     Label copy_32_loop, copy_tail;
  9778 
  9778 
  9779     // if length of the string is less than 16, handle it in an old fashioned way
  9779     // if length of the string is less than 16, handle it in an old fashioned way
  9780     testl(len, -16);
  9780     testl(len, -16);
  9781     jcc(Assembler::zero, below_threshold);
  9781     jcc(Assembler::zero, below_threshold);
  9782 
  9782 
       
  9783     testl(len, -1 * AVX3Threshold);
       
  9784     jcc(Assembler::zero, avx3_threshold);
       
  9785 
  9783     // In order to use only one arithmetic operation for the main loop we use
  9786     // In order to use only one arithmetic operation for the main loop we use
  9784     // this pre-calculation
  9787     // this pre-calculation
  9785     movl(tmp2, len);
       
  9786     andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
  9788     andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
  9787     andl(len, -32);     // vector count
  9789     andl(len, -32);     // vector count
  9788     jccb(Assembler::zero, copy_tail);
  9790     jccb(Assembler::zero, copy_tail);
  9789 
  9791 
  9790     lea(src, Address(src, len, Address::times_1));
  9792     lea(src, Address(src, len, Address::times_1));
  9811     kmovdl(k2, tmp3_aliased);
  9813     kmovdl(k2, tmp3_aliased);
  9812     evpmovzxbw(tmp1, k2, Address(src, 0), Assembler::AVX_512bit);
  9814     evpmovzxbw(tmp1, k2, Address(src, 0), Assembler::AVX_512bit);
  9813     evmovdquw(Address(dst, 0), k2, tmp1, Assembler::AVX_512bit);
  9815     evmovdquw(Address(dst, 0), k2, tmp1, Assembler::AVX_512bit);
  9814 
  9816 
  9815     jmp(done);
  9817     jmp(done);
       
  9818     bind(avx3_threshold);
  9816   }
  9819   }
  9817   if (UseSSE42Intrinsics) {
  9820   if (UseSSE42Intrinsics) {
  9818     Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
  9821     Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
  9819 
       
  9820     movl(tmp2, len);
       
  9821 
  9822 
  9822     if (UseAVX > 1) {
  9823     if (UseAVX > 1) {
  9823       andl(tmp2, (16 - 1));
  9824       andl(tmp2, (16 - 1));
  9824       andl(len, -16);
  9825       andl(len, -16);
  9825       jccb(Assembler::zero, copy_new_tail);
  9826       jccb(Assembler::zero, copy_new_tail);
  9841       addptr(len, 16);
  9842       addptr(len, 16);
  9842       jcc(Assembler::notZero, copy_16_loop);
  9843       jcc(Assembler::notZero, copy_16_loop);
  9843 
  9844 
  9844       bind(below_threshold);
  9845       bind(below_threshold);
  9845       bind(copy_new_tail);
  9846       bind(copy_new_tail);
  9846       if ((UseAVX > 2) &&
  9847       movl(len, tmp2);
  9847         VM_Version::supports_avx512vlbw() &&
       
  9848         VM_Version::supports_bmi2()) {
       
  9849         movl(tmp2, len);
       
  9850       } else {
       
  9851         movl(len, tmp2);
       
  9852       }
       
  9853       andl(tmp2, 0x00000007);
  9848       andl(tmp2, 0x00000007);
  9854       andl(len, 0xFFFFFFF8);
  9849       andl(len, 0xFFFFFFF8);
  9855       jccb(Assembler::zero, copy_tail);
  9850       jccb(Assembler::zero, copy_tail);
  9856 
  9851 
  9857       pmovzxbw(tmp1, Address(src, 0));
  9852       pmovzxbw(tmp1, Address(src, 0));