diff -r ed04bc1ff453 -r 21154cb84d2a src/hotspot/cpu/x86/macroAssembler_x86.cpp --- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp Tue Sep 04 18:32:28 2018 +0100 +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp Tue Sep 04 12:44:02 2018 -0700 @@ -4166,19 +4166,22 @@ if ((dst_enc < 16) && (nds_enc < 16)) { vandps(dst, nds, negate_field, vector_len); } else if ((src_enc < 16) && (dst_enc < 16)) { + // Use src scratch register evmovdqul(src, nds, Assembler::AVX_512bit); vandps(dst, src, negate_field, vector_len); + } else if (dst_enc < 16) { + evmovdqul(dst, nds, Assembler::AVX_512bit); + vandps(dst, dst, negate_field, vector_len); + } else if (nds_enc < 16) { + vandps(nds, nds, negate_field, vector_len); + evmovdqul(dst, nds, Assembler::AVX_512bit); } else if (src_enc < 16) { evmovdqul(src, nds, Assembler::AVX_512bit); vandps(src, src, negate_field, vector_len); evmovdqul(dst, src, Assembler::AVX_512bit); - } else if (dst_enc < 16) { - evmovdqul(src, xmm0, Assembler::AVX_512bit); - evmovdqul(xmm0, nds, Assembler::AVX_512bit); - vandps(dst, xmm0, negate_field, vector_len); - evmovdqul(xmm0, src, Assembler::AVX_512bit); } else { if (src_enc != dst_enc) { + // Use src scratch register evmovdqul(src, xmm0, Assembler::AVX_512bit); evmovdqul(xmm0, nds, Assembler::AVX_512bit); vandps(xmm0, xmm0, negate_field, vector_len); @@ -4201,17 +4204,19 @@ if ((dst_enc < 16) && (nds_enc < 16)) { vandpd(dst, nds, negate_field, vector_len); } else if ((src_enc < 16) && (dst_enc < 16)) { + // Use src scratch register evmovdqul(src, nds, Assembler::AVX_512bit); vandpd(dst, src, negate_field, vector_len); + } else if (dst_enc < 16) { + evmovdqul(dst, nds, Assembler::AVX_512bit); + vandpd(dst, dst, negate_field, vector_len); + } else if (nds_enc < 16) { + vandpd(nds, nds, negate_field, vector_len); + evmovdqul(dst, nds, Assembler::AVX_512bit); } else if (src_enc < 16) { evmovdqul(src, nds, Assembler::AVX_512bit); vandpd(src, src, negate_field, vector_len); evmovdqul(dst, src, Assembler::AVX_512bit); - } else if (dst_enc < 16) { - evmovdqul(src, xmm0, Assembler::AVX_512bit); - evmovdqul(xmm0, nds, Assembler::AVX_512bit); - vandpd(dst, xmm0, negate_field, vector_len); - evmovdqul(xmm0, src, Assembler::AVX_512bit); } else { if (src_enc != dst_enc) { evmovdqul(src, xmm0, Assembler::AVX_512bit); @@ -4282,6 +4287,7 @@ evmovdqul(nds, xmm0, Assembler::AVX_512bit); evmovdqul(xmm0, dst, Assembler::AVX_512bit); Assembler::vpaddb(xmm0, xmm0, src, vector_len); + evmovdqul(dst, xmm0, Assembler::AVX_512bit); evmovdqul(xmm0, nds, Assembler::AVX_512bit); } } @@ -4330,7 +4336,7 @@ } else if (dst_enc < 16) { Assembler::vpaddw(dst, dst, src, vector_len); } else if (nds_enc < 16) { - // implies dst_enc in upper bank with src as scratch + // implies dst_enc in upper bank with nds as scratch evmovdqul(nds, dst, Assembler::AVX_512bit); Assembler::vpaddw(nds, nds, src, vector_len); evmovdqul(dst, nds, Assembler::AVX_512bit); @@ -4339,6 +4345,7 @@ evmovdqul(nds, xmm0, Assembler::AVX_512bit); evmovdqul(xmm0, dst, Assembler::AVX_512bit); Assembler::vpaddw(xmm0, xmm0, src, vector_len); + evmovdqul(dst, xmm0, Assembler::AVX_512bit); evmovdqul(xmm0, nds, Assembler::AVX_512bit); } } @@ -4522,6 +4529,7 @@ evmovdqul(nds, xmm0, Assembler::AVX_512bit); evmovdqul(xmm0, dst, Assembler::AVX_512bit); Assembler::vpmullw(xmm0, xmm0, src, vector_len); + evmovdqul(dst, xmm0, Assembler::AVX_512bit); evmovdqul(xmm0, nds, Assembler::AVX_512bit); } } @@ -4578,7 +4586,8 @@ // worse case scenario, all regs in upper bank evmovdqul(nds, xmm0, Assembler::AVX_512bit); evmovdqul(xmm0, dst, Assembler::AVX_512bit); - Assembler::vpsubw(xmm0, xmm0, src, vector_len); + Assembler::vpsubb(xmm0, xmm0, src, vector_len); + evmovdqul(dst, xmm0, Assembler::AVX_512bit); evmovdqul(xmm0, nds, Assembler::AVX_512bit); } } @@ -4636,6 +4645,7 @@ evmovdqul(nds, xmm0, Assembler::AVX_512bit); evmovdqul(xmm0, dst, Assembler::AVX_512bit); Assembler::vpsubw(xmm0, xmm0, src, vector_len); + evmovdqul(dst, xmm0, Assembler::AVX_512bit); evmovdqul(xmm0, nds, Assembler::AVX_512bit); } } @@ -4649,7 +4659,7 @@ } else if ((dst_enc < 16) && (shift_enc < 16)) { Assembler::vpsraw(dst, dst, shift, vector_len); } else if ((dst_enc < 16) && (nds_enc < 16)) { - // use nds_enc as scratch with shift + // use nds as scratch with shift evmovdqul(nds, shift, Assembler::AVX_512bit); Assembler::vpsraw(dst, dst, nds, vector_len); } else if ((shift_enc < 16) && (nds_enc < 16)) { @@ -4664,7 +4674,7 @@ Assembler::vpsraw(dst, dst, xmm0, vector_len); evmovdqul(xmm0, nds, Assembler::AVX_512bit); } else if (nds_enc < 16) { - // use nds as dest as temps + // use nds and dst as temps evmovdqul(nds, dst, Assembler::AVX_512bit); evmovdqul(dst, xmm0, Assembler::AVX_512bit); evmovdqul(xmm0, shift, Assembler::AVX_512bit); @@ -4677,8 +4687,7 @@ evmovdqul(nds, xmm0, Assembler::AVX_512bit); evmovdqul(xmm1, shift, Assembler::AVX_512bit); evmovdqul(xmm0, dst, Assembler::AVX_512bit); - Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len); - evmovdqul(xmm1, dst, Assembler::AVX_512bit); + Assembler::vpsraw(xmm0, xmm0, xmm1, vector_len); evmovdqul(dst, xmm0, Assembler::AVX_512bit); evmovdqul(xmm0, nds, Assembler::AVX_512bit); pop_zmm(xmm1); @@ -4702,6 +4711,7 @@ evmovdqul(nds, xmm0, Assembler::AVX_512bit); evmovdqul(xmm0, dst, Assembler::AVX_512bit); Assembler::vpsraw(xmm0, xmm0, shift, vector_len); + evmovdqul(dst, xmm0, Assembler::AVX_512bit); evmovdqul(xmm0, nds, Assembler::AVX_512bit); } } @@ -4715,7 +4725,7 @@ } else if ((dst_enc < 16) && (shift_enc < 16)) { Assembler::vpsrlw(dst, dst, shift, vector_len); } else if ((dst_enc < 16) && (nds_enc < 16)) { - // use nds_enc as scratch with shift + // use nds as scratch with shift evmovdqul(nds, shift, Assembler::AVX_512bit); Assembler::vpsrlw(dst, dst, nds, vector_len); } else if ((shift_enc < 16) && (nds_enc < 16)) { @@ -4730,7 +4740,7 @@ Assembler::vpsrlw(dst, dst, xmm0, vector_len); evmovdqul(xmm0, nds, Assembler::AVX_512bit); } else if (nds_enc < 16) { - // use nds as dest as temps + // use nds and dst as temps evmovdqul(nds, dst, Assembler::AVX_512bit); evmovdqul(dst, xmm0, Assembler::AVX_512bit); evmovdqul(xmm0, shift, Assembler::AVX_512bit); @@ -4743,8 +4753,7 @@ evmovdqul(nds, xmm0, Assembler::AVX_512bit); evmovdqul(xmm1, shift, Assembler::AVX_512bit); evmovdqul(xmm0, dst, Assembler::AVX_512bit); - Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len); - evmovdqul(xmm1, dst, Assembler::AVX_512bit); + Assembler::vpsrlw(xmm0, xmm0, xmm1, vector_len); evmovdqul(dst, xmm0, Assembler::AVX_512bit); evmovdqul(xmm0, nds, Assembler::AVX_512bit); pop_zmm(xmm1); @@ -4768,6 +4777,7 @@ evmovdqul(nds, xmm0, Assembler::AVX_512bit); evmovdqul(xmm0, dst, Assembler::AVX_512bit); Assembler::vpsrlw(xmm0, xmm0, shift, vector_len); + evmovdqul(dst, xmm0, Assembler::AVX_512bit); evmovdqul(xmm0, nds, Assembler::AVX_512bit); } } @@ -4781,7 +4791,7 @@ } else if ((dst_enc < 16) && (shift_enc < 16)) { Assembler::vpsllw(dst, dst, shift, vector_len); } else if ((dst_enc < 16) && (nds_enc < 16)) { - // use nds_enc as scratch with shift + // use nds as scratch with shift evmovdqul(nds, shift, Assembler::AVX_512bit); Assembler::vpsllw(dst, dst, nds, vector_len); } else if ((shift_enc < 16) && (nds_enc < 16)) { @@ -4796,7 +4806,7 @@ Assembler::vpsllw(dst, dst, xmm0, vector_len); evmovdqul(xmm0, nds, Assembler::AVX_512bit); } else if (nds_enc < 16) { - // use nds as dest as temps + // use nds and dst as temps evmovdqul(nds, dst, Assembler::AVX_512bit); evmovdqul(dst, xmm0, Assembler::AVX_512bit); evmovdqul(xmm0, shift, Assembler::AVX_512bit); @@ -4810,7 +4820,6 @@ evmovdqul(xmm1, shift, Assembler::AVX_512bit); evmovdqul(xmm0, dst, Assembler::AVX_512bit); Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len); - evmovdqul(xmm1, dst, Assembler::AVX_512bit); evmovdqul(dst, xmm0, Assembler::AVX_512bit); evmovdqul(xmm0, nds, Assembler::AVX_512bit); pop_zmm(xmm1); @@ -4834,6 +4843,7 @@ evmovdqul(nds, xmm0, Assembler::AVX_512bit); evmovdqul(xmm0, dst, Assembler::AVX_512bit); Assembler::vpsllw(xmm0, xmm0, shift, vector_len); + evmovdqul(dst, xmm0, Assembler::AVX_512bit); evmovdqul(xmm0, nds, Assembler::AVX_512bit); } } @@ -7130,7 +7140,7 @@ bind(RET_NOT_FOUND); movl(result, -1); - jmpb(CLEANUP); + jmp(CLEANUP); bind(FOUND_SUBSTR); // Compute start addr of substr @@ -7148,7 +7158,7 @@ addl(tmp, cnt2); // Found result if we matched whole substring. cmpl(tmp, stride); - jccb(Assembler::lessEqual, RET_FOUND); + jcc(Assembler::lessEqual, RET_FOUND); // Repeat search for small substring (<= 8 chars) // from new point 'str1' without reloading substring. @@ -7248,7 +7258,7 @@ jcc(Assembler::carryClear, FOUND_CHAR); addptr(result, 32); subl(tmp, 2*stride); - jccb(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); + jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); jmp(SCAN_TO_8_CHAR); bind(SCAN_TO_8_CHAR_INIT); movdl(vec1, ch); @@ -7278,7 +7288,7 @@ jcc(Assembler::carryClear, FOUND_CHAR); addptr(result, 16); subl(tmp, stride); - jccb(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); + jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); bind(SCAN_TO_CHAR); testl(cnt1, cnt1); jcc(Assembler::zero, RET_NOT_FOUND); @@ -7857,7 +7867,7 @@ // Compare 16-byte vectors andl(result, 0x0000000f); // tail count (in bytes) andl(len, 0xfffffff0); // vector count (in bytes) - jccb(Assembler::zero, COMPARE_TAIL); + jcc(Assembler::zero, COMPARE_TAIL); lea(ary1, Address(ary1, len, Address::times_1)); negptr(len); @@ -7869,12 +7879,12 @@ bind(COMPARE_WIDE_VECTORS); movdqu(vec1, Address(ary1, len, Address::times_1)); ptest(vec1, vec2); - jccb(Assembler::notZero, TRUE_LABEL); + jcc(Assembler::notZero, TRUE_LABEL); addptr(len, 16); jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); testl(result, result); - jccb(Assembler::zero, FALSE_LABEL); + jcc(Assembler::zero, FALSE_LABEL); movdqu(vec1, Address(ary1, result, Address::times_1, -16)); ptest(vec1, vec2); @@ -9069,7 +9079,7 @@ jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found addq(result, 32); subq(length, 32); - jccb(Assembler::greaterEqual, VECTOR32_LOOP); + jcc(Assembler::greaterEqual, VECTOR32_LOOP); addq(length, 32); jcc(Assembler::equal, SAME_TILL_END); //falling through if less than 32 bytes left //close the branch here. @@ -9140,24 +9150,24 @@ load_unsigned_byte(tmp2, Address(objb, result)); xorl(tmp1, tmp2); testl(tmp1, tmp1); - jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found + jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found decq(length); - jccb(Assembler::zero, SAME_TILL_END); + jcc(Assembler::zero, SAME_TILL_END); incq(result); load_unsigned_byte(tmp1, Address(obja, result)); load_unsigned_byte(tmp2, Address(objb, result)); xorl(tmp1, tmp2); testl(tmp1, tmp1); - jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found + jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found decq(length); - jccb(Assembler::zero, SAME_TILL_END); + jcc(Assembler::zero, SAME_TILL_END); incq(result); load_unsigned_byte(tmp1, Address(obja, result)); load_unsigned_byte(tmp2, Address(objb, result)); xorl(tmp1, tmp2); testl(tmp1, tmp1); - jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found - jmpb(SAME_TILL_END); + jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found + jmp(SAME_TILL_END); if (UseAVX >= 2) { bind(VECTOR32_NOT_EQUAL); @@ -9168,7 +9178,7 @@ bsfq(tmp1, tmp1); addq(result, tmp1); shrq(result); - jmpb(DONE); + jmp(DONE); } bind(VECTOR16_NOT_EQUAL); @@ -10590,7 +10600,7 @@ andl(len, 0xfffffff0); // vector count (in chars) andl(result, 0x0000000f); // tail count (in chars) testl(len, len); - jccb(Assembler::zero, copy_16); + jcc(Assembler::zero, copy_16); // compress 16 chars per iter movdl(tmp1Reg, tmp5);