--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Mon Dec 07 15:42:47 2015 +0100
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Mon Dec 14 14:48:30 2015 -0800
@@ -7999,9 +7999,15 @@
XMMRegister vec1, int ae) {
ShortBranchVerifier sbv(this);
Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
+ Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3
int stride, stride2, adr_stride, adr_stride1, adr_stride2;
+ int stride2x2 = 0x40;
Address::ScaleFactor scale, scale1, scale2;
+ if (ae != StrIntrinsicNode::LL) {
+ stride2x2 = 0x20;
+ }
+
if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
shrl(cnt2, 1);
}
@@ -8011,15 +8017,15 @@
movl(result, cnt1);
subl(cnt1, cnt2);
push(cnt1);
- cmov32(Assembler::lessEqual, cnt2, result);
+ cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2)
// Is the minimum length zero?
testl(cnt2, cnt2);
jcc(Assembler::zero, LENGTH_DIFF_LABEL);
if (ae == StrIntrinsicNode::LL) {
// Load first bytes
- load_unsigned_byte(result, Address(str1, 0));
- load_unsigned_byte(cnt1, Address(str2, 0));
+ load_unsigned_byte(result, Address(str1, 0)); // result = str1[0]
+ load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0]
} else if (ae == StrIntrinsicNode::UU) {
// Load first characters
load_unsigned_short(result, Address(str1, 0));
@@ -8060,7 +8066,10 @@
assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
+ Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
Label COMPARE_TAIL_LONG;
+ Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3
+
int pcmpmask = 0x19;
if (ae == StrIntrinsicNode::LL) {
pcmpmask &= ~0x01;
@@ -8123,11 +8132,40 @@
}
subl(result, stride2);
subl(cnt2, stride2);
- jccb(Assembler::zero, COMPARE_WIDE_TAIL);
+ jcc(Assembler::zero, COMPARE_WIDE_TAIL);
negptr(result);
// In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
bind(COMPARE_WIDE_VECTORS_LOOP);
+
+#ifdef _LP64
+ if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
+ cmpl(cnt2, stride2x2);
+ jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
+ testl(cnt2, stride2x2-1); // cnt2 holds the vector count
+ jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40
+
+ bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
+ if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
+ evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
+ evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
+ } else {
+ vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
+ evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
+ }
+ kortestql(k7, k7);
+ jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare
+ addptr(result, stride2x2); // update since we already compared at this addr
+ subl(cnt2, stride2x2); // and sub the size too
+ jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
+
+ vpxor(vec1, vec1);
+ jmpb(COMPARE_WIDE_TAIL);
+ }//if (VM_Version::supports_avx512vlbw())
+#endif // _LP64
+
+
+ bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
vmovdqu(vec1, Address(str1, result, scale));
vpxor(vec1, Address(str2, result, scale));
@@ -8136,7 +8174,7 @@
vpxor(vec1, Address(str2, result, scale2));
}
vptest(vec1, vec1);
- jccb(Assembler::notZero, VECTOR_NOT_EQUAL);
+ jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
addptr(result, stride2);
subl(cnt2, stride2);
jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
@@ -8151,7 +8189,7 @@
movl(result, stride2);
movl(cnt2, result);
negptr(result);
- jmpb(COMPARE_WIDE_VECTORS_LOOP);
+ jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
// Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
bind(VECTOR_NOT_EQUAL);
@@ -8295,6 +8333,34 @@
}
jmpb(DONE_LABEL);
+#ifdef _LP64
+ if (VM_Version::supports_avx512vlbw()) {
+
+ bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
+
+ kmovql(cnt1, k7);
+ notq(cnt1);
+ bsfq(cnt2, cnt1);
+ if (ae != StrIntrinsicNode::LL) {
+ // Divide diff by 2 to get number of chars
+ sarl(cnt2, 1);
+ }
+ addq(result, cnt2);
+ if (ae == StrIntrinsicNode::LL) {
+ load_unsigned_byte(cnt1, Address(str2, result));
+ load_unsigned_byte(result, Address(str1, result));
+ } else if (ae == StrIntrinsicNode::UU) {
+ load_unsigned_short(cnt1, Address(str2, result, scale));
+ load_unsigned_short(result, Address(str1, result, scale));
+ } else {
+ load_unsigned_short(cnt1, Address(str2, result, scale2));
+ load_unsigned_byte(result, Address(str1, result, scale1));
+ }
+ subl(result, cnt1);
+ jmpb(POP_LABEL);
+ }//if (VM_Version::supports_avx512vlbw())
+#endif // _LP64
+
// Discard the stored length difference
bind(POP_LABEL);
pop(cnt1);
@@ -8304,6 +8370,7 @@
if(ae == StrIntrinsicNode::UL) {
negl(result);
}
+
}
// Search for Non-ASCII character (Negative byte value) in a byte array,