diff -r cbc557f166f2 -r 82fd8793ba5e src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp --- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp Wed May 22 20:12:19 2019 +0300 +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp Wed May 22 20:39:04 2019 +0300 @@ -4035,14 +4035,14 @@ : "compare_long_string_different_encoding UL"); address entry = __ pc(); Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, - DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, SMALL_LOOP_ENTER, + DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; RegSet spilled_regs = RegSet::of(tmp3, tmp4); - int prefetchLoopExitCondition = MAX(32, SoftwarePrefetchHintDistance/2); + int prefetchLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance/2); __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); // cnt2 == amount of characters left to compare @@ -4069,7 +4069,7 @@ if (SoftwarePrefetchHintDistance >= 0) { __ subs(rscratch2, cnt2, prefetchLoopExitCondition); - __ br(__ LT, SMALL_LOOP); + __ br(__ LT, NO_PREFETCH); __ bind(LARGE_LOOP_PREFETCH); __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); __ mov(tmp4, 2); @@ -4089,51 +4089,20 @@ __ br(__ GE, LARGE_LOOP_PREFETCH); } __ cbz(cnt2, LOAD_LAST); // no characters left except last load + __ bind(NO_PREFETCH); __ subs(cnt2, cnt2, 16); __ br(__ LT, TAIL); - __ b(SMALL_LOOP_ENTER); __ bind(SMALL_LOOP); // smaller loop __ subs(cnt2, cnt2, 16); - __ bind(SMALL_LOOP_ENTER); compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); __ br(__ GE, SMALL_LOOP); - __ cbz(cnt2, LOAD_LAST); - __ bind(TAIL); // 1..15 characters left - __ subs(zr, cnt2, -8); - __ br(__ GT, TAIL_LOAD_16); - __ ldrd(vtmp, Address(tmp2)); - __ zip1(vtmp3, __ T8B, vtmp, vtmpZ); - - __ ldr(tmpU, Address(__ post(cnt1, 8))); - __ fmovd(tmpL, vtmp3); - __ eor(rscratch2, tmp3, tmpL); - __ cbnz(rscratch2, DIFF2); - __ umov(tmpL, vtmp3, __ D, 1); - __ eor(rscratch2, tmpU, tmpL); - __ cbnz(rscratch2, DIFF1); - __ b(LOAD_LAST); - __ bind(TAIL_LOAD_16); - __ ldrq(vtmp, Address(tmp2)); - __ ldr(tmpU, Address(__ post(cnt1, 8))); - __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); - __ zip2(vtmp, __ T16B, vtmp, vtmpZ); - __ fmovd(tmpL, vtmp3); - __ eor(rscratch2, tmp3, tmpL); - __ cbnz(rscratch2, DIFF2); - - __ ldr(tmp3, Address(__ post(cnt1, 8))); - __ umov(tmpL, vtmp3, __ D, 1); - __ eor(rscratch2, tmpU, tmpL); - __ cbnz(rscratch2, DIFF1); - - __ ldr(tmpU, Address(__ post(cnt1, 8))); - __ fmovd(tmpL, vtmp); - __ eor(rscratch2, tmp3, tmpL); - __ cbnz(rscratch2, DIFF2); - - __ umov(tmpL, vtmp, __ D, 1); - __ eor(rscratch2, tmpU, tmpL); - __ cbnz(rscratch2, DIFF1); + __ cmn(cnt2, (u1)16); + __ br(__ EQ, LOAD_LAST); + __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) + __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 8 bytes before last 4 characters in UTF-16 string + __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string + __ ldr(tmp3, Address(cnt1, -8)); + compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load __ b(LOAD_LAST); __ bind(DIFF2); __ mov(tmpU, tmp3); @@ -4141,10 +4110,12 @@ __ pop(spilled_regs, sp); __ b(CALCULATE_DIFFERENCE); __ bind(LOAD_LAST); + // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. + // No need to load it again + __ mov(tmpU, tmp3); __ pop(spilled_regs, sp); __ ldrs(vtmp, Address(strL)); - __ ldr(tmpU, Address(strU)); __ zip1(vtmp, __ T8B, vtmp, vtmpZ); __ fmovd(tmpL, vtmp); @@ -4206,10 +4177,10 @@ compare_string_16_bytes_same(DIFF, DIFF2); __ br(__ GT, LARGE_LOOP_PREFETCH); __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left? - // less than 16 bytes left? - __ subs(cnt2, cnt2, isLL ? 16 : 8); - __ br(__ LT, TAIL); } + // less than 16 bytes left? + __ subs(cnt2, cnt2, isLL ? 16 : 8); + __ br(__ LT, TAIL); __ bind(SMALL_LOOP); compare_string_16_bytes_same(DIFF, DIFF2); __ subs(cnt2, cnt2, isLL ? 16 : 8);