8189113: AARCH64: StringLatin1 inflate intrinsic doesn't use prefetch instruction
Reviewed-by: aph
--- a/src/hotspot/cpu/aarch64/aarch64.ad Mon Jun 25 16:32:02 2018 +0300
+++ b/src/hotspot/cpu/aarch64/aarch64.ad Mon Jun 25 16:32:23 2018 +0300
@@ -16168,7 +16168,7 @@
// fast byte[] to char[] inflation
instruct string_inflate(Universe dummy, iRegP_R0 src, iRegP_R1 dst, iRegI_R2 len,
- vRegD tmp1, vRegD tmp2, vRegD tmp3, iRegP_R3 tmp4, rFlagsReg cr)
+ vRegD_V0 tmp1, vRegD_V1 tmp2, vRegD_V2 tmp3, iRegP_R3 tmp4, rFlagsReg cr)
%{
match(Set dummy (StrInflatedCopy src (Binary dst len)));
effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr);
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp Mon Jun 25 16:32:02 2018 +0300
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp Mon Jun 25 16:32:23 2018 +0300
@@ -5681,26 +5681,24 @@
void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
Register tmp4) {
- Label big, done;
+ Label big, done, after_init, to_stub;
assert_different_registers(src, dst, len, tmp4, rscratch1);
- fmovd(vtmp1 , zr);
- lsrw(rscratch1, len, 3);
-
- cbnzw(rscratch1, big);
-
+ fmovd(vtmp1, zr);
+ lsrw(tmp4, len, 3);
+ bind(after_init);
+ cbnzw(tmp4, big);
// Short string: less than 8 bytes.
{
- Label loop, around, tiny;
-
- subsw(len, len, 4);
- andw(len, len, 3);
- br(LO, tiny);
-
+ Label loop, tiny;
+
+ cmpw(len, 4);
+ br(LT, tiny);
// Use SIMD to do 4 bytes.
ldrs(vtmp2, post(src, 4));
zip1(vtmp3, T8B, vtmp2, vtmp1);
+ subw(len, len, 4);
strd(vtmp3, post(dst, 8));
cbzw(len, done);
@@ -5714,35 +5712,65 @@
bind(tiny);
cbnz(len, loop);
- bind(around);
b(done);
}
+ if (SoftwarePrefetchHintDistance >= 0) {
+ bind(to_stub);
+ RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
+ assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
+ trampoline_call(stub);
+ b(after_init);
+ }
+
// Unpack the bytes 8 at a time.
bind(big);
- andw(len, len, 7);
-
{
- Label loop, around;
-
- bind(loop);
- ldrd(vtmp2, post(src, 8));
- sub(rscratch1, rscratch1, 1);
- zip1(vtmp3, T16B, vtmp2, vtmp1);
- st1(vtmp3, T8H, post(dst, 16));
- cbnz(rscratch1, loop);
-
- bind(around);
+ Label loop, around, loop_last, loop_start;
+
+ if (SoftwarePrefetchHintDistance >= 0) {
+ const int large_loop_threshold = (64 + 16)/8;
+ ldrd(vtmp2, post(src, 8));
+ andw(len, len, 7);
+ cmp(tmp4, large_loop_threshold);
+ br(GE, to_stub);
+ b(loop_start);
+
+ bind(loop);
+ ldrd(vtmp2, post(src, 8));
+ bind(loop_start);
+ subs(tmp4, tmp4, 1);
+ br(EQ, loop_last);
+ zip1(vtmp2, T16B, vtmp2, vtmp1);
+ ldrd(vtmp3, post(src, 8));
+ st1(vtmp2, T8H, post(dst, 16));
+ subs(tmp4, tmp4, 1);
+ zip1(vtmp3, T16B, vtmp3, vtmp1);
+ st1(vtmp3, T8H, post(dst, 16));
+ br(NE, loop);
+ b(around);
+ bind(loop_last);
+ zip1(vtmp2, T16B, vtmp2, vtmp1);
+ st1(vtmp2, T8H, post(dst, 16));
+ bind(around);
+ cbz(len, done);
+ } else {
+ andw(len, len, 7);
+ bind(loop);
+ ldrd(vtmp2, post(src, 8));
+ sub(tmp4, tmp4, 1);
+ zip1(vtmp3, T16B, vtmp2, vtmp1);
+ st1(vtmp3, T8H, post(dst, 16));
+ cbnz(tmp4, loop);
+ }
}
// Do the tail of up to 8 bytes.
- sub(src, src, 8);
- add(src, src, len, ext::uxtw, 0);
- ldrd(vtmp2, Address(src));
- sub(dst, dst, 16);
+ add(src, src, len);
+ ldrd(vtmp3, Address(src, -8));
add(dst, dst, len, ext::uxtw, 1);
- zip1(vtmp3, T16B, vtmp2, vtmp1);
- st1(vtmp3, T8H, Address(dst));
+ zip1(vtmp3, T16B, vtmp3, vtmp1);
+ strq(vtmp3, Address(dst, -16));
bind(done);
}
--- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp Mon Jun 25 16:32:02 2018 +0300
+++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp Mon Jun 25 16:32:23 2018 +0300
@@ -4624,6 +4624,68 @@
StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
}
+ void inflate_and_store_2_fp_registers(bool generatePrfm,
+ FloatRegister src1, FloatRegister src2) {
+ Register dst = r1;
+ __ zip1(v1, __ T16B, src1, v0);
+ __ zip2(v2, __ T16B, src1, v0);
+ if (generatePrfm) {
+ __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
+ }
+ __ zip1(v3, __ T16B, src2, v0);
+ __ zip2(v4, __ T16B, src2, v0);
+ __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
+ }
+
+ // R0 = src
+ // R1 = dst
+ // R2 = len
+ // R3 = len >> 3
+ // V0 = 0
+ // v1 = loaded 8 bytes
+ address generate_large_byte_array_inflate() {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
+ address entry = __ pc();
+ Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
+ Register src = r0, dst = r1, len = r2, octetCounter = r3;
+ const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
+
+ // do one more 8-byte read to have address 16-byte aligned in most cases
+ // also use single store instruction
+ __ ldrd(v2, __ post(src, 8));
+ __ sub(octetCounter, octetCounter, 2);
+ __ zip1(v1, __ T16B, v1, v0);
+ __ zip1(v2, __ T16B, v2, v0);
+ __ st1(v1, v2, __ T16B, __ post(dst, 32));
+ __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
+ __ cmp(octetCounter, large_loop_threshold);
+ __ br(__ LE, LOOP_START);
+ __ b(LOOP_PRFM_START);
+ __ bind(LOOP_PRFM);
+ __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
+ __ bind(LOOP_PRFM_START);
+ __ prfm(Address(src, SoftwarePrefetchHintDistance));
+ __ sub(octetCounter, octetCounter, 8);
+ __ cmp(octetCounter, large_loop_threshold);
+ inflate_and_store_2_fp_registers(true, v3, v4);
+ inflate_and_store_2_fp_registers(true, v5, v6);
+ __ br(__ GT, LOOP_PRFM);
+ __ cmp(octetCounter, 8);
+ __ br(__ LT, DONE);
+ __ bind(LOOP);
+ __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
+ __ bind(LOOP_START);
+ __ sub(octetCounter, octetCounter, 8);
+ __ cmp(octetCounter, 8);
+ inflate_and_store_2_fp_registers(false, v3, v4);
+ inflate_and_store_2_fp_registers(false, v5, v6);
+ __ br(__ GE, LOOP);
+ __ bind(DONE);
+ __ ret(lr);
+ return entry;
+ }
+
/**
* Arguments:
*
@@ -5727,6 +5789,9 @@
generate_string_indexof_stubs();
+ // byte_array_inflate stub for large arrays.
+ StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
+
if (UseMultiplyToLenIntrinsic) {
StubRoutines::_multiplyToLen = generate_multiplyToLen();
}
--- a/src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp Mon Jun 25 16:32:02 2018 +0300
+++ b/src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp Mon Jun 25 16:32:23 2018 +0300
@@ -55,6 +55,7 @@
address StubRoutines::aarch64::_string_indexof_linear_ll = NULL;
address StubRoutines::aarch64::_string_indexof_linear_uu = NULL;
address StubRoutines::aarch64::_string_indexof_linear_ul = NULL;
+address StubRoutines::aarch64::_large_byte_array_inflate = NULL;
bool StubRoutines::aarch64::_completed = false;
/**
--- a/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp Mon Jun 25 16:32:02 2018 +0300
+++ b/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp Mon Jun 25 16:32:23 2018 +0300
@@ -73,6 +73,7 @@
static address _string_indexof_linear_ll;
static address _string_indexof_linear_uu;
static address _string_indexof_linear_ul;
+ static address _large_byte_array_inflate;
static bool _completed;
public:
@@ -171,6 +172,10 @@
return _string_indexof_linear_uu;
}
+ static address large_byte_array_inflate() {
+ return _large_byte_array_inflate;
+ }
+
static bool complete() {
return _completed;
}