8184943: AARCH64: Intrinsify hasNegatives
Reviewed-by: aph
Contributed-by: dmitrij.pochepko@bell-sw.com, stuart.monteith@linaro.org
--- a/hotspot/src/cpu/aarch64/vm/aarch64.ad Tue Aug 15 02:49:48 2017 +0000
+++ b/hotspot/src/cpu/aarch64/vm/aarch64.ad Tue Aug 15 15:02:40 2017 +0300
@@ -16102,6 +16102,16 @@
ins_pipe(pipe_class_memory);
%}
+instruct has_negatives(iRegP_R1 ary1, iRegI_R2 len, iRegI_R0 result, rFlagsReg cr)
+%{
+ match(Set result (HasNegatives ary1 len));
+ effect(USE_KILL ary1, USE_KILL len, KILL cr);
+ format %{ "has negatives byte[] $ary1,$len -> $result" %}
+ ins_encode %{
+ __ has_negatives($ary1$$Register, $len$$Register, $result$$Register);
+ %}
+ ins_pipe( pipe_slow );
+%}
// fast char[] to byte[] compression
instruct string_compress(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len,
--- a/hotspot/src/cpu/aarch64/vm/globals_aarch64.hpp Tue Aug 15 02:49:48 2017 +0000
+++ b/hotspot/src/cpu/aarch64/vm/globals_aarch64.hpp Tue Aug 15 15:02:40 2017 +0300
@@ -154,8 +154,11 @@
product(intx, BlockZeroingLowLimit, 256, \
"Minimum size in bytes when block zeroing will be used") \
range(1, max_jint) \
- product(bool, TraceTraps, false, "Trace all traps the signal handler")
-
+ product(bool, TraceTraps, false, "Trace all traps the signal handler")\
+ product(int, SoftwarePrefetchHintDistance, -1, \
+ "Use prfm hint with specified distance in compiled code." \
+ "Value -1 means off.") \
+ range(-1, 32760)
#endif
--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp Tue Aug 15 02:49:48 2017 +0000
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp Tue Aug 15 15:02:40 2017 +0300
@@ -4829,6 +4829,62 @@
BLOCK_COMMENT("} string_compare");
}
+// This method checks if provided byte array contains byte with highest bit set.
+void MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
+ // Simple and most common case of aligned small array which is not at the
+ // end of memory page is placed here. All other cases are in stub.
+ Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
+ const uint64_t UPPER_BIT_MASK=0x8080808080808080;
+ assert_different_registers(ary1, len, result);
+
+ cmpw(len, 0);
+ br(LE, SET_RESULT);
+ cmpw(len, 4 * wordSize);
+ br(GE, STUB_LONG); // size > 32 then go to stub
+
+ int shift = 64 - exact_log2(os::vm_page_size());
+ lsl(rscratch1, ary1, shift);
+ mov(rscratch2, (size_t)(4 * wordSize) << shift);
+ adds(rscratch2, rscratch1, rscratch2); // At end of page?
+ br(CS, STUB); // at the end of page then go to stub
+ subs(len, len, wordSize);
+ br(LT, END);
+
+ BIND(LOOP);
+ ldr(rscratch1, Address(post(ary1, wordSize)));
+ tst(rscratch1, UPPER_BIT_MASK);
+ br(NE, SET_RESULT);
+ subs(len, len, wordSize);
+ br(GE, LOOP);
+ cmpw(len, -wordSize);
+ br(EQ, SET_RESULT);
+
+ BIND(END);
+ ldr(result, Address(ary1));
+ sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
+ lslv(result, result, len);
+ tst(result, UPPER_BIT_MASK);
+ b(SET_RESULT);
+
+ BIND(STUB);
+ RuntimeAddress has_neg = RuntimeAddress(StubRoutines::aarch64::has_negatives());
+ assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
+ trampoline_call(has_neg);
+ b(DONE);
+
+ BIND(STUB_LONG);
+ RuntimeAddress has_neg_long = RuntimeAddress(
+ StubRoutines::aarch64::has_negatives_long());
+ assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
+ trampoline_call(has_neg_long);
+ b(DONE);
+
+ BIND(SET_RESULT);
+ cset(result, NE); // set true or false
+
+ BIND(DONE);
+}
+
// Compare Strings or char/byte arrays.
// is_string is true iff this is a string comparison.
--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp Tue Aug 15 02:49:48 2017 +0000
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp Tue Aug 15 15:02:40 2017 +0300
@@ -1210,6 +1210,8 @@
Register tmp1,
FloatRegister vtmp, FloatRegister vtmpZ, int ae);
+ void has_negatives(Register ary1, Register len, Register result);
+
void arrays_equals(Register a1, Register a2,
Register result, Register cnt1,
int elem_size, bool is_string);
--- a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp Tue Aug 15 02:49:48 2017 +0000
+++ b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp Tue Aug 15 15:02:40 2017 +0300
@@ -3670,6 +3670,167 @@
__ eor(result, __ T16B, lo, t0);
}
+ address generate_has_negatives(address &has_negatives_long) {
+ StubCodeMark mark(this, "StubRoutines", "has_negatives");
+ const int large_loop_size = 64;
+ const uint64_t UPPER_BIT_MASK=0x8080808080808080;
+ int dcache_line = VM_Version::dcache_line_size();
+
+ Register ary1 = r1, len = r2, result = r0;
+
+ __ align(CodeEntryAlignment);
+ address entry = __ pc();
+
+ __ enter();
+
+ Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
+ LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
+
+ __ cmp(len, 15);
+ __ br(Assembler::GT, LEN_OVER_15);
+ // The only case when execution falls into this code is when pointer is near
+ // the end of memory page and we have to avoid reading next page
+ __ add(ary1, ary1, len);
+ __ subs(len, len, 8);
+ __ br(Assembler::GT, LEN_OVER_8);
+ __ ldr(rscratch2, Address(ary1, -8));
+ __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes.
+ __ lsrv(rscratch2, rscratch2, rscratch1);
+ __ tst(rscratch2, UPPER_BIT_MASK);
+ __ cset(result, Assembler::NE);
+ __ leave();
+ __ ret(lr);
+ __ bind(LEN_OVER_8);
+ __ ldp(rscratch1, rscratch2, Address(ary1, -16));
+ __ sub(len, len, 8); // no data dep., then sub can be executed while loading
+ __ tst(rscratch2, UPPER_BIT_MASK);
+ __ br(Assembler::NE, RET_TRUE_NO_POP);
+ __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
+ __ lsrv(rscratch1, rscratch1, rscratch2);
+ __ tst(rscratch1, UPPER_BIT_MASK);
+ __ cset(result, Assembler::NE);
+ __ leave();
+ __ ret(lr);
+
+ Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
+ const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
+
+ has_negatives_long = __ pc(); // 2nd entry point
+
+ __ enter();
+
+ __ bind(LEN_OVER_15);
+ __ push(spilled_regs, sp);
+ __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
+ __ cbz(rscratch2, ALIGNED);
+ __ ldp(tmp6, tmp1, Address(ary1));
+ __ mov(tmp5, 16);
+ __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
+ __ add(ary1, ary1, rscratch1);
+ __ sub(len, len, rscratch1);
+ __ orr(tmp6, tmp6, tmp1);
+ __ tst(tmp6, UPPER_BIT_MASK);
+ __ br(Assembler::NE, RET_TRUE);
+
+ __ bind(ALIGNED);
+ __ cmp(len, large_loop_size);
+ __ br(Assembler::LT, CHECK_16);
+ // Perform 16-byte load as early return in pre-loop to handle situation
+ // when initially aligned large array has negative values at starting bytes,
+ // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
+ // slower. Cases with negative bytes further ahead won't be affected that
+ // much. In fact, it'll be faster due to early loads, less instructions and
+ // less branches in LARGE_LOOP.
+ __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
+ __ sub(len, len, 16);
+ __ orr(tmp6, tmp6, tmp1);
+ __ tst(tmp6, UPPER_BIT_MASK);
+ __ br(Assembler::NE, RET_TRUE);
+ __ cmp(len, large_loop_size);
+ __ br(Assembler::LT, CHECK_16);
+
+ if (SoftwarePrefetchHintDistance >= 0
+ && SoftwarePrefetchHintDistance >= dcache_line) {
+ // initial prefetch
+ __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
+ }
+ __ bind(LARGE_LOOP);
+ if (SoftwarePrefetchHintDistance >= 0) {
+ __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
+ }
+ // Issue load instructions first, since it can save few CPU/MEM cycles, also
+ // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
+ // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
+ // instructions per cycle and have less branches, but this approach disables
+ // early return, thus, all 64 bytes are loaded and checked every time.
+ __ ldp(tmp2, tmp3, Address(ary1));
+ __ ldp(tmp4, tmp5, Address(ary1, 16));
+ __ ldp(rscratch1, rscratch2, Address(ary1, 32));
+ __ ldp(tmp6, tmp1, Address(ary1, 48));
+ __ add(ary1, ary1, large_loop_size);
+ __ sub(len, len, large_loop_size);
+ __ orr(tmp2, tmp2, tmp3);
+ __ orr(tmp4, tmp4, tmp5);
+ __ orr(rscratch1, rscratch1, rscratch2);
+ __ orr(tmp6, tmp6, tmp1);
+ __ orr(tmp2, tmp2, tmp4);
+ __ orr(rscratch1, rscratch1, tmp6);
+ __ orr(tmp2, tmp2, rscratch1);
+ __ tst(tmp2, UPPER_BIT_MASK);
+ __ br(Assembler::NE, RET_TRUE);
+ __ cmp(len, large_loop_size);
+ __ br(Assembler::GE, LARGE_LOOP);
+
+ __ bind(CHECK_16); // small 16-byte load pre-loop
+ __ cmp(len, 16);
+ __ br(Assembler::LT, POST_LOOP16);
+
+ __ bind(LOOP16); // small 16-byte load loop
+ __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
+ __ sub(len, len, 16);
+ __ orr(tmp2, tmp2, tmp3);
+ __ tst(tmp2, UPPER_BIT_MASK);
+ __ br(Assembler::NE, RET_TRUE);
+ __ cmp(len, 16);
+ __ br(Assembler::GE, LOOP16); // 16-byte load loop end
+
+ __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
+ __ cmp(len, 8);
+ __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
+ __ ldr(tmp3, Address(__ post(ary1, 8)));
+ __ sub(len, len, 8);
+ __ tst(tmp3, UPPER_BIT_MASK);
+ __ br(Assembler::NE, RET_TRUE);
+
+ __ bind(POST_LOOP16_LOAD_TAIL);
+ __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
+ __ ldr(tmp1, Address(ary1));
+ __ mov(tmp2, 64);
+ __ sub(tmp4, tmp2, len, __ LSL, 3);
+ __ lslv(tmp1, tmp1, tmp4);
+ __ tst(tmp1, UPPER_BIT_MASK);
+ __ br(Assembler::NE, RET_TRUE);
+ // Fallthrough
+
+ __ bind(RET_FALSE);
+ __ pop(spilled_regs, sp);
+ __ leave();
+ __ mov(result, zr);
+ __ ret(lr);
+
+ __ bind(RET_TRUE);
+ __ pop(spilled_regs, sp);
+ __ bind(RET_TRUE_NO_POP);
+ __ leave();
+ __ mov(result, 1);
+ __ ret(lr);
+
+ __ bind(DONE);
+ __ pop(spilled_regs, sp);
+ __ leave();
+ __ ret(lr);
+ return entry;
+ }
/**
* Arguments:
*
@@ -4686,6 +4847,7 @@
// }
};
+
// Initialization
void generate_initial() {
// Generate initial stubs and initializes the entry points
@@ -4744,6 +4906,9 @@
// arraycopy stubs used by compilers
generate_arraycopy_stubs();
+ // has negatives stub for large arrays.
+ StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
+
if (UseMultiplyToLenIntrinsic) {
StubRoutines::_multiplyToLen = generate_multiplyToLen();
}
--- a/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.cpp Tue Aug 15 02:49:48 2017 +0000
+++ b/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.cpp Tue Aug 15 15:02:40 2017 +0300
@@ -44,6 +44,8 @@
address StubRoutines::aarch64::_double_sign_mask = NULL;
address StubRoutines::aarch64::_double_sign_flip = NULL;
address StubRoutines::aarch64::_zero_blocks = NULL;
+address StubRoutines::aarch64::_has_negatives = NULL;
+address StubRoutines::aarch64::_has_negatives_long = NULL;
bool StubRoutines::aarch64::_completed = false;
/**
--- a/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp Tue Aug 15 02:49:48 2017 +0000
+++ b/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp Tue Aug 15 15:02:40 2017 +0300
@@ -62,6 +62,9 @@
static address _double_sign_flip;
static address _zero_blocks;
+
+ static address _has_negatives;
+ static address _has_negatives_long;
static bool _completed;
public:
@@ -120,6 +123,14 @@
return _zero_blocks;
}
+ static address has_negatives() {
+ return _has_negatives;
+ }
+
+ static address has_negatives_long() {
+ return _has_negatives_long;
+ }
+
static bool complete() {
return _completed;
}
--- a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp Tue Aug 15 02:49:48 2017 +0000
+++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp Tue Aug 15 15:02:40 2017 +0300
@@ -137,6 +137,8 @@
FLAG_SET_DEFAULT(PrefetchScanIntervalInBytes, 3*dcache_line);
if (FLAG_IS_DEFAULT(PrefetchCopyIntervalInBytes))
FLAG_SET_DEFAULT(PrefetchCopyIntervalInBytes, 3*dcache_line);
+ if (FLAG_IS_DEFAULT(SoftwarePrefetchHintDistance))
+ FLAG_SET_DEFAULT(SoftwarePrefetchHintDistance, 3*dcache_line);
if (PrefetchCopyIntervalInBytes != -1 &&
((PrefetchCopyIntervalInBytes & 7) || (PrefetchCopyIntervalInBytes >= 32768))) {
@@ -146,6 +148,12 @@
PrefetchCopyIntervalInBytes = 32760;
}
+ if (SoftwarePrefetchHintDistance != -1 &&
+ (SoftwarePrefetchHintDistance & 7)) {
+ warning("SoftwarePrefetchHintDistance must be -1, or a multiple of 8");
+ SoftwarePrefetchHintDistance &= ~7;
+ }
+
unsigned long auxv = getauxval(AT_HWCAP);
char buf[512];