8153713: aarch64: improve short array clearing using store pair
Summary: aarch64: generate store pair instruction to clear short arrays
Reviewed-by: aph
--- a/hotspot/src/cpu/aarch64/vm/aarch64.ad Thu Apr 14 08:32:39 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/aarch64.ad Tue Apr 12 11:53:44 2016 +0800
@@ -13321,6 +13321,20 @@
ins_pipe(pipe_class_memory);
%}
+instruct clearArray_imm_reg(immL cnt, iRegP base, Universe dummy, rFlagsReg cr)
+%{
+ match(Set dummy (ClearArray cnt base));
+
+ ins_cost(4 * INSN_COST);
+ format %{ "ClearArray $cnt, $base" %}
+
+ ins_encode %{
+ __ zero_words($base$$Register, (u_int64_t)$cnt$$constant);
+ %}
+
+ ins_pipe(pipe_class_memory);
+%}
+
// ============================================================================
// Overflow Math Instructions
--- a/hotspot/src/cpu/aarch64/vm/globals_aarch64.hpp Thu Apr 14 08:32:39 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/globals_aarch64.hpp Tue Apr 12 11:53:44 2016 +0800
@@ -76,7 +76,8 @@
// avoid biased locking while we are bootstrapping the aarch64 build
define_pd_global(bool, UseBiasedLocking, false);
-define_pd_global(intx, InitArrayShortSize, 18*BytesPerLong);
+// Clear short arrays bigger than one word in an arch-specific way
+define_pd_global(intx, InitArrayShortSize, BytesPerLong);
#if defined(COMPILER1) || defined(COMPILER2)
define_pd_global(intx, InlineSmallCode, 1000);
--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp Thu Apr 14 08:32:39 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp Tue Apr 12 11:53:44 2016 +0800
@@ -4677,6 +4677,39 @@
fill_words(base, cnt, zr);
}
+// base: Address of a buffer to be zeroed, 8 bytes aligned.
+// cnt: Immediate count in 8-byte unit.
+#define ShortArraySize (18 * BytesPerLong)
+void MacroAssembler::zero_words(Register base, u_int64_t cnt)
+{
+ int i = cnt & 1; // store any odd word to start
+ if (i) str(zr, Address(base));
+
+ if (cnt <= ShortArraySize / BytesPerLong) {
+ for (; i < (int)cnt; i += 2)
+ stp(zr, zr, Address(base, i * wordSize));
+ } else {
+ const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
+ int remainder = cnt % (2 * unroll);
+ for (; i < remainder; i += 2)
+ stp(zr, zr, Address(base, i * wordSize));
+
+ Label loop;
+ Register cnt_reg = rscratch1;
+ Register loop_base = rscratch2;
+ cnt = cnt - remainder;
+ mov(cnt_reg, cnt);
+ // adjust base and prebias by -2 * wordSize so we can pre-increment
+ add(loop_base, base, (remainder - 2) * wordSize);
+ bind(loop);
+ sub(cnt_reg, cnt_reg, 2 * unroll);
+ for (i = 1; i < unroll; i++)
+ stp(zr, zr, Address(loop_base, 2 * i * wordSize));
+ stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
+ cbnz(cnt_reg, loop);
+ }
+}
+
// base: Address of a buffer to be filled, 8 bytes aligned.
// cnt: Count in 8-byte unit.
// value: Value to be filled with.
--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp Thu Apr 14 08:32:39 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp Tue Apr 12 11:53:44 2016 +0800
@@ -1186,6 +1186,7 @@
void fill_words(Register base, Register cnt, Register value);
void zero_words(Register base, Register cnt);
+ void zero_words(Register base, u_int64_t cnt);
void encode_iso_array(Register src, Register dst,
Register len, Register result,