8153797: aarch64: Add Arrays.fill stub code
Reviewed-by: aph
Contributed-by: long.chen@linaro.org
--- a/hotspot/src/cpu/aarch64/vm/aarch64.ad Thu Apr 07 08:57:26 2016 -1000
+++ b/hotspot/src/cpu/aarch64/vm/aarch64.ad Thu Apr 07 22:36:16 2016 +0000
@@ -4190,55 +4190,6 @@
}
%}
- enc_class aarch64_enc_clear_array_reg_reg(iRegL_R11 cnt, iRegP_R10 base) %{
- MacroAssembler _masm(&cbuf);
- Register cnt_reg = as_Register($cnt$$reg);
- Register base_reg = as_Register($base$$reg);
- // base is word aligned
- // cnt is count of words
-
- Label loop;
- Label entry;
-
-// Algorithm:
-//
-// scratch1 = cnt & 7;
-// cnt -= scratch1;
-// p += scratch1;
-// switch (scratch1) {
-// do {
-// cnt -= 8;
-// p[-8] = 0;
-// case 7:
-// p[-7] = 0;
-// case 6:
-// p[-6] = 0;
-// // ...
-// case 1:
-// p[-1] = 0;
-// case 0:
-// p += 8;
-// } while (cnt);
-// }
-
- const int unroll = 8; // Number of str(zr) instructions we'll unroll
-
- __ andr(rscratch1, cnt_reg, unroll - 1); // tmp1 = cnt % unroll
- __ sub(cnt_reg, cnt_reg, rscratch1); // cnt -= unroll
- // base_reg always points to the end of the region we're about to zero
- __ add(base_reg, base_reg, rscratch1, Assembler::LSL, exact_log2(wordSize));
- __ adr(rscratch2, entry);
- __ sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
- __ br(rscratch2);
- __ bind(loop);
- __ sub(cnt_reg, cnt_reg, unroll);
- for (int i = -unroll; i < 0; i++)
- __ str(zr, Address(base_reg, i * wordSize));
- __ bind(entry);
- __ add(base_reg, base_reg, unroll * wordSize);
- __ cbnz(cnt_reg, loop);
- %}
-
/// mov envcodings
enc_class aarch64_enc_movw_imm(iRegI dst, immI src) %{
@@ -13363,7 +13314,9 @@
ins_cost(4 * INSN_COST);
format %{ "ClearArray $cnt, $base" %}
- ins_encode(aarch64_enc_clear_array_reg_reg(cnt, base));
+ ins_encode %{
+ __ zero_words($base$$Register, $cnt$$Register);
+ %}
ins_pipe(pipe_class_memory);
%}
--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp Thu Apr 07 08:57:26 2016 -1000
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp Thu Apr 07 22:36:16 2016 +0000
@@ -4670,6 +4670,61 @@
BLOCK_COMMENT(is_string ? "} string_equals" : "} array_equals");
}
+// base: Address of a buffer to be zeroed, 8 bytes aligned.
+// cnt: Count in 8-byte unit.
+void MacroAssembler::zero_words(Register base, Register cnt)
+{
+ fill_words(base, cnt, zr);
+}
+
+// base: Address of a buffer to be filled, 8 bytes aligned.
+// cnt: Count in 8-byte unit.
+// value: Value to be filled with.
+// base will point to the end of the buffer after filling.
+void MacroAssembler::fill_words(Register base, Register cnt, Register value)
+{
+// Algorithm:
+//
+// scratch1 = cnt & 7;
+// cnt -= scratch1;
+// p += scratch1;
+// switch (scratch1) {
+// do {
+// cnt -= 8;
+// p[-8] = v;
+// case 7:
+// p[-7] = v;
+// case 6:
+// p[-6] = v;
+// // ...
+// case 1:
+// p[-1] = v;
+// case 0:
+// p += 8;
+// } while (cnt);
+// }
+
+ assert_different_registers(base, cnt, value, rscratch1, rscratch2);
+
+ Label entry, loop;
+ const int unroll = 8; // Number of str instructions we'll unroll
+
+ andr(rscratch1, cnt, unroll - 1); // tmp1 = cnt % unroll
+ cbz(rscratch1, entry);
+ sub(cnt, cnt, rscratch1); // cnt -= tmp1
+ // base always points to the end of the region we're about to fill
+ add(base, base, rscratch1, Assembler::LSL, 3);
+ adr(rscratch2, entry);
+ sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
+ br(rscratch2);
+ bind(loop);
+ add(base, base, unroll * 8);
+ sub(cnt, cnt, unroll);
+ for (int i = -unroll; i < 0; i++)
+ str(value, Address(base, i * 8));
+ bind(entry);
+ cbnz(cnt, loop);
+}
// encode char[] to byte[] in ISO_8859_1
void MacroAssembler::encode_iso_array(Register src, Register dst,
--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp Thu Apr 07 08:57:26 2016 -1000
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp Thu Apr 07 22:36:16 2016 +0000
@@ -1184,6 +1184,9 @@
Register result, Register cnt1,
int elem_size, bool is_string);
+ void fill_words(Register base, Register cnt, Register value);
+ void zero_words(Register base, Register cnt);
+
void encode_iso_array(Register src, Register dst,
Register len, Register result,
FloatRegister Vtmp1, FloatRegister Vtmp2,
--- a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp Thu Apr 07 08:57:26 2016 -1000
+++ b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp Thu Apr 07 22:36:16 2016 +0000
@@ -2022,6 +2022,136 @@
return start;
}
+ //
+ // Generate stub for array fill. If "aligned" is true, the
+ // "to" address is assumed to be heapword aligned.
+ //
+ // Arguments for generated stub:
+ // to: c_rarg0
+ // value: c_rarg1
+ // count: c_rarg2 treated as signed
+ //
+ address generate_fill(BasicType t, bool aligned, const char *name) {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", name);
+ address start = __ pc();
+
+ BLOCK_COMMENT("Entry:");
+
+ const Register to = c_rarg0; // source array address
+ const Register value = c_rarg1; // value
+ const Register count = c_rarg2; // elements count
+ const Register cnt_words = c_rarg3; // temp register
+
+ __ enter();
+
+ Label L_fill_elements, L_exit1;
+
+ int shift = -1;
+ switch (t) {
+ case T_BYTE:
+ shift = 0;
+ __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
+ __ bfi(value, value, 8, 8); // 8 bit -> 16 bit
+ __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
+ __ br(Assembler::LO, L_fill_elements);
+ break;
+ case T_SHORT:
+ shift = 1;
+ __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
+ __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
+ __ br(Assembler::LO, L_fill_elements);
+ break;
+ case T_INT:
+ shift = 2;
+ __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
+ __ br(Assembler::LO, L_fill_elements);
+ break;
+ default: ShouldNotReachHere();
+ }
+
+ // Align source address at 8 bytes address boundary.
+ Label L_skip_align1, L_skip_align2, L_skip_align4;
+ if (!aligned) {
+ switch (t) {
+ case T_BYTE:
+ // One byte misalignment happens only for byte arrays.
+ __ tbz(to, 0, L_skip_align1);
+ __ strb(value, Address(__ post(to, 1)));
+ __ subw(count, count, 1);
+ __ bind(L_skip_align1);
+ // Fallthrough
+ case T_SHORT:
+ // Two bytes misalignment happens only for byte and short (char) arrays.
+ __ tbz(to, 1, L_skip_align2);
+ __ strh(value, Address(__ post(to, 2)));
+ __ subw(count, count, 2 >> shift);
+ __ bind(L_skip_align2);
+ // Fallthrough
+ case T_INT:
+ // Align to 8 bytes, we know we are 4 byte aligned to start.
+ __ tbz(to, 2, L_skip_align4);
+ __ strw(value, Address(__ post(to, 4)));
+ __ subw(count, count, 4 >> shift);
+ __ bind(L_skip_align4);
+ break;
+ default: ShouldNotReachHere();
+ }
+ }
+
+ //
+ // Fill large chunks
+ //
+ __ lsrw(cnt_words, count, 3 - shift); // number of words
+ __ bfi(value, value, 32, 32); // 32 bit -> 64 bit
+ __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
+ __ fill_words(to, cnt_words, value);
+
+ // Remaining count is less than 8 bytes. Fill it by a single store.
+ // Note that the total length is no less than 8 bytes.
+ if (t == T_BYTE || t == T_SHORT) {
+ Label L_exit1;
+ __ cbzw(count, L_exit1);
+ __ add(to, to, count, Assembler::LSL, shift); // points to the end
+ __ str(value, Address(to, -8)); // overwrite some elements
+ __ bind(L_exit1);
+ __ leave();
+ __ ret(lr);
+ }
+
+ // Handle copies less than 8 bytes.
+ Label L_fill_2, L_fill_4, L_exit2;
+ __ bind(L_fill_elements);
+ switch (t) {
+ case T_BYTE:
+ __ tbz(count, 0, L_fill_2);
+ __ strb(value, Address(__ post(to, 1)));
+ __ bind(L_fill_2);
+ __ tbz(count, 1, L_fill_4);
+ __ strh(value, Address(__ post(to, 2)));
+ __ bind(L_fill_4);
+ __ tbz(count, 2, L_exit2);
+ __ strw(value, Address(to));
+ break;
+ case T_SHORT:
+ __ tbz(count, 0, L_fill_4);
+ __ strh(value, Address(__ post(to, 2)));
+ __ bind(L_fill_4);
+ __ tbz(count, 1, L_exit2);
+ __ strw(value, Address(to));
+ break;
+ case T_INT:
+ __ cbzw(count, L_exit2);
+ __ strw(value, Address(to));
+ break;
+ default: ShouldNotReachHere();
+ }
+ __ bind(L_exit2);
+ __ leave();
+ __ ret(lr);
+ return start;
+ }
+
void generate_arraycopy_stubs() {
address entry;
address entry_jbyte_arraycopy;
@@ -2125,6 +2255,12 @@
entry_jlong_arraycopy,
entry_checkcast_arraycopy);
+ StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
+ StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
+ StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
+ StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
+ StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
+ StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
}
void generate_math_stubs() { Unimplemented(); }