# HG changeset patch # User enevill # Date 1460068576 0 # Node ID be8cc044b136601ae4119e2d555929c940fa42b9 # Parent 8a5693d27400227606dbce15406f833d8698eed7 8153797: aarch64: Add Arrays.fill stub code Reviewed-by: aph Contributed-by: long.chen@linaro.org diff -r 8a5693d27400 -r be8cc044b136 hotspot/src/cpu/aarch64/vm/aarch64.ad --- a/hotspot/src/cpu/aarch64/vm/aarch64.ad Thu Apr 07 08:57:26 2016 -1000 +++ b/hotspot/src/cpu/aarch64/vm/aarch64.ad Thu Apr 07 22:36:16 2016 +0000 @@ -4190,55 +4190,6 @@ } %} - enc_class aarch64_enc_clear_array_reg_reg(iRegL_R11 cnt, iRegP_R10 base) %{ - MacroAssembler _masm(&cbuf); - Register cnt_reg = as_Register($cnt$$reg); - Register base_reg = as_Register($base$$reg); - // base is word aligned - // cnt is count of words - - Label loop; - Label entry; - -// Algorithm: -// -// scratch1 = cnt & 7; -// cnt -= scratch1; -// p += scratch1; -// switch (scratch1) { -// do { -// cnt -= 8; -// p[-8] = 0; -// case 7: -// p[-7] = 0; -// case 6: -// p[-6] = 0; -// // ... -// case 1: -// p[-1] = 0; -// case 0: -// p += 8; -// } while (cnt); -// } - - const int unroll = 8; // Number of str(zr) instructions we'll unroll - - __ andr(rscratch1, cnt_reg, unroll - 1); // tmp1 = cnt % unroll - __ sub(cnt_reg, cnt_reg, rscratch1); // cnt -= unroll - // base_reg always points to the end of the region we're about to zero - __ add(base_reg, base_reg, rscratch1, Assembler::LSL, exact_log2(wordSize)); - __ adr(rscratch2, entry); - __ sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2); - __ br(rscratch2); - __ bind(loop); - __ sub(cnt_reg, cnt_reg, unroll); - for (int i = -unroll; i < 0; i++) - __ str(zr, Address(base_reg, i * wordSize)); - __ bind(entry); - __ add(base_reg, base_reg, unroll * wordSize); - __ cbnz(cnt_reg, loop); - %} - /// mov envcodings enc_class aarch64_enc_movw_imm(iRegI dst, immI src) %{ @@ -13363,7 +13314,9 @@ ins_cost(4 * INSN_COST); format %{ "ClearArray $cnt, $base" %} - ins_encode(aarch64_enc_clear_array_reg_reg(cnt, base)); + ins_encode %{ + __ zero_words($base$$Register, $cnt$$Register); + %} ins_pipe(pipe_class_memory); %} diff -r 8a5693d27400 -r be8cc044b136 hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp --- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp Thu Apr 07 08:57:26 2016 -1000 +++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp Thu Apr 07 22:36:16 2016 +0000 @@ -4670,6 +4670,61 @@ BLOCK_COMMENT(is_string ? "} string_equals" : "} array_equals"); } +// base: Address of a buffer to be zeroed, 8 bytes aligned. +// cnt: Count in 8-byte unit. +void MacroAssembler::zero_words(Register base, Register cnt) +{ + fill_words(base, cnt, zr); +} + +// base: Address of a buffer to be filled, 8 bytes aligned. +// cnt: Count in 8-byte unit. +// value: Value to be filled with. +// base will point to the end of the buffer after filling. +void MacroAssembler::fill_words(Register base, Register cnt, Register value) +{ +// Algorithm: +// +// scratch1 = cnt & 7; +// cnt -= scratch1; +// p += scratch1; +// switch (scratch1) { +// do { +// cnt -= 8; +// p[-8] = v; +// case 7: +// p[-7] = v; +// case 6: +// p[-6] = v; +// // ... +// case 1: +// p[-1] = v; +// case 0: +// p += 8; +// } while (cnt); +// } + + assert_different_registers(base, cnt, value, rscratch1, rscratch2); + + Label entry, loop; + const int unroll = 8; // Number of str instructions we'll unroll + + andr(rscratch1, cnt, unroll - 1); // tmp1 = cnt % unroll + cbz(rscratch1, entry); + sub(cnt, cnt, rscratch1); // cnt -= tmp1 + // base always points to the end of the region we're about to fill + add(base, base, rscratch1, Assembler::LSL, 3); + adr(rscratch2, entry); + sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2); + br(rscratch2); + bind(loop); + add(base, base, unroll * 8); + sub(cnt, cnt, unroll); + for (int i = -unroll; i < 0; i++) + str(value, Address(base, i * 8)); + bind(entry); + cbnz(cnt, loop); +} // encode char[] to byte[] in ISO_8859_1 void MacroAssembler::encode_iso_array(Register src, Register dst, diff -r 8a5693d27400 -r be8cc044b136 hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp --- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp Thu Apr 07 08:57:26 2016 -1000 +++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp Thu Apr 07 22:36:16 2016 +0000 @@ -1184,6 +1184,9 @@ Register result, Register cnt1, int elem_size, bool is_string); + void fill_words(Register base, Register cnt, Register value); + void zero_words(Register base, Register cnt); + void encode_iso_array(Register src, Register dst, Register len, Register result, FloatRegister Vtmp1, FloatRegister Vtmp2, diff -r 8a5693d27400 -r be8cc044b136 hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp --- a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp Thu Apr 07 08:57:26 2016 -1000 +++ b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp Thu Apr 07 22:36:16 2016 +0000 @@ -2022,6 +2022,136 @@ return start; } + // + // Generate stub for array fill. If "aligned" is true, the + // "to" address is assumed to be heapword aligned. + // + // Arguments for generated stub: + // to: c_rarg0 + // value: c_rarg1 + // count: c_rarg2 treated as signed + // + address generate_fill(BasicType t, bool aligned, const char *name) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", name); + address start = __ pc(); + + BLOCK_COMMENT("Entry:"); + + const Register to = c_rarg0; // source array address + const Register value = c_rarg1; // value + const Register count = c_rarg2; // elements count + const Register cnt_words = c_rarg3; // temp register + + __ enter(); + + Label L_fill_elements, L_exit1; + + int shift = -1; + switch (t) { + case T_BYTE: + shift = 0; + __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element + __ bfi(value, value, 8, 8); // 8 bit -> 16 bit + __ bfi(value, value, 16, 16); // 16 bit -> 32 bit + __ br(Assembler::LO, L_fill_elements); + break; + case T_SHORT: + shift = 1; + __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element + __ bfi(value, value, 16, 16); // 16 bit -> 32 bit + __ br(Assembler::LO, L_fill_elements); + break; + case T_INT: + shift = 2; + __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element + __ br(Assembler::LO, L_fill_elements); + break; + default: ShouldNotReachHere(); + } + + // Align source address at 8 bytes address boundary. + Label L_skip_align1, L_skip_align2, L_skip_align4; + if (!aligned) { + switch (t) { + case T_BYTE: + // One byte misalignment happens only for byte arrays. + __ tbz(to, 0, L_skip_align1); + __ strb(value, Address(__ post(to, 1))); + __ subw(count, count, 1); + __ bind(L_skip_align1); + // Fallthrough + case T_SHORT: + // Two bytes misalignment happens only for byte and short (char) arrays. + __ tbz(to, 1, L_skip_align2); + __ strh(value, Address(__ post(to, 2))); + __ subw(count, count, 2 >> shift); + __ bind(L_skip_align2); + // Fallthrough + case T_INT: + // Align to 8 bytes, we know we are 4 byte aligned to start. + __ tbz(to, 2, L_skip_align4); + __ strw(value, Address(__ post(to, 4))); + __ subw(count, count, 4 >> shift); + __ bind(L_skip_align4); + break; + default: ShouldNotReachHere(); + } + } + + // + // Fill large chunks + // + __ lsrw(cnt_words, count, 3 - shift); // number of words + __ bfi(value, value, 32, 32); // 32 bit -> 64 bit + __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); + __ fill_words(to, cnt_words, value); + + // Remaining count is less than 8 bytes. Fill it by a single store. + // Note that the total length is no less than 8 bytes. + if (t == T_BYTE || t == T_SHORT) { + Label L_exit1; + __ cbzw(count, L_exit1); + __ add(to, to, count, Assembler::LSL, shift); // points to the end + __ str(value, Address(to, -8)); // overwrite some elements + __ bind(L_exit1); + __ leave(); + __ ret(lr); + } + + // Handle copies less than 8 bytes. + Label L_fill_2, L_fill_4, L_exit2; + __ bind(L_fill_elements); + switch (t) { + case T_BYTE: + __ tbz(count, 0, L_fill_2); + __ strb(value, Address(__ post(to, 1))); + __ bind(L_fill_2); + __ tbz(count, 1, L_fill_4); + __ strh(value, Address(__ post(to, 2))); + __ bind(L_fill_4); + __ tbz(count, 2, L_exit2); + __ strw(value, Address(to)); + break; + case T_SHORT: + __ tbz(count, 0, L_fill_4); + __ strh(value, Address(__ post(to, 2))); + __ bind(L_fill_4); + __ tbz(count, 1, L_exit2); + __ strw(value, Address(to)); + break; + case T_INT: + __ cbzw(count, L_exit2); + __ strw(value, Address(to)); + break; + default: ShouldNotReachHere(); + } + __ bind(L_exit2); + __ leave(); + __ ret(lr); + return start; + } + void generate_arraycopy_stubs() { address entry; address entry_jbyte_arraycopy; @@ -2125,6 +2255,12 @@ entry_jlong_arraycopy, entry_checkcast_arraycopy); + StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); + StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); + StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); + StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); + StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); + StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); } void generate_math_stubs() { Unimplemented(); }