8155617: aarch64: ClearArray does not use DC ZVA
Summary: Implement block zero using DC ZVA
Reviewed-by: aph
Contributed-by: long.chen@linaro.org, edward.nevill@gmail.com
--- a/hotspot/src/cpu/aarch64/vm/aarch64.ad Thu Apr 28 17:36:37 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/aarch64.ad Thu Apr 28 13:26:29 2016 +0000
@@ -13470,9 +13470,10 @@
ins_pipe(pipe_class_memory);
%}
-instruct clearArray_imm_reg(immL cnt, iRegP base, Universe dummy, rFlagsReg cr)
+instruct clearArray_imm_reg(immL cnt, iRegP_R10 base, iRegL_R11 tmp, Universe dummy, rFlagsReg cr)
%{
match(Set dummy (ClearArray cnt base));
+ effect(USE_KILL base, TEMP tmp);
ins_cost(4 * INSN_COST);
format %{ "ClearArray $cnt, $base" %}
--- a/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp Thu Apr 28 17:36:37 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp Thu Apr 28 13:26:29 2016 +0000
@@ -1032,12 +1032,28 @@
system(0b00, 0b011, 0b00011, SY, 0b110);
}
- void dc(Register Rt) {
- system(0b01, 0b011, 0b0111, 0b1011, 0b001, Rt);
+ void sys(int op1, int CRn, int CRm, int op2,
+ Register rt = (Register)0b11111) {
+ system(0b01, op1, CRn, CRm, op2, rt);
}
- void ic(Register Rt) {
- system(0b01, 0b011, 0b0111, 0b0101, 0b001, Rt);
+ // Only implement operations accessible from EL0 or higher, i.e.,
+ // op1 CRn CRm op2
+ // IC IVAU 3 7 5 1
+ // DC CVAC 3 7 10 1
+ // DC CVAU 3 7 11 1
+ // DC CIVAC 3 7 14 1
+ // DC ZVA 3 7 4 1
+ // So only deal with the CRm field.
+ enum icache_maintenance {IVAU = 0b0101};
+ enum dcache_maintenance {CVAC = 0b1010, CVAU = 0b1011, CIVAC = 0b1110, ZVA = 0b100};
+
+ void dc(dcache_maintenance cm, Register Rt) {
+ sys(0b011, 0b0111, cm, 0b001, Rt);
+ }
+
+ void ic(icache_maintenance cm, Register Rt) {
+ sys(0b011, 0b0111, cm, 0b001, Rt);
}
// A more convenient access to dmb for our purposes
--- a/hotspot/src/cpu/aarch64/vm/globals_aarch64.hpp Thu Apr 28 17:36:37 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/globals_aarch64.hpp Thu Apr 28 13:26:29 2016 +0000
@@ -132,6 +132,11 @@
"Use SIMD instructions in generated memory move code") \
product(bool, UseLSE, false, \
"Use LSE instructions") \
+ product(bool, UseBlockZeroing, true, \
+ "Use DC ZVA for block zeroing") \
+ product(intx, BlockZeroingLowLimit, 256, \
+ "Minimum size in bytes when block zeroing will be used") \
+ range(1, max_jint) \
product(bool, TraceTraps, false, "Trace all traps the signal handler")
#endif
--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp Thu Apr 28 17:36:37 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp Thu Apr 28 13:26:29 2016 +0000
@@ -4670,24 +4670,35 @@
BLOCK_COMMENT(is_string ? "} string_equals" : "} array_equals");
}
-// base: Address of a buffer to be zeroed, 8 bytes aligned.
-// cnt: Count in 8-byte unit.
+
+// base: Address of a buffer to be zeroed, 8 bytes aligned.
+// cnt: Count in HeapWords.
+// is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit.
void MacroAssembler::zero_words(Register base, Register cnt)
{
- fill_words(base, cnt, zr);
+ if (UseBlockZeroing) {
+ block_zero(base, cnt);
+ } else {
+ fill_words(base, cnt, zr);
+ }
}
-// base: Address of a buffer to be zeroed, 8 bytes aligned.
-// cnt: Immediate count in 8-byte unit.
+// r10 = base: Address of a buffer to be zeroed, 8 bytes aligned.
+// cnt: Immediate count in HeapWords.
+// r11 = tmp: For use as cnt if we need to call out
#define ShortArraySize (18 * BytesPerLong)
void MacroAssembler::zero_words(Register base, u_int64_t cnt)
{
+ Register tmp = r11;
int i = cnt & 1; // store any odd word to start
if (i) str(zr, Address(base));
if (cnt <= ShortArraySize / BytesPerLong) {
for (; i < (int)cnt; i += 2)
stp(zr, zr, Address(base, i * wordSize));
+ } else if (UseBlockZeroing && cnt >= (u_int64_t)(BlockZeroingLowLimit >> LogBytesPerWord)) {
+ mov(tmp, cnt);
+ block_zero(base, tmp, true);
} else {
const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
int remainder = cnt % (2 * unroll);
@@ -4739,24 +4750,95 @@
assert_different_registers(base, cnt, value, rscratch1, rscratch2);
- Label entry, loop;
- const int unroll = 8; // Number of str instructions we'll unroll
-
- andr(rscratch1, cnt, unroll - 1); // tmp1 = cnt % unroll
- cbz(rscratch1, entry);
- sub(cnt, cnt, rscratch1); // cnt -= tmp1
- // base always points to the end of the region we're about to fill
+ Label fini, skip, entry, loop;
+ const int unroll = 8; // Number of stp instructions we'll unroll
+
+ cbz(cnt, fini);
+ tbz(base, 3, skip);
+ str(value, Address(post(base, 8)));
+ sub(cnt, cnt, 1);
+ bind(skip);
+
+ andr(rscratch1, cnt, (unroll-1) * 2);
+ sub(cnt, cnt, rscratch1);
add(base, base, rscratch1, Assembler::LSL, 3);
adr(rscratch2, entry);
- sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
+ sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
br(rscratch2);
+
bind(loop);
- add(base, base, unroll * 8);
- sub(cnt, cnt, unroll);
for (int i = -unroll; i < 0; i++)
- str(value, Address(base, i * 8));
+ stp(value, value, Address(base, i * 16));
bind(entry);
- cbnz(cnt, loop);
+ subs(cnt, cnt, unroll * 2);
+ add(base, base, unroll * 16);
+ br(Assembler::GE, loop);
+
+ tbz(cnt, 0, fini);
+ str(value, Address(base, -unroll * 16));
+ bind(fini);
+}
+
+// Use DC ZVA to do fast zeroing.
+// base: Address of a buffer to be zeroed, 8 bytes aligned.
+// cnt: Count in HeapWords.
+// is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit.
+void MacroAssembler::block_zero(Register base, Register cnt, bool is_large)
+{
+ Label small;
+ Label store_pair, loop_store_pair, done;
+ Label base_aligned;
+
+ assert_different_registers(base, cnt, rscratch1);
+
+ Register tmp = rscratch1;
+ Register tmp2 = rscratch2;
+ int zva_length = VM_Version::zva_length();
+
+ // Ensure ZVA length can be divided by 16. This is required by
+ // the subsequent operations.
+ assert (zva_length % 16 == 0, "Unexpected ZVA Length");
+
+ if (!is_large) cbz(cnt, done);
+ tbz(base, 3, base_aligned);
+ str(zr, Address(post(base, 8)));
+ sub(cnt, cnt, 1);
+ bind(base_aligned);
+
+ // Ensure count >= zva_length * 2 so that it still deserves a zva after
+ // alignment.
+ if (!is_large || !(BlockZeroingLowLimit >= zva_length * 2)) {
+ int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
+ cmp(cnt, low_limit >> 3);
+ br(Assembler::LT, small);
+ }
+
+ far_call(StubRoutines::aarch64::get_zero_longs());
+
+ bind(small);
+
+ const int unroll = 8; // Number of stp instructions we'll unroll
+ Label small_loop, small_table_end;
+
+ andr(tmp, cnt, (unroll-1) * 2);
+ sub(cnt, cnt, tmp);
+ add(base, base, tmp, Assembler::LSL, 3);
+ adr(tmp2, small_table_end);
+ sub(tmp2, tmp2, tmp, Assembler::LSL, 1);
+ br(tmp2);
+
+ bind(small_loop);
+ for (int i = -unroll; i < 0; i++)
+ stp(zr, zr, Address(base, i * 16));
+ bind(small_table_end);
+ subs(cnt, cnt, unroll * 2);
+ add(base, base, unroll * 16);
+ br(Assembler::GE, small_loop);
+
+ tbz(cnt, 0, done);
+ str(zr, Address(base, -unroll * 16));
+
+ bind(done);
}
// encode char[] to byte[] in ISO_8859_1
--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp Thu Apr 28 17:36:37 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp Thu Apr 28 13:26:29 2016 +0000
@@ -536,6 +536,15 @@
msr(0b011, 0b0100, 0b0100, 0b001, zr);
}
+ // DCZID_EL0: op1 == 011
+ // CRn == 0000
+ // CRm == 0000
+ // op2 == 111
+ inline void get_dczid_el0(Register reg)
+ {
+ mrs(0b011, 0b0000, 0b0000, 0b111, reg);
+ }
+
// idiv variant which deals with MINLONG as dividend and -1 as divisor
int corrected_idivl(Register result, Register ra, Register rb,
bool want_remainder, Register tmp = rscratch1);
@@ -1185,8 +1194,9 @@
int elem_size, bool is_string);
void fill_words(Register base, Register cnt, Register value);
+ void zero_words(Register base, u_int64_t cnt);
void zero_words(Register base, Register cnt);
- void zero_words(Register base, u_int64_t cnt);
+ void block_zero(Register base, Register cnt, bool is_large = false);
void encode_iso_array(Register src, Register dst,
Register len, Register result,
--- a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp Thu Apr 28 17:36:37 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp Thu Apr 28 13:26:29 2016 +0000
@@ -719,6 +719,43 @@
}
}
+ address generate_zero_longs(Register base, Register cnt) {
+ Register tmp = rscratch1;
+ Register tmp2 = rscratch2;
+ int zva_length = VM_Version::zva_length();
+ Label initial_table_end, loop_zva;
+
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "zero_longs");
+ address start = __ pc();
+
+ // Align base with ZVA length.
+ __ neg(tmp, base);
+ __ andr(tmp, tmp, zva_length - 1);
+
+ // tmp: the number of bytes to be filled to align the base with ZVA length.
+ __ add(base, base, tmp);
+ __ sub(cnt, cnt, tmp, Assembler::ASR, 3);
+ __ adr(tmp2, initial_table_end);
+ __ sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
+ __ br(tmp2);
+
+ for (int i = -zva_length + 16; i < 0; i += 16)
+ __ stp(zr, zr, Address(base, i));
+ __ bind(initial_table_end);
+
+ __ sub(cnt, cnt, zva_length >> 3);
+ __ bind(loop_zva);
+ __ dc(Assembler::ZVA, base);
+ __ subs(cnt, cnt, zva_length >> 3);
+ __ add(base, base, zva_length);
+ __ br(Assembler::GE, loop_zva);
+ __ add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
+ __ ret(lr);
+
+ return start;
+ }
+
typedef enum {
copy_forwards = 1,
copy_backwards = -1
@@ -2104,7 +2141,21 @@
__ lsrw(cnt_words, count, 3 - shift); // number of words
__ bfi(value, value, 32, 32); // 32 bit -> 64 bit
__ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
- __ fill_words(to, cnt_words, value);
+ if (UseBlockZeroing) {
+ Label non_block_zeroing, rest;
+ // count >= BlockZeroingLowLimit && value == 0
+ __ cmp(cnt_words, BlockZeroingLowLimit >> 3);
+ __ ccmp(value, 0 /* comparing value */, 0 /* NZCV */, Assembler::GE);
+ __ br(Assembler::NE, non_block_zeroing);
+ __ block_zero(to, cnt_words, true);
+ __ b(rest);
+ __ bind(non_block_zeroing);
+ __ fill_words(to, cnt_words, value);
+ __ bind(rest);
+ }
+ else {
+ __ fill_words(to, cnt_words, value);
+ }
// Remaining count is less than 8 bytes. Fill it by a single store.
// Note that the total length is no less than 8 bytes.
@@ -2163,6 +2214,8 @@
generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
+ StubRoutines::aarch64::_zero_longs = generate_zero_longs(r10, r11);
+
//*** jbyte
// Always need aligned and unaligned versions
StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry,
--- a/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.cpp Thu Apr 28 17:36:37 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.cpp Thu Apr 28 13:26:29 2016 +0000
@@ -43,6 +43,7 @@
address StubRoutines::aarch64::_float_sign_flip = NULL;
address StubRoutines::aarch64::_double_sign_mask = NULL;
address StubRoutines::aarch64::_double_sign_flip = NULL;
+address StubRoutines::aarch64::_zero_longs = NULL;
/**
* crc_table[] from jdk/src/share/native/java/util/zip/zlib-1.2.5/crc32.h
--- a/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp Thu Apr 28 17:36:37 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp Thu Apr 28 13:26:29 2016 +0000
@@ -61,6 +61,8 @@
static address _double_sign_mask;
static address _double_sign_flip;
+ static address _zero_longs;
+
public:
static address get_previous_fp_entry()
@@ -113,6 +115,11 @@
return _double_sign_flip;
}
+ static address get_zero_longs()
+ {
+ return _zero_longs;
+ }
+
private:
static juint _crc_table[];
--- a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp Thu Apr 28 17:36:37 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp Thu Apr 28 13:26:29 2016 +0000
@@ -71,6 +71,7 @@
int VM_Version::_variant;
int VM_Version::_revision;
int VM_Version::_stepping;
+VM_Version::PsrInfo VM_Version::_psr_info = { 0, };
static BufferBlob* stub_blob;
static const int stub_size = 550;
@@ -95,13 +96,16 @@
__ c_stub_prolog(1, 0, MacroAssembler::ret_type_void);
#endif
- // void getPsrInfo(VM_Version::CpuidInfo* cpuid_info);
+ // void getPsrInfo(VM_Version::PsrInfo* psr_info);
address entry = __ pc();
- // TODO : redefine fields in CpuidInfo and generate
- // code to fill them in
+ __ enter();
+ __ get_dczid_el0(rscratch1);
+ __ strw(rscratch1, Address(c_rarg0, in_bytes(VM_Version::dczid_el0_offset())));
+
+ __ leave();
__ ret(lr);
# undef __
@@ -118,6 +122,8 @@
_supports_atomic_getset8 = true;
_supports_atomic_getadd8 = true;
+ getPsrInfo_stub(&_psr_info);
+
if (FLAG_IS_DEFAULT(AllocatePrefetchDistance))
FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256);
if (FLAG_IS_DEFAULT(AllocatePrefetchStepSize))
@@ -285,6 +291,18 @@
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
}
+ if (is_zva_enabled()) {
+ if (FLAG_IS_DEFAULT(UseBlockZeroing)) {
+ FLAG_SET_DEFAULT(UseBlockZeroing, true);
+ }
+ if (FLAG_IS_DEFAULT(BlockZeroingLowLimit)) {
+ FLAG_SET_DEFAULT(BlockZeroingLowLimit, 4 * VM_Version::zva_length());
+ }
+ } else if (UseBlockZeroing) {
+ warning("DC ZVA is not available on this CPU");
+ FLAG_SET_DEFAULT(UseBlockZeroing, false);
+ }
+
// This machine allows unaligned memory accesses
if (FLAG_IS_DEFAULT(UseUnalignedAccesses)) {
FLAG_SET_DEFAULT(UseUnalignedAccesses, true);
--- a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.hpp Thu Apr 28 17:36:37 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.hpp Thu Apr 28 13:26:29 2016 +0000
@@ -40,6 +40,10 @@
static int _revision;
static int _stepping;
+ struct PsrInfo {
+ uint32_t dczid_el0;
+ };
+ static PsrInfo _psr_info;
static void get_processor_features();
public:
@@ -83,6 +87,17 @@
static int cpu_model2() { return _model2; }
static int cpu_variant() { return _variant; }
static int cpu_revision() { return _revision; }
+ static ByteSize dczid_el0_offset() { return byte_offset_of(PsrInfo, dczid_el0); }
+ static bool is_zva_enabled() {
+ // Check the DZP bit (bit 4) of dczid_el0 is zero
+ // and block size (bit 0~3) is not zero.
+ return ((_psr_info.dczid_el0 & 0x10) == 0 &&
+ (_psr_info.dczid_el0 & 0xf) != 0);
+ }
+ static int zva_length() {
+ assert(is_zva_enabled(), "ZVA not available");
+ return 4 << (_psr_info.dczid_el0 & 0xf);
+ }
};
#endif // CPU_AARCH64_VM_VM_VERSION_AARCH64_HPP