8155617: aarch64: ClearArray does not use DC ZVA
authorenevill
Thu, 28 Apr 2016 13:26:29 +0000
changeset 38143 3b732f17ea7d
parent 38142 e16b23089599
child 38144 0976c0c5c5d3
child 38217 7ecc47b7df3d
child 38221 0a7813e6b50e
8155617: aarch64: ClearArray does not use DC ZVA Summary: Implement block zero using DC ZVA Reviewed-by: aph Contributed-by: long.chen@linaro.org, edward.nevill@gmail.com
hotspot/src/cpu/aarch64/vm/aarch64.ad
hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp
hotspot/src/cpu/aarch64/vm/globals_aarch64.hpp
hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.cpp
hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp
hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp
hotspot/src/cpu/aarch64/vm/vm_version_aarch64.hpp
--- a/hotspot/src/cpu/aarch64/vm/aarch64.ad	Thu Apr 28 17:36:37 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/aarch64.ad	Thu Apr 28 13:26:29 2016 +0000
@@ -13470,9 +13470,10 @@
   ins_pipe(pipe_class_memory);
 %}
 
-instruct clearArray_imm_reg(immL cnt, iRegP base, Universe dummy, rFlagsReg cr)
+instruct clearArray_imm_reg(immL cnt, iRegP_R10 base, iRegL_R11 tmp, Universe dummy, rFlagsReg cr)
 %{
   match(Set dummy (ClearArray cnt base));
+  effect(USE_KILL base, TEMP tmp);
 
   ins_cost(4 * INSN_COST);
   format %{ "ClearArray $cnt, $base" %}
--- a/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp	Thu Apr 28 17:36:37 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp	Thu Apr 28 13:26:29 2016 +0000
@@ -1032,12 +1032,28 @@
     system(0b00, 0b011, 0b00011, SY, 0b110);
   }
 
-  void dc(Register Rt) {
-    system(0b01, 0b011, 0b0111, 0b1011, 0b001, Rt);
+  void sys(int op1, int CRn, int CRm, int op2,
+           Register rt = (Register)0b11111) {
+    system(0b01, op1, CRn, CRm, op2, rt);
   }
 
-  void ic(Register Rt) {
-    system(0b01, 0b011, 0b0111, 0b0101, 0b001, Rt);
+  // Only implement operations accessible from EL0 or higher, i.e.,
+  //            op1    CRn    CRm    op2
+  // IC IVAU     3      7      5      1
+  // DC CVAC     3      7      10     1
+  // DC CVAU     3      7      11     1
+  // DC CIVAC    3      7      14     1
+  // DC ZVA      3      7      4      1
+  // So only deal with the CRm field.
+  enum icache_maintenance {IVAU = 0b0101};
+  enum dcache_maintenance {CVAC = 0b1010, CVAU = 0b1011, CIVAC = 0b1110, ZVA = 0b100};
+
+  void dc(dcache_maintenance cm, Register Rt) {
+    sys(0b011, 0b0111, cm, 0b001, Rt);
+  }
+
+  void ic(icache_maintenance cm, Register Rt) {
+    sys(0b011, 0b0111, cm, 0b001, Rt);
   }
 
   // A more convenient access to dmb for our purposes
--- a/hotspot/src/cpu/aarch64/vm/globals_aarch64.hpp	Thu Apr 28 17:36:37 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/globals_aarch64.hpp	Thu Apr 28 13:26:29 2016 +0000
@@ -132,6 +132,11 @@
           "Use SIMD instructions in generated memory move code")        \
   product(bool, UseLSE, false,                                          \
           "Use LSE instructions")                                       \
+  product(bool, UseBlockZeroing, true,                                  \
+          "Use DC ZVA for block zeroing")                               \
+  product(intx, BlockZeroingLowLimit, 256,                              \
+          "Minimum size in bytes when block zeroing will be used")      \
+          range(1, max_jint)                                            \
   product(bool, TraceTraps, false, "Trace all traps the signal handler")
 
 #endif
--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Thu Apr 28 17:36:37 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Thu Apr 28 13:26:29 2016 +0000
@@ -4670,24 +4670,35 @@
   BLOCK_COMMENT(is_string ? "} string_equals" : "} array_equals");
 }
 
-// base:   Address of a buffer to be zeroed, 8 bytes aligned.
-// cnt:    Count in 8-byte unit.
+
+// base:     Address of a buffer to be zeroed, 8 bytes aligned.
+// cnt:      Count in HeapWords.
+// is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit.
 void MacroAssembler::zero_words(Register base, Register cnt)
 {
-  fill_words(base, cnt, zr);
+  if (UseBlockZeroing) {
+    block_zero(base, cnt);
+  } else {
+    fill_words(base, cnt, zr);
+  }
 }
 
-// base:   Address of a buffer to be zeroed, 8 bytes aligned.
-// cnt:    Immediate count in 8-byte unit.
+// r10 = base:   Address of a buffer to be zeroed, 8 bytes aligned.
+// cnt:          Immediate count in HeapWords.
+// r11 = tmp:    For use as cnt if we need to call out
 #define ShortArraySize (18 * BytesPerLong)
 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
 {
+  Register tmp = r11;
   int i = cnt & 1;  // store any odd word to start
   if (i) str(zr, Address(base));
 
   if (cnt <= ShortArraySize / BytesPerLong) {
     for (; i < (int)cnt; i += 2)
       stp(zr, zr, Address(base, i * wordSize));
+  } else if (UseBlockZeroing && cnt >= (u_int64_t)(BlockZeroingLowLimit >> LogBytesPerWord)) {
+    mov(tmp, cnt);
+    block_zero(base, tmp, true);
   } else {
     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
     int remainder = cnt % (2 * unroll);
@@ -4739,24 +4750,95 @@
 
   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
 
-  Label entry, loop;
-  const int unroll = 8; // Number of str instructions we'll unroll
-
-  andr(rscratch1, cnt, unroll - 1);  // tmp1 = cnt % unroll
-  cbz(rscratch1, entry);
-  sub(cnt, cnt, rscratch1);          // cnt -= tmp1
-  // base always points to the end of the region we're about to fill
+  Label fini, skip, entry, loop;
+  const int unroll = 8; // Number of stp instructions we'll unroll
+
+  cbz(cnt, fini);
+  tbz(base, 3, skip);
+  str(value, Address(post(base, 8)));
+  sub(cnt, cnt, 1);
+  bind(skip);
+
+  andr(rscratch1, cnt, (unroll-1) * 2);
+  sub(cnt, cnt, rscratch1);
   add(base, base, rscratch1, Assembler::LSL, 3);
   adr(rscratch2, entry);
-  sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
+  sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
   br(rscratch2);
+
   bind(loop);
-  add(base, base, unroll * 8);
-  sub(cnt, cnt, unroll);
   for (int i = -unroll; i < 0; i++)
-    str(value, Address(base, i * 8));
+    stp(value, value, Address(base, i * 16));
   bind(entry);
-  cbnz(cnt, loop);
+  subs(cnt, cnt, unroll * 2);
+  add(base, base, unroll * 16);
+  br(Assembler::GE, loop);
+
+  tbz(cnt, 0, fini);
+  str(value, Address(base, -unroll * 16));
+  bind(fini);
+}
+
+// Use DC ZVA to do fast zeroing.
+// base:   Address of a buffer to be zeroed, 8 bytes aligned.
+// cnt:    Count in HeapWords.
+// is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit.
+void MacroAssembler::block_zero(Register base, Register cnt, bool is_large)
+{
+  Label small;
+  Label store_pair, loop_store_pair, done;
+  Label base_aligned;
+
+  assert_different_registers(base, cnt, rscratch1);
+
+  Register tmp = rscratch1;
+  Register tmp2 = rscratch2;
+  int zva_length = VM_Version::zva_length();
+
+  // Ensure ZVA length can be divided by 16. This is required by
+  // the subsequent operations.
+  assert (zva_length % 16 == 0, "Unexpected ZVA Length");
+
+  if (!is_large) cbz(cnt, done);
+  tbz(base, 3, base_aligned);
+  str(zr, Address(post(base, 8)));
+  sub(cnt, cnt, 1);
+  bind(base_aligned);
+
+  // Ensure count >= zva_length * 2 so that it still deserves a zva after
+  // alignment.
+  if (!is_large || !(BlockZeroingLowLimit >= zva_length * 2)) {
+    int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
+    cmp(cnt, low_limit >> 3);
+    br(Assembler::LT, small);
+  }
+
+  far_call(StubRoutines::aarch64::get_zero_longs());
+
+  bind(small);
+
+  const int unroll = 8; // Number of stp instructions we'll unroll
+  Label small_loop, small_table_end;
+
+  andr(tmp, cnt, (unroll-1) * 2);
+  sub(cnt, cnt, tmp);
+  add(base, base, tmp, Assembler::LSL, 3);
+  adr(tmp2, small_table_end);
+  sub(tmp2, tmp2, tmp, Assembler::LSL, 1);
+  br(tmp2);
+
+  bind(small_loop);
+  for (int i = -unroll; i < 0; i++)
+    stp(zr, zr, Address(base, i * 16));
+  bind(small_table_end);
+  subs(cnt, cnt, unroll * 2);
+  add(base, base, unroll * 16);
+  br(Assembler::GE, small_loop);
+
+  tbz(cnt, 0, done);
+  str(zr, Address(base, -unroll * 16));
+
+  bind(done);
 }
 
 // encode char[] to byte[] in ISO_8859_1
--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Thu Apr 28 17:36:37 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Thu Apr 28 13:26:29 2016 +0000
@@ -536,6 +536,15 @@
     msr(0b011, 0b0100, 0b0100, 0b001, zr);
   }
 
+  // DCZID_EL0: op1 == 011
+  //            CRn == 0000
+  //            CRm == 0000
+  //            op2 == 111
+  inline void get_dczid_el0(Register reg)
+  {
+    mrs(0b011, 0b0000, 0b0000, 0b111, reg);
+  }
+
   // idiv variant which deals with MINLONG as dividend and -1 as divisor
   int corrected_idivl(Register result, Register ra, Register rb,
                       bool want_remainder, Register tmp = rscratch1);
@@ -1185,8 +1194,9 @@
                      int elem_size, bool is_string);
 
   void fill_words(Register base, Register cnt, Register value);
+  void zero_words(Register base, u_int64_t cnt);
   void zero_words(Register base, Register cnt);
-  void zero_words(Register base, u_int64_t cnt);
+  void block_zero(Register base, Register cnt, bool is_large = false);
 
   void encode_iso_array(Register src, Register dst,
                         Register len, Register result,
--- a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Thu Apr 28 17:36:37 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Thu Apr 28 13:26:29 2016 +0000
@@ -719,6 +719,43 @@
     }
   }
 
+  address generate_zero_longs(Register base, Register cnt) {
+    Register tmp = rscratch1;
+    Register tmp2 = rscratch2;
+    int zva_length = VM_Version::zva_length();
+    Label initial_table_end, loop_zva;
+
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "zero_longs");
+    address start = __ pc();
+
+    // Align base with ZVA length.
+    __ neg(tmp, base);
+    __ andr(tmp, tmp, zva_length - 1);
+
+    // tmp: the number of bytes to be filled to align the base with ZVA length.
+    __ add(base, base, tmp);
+    __ sub(cnt, cnt, tmp, Assembler::ASR, 3);
+    __ adr(tmp2, initial_table_end);
+    __ sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
+    __ br(tmp2);
+
+    for (int i = -zva_length + 16; i < 0; i += 16)
+      __ stp(zr, zr, Address(base, i));
+    __ bind(initial_table_end);
+
+    __ sub(cnt, cnt, zva_length >> 3);
+    __ bind(loop_zva);
+    __ dc(Assembler::ZVA, base);
+    __ subs(cnt, cnt, zva_length >> 3);
+    __ add(base, base, zva_length);
+    __ br(Assembler::GE, loop_zva);
+    __ add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
+    __ ret(lr);
+
+    return start;
+  }
+
   typedef enum {
     copy_forwards = 1,
     copy_backwards = -1
@@ -2104,7 +2141,21 @@
     __ lsrw(cnt_words, count, 3 - shift); // number of words
     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
-    __ fill_words(to, cnt_words, value);
+    if (UseBlockZeroing) {
+      Label non_block_zeroing, rest;
+      // count >= BlockZeroingLowLimit && value == 0
+      __ cmp(cnt_words, BlockZeroingLowLimit >> 3);
+      __ ccmp(value, 0 /* comparing value */, 0 /* NZCV */, Assembler::GE);
+      __ br(Assembler::NE, non_block_zeroing);
+      __ block_zero(to, cnt_words, true);
+      __ b(rest);
+      __ bind(non_block_zeroing);
+      __ fill_words(to, cnt_words, value);
+      __ bind(rest);
+    }
+    else {
+      __ fill_words(to, cnt_words, value);
+    }
 
     // Remaining count is less than 8 bytes. Fill it by a single store.
     // Note that the total length is no less than 8 bytes.
@@ -2163,6 +2214,8 @@
     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
 
+    StubRoutines::aarch64::_zero_longs = generate_zero_longs(r10, r11);
+
     //*** jbyte
     // Always need aligned and unaligned versions
     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
--- a/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.cpp	Thu Apr 28 17:36:37 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.cpp	Thu Apr 28 13:26:29 2016 +0000
@@ -43,6 +43,7 @@
 address StubRoutines::aarch64::_float_sign_flip = NULL;
 address StubRoutines::aarch64::_double_sign_mask = NULL;
 address StubRoutines::aarch64::_double_sign_flip = NULL;
+address StubRoutines::aarch64::_zero_longs = NULL;
 
 /**
  *  crc_table[] from jdk/src/share/native/java/util/zip/zlib-1.2.5/crc32.h
--- a/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp	Thu Apr 28 17:36:37 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp	Thu Apr 28 13:26:29 2016 +0000
@@ -61,6 +61,8 @@
   static address _double_sign_mask;
   static address _double_sign_flip;
 
+  static address _zero_longs;
+
  public:
 
   static address get_previous_fp_entry()
@@ -113,6 +115,11 @@
     return _double_sign_flip;
   }
 
+  static address get_zero_longs()
+  {
+    return _zero_longs;
+  }
+
  private:
   static juint    _crc_table[];
 
--- a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Thu Apr 28 17:36:37 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Thu Apr 28 13:26:29 2016 +0000
@@ -71,6 +71,7 @@
 int VM_Version::_variant;
 int VM_Version::_revision;
 int VM_Version::_stepping;
+VM_Version::PsrInfo VM_Version::_psr_info   = { 0, };
 
 static BufferBlob* stub_blob;
 static const int stub_size = 550;
@@ -95,13 +96,16 @@
     __ c_stub_prolog(1, 0, MacroAssembler::ret_type_void);
 #endif
 
-    // void getPsrInfo(VM_Version::CpuidInfo* cpuid_info);
+    // void getPsrInfo(VM_Version::PsrInfo* psr_info);
 
     address entry = __ pc();
 
-    // TODO : redefine fields in CpuidInfo and generate
-    // code to fill them in
+    __ enter();
 
+    __ get_dczid_el0(rscratch1);
+    __ strw(rscratch1, Address(c_rarg0, in_bytes(VM_Version::dczid_el0_offset())));
+
+    __ leave();
     __ ret(lr);
 
 #   undef __
@@ -118,6 +122,8 @@
   _supports_atomic_getset8 = true;
   _supports_atomic_getadd8 = true;
 
+  getPsrInfo_stub(&_psr_info);
+
   if (FLAG_IS_DEFAULT(AllocatePrefetchDistance))
     FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256);
   if (FLAG_IS_DEFAULT(AllocatePrefetchStepSize))
@@ -285,6 +291,18 @@
     FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
   }
 
+  if (is_zva_enabled()) {
+    if (FLAG_IS_DEFAULT(UseBlockZeroing)) {
+      FLAG_SET_DEFAULT(UseBlockZeroing, true);
+    }
+    if (FLAG_IS_DEFAULT(BlockZeroingLowLimit)) {
+      FLAG_SET_DEFAULT(BlockZeroingLowLimit, 4 * VM_Version::zva_length());
+    }
+  } else if (UseBlockZeroing) {
+    warning("DC ZVA is not available on this CPU");
+    FLAG_SET_DEFAULT(UseBlockZeroing, false);
+  }
+
   // This machine allows unaligned memory accesses
   if (FLAG_IS_DEFAULT(UseUnalignedAccesses)) {
     FLAG_SET_DEFAULT(UseUnalignedAccesses, true);
--- a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.hpp	Thu Apr 28 17:36:37 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.hpp	Thu Apr 28 13:26:29 2016 +0000
@@ -40,6 +40,10 @@
   static int _revision;
   static int _stepping;
 
+  struct PsrInfo {
+    uint32_t dczid_el0;
+  };
+  static PsrInfo _psr_info;
   static void get_processor_features();
 
 public:
@@ -83,6 +87,17 @@
   static int cpu_model2()                     { return _model2; }
   static int cpu_variant()                    { return _variant; }
   static int cpu_revision()                   { return _revision; }
+  static ByteSize dczid_el0_offset() { return byte_offset_of(PsrInfo, dczid_el0); }
+  static bool is_zva_enabled() {
+    // Check the DZP bit (bit 4) of dczid_el0 is zero
+    // and block size (bit 0~3) is not zero.
+    return ((_psr_info.dczid_el0 & 0x10) == 0 &&
+            (_psr_info.dczid_el0 & 0xf) != 0);
+  }
+  static int zva_length() {
+    assert(is_zva_enabled(), "ZVA not available");
+    return 4 << (_psr_info.dczid_el0 & 0xf);
+  }
 };
 
 #endif // CPU_AARCH64_VM_VM_VERSION_AARCH64_HPP