8179444: AArch64: Put zero_words on a diet
authoraph
Tue, 09 May 2017 16:48:16 +0100
changeset 45054 c09733aaf97f
parent 45053 bd6d9a8a0b61
child 45055 1c7bbc426b1e
child 45057 a93a8a7f862f
8179444: AArch64: Put zero_words on a diet Reviewed-by: roland
hotspot/src/cpu/aarch64/vm/aarch64.ad
hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.cpp
hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp
--- a/hotspot/src/cpu/aarch64/vm/aarch64.ad	Fri May 05 17:29:57 2017 +0100
+++ b/hotspot/src/cpu/aarch64/vm/aarch64.ad	Tue May 09 16:48:16 2017 +0100
@@ -14021,10 +14021,12 @@
   ins_pipe(pipe_class_memory);
 %}
 
-instruct clearArray_imm_reg(immL cnt, iRegP_R10 base, iRegL_R11 tmp, Universe dummy, rFlagsReg cr)
-%{
+instruct clearArray_imm_reg(immL cnt, iRegP_R10 base, Universe dummy, rFlagsReg cr)
+%{
+  predicate((u_int64_t)n->in(2)->get_long()
+            < (u_int64_t)(BlockZeroingLowLimit >> LogBytesPerWord));
   match(Set dummy (ClearArray cnt base));
-  effect(USE_KILL base, TEMP tmp);
+  effect(USE_KILL base);
 
   ins_cost(4 * INSN_COST);
   format %{ "ClearArray $cnt, $base" %}
--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Fri May 05 17:29:57 2017 +0100
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Tue May 09 16:48:16 2017 +0100
@@ -698,6 +698,7 @@
 // trampolines won't be emitted.
 
 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
+  assert(JavaThread::current()->is_Compiler_thread(), "just checking");
   assert(entry.rspec().type() == relocInfo::runtime_call_type
          || entry.rspec().type() == relocInfo::opt_virtual_call_type
          || entry.rspec().type() == relocInfo::static_call_type
@@ -4944,34 +4945,67 @@
 }
 
 
-// base:     Address of a buffer to be zeroed, 8 bytes aligned.
-// cnt:      Count in HeapWords.
-// is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit.
-void MacroAssembler::zero_words(Register base, Register cnt)
+// The size of the blocks erased by the zero_blocks stub.  We must
+// handle anything smaller than this ourselves in zero_words().
+const int MacroAssembler::zero_words_block_size = 8;
+
+// zero_words() is used by C2 ClearArray patterns.  It is as small as
+// possible, handling small word counts locally and delegating
+// anything larger to the zero_blocks stub.  It is expanded many times
+// in compiled code, so it is important to keep it short.
+
+// ptr:   Address of a buffer to be zeroed.
+// cnt:   Count in HeapWords.
+//
+// ptr, cnt, rscratch1, and rscratch2 are clobbered.
+void MacroAssembler::zero_words(Register ptr, Register cnt)
 {
-  if (UseBlockZeroing) {
-    block_zero(base, cnt);
-  } else {
-    fill_words(base, cnt, zr);
+  assert(is_power_of_2(zero_words_block_size), "adjust this");
+  assert(ptr == r10 && cnt == r11, "mismatch in register usage");
+
+  BLOCK_COMMENT("zero_words {");
+  cmp(cnt, zero_words_block_size);
+  Label around, done, done16;
+  br(LO, around);
+  {
+    RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
+    assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
+    if (StubRoutines::aarch64::complete()) {
+      trampoline_call(zero_blocks);
+    } else {
+      bl(zero_blocks);
+    }
   }
+  bind(around);
+  for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
+    Label l;
+    tbz(cnt, exact_log2(i), l);
+    for (int j = 0; j < i; j += 2) {
+      stp(zr, zr, post(ptr, 16));
+    }
+    bind(l);
+  }
+  {
+    Label l;
+    tbz(cnt, 0, l);
+    str(zr, Address(ptr));
+    bind(l);
+  }
+  BLOCK_COMMENT("} zero_words");
 }
 
-// r10 = base:   Address of a buffer to be zeroed, 8 bytes aligned.
+// base:         Address of a buffer to be zeroed, 8 bytes aligned.
 // cnt:          Immediate count in HeapWords.
-// r11 = tmp:    For use as cnt if we need to call out
-#define ShortArraySize (18 * BytesPerLong)
+#define SmallArraySize (18 * BytesPerLong)
 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
 {
-  Register tmp = r11;
+  BLOCK_COMMENT("zero_words {");
   int i = cnt & 1;  // store any odd word to start
   if (i) str(zr, Address(base));
 
-  if (cnt <= ShortArraySize / BytesPerLong) {
+  if (cnt <= SmallArraySize / BytesPerLong) {
     for (; i < (int)cnt; i += 2)
       stp(zr, zr, Address(base, i * wordSize));
-  } else if (UseBlockZeroing && cnt >= (u_int64_t)(BlockZeroingLowLimit >> LogBytesPerWord)) {
-    mov(tmp, cnt);
-    block_zero(base, tmp, true);
   } else {
     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
     int remainder = cnt % (2 * unroll);
@@ -4992,6 +5026,51 @@
     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
     cbnz(cnt_reg, loop);
   }
+  BLOCK_COMMENT("} zero_words");
+}
+
+// Zero blocks of memory by using DC ZVA.
+//
+// Aligns the base address first sufficently for DC ZVA, then uses
+// DC ZVA repeatedly for every full block.  cnt is the size to be
+// zeroed in HeapWords.  Returns the count of words left to be zeroed
+// in cnt.
+//
+// NOTE: This is intended to be used in the zero_blocks() stub.  If
+// you want to use it elsewhere, note that cnt must be >= 2*zva_length.
+void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
+  Register tmp = rscratch1;
+  Register tmp2 = rscratch2;
+  int zva_length = VM_Version::zva_length();
+  Label initial_table_end, loop_zva;
+  Label fini;
+
+  // Base must be 16 byte aligned. If not just return and let caller handle it
+  tst(base, 0x0f);
+  br(Assembler::NE, fini);
+  // Align base with ZVA length.
+  neg(tmp, base);
+  andr(tmp, tmp, zva_length - 1);
+
+  // tmp: the number of bytes to be filled to align the base with ZVA length.
+  add(base, base, tmp);
+  sub(cnt, cnt, tmp, Assembler::ASR, 3);
+  adr(tmp2, initial_table_end);
+  sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
+  br(tmp2);
+
+  for (int i = -zva_length + 16; i < 0; i += 16)
+    stp(zr, zr, Address(base, i));
+  bind(initial_table_end);
+
+  sub(cnt, cnt, zva_length >> 3);
+  bind(loop_zva);
+  dc(Assembler::ZVA, base);
+  subs(cnt, cnt, zva_length >> 3);
+  add(base, base, zva_length);
+  br(Assembler::GE, loop_zva);
+  add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
+  bind(fini);
 }
 
 // base:   Address of a buffer to be filled, 8 bytes aligned.
@@ -5052,69 +5131,6 @@
   bind(fini);
 }
 
-// Use DC ZVA to do fast zeroing.
-// base:   Address of a buffer to be zeroed, 8 bytes aligned.
-// cnt:    Count in HeapWords.
-// is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit.
-void MacroAssembler::block_zero(Register base, Register cnt, bool is_large)
-{
-  Label small;
-  Label store_pair, loop_store_pair, done;
-  Label base_aligned;
-
-  assert_different_registers(base, cnt, rscratch1);
-  guarantee(base == r10 && cnt == r11, "fix register usage");
-
-  Register tmp = rscratch1;
-  Register tmp2 = rscratch2;
-  int zva_length = VM_Version::zva_length();
-
-  // Ensure ZVA length can be divided by 16. This is required by
-  // the subsequent operations.
-  assert (zva_length % 16 == 0, "Unexpected ZVA Length");
-
-  if (!is_large) cbz(cnt, done);
-  tbz(base, 3, base_aligned);
-  str(zr, Address(post(base, 8)));
-  sub(cnt, cnt, 1);
-  bind(base_aligned);
-
-  // Ensure count >= zva_length * 2 so that it still deserves a zva after
-  // alignment.
-  if (!is_large || !(BlockZeroingLowLimit >= zva_length * 2)) {
-    int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
-    subs(tmp, cnt, low_limit >> 3);
-    br(Assembler::LT, small);
-  }
-
-  far_call(StubRoutines::aarch64::get_zero_longs());
-
-  bind(small);
-
-  const int unroll = 8; // Number of stp instructions we'll unroll
-  Label small_loop, small_table_end;
-
-  andr(tmp, cnt, (unroll-1) * 2);
-  sub(cnt, cnt, tmp);
-  add(base, base, tmp, Assembler::LSL, 3);
-  adr(tmp2, small_table_end);
-  sub(tmp2, tmp2, tmp, Assembler::LSL, 1);
-  br(tmp2);
-
-  bind(small_loop);
-  add(base, base, unroll * 16);
-  for (int i = -unroll; i < 0; i++)
-    stp(zr, zr, Address(base, i * 16));
-  bind(small_table_end);
-  subs(cnt, cnt, unroll * 2);
-  br(Assembler::GE, small_loop);
-
-  tbz(cnt, 0, done);
-  str(zr, Address(post(base, 8)));
-
-  bind(done);
-}
-
 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
 // java/lang/StringUTF16.compress.
 void MacroAssembler::encode_iso_array(Register src, Register dst,
--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Fri May 05 17:29:57 2017 +0100
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Tue May 09 16:48:16 2017 +0100
@@ -1213,8 +1213,10 @@
 
   void fill_words(Register base, Register cnt, Register value);
   void zero_words(Register base, u_int64_t cnt);
-  void zero_words(Register base, Register cnt);
-  void block_zero(Register base, Register cnt, bool is_large = false);
+  void zero_words(Register ptr, Register cnt);
+  void zero_dcache_blocks(Register base, Register cnt);
+
+  static const int zero_words_block_size;
 
   void byte_array_inflate(Register src, Register dst, Register len,
                           FloatRegister vtmp1, FloatRegister vtmp2,
--- a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Fri May 05 17:29:57 2017 +0100
+++ b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Tue May 09 16:48:16 2017 +0100
@@ -719,48 +719,74 @@
     }
   }
 
-  address generate_zero_longs(Register base, Register cnt) {
-    Register tmp = rscratch1;
-    Register tmp2 = rscratch2;
-    int zva_length = VM_Version::zva_length();
-    Label initial_table_end, loop_zva;
-    Label fini;
+  // The inner part of zero_words().  This is the bulk operation,
+  // zeroing words in blocks, possibly using DC ZVA to do it.  The
+  // caller is responsible for zeroing the last few words.
+  //
+  // Inputs:
+  // r10: the HeapWord-aligned base address of an array to zero.
+  // r11: the count in HeapWords, r11 > 0.
+  //
+  // Returns r10 and r11, adjusted for the caller to clear.
+  // r10: the base address of the tail of words left to clear.
+  // r11: the number of words in the tail.
+  //      r11 < MacroAssembler::zero_words_block_size.
+
+  address generate_zero_blocks() {
+    Label store_pair, loop_store_pair, done;
+    Label base_aligned;
+
+    Register base = r10, cnt = r11;
 
     __ align(CodeEntryAlignment);
-    StubCodeMark mark(this, "StubRoutines", "zero_longs");
+    StubCodeMark mark(this, "StubRoutines", "zero_blocks");
     address start = __ pc();
 
-    // Base must be 16 byte aligned. If not just return and let caller handle it
-    __ tst(base, 0x0f);
-    __ br(Assembler::NE, fini);
-    // Align base with ZVA length.
-    __ neg(tmp, base);
-    __ andr(tmp, tmp, zva_length - 1);
-
-    // tmp: the number of bytes to be filled to align the base with ZVA length.
-    __ add(base, base, tmp);
-    __ sub(cnt, cnt, tmp, Assembler::ASR, 3);
-    __ adr(tmp2, initial_table_end);
-    __ sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
-    __ br(tmp2);
-
-    for (int i = -zva_length + 16; i < 0; i += 16)
-      __ stp(zr, zr, Address(base, i));
-    __ bind(initial_table_end);
-
-    __ sub(cnt, cnt, zva_length >> 3);
-    __ bind(loop_zva);
-    __ dc(Assembler::ZVA, base);
-    __ subs(cnt, cnt, zva_length >> 3);
-    __ add(base, base, zva_length);
-    __ br(Assembler::GE, loop_zva);
-    __ add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
-    __ bind(fini);
+    if (UseBlockZeroing) {
+      int zva_length = VM_Version::zva_length();
+
+      // Ensure ZVA length can be divided by 16. This is required by
+      // the subsequent operations.
+      assert (zva_length % 16 == 0, "Unexpected ZVA Length");
+
+      __ tbz(base, 3, base_aligned);
+      __ str(zr, Address(__ post(base, 8)));
+      __ sub(cnt, cnt, 1);
+      __ bind(base_aligned);
+
+      // Ensure count >= zva_length * 2 so that it still deserves a zva after
+      // alignment.
+      Label small;
+      int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
+      __ cmp(cnt, low_limit >> 3);
+      __ br(Assembler::LT, small);
+      __ zero_dcache_blocks(base, cnt);
+      __ bind(small);
+    }
+
+    {
+      // Number of stp instructions we'll unroll
+      const int unroll =
+        MacroAssembler::zero_words_block_size / 2;
+      // Clear the remaining blocks.
+      Label loop;
+      __ subs(cnt, cnt, unroll * 2);
+      __ br(Assembler::LT, done);
+      __ bind(loop);
+      for (int i = 0; i < unroll; i++)
+        __ stp(zr, zr, __ post(base, 16));
+      __ subs(cnt, cnt, unroll * 2);
+      __ br(Assembler::GE, loop);
+      __ bind(done);
+      __ add(cnt, cnt, unroll * 2);
+    }
+
     __ ret(lr);
 
     return start;
   }
 
+
   typedef enum {
     copy_forwards = 1,
     copy_backwards = -1
@@ -2346,20 +2372,16 @@
     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
     if (UseBlockZeroing) {
       Label non_block_zeroing, rest;
-      Register tmp = rscratch1;
-      // count >= BlockZeroingLowLimit && value == 0
-      __ subs(tmp, cnt_words, BlockZeroingLowLimit >> 3);
-      __ ccmp(value, 0 /* comparing value */, 0 /* NZCV */, Assembler::GE);
-      __ br(Assembler::NE, non_block_zeroing);
+      // If the fill value is zero we can use the fast zero_words().
+      __ cbnz(value, non_block_zeroing);
       __ mov(bz_base, to);
-      __ block_zero(bz_base, cnt_words, true);
-      __ mov(to, bz_base);
+      __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
+      __ zero_words(bz_base, cnt_words);
       __ b(rest);
       __ bind(non_block_zeroing);
       __ fill_words(to, cnt_words, value);
       __ bind(rest);
-    }
-    else {
+    } else {
       __ fill_words(to, cnt_words, value);
     }
 
@@ -2420,7 +2442,7 @@
     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
 
-    StubRoutines::aarch64::_zero_longs = generate_zero_longs(r10, r11);
+    StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 
     //*** jbyte
     // Always need aligned and unaligned versions
@@ -4769,6 +4791,7 @@
                                                        &StubRoutines::_safefetchN_fault_pc,
                                                        &StubRoutines::_safefetchN_continuation_pc);
 #endif
+    StubRoutines::aarch64::set_completed();
   }
 
  public:
--- a/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.cpp	Fri May 05 17:29:57 2017 +0100
+++ b/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.cpp	Tue May 09 16:48:16 2017 +0100
@@ -43,7 +43,8 @@
 address StubRoutines::aarch64::_float_sign_flip = NULL;
 address StubRoutines::aarch64::_double_sign_mask = NULL;
 address StubRoutines::aarch64::_double_sign_flip = NULL;
-address StubRoutines::aarch64::_zero_longs = NULL;
+address StubRoutines::aarch64::_zero_blocks = NULL;
+bool StubRoutines::aarch64::_completed = false;
 
 /**
  *  crc_table[] from jdk/src/share/native/java/util/zip/zlib-1.2.5/crc32.h
--- a/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp	Fri May 05 17:29:57 2017 +0100
+++ b/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp	Tue May 09 16:48:16 2017 +0100
@@ -61,7 +61,8 @@
   static address _double_sign_mask;
   static address _double_sign_flip;
 
-  static address _zero_longs;
+  static address _zero_blocks;
+  static bool _completed;
 
  public:
 
@@ -115,12 +116,19 @@
     return _double_sign_flip;
   }
 
-  static address get_zero_longs()
-  {
-    return _zero_longs;
+  static address zero_blocks() {
+    return _zero_blocks;
   }
 
- private:
+  static bool complete() {
+    return _completed;
+  }
+
+  static void set_completed() {
+    _completed = true;
+  }
+
+private:
   static juint    _crc_table[];
 
 };