hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
changeset 45054 c09733aaf97f
parent 43439 5e03c9ba74f3
child 46625 edefffab74e2
--- a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Fri May 05 17:29:57 2017 +0100
+++ b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Tue May 09 16:48:16 2017 +0100
@@ -719,48 +719,74 @@
     }
   }
 
-  address generate_zero_longs(Register base, Register cnt) {
-    Register tmp = rscratch1;
-    Register tmp2 = rscratch2;
-    int zva_length = VM_Version::zva_length();
-    Label initial_table_end, loop_zva;
-    Label fini;
+  // The inner part of zero_words().  This is the bulk operation,
+  // zeroing words in blocks, possibly using DC ZVA to do it.  The
+  // caller is responsible for zeroing the last few words.
+  //
+  // Inputs:
+  // r10: the HeapWord-aligned base address of an array to zero.
+  // r11: the count in HeapWords, r11 > 0.
+  //
+  // Returns r10 and r11, adjusted for the caller to clear.
+  // r10: the base address of the tail of words left to clear.
+  // r11: the number of words in the tail.
+  //      r11 < MacroAssembler::zero_words_block_size.
+
+  address generate_zero_blocks() {
+    Label store_pair, loop_store_pair, done;
+    Label base_aligned;
+
+    Register base = r10, cnt = r11;
 
     __ align(CodeEntryAlignment);
-    StubCodeMark mark(this, "StubRoutines", "zero_longs");
+    StubCodeMark mark(this, "StubRoutines", "zero_blocks");
     address start = __ pc();
 
-    // Base must be 16 byte aligned. If not just return and let caller handle it
-    __ tst(base, 0x0f);
-    __ br(Assembler::NE, fini);
-    // Align base with ZVA length.
-    __ neg(tmp, base);
-    __ andr(tmp, tmp, zva_length - 1);
-
-    // tmp: the number of bytes to be filled to align the base with ZVA length.
-    __ add(base, base, tmp);
-    __ sub(cnt, cnt, tmp, Assembler::ASR, 3);
-    __ adr(tmp2, initial_table_end);
-    __ sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
-    __ br(tmp2);
-
-    for (int i = -zva_length + 16; i < 0; i += 16)
-      __ stp(zr, zr, Address(base, i));
-    __ bind(initial_table_end);
-
-    __ sub(cnt, cnt, zva_length >> 3);
-    __ bind(loop_zva);
-    __ dc(Assembler::ZVA, base);
-    __ subs(cnt, cnt, zva_length >> 3);
-    __ add(base, base, zva_length);
-    __ br(Assembler::GE, loop_zva);
-    __ add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
-    __ bind(fini);
+    if (UseBlockZeroing) {
+      int zva_length = VM_Version::zva_length();
+
+      // Ensure ZVA length can be divided by 16. This is required by
+      // the subsequent operations.
+      assert (zva_length % 16 == 0, "Unexpected ZVA Length");
+
+      __ tbz(base, 3, base_aligned);
+      __ str(zr, Address(__ post(base, 8)));
+      __ sub(cnt, cnt, 1);
+      __ bind(base_aligned);
+
+      // Ensure count >= zva_length * 2 so that it still deserves a zva after
+      // alignment.
+      Label small;
+      int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
+      __ cmp(cnt, low_limit >> 3);
+      __ br(Assembler::LT, small);
+      __ zero_dcache_blocks(base, cnt);
+      __ bind(small);
+    }
+
+    {
+      // Number of stp instructions we'll unroll
+      const int unroll =
+        MacroAssembler::zero_words_block_size / 2;
+      // Clear the remaining blocks.
+      Label loop;
+      __ subs(cnt, cnt, unroll * 2);
+      __ br(Assembler::LT, done);
+      __ bind(loop);
+      for (int i = 0; i < unroll; i++)
+        __ stp(zr, zr, __ post(base, 16));
+      __ subs(cnt, cnt, unroll * 2);
+      __ br(Assembler::GE, loop);
+      __ bind(done);
+      __ add(cnt, cnt, unroll * 2);
+    }
+
     __ ret(lr);
 
     return start;
   }
 
+
   typedef enum {
     copy_forwards = 1,
     copy_backwards = -1
@@ -2346,20 +2372,16 @@
     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
     if (UseBlockZeroing) {
       Label non_block_zeroing, rest;
-      Register tmp = rscratch1;
-      // count >= BlockZeroingLowLimit && value == 0
-      __ subs(tmp, cnt_words, BlockZeroingLowLimit >> 3);
-      __ ccmp(value, 0 /* comparing value */, 0 /* NZCV */, Assembler::GE);
-      __ br(Assembler::NE, non_block_zeroing);
+      // If the fill value is zero we can use the fast zero_words().
+      __ cbnz(value, non_block_zeroing);
       __ mov(bz_base, to);
-      __ block_zero(bz_base, cnt_words, true);
-      __ mov(to, bz_base);
+      __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
+      __ zero_words(bz_base, cnt_words);
       __ b(rest);
       __ bind(non_block_zeroing);
       __ fill_words(to, cnt_words, value);
       __ bind(rest);
-    }
-    else {
+    } else {
       __ fill_words(to, cnt_words, value);
     }
 
@@ -2420,7 +2442,7 @@
     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
 
-    StubRoutines::aarch64::_zero_longs = generate_zero_longs(r10, r11);
+    StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 
     //*** jbyte
     // Always need aligned and unaligned versions
@@ -4769,6 +4791,7 @@
                                                        &StubRoutines::_safefetchN_fault_pc,
                                                        &StubRoutines::_safefetchN_continuation_pc);
 #endif
+    StubRoutines::aarch64::set_completed();
   }
 
  public: