8150082: aarch64: optimise small array copy
authorenevill
Sat, 20 Feb 2016 15:11:42 +0000
changeset 36563 0b48c2c8ad13
parent 36562 4d1e93624d6a
child 36564 9442bb67de26
8150082: aarch64: optimise small array copy Reviewed-by: aph
hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
--- a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Tue Mar 08 14:39:50 2016 +0000
+++ b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Sat Feb 20 15:11:42 2016 +0000
@@ -729,7 +729,7 @@
   //
   // count is a count of words.
   //
-  // Precondition: count >= 2
+  // Precondition: count >= 8
   //
   // Postconditions:
   //
@@ -750,7 +750,7 @@
     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
     assert_different_registers(s, d, count, rscratch1);
 
-    Label again, large, small;
+    Label again, drain;
     const char *stub_name;
     if (direction == copy_forwards)
       stub_name = "foward_copy_longs";
@@ -759,51 +759,21 @@
     StubCodeMark mark(this, "StubRoutines", stub_name);
     __ align(CodeEntryAlignment);
     __ bind(start);
-    __ cmp(count, 8);
-    __ br(Assembler::LO, small);
     if (direction == copy_forwards) {
       __ sub(s, s, 2 * wordSize);
       __ sub(d, d, 2 * wordSize);
     }
-    __ subs(count, count, 16);
-    __ br(Assembler::GE, large);
-
-    // 8 <= count < 16 words.  Copy 8.
-    __ ldp(t0, t1, Address(s, 2 * unit));
-    __ ldp(t2, t3, Address(s, 4 * unit));
-    __ ldp(t4, t5, Address(s, 6 * unit));
-    __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
-
-    __ stp(t0, t1, Address(d, 2 * unit));
-    __ stp(t2, t3, Address(d, 4 * unit));
-    __ stp(t4, t5, Address(d, 6 * unit));
-    __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
-
-    if (direction == copy_forwards) {
-      __ add(s, s, 2 * wordSize);
-      __ add(d, d, 2 * wordSize);
-    }
-
+
+#ifdef ASSERT
+    // Make sure we are never given < 8 words
     {
-      Label L1, L2;
-      __ bind(small);
-      __ tbz(count, exact_log2(4), L1);
-      __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
-      __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
-      __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
-      __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
-      __ bind(L1);
-
-      __ tbz(count, 1, L2);
-      __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
-      __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
-      __ bind(L2);
+      Label L;
+      __ cmp(count, 8);
+      __ br(Assembler::GE, L);
+      __ stop("genrate_copy_longs called with < 8 words");
+      __ bind(L);
     }
-
-    __ ret(lr);
-
-    __ align(CodeEntryAlignment);
-    __ bind(large);
+#endif
 
     // Fill 8 registers
     __ ldp(t0, t1, Address(s, 2 * unit));
@@ -811,6 +781,9 @@
     __ ldp(t4, t5, Address(s, 6 * unit));
     __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 
+    __ subs(count, count, 16);
+    __ br(Assembler::LO, drain);
+
     int prefetch = PrefetchCopyIntervalInBytes;
     bool use_stride = false;
     if (direction == copy_backwards) {
@@ -837,6 +810,7 @@
     __ br(Assembler::HS, again);
 
     // Drain
+    __ bind(drain);
     __ stp(t0, t1, Address(d, 2 * unit));
     __ stp(t2, t3, Address(d, 4 * unit));
     __ stp(t4, t5, Address(d, 6 * unit));
@@ -931,16 +905,119 @@
     int granularity = uabs(step);
     const Register t0 = r3, t1 = r4;
 
+    // <= 96 bytes do inline. Direction doesn't matter because we always
+    // load all the data before writing anything
+    Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
+    const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
+    const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
+    const Register send = r17, dend = r18;
+
+    if (PrefetchCopyIntervalInBytes > 0)
+      __ prfm(Address(s, 0), PLDL1KEEP);
+
+    __ cmp(count, 80/granularity);
+    __ br(Assembler::HI, copy_big);
+
+    __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
+    __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
+
+    __ cmp(count, 16/granularity);
+    __ br(Assembler::LS, copy16);
+
+    __ cmp(count, 64/granularity);
+    __ br(Assembler::HI, copy80);
+
+    __ cmp(count, 32/granularity);
+    __ br(Assembler::LS, copy32);
+
+    // 33..64 bytes
+    __ ldp(t0, t1, Address(s, 0));
+    __ ldp(t2, t3, Address(s, 16));
+    __ ldp(t4, t5, Address(send, -32));
+    __ ldp(t6, t7, Address(send, -16));
+
+    __ stp(t0, t1, Address(d, 0));
+    __ stp(t2, t3, Address(d, 16));
+    __ stp(t4, t5, Address(dend, -32));
+    __ stp(t6, t7, Address(dend, -16));
+    __ b(finish);
+
+    // 17..32 bytes
+    __ bind(copy32);
+    __ ldp(t0, t1, Address(s, 0));
+    __ ldp(t2, t3, Address(send, -16));
+    __ stp(t0, t1, Address(d, 0));
+    __ stp(t2, t3, Address(dend, -16));
+    __ b(finish);
+
+    // 65..80 bytes
+    __ bind(copy80);
+    __ ldp(t0, t1, Address(s, 0));
+    __ ldp(t2, t3, Address(s, 16));
+    __ ldp(t4, t5, Address(s, 32));
+    __ ldp(t6, t7, Address(s, 48));
+    __ ldp(t8, t9, Address(send, -16));
+
+    __ stp(t0, t1, Address(d, 0));
+    __ stp(t2, t3, Address(d, 16));
+    __ stp(t4, t5, Address(d, 32));
+    __ stp(t6, t7, Address(d, 48));
+    __ stp(t8, t9, Address(dend, -16));
+    __ b(finish);
+
+    // 0..16 bytes
+    __ bind(copy16);
+    __ cmp(count, 8/granularity);
+    __ br(Assembler::LO, copy8);
+
+    // 8..16 bytes
+    __ ldr(t0, Address(s, 0));
+    __ ldr(t1, Address(send, -8));
+    __ str(t0, Address(d, 0));
+    __ str(t1, Address(dend, -8));
+    __ b(finish);
+
+    if (granularity < 8) {
+      // 4..7 bytes
+      __ bind(copy8);
+      __ tbz(count, 2 - exact_log2(granularity), copy4);
+      __ ldrw(t0, Address(s, 0));
+      __ ldrw(t1, Address(send, -4));
+      __ strw(t0, Address(d, 0));
+      __ strw(t1, Address(dend, -4));
+      __ b(finish);
+      if (granularity < 4) {
+        // 0..3 bytes
+        __ bind(copy4);
+        __ cbz(count, finish); // get rid of 0 case
+        if (granularity == 2) {
+          __ ldrh(t0, Address(s, 0));
+          __ strh(t0, Address(d, 0));
+        } else { // granularity == 1
+          // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
+          // the first and last byte.
+          // Handle the 3 byte case by loading and storing base + count/2
+          // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
+          // This does means in the 1 byte case we load/store the same
+          // byte 3 times.
+          __ lsr(count, count, 1);
+          __ ldrb(t0, Address(s, 0));
+          __ ldrb(t1, Address(send, -1));
+          __ ldrb(t2, Address(s, count));
+          __ strb(t0, Address(d, 0));
+          __ strb(t1, Address(dend, -1));
+          __ strb(t2, Address(d, count));
+        }
+        __ b(finish);
+      }
+    }
+
+    __ bind(copy_big);
     if (is_backwards) {
       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
     }
 
-    Label tail;
-
-    __ cmp(count, 16/granularity);
-    __ br(Assembler::LO, tail);
-
     // Now we've got the small case out of the way we can align the
     // source address on a 2-word boundary.
 
@@ -986,8 +1063,6 @@
 #endif
     }
 
-    __ cmp(count, 16/granularity);
-    __ br(Assembler::LT, tail);
     __ bind(aligned);
 
     // s is now 2-word-aligned.
@@ -1001,9 +1076,11 @@
       __ bl(copy_b);
 
     // And the tail.
-
-    __ bind(tail);
     copy_memory_small(s, d, count, tmp, step);
+
+    if (granularity >= 8) __ bind(copy8);
+    if (granularity >= 4) __ bind(copy4);
+    __ bind(finish);
   }