6940677: Use 64 bytes chunk copy for arraycopy on Sparc
authorkvn
Tue, 06 Apr 2010 15:18:10 -0700
changeset 5248 5eddf88cc3c9
parent 5247 c2b4e525b3e5
child 5249 5cac34e6fe54
6940677: Use 64 bytes chunk copy for arraycopy on Sparc Summary: For large arrays we should use 64 bytes chunks copy. Reviewed-by: twisti
hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp
--- a/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Tue Apr 06 13:39:52 2010 +0200
+++ b/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Tue Apr 06 15:18:10 2010 -0700
@@ -2000,6 +2000,27 @@
   //      to:    O1
   //      count: O2 treated as signed
   //
+  // count -= 2;
+  // if ( count >= 0 ) { // >= 2 elements
+  //   if ( count > 6) { // >= 8 elements
+  //     count -= 6; // original count - 8
+  //     do {
+  //       copy_8_elements;
+  //       count -= 8;
+  //     } while ( count >= 0 );
+  //     count += 6;
+  //   }
+  //   if ( count >= 0 ) { // >= 2 elements
+  //     do {
+  //       copy_2_elements;
+  //     } while ( (count=count-2) >= 0 );
+  //   }
+  // }
+  // count += 2;
+  // if ( count != 0 ) { // 1 element left
+  //   copy_1_element;
+  // }
+  //
   void generate_disjoint_long_copy_core(bool aligned) {
     Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
     const Register from    = O0;  // source array address
@@ -2012,6 +2033,38 @@
       __ mov(G0, offset0);   // offset from start of arrays (0)
       __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
       __ delayed()->add(offset0, 8, offset8);
+
+    // Copy by 64 bytes chunks
+    Label L_copy_64_bytes;
+    const Register from64 = O3;  // source address
+    const Register to64   = G3;  // destination address
+      __ subcc(count, 6, O3);
+      __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
+      __ delayed()->mov(to,   to64);
+      // Now we can use O4(offset0), O5(offset8) as temps
+      __ mov(O3, count);
+      __ mov(from, from64);
+
+      __ align(16);
+    __ BIND(L_copy_64_bytes);
+      for( int off = 0; off < 64; off += 16 ) {
+        __ ldx(from64,  off+0, O4);
+        __ ldx(from64,  off+8, O5);
+        __ stx(O4, to64,  off+0);
+        __ stx(O5, to64,  off+8);
+      }
+      __ deccc(count, 8);
+      __ inc(from64, 64);
+      __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_64_bytes);
+      __ delayed()->inc(to64, 64);
+
+      // Restore O4(offset0), O5(offset8)
+      __ sub(from64, from, offset0);
+      __ inccc(count, 6);
+      __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
+      __ delayed()->add(offset0, 8, offset8);
+
+      // Copy by 16 bytes chunks
       __ align(16);
     __ BIND(L_copy_16_bytes);
       __ ldx(from, offset0, O3);
@@ -2023,6 +2076,7 @@
       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
       __ delayed()->inc(offset8, 16);
 
+      // Copy last 8 bytes
     __ BIND(L_copy_8_bytes);
       __ inccc(count, 2);
       __ brx(Assembler::zero, true, Assembler::pn, L_exit );