8166684: PPC64: implement intrinsic code with vector instructions for Unsafe.copyMemory()
authormdoerr
Fri, 21 Oct 2016 10:27:32 +0200
changeset 42034 f1e6a21095f5
parent 42033 401c7013b053
child 42035 10e6e31dc1aa
8166684: PPC64: implement intrinsic code with vector instructions for Unsafe.copyMemory() Reviewed-by: simonis, mdoerr Contributed-by: Michihiro Horie <horie@jp.ibm.com>
hotspot/src/cpu/ppc/vm/assembler_ppc.hpp
hotspot/src/cpu/ppc/vm/assembler_ppc.inline.hpp
hotspot/src/cpu/ppc/vm/stubGenerator_ppc.cpp
hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp
--- a/hotspot/src/cpu/ppc/vm/assembler_ppc.hpp	Thu Oct 20 17:05:26 2016 -0700
+++ b/hotspot/src/cpu/ppc/vm/assembler_ppc.hpp	Fri Oct 21 10:27:32 2016 +0200
@@ -2102,7 +2102,9 @@
   inline void mfvscr(   VectorRegister d);
 
   // Vector-Scalar (VSX) instructions.
+  inline void lxvd2x(   VectorSRegister d, Register a);
   inline void lxvd2x(   VectorSRegister d, Register a, Register b);
+  inline void stxvd2x(  VectorSRegister d, Register a);
   inline void stxvd2x(  VectorSRegister d, Register a, Register b);
   inline void mtvrd(    VectorRegister  d, Register a);
   inline void mfvrd(    Register        a, VectorRegister d);
--- a/hotspot/src/cpu/ppc/vm/assembler_ppc.inline.hpp	Thu Oct 20 17:05:26 2016 -0700
+++ b/hotspot/src/cpu/ppc/vm/assembler_ppc.inline.hpp	Fri Oct 21 10:27:32 2016 +0200
@@ -734,8 +734,10 @@
 inline void Assembler::lvsr(  VectorRegister d, Register s1, Register s2) { emit_int32( LVSR_OPCODE   | vrt(d) | ra0mem(s1) | rb(s2)); }
 
 // Vector-Scalar (VSX) instructions.
-inline void Assembler::lxvd2x (VectorSRegister d, Register s1, Register s2) { emit_int32( LXVD2X_OPCODE  | vsrt(d) | ra(s1) | rb(s2)); }
-inline void Assembler::stxvd2x(VectorSRegister d, Register s1, Register s2) { emit_int32( STXVD2X_OPCODE | vsrt(d) | ra(s1) | rb(s2)); }
+inline void Assembler::lxvd2x (VectorSRegister d, Register s1) { emit_int32( LXVD2X_OPCODE  | vsrt(d) | ra(0) | rb(s1)); }
+inline void Assembler::lxvd2x (VectorSRegister d, Register s1, Register s2) { emit_int32( LXVD2X_OPCODE  | vsrt(d) | ra0mem(s1) | rb(s2)); }
+inline void Assembler::stxvd2x(VectorSRegister d, Register s1) { emit_int32( STXVD2X_OPCODE | vsrt(d) | ra(0) | rb(s1)); }
+inline void Assembler::stxvd2x(VectorSRegister d, Register s1, Register s2) { emit_int32( STXVD2X_OPCODE | vsrt(d) | ra0mem(s1) | rb(s2)); }
 inline void Assembler::mtvrd(  VectorRegister  d, Register a)               { emit_int32( MTVSRD_OPCODE  | vrt(d)  | ra(a)  | 1u); } // 1u: d is treated as Vector (VMX/Altivec).
 inline void Assembler::mfvrd(  Register        a, VectorRegister d)         { emit_int32( MFVSRD_OPCODE  | vrt(d)  | ra(a)  | 1u); } // 1u: d is treated as Vector (VMX/Altivec).
 
--- a/hotspot/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Thu Oct 20 17:05:26 2016 -0700
+++ b/hotspot/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Fri Oct 21 10:27:32 2016 +0200
@@ -1220,8 +1220,8 @@
       __ bind(l_10);
       // Use loop with VSX load/store instructions to
       // copy 32 elements a time.
-      __ lxvd2x(tmp_vsr1, 0, R3_ARG1);     // Load src
-      __ stxvd2x(tmp_vsr1, 0, R4_ARG2);    // Store to dst
+      __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
+      __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
       __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
@@ -1486,8 +1486,8 @@
         __ bind(l_9);
         // Use loop with VSX load/store instructions to
         // copy 16 elements a time.
-        __ lxvd2x(tmp_vsr1, 0, R3_ARG1);     // Load from src.
-        __ stxvd2x(tmp_vsr1, 0, R4_ARG2);    // Store to dst.
+        __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load from src.
+        __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst.
         __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1);  // Load from src + 16.
         __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
         __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32.
@@ -1677,8 +1677,8 @@
       __ bind(l_7);
       // Use loop with VSX load/store instructions to
       // copy 8 elements a time.
-      __ lxvd2x(tmp_vsr1, 0, R3_ARG1);     // Load src
-      __ stxvd2x(tmp_vsr1, 0, R4_ARG2);    // Store to dst
+      __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
+      __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
       __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
@@ -1745,13 +1745,16 @@
     // Do reverse copy.  We assume the case of actual overlap is rare enough
     // that we don't have to optimize it.
 
-    Label l_1, l_2, l_3, l_4, l_5, l_6;
+    Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
 
     Register tmp1 = R6_ARG4;
     Register tmp2 = R7_ARG5;
     Register tmp3 = R8_ARG6;
     Register tmp4 = R0;
 
+    VectorSRegister tmp_vsr1  = VSR1;
+    VectorSRegister tmp_vsr2  = VSR2;
+
     { // FasterArrayCopy
       __ cmpwi(CCR0, R5_ARG3, 0);
       __ beq(CCR0, l_6);
@@ -1761,6 +1764,25 @@
       __ add(R4_ARG2, R4_ARG2, R5_ARG3);
       __ srdi(R5_ARG3, R5_ARG3, 2);
 
+      if (!aligned) {
+        // check if arrays have same alignment mod 8.
+        __ xorr(tmp1, R3_ARG1, R4_ARG2);
+        __ andi_(R0, tmp1, 7);
+        // Not the same alignment, but ld and std just need to be 4 byte aligned.
+        __ bne(CCR0, l_7); // to OR from is 8 byte aligned -> copy 2 at a time
+
+        // copy 1 element to align to and from on an 8 byte boundary
+        __ andi_(R0, R3_ARG1, 7);
+        __ beq(CCR0, l_7);
+
+        __ addi(R3_ARG1, R3_ARG1, -4);
+        __ addi(R4_ARG2, R4_ARG2, -4);
+        __ addi(R5_ARG3, R5_ARG3, -1);
+        __ lwzx(tmp2, R3_ARG1);
+        __ stwx(tmp2, R4_ARG2);
+        __ bind(l_7);
+      }
+
       __ cmpwi(CCR0, R5_ARG3, 7);
       __ ble(CCR0, l_5); // copy 1 at a time if less than 8 elements remain
 
@@ -1768,6 +1790,7 @@
       __ andi(R5_ARG3, R5_ARG3, 7);
       __ mtctr(tmp1);
 
+     if (!VM_Version::has_vsx()) {
       __ bind(l_4);
       // Use unrolled version for mass copying (copy 4 elements a time).
       // Load feeding store gets zero latency on Power6, however not on Power5.
@@ -1783,6 +1806,40 @@
       __ std(tmp2, 8, R4_ARG2);
       __ std(tmp1, 0, R4_ARG2);
       __ bdnz(l_4);
+     } else {  // Processor supports VSX, so use it to mass copy.
+      // Prefetch the data into the L2 cache.
+      __ dcbt(R3_ARG1, 0);
+
+      // If supported set DSCR pre-fetch to deepest.
+      if (VM_Version::has_mfdscr()) {
+        __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
+        __ mtdscr(tmp2);
+      }
+
+      __ li(tmp1, 16);
+
+      // Backbranch target aligned to 32-byte. Not 16-byte align as
+      // loop contains < 8 instructions that fit inside a single
+      // i-cache sector.
+      __ align(32);
+
+      __ bind(l_4);
+      // Use loop with VSX load/store instructions to
+      // copy 8 elements a time.
+      __ addi(R3_ARG1, R3_ARG1, -32);      // Update src-=32
+      __ addi(R4_ARG2, R4_ARG2, -32);      // Update dsc-=32
+      __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src+16
+      __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
+      __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
+      __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
+      __ bdnz(l_4);
+
+      // Restore DSCR pre-fetch value.
+      if (VM_Version::has_mfdscr()) {
+        __ load_const_optimized(tmp2, VM_Version::_dscr_val);
+        __ mtdscr(tmp2);
+      }
+     }
 
       __ cmpwi(CCR0, R5_ARG3, 0);
       __ beq(CCR0, l_6);
@@ -1892,8 +1949,8 @@
       __ bind(l_5);
       // Use loop with VSX load/store instructions to
       // copy 4 elements a time.
-      __ lxvd2x(tmp_vsr1, 0, R3_ARG1);     // Load src
-      __ stxvd2x(tmp_vsr1, 0, R4_ARG2);    // Store to dst
+      __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
+      __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
       __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
@@ -1962,6 +2019,9 @@
     Register tmp3 = R8_ARG6;
     Register tmp4 = R0;
 
+    VectorSRegister tmp_vsr1  = VSR1;
+    VectorSRegister tmp_vsr2  = VSR2;
+
     Label l_1, l_2, l_3, l_4, l_5;
 
     __ cmpwi(CCR0, R5_ARG3, 0);
@@ -1980,6 +2040,7 @@
       __ andi(R5_ARG3, R5_ARG3, 3);
       __ mtctr(tmp1);
 
+     if (!VM_Version::has_vsx()) {
       __ bind(l_4);
       // Use unrolled version for mass copying (copy 4 elements a time).
       // Load feeding store gets zero latency on Power6, however not on Power5.
@@ -1995,6 +2056,40 @@
       __ std(tmp2, 8, R4_ARG2);
       __ std(tmp1, 0, R4_ARG2);
       __ bdnz(l_4);
+     } else { // Processor supports VSX, so use it to mass copy.
+      // Prefetch the data into the L2 cache.
+      __ dcbt(R3_ARG1, 0);
+
+      // If supported set DSCR pre-fetch to deepest.
+      if (VM_Version::has_mfdscr()) {
+        __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
+        __ mtdscr(tmp2);
+      }
+
+      __ li(tmp1, 16);
+
+      // Backbranch target aligned to 32-byte. Not 16-byte align as
+      // loop contains < 8 instructions that fit inside a single
+      // i-cache sector.
+      __ align(32);
+
+      __ bind(l_4);
+      // Use loop with VSX load/store instructions to
+      // copy 4 elements a time.
+      __ addi(R3_ARG1, R3_ARG1, -32);      // Update src-=32
+      __ addi(R4_ARG2, R4_ARG2, -32);      // Update dsc-=32
+      __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src+16
+      __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
+      __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
+      __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
+      __ bdnz(l_4);
+
+      // Restore DSCR pre-fetch value.
+      if (VM_Version::has_mfdscr()) {
+        __ load_const_optimized(tmp2, VM_Version::_dscr_val);
+        __ mtdscr(tmp2);
+      }
+     }
 
       __ cmpwi(CCR0, R5_ARG3, 0);
       __ beq(CCR0, l_1);
--- a/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp	Thu Oct 20 17:05:26 2016 -0700
+++ b/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp	Fri Oct 21 10:27:32 2016 +0200
@@ -656,7 +656,7 @@
   a->vpmsumb(VR0, VR1, VR2);                   // code[11] -> vpmsumb
   a->tcheck(0);                                // code[12] -> tcheck
   a->mfdscr(R0);                               // code[13] -> mfdscr
-  a->lxvd2x(VSR0, 0, R3_ARG1);                 // code[14] -> vsx
+  a->lxvd2x(VSR0, R3_ARG1);                    // code[14] -> vsx
   a->blr();
 
   // Emit function to set one cache line to zero. Emit function descriptor and get pointer to it.