8154156: PPC64: improve array copy stubs by using vector instructions
authormdoerr
Mon, 23 May 2016 10:35:51 -0300
changeset 39236 c9d756fa846e
parent 39235 0a8a53391399
child 39237 1674c59db516
child 39240 3e5aba838d96
8154156: PPC64: improve array copy stubs by using vector instructions Reviewed-by: goetz, mdoerr Contributed-by: Gustavo Romero <gromero@linux.vnet.ibm.com>
hotspot/src/cpu/ppc/vm/assembler_ppc.hpp
hotspot/src/cpu/ppc/vm/assembler_ppc.inline.hpp
hotspot/src/cpu/ppc/vm/register_ppc.cpp
hotspot/src/cpu/ppc/vm/register_ppc.hpp
hotspot/src/cpu/ppc/vm/stubGenerator_ppc.cpp
hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp
hotspot/src/cpu/ppc/vm/vm_version_ppc.hpp
--- a/hotspot/src/cpu/ppc/vm/assembler_ppc.hpp	Fri May 27 17:12:10 2016 +0300
+++ b/hotspot/src/cpu/ppc/vm/assembler_ppc.hpp	Mon May 23 10:35:51 2016 -0300
@@ -503,6 +503,10 @@
     LVSL_OPCODE    = (31u << OPCODE_SHIFT |    6u << 1),
     LVSR_OPCODE    = (31u << OPCODE_SHIFT |   38u << 1),
 
+    // Vector-Scalar (VSX) instruction support.
+    LXVD2X_OPCODE  = (31u << OPCODE_SHIFT |  844u << 1),
+    STXVD2X_OPCODE = (31u << OPCODE_SHIFT |  972u << 1),
+
     // Vector Permute and Formatting
     VPKPX_OPCODE   = (4u  << OPCODE_SHIFT |  782u     ),
     VPKSHSS_OPCODE = (4u  << OPCODE_SHIFT |  398u     ),
@@ -1085,6 +1089,19 @@
   static int vrs(   VectorRegister r)  { return  vrs(r->encoding());}
   static int vrt(   VectorRegister r)  { return  vrt(r->encoding());}
 
+  // Support Vector-Scalar (VSX) instructions.
+  static int vsra(      int         x)  { return  opp_u_field(x,            15, 11); }
+  static int vsrb(      int         x)  { return  opp_u_field(x,            20, 16); }
+  static int vsrc(      int         x)  { return  opp_u_field(x,            25, 21); }
+  static int vsrs(      int         x)  { return  opp_u_field(x,            10,  6); }
+  static int vsrt(      int         x)  { return  opp_u_field(x,            10,  6); }
+
+  static int vsra(   VectorSRegister r)  { return  vsra(r->encoding());}
+  static int vsrb(   VectorSRegister r)  { return  vsrb(r->encoding());}
+  static int vsrc(   VectorSRegister r)  { return  vsrc(r->encoding());}
+  static int vsrs(   VectorSRegister r)  { return  vsrs(r->encoding());}
+  static int vsrt(   VectorSRegister r)  { return  vsrt(r->encoding());}
+
   static int vsplt_uim( int        x)  { return  opp_u_field(x,             15, 12); } // for vsplt* instructions
   static int vsplti_sim(int        x)  { return  opp_u_field(x,             15, 11); } // for vsplti* instructions
   static int vsldoi_shb(int        x)  { return  opp_u_field(x,             25, 22); } // for vsldoi instruction
@@ -2065,6 +2082,10 @@
   inline void mtvscr(   VectorRegister b);
   inline void mfvscr(   VectorRegister d);
 
+  // Vector-Scalar (VSX) instructions.
+  inline void lxvd2x(   VectorSRegister d, Register a, Register b);
+  inline void stxvd2x(  VectorSRegister d, Register a, Register b);
+
   // AES (introduced with Power 8)
   inline void vcipher(     VectorRegister d, VectorRegister a, VectorRegister b);
   inline void vcipherlast( VectorRegister d, VectorRegister a, VectorRegister b);
--- a/hotspot/src/cpu/ppc/vm/assembler_ppc.inline.hpp	Fri May 27 17:12:10 2016 +0300
+++ b/hotspot/src/cpu/ppc/vm/assembler_ppc.inline.hpp	Mon May 23 10:35:51 2016 -0300
@@ -721,6 +721,10 @@
 inline void Assembler::lvsl(  VectorRegister d, Register s1, Register s2) { emit_int32( LVSL_OPCODE   | vrt(d) | ra0mem(s1) | rb(s2)); }
 inline void Assembler::lvsr(  VectorRegister d, Register s1, Register s2) { emit_int32( LVSR_OPCODE   | vrt(d) | ra0mem(s1) | rb(s2)); }
 
+// Vector-Scalar (VSX) instructions.
+inline void Assembler::lxvd2x (VectorSRegister d, Register s1, Register s2) { emit_int32( LXVD2X_OPCODE  | vsrt(d) | ra(s1) | rb(s2)); }
+inline void Assembler::stxvd2x(VectorSRegister d, Register s1, Register s2) { emit_int32( STXVD2X_OPCODE | vsrt(d) | ra(s1) | rb(s2)); }
+
 inline void Assembler::vpkpx(   VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKPX_OPCODE   | vrt(d) | vra(a) | vrb(b)); }
 inline void Assembler::vpkshss( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKSHSS_OPCODE | vrt(d) | vra(a) | vrb(b)); }
 inline void Assembler::vpkswss( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKSWSS_OPCODE | vrt(d) | vra(a) | vrb(b)); }
--- a/hotspot/src/cpu/ppc/vm/register_ppc.cpp	Fri May 27 17:12:10 2016 +0300
+++ b/hotspot/src/cpu/ppc/vm/register_ppc.cpp	Mon May 23 10:35:51 2016 -0300
@@ -75,3 +75,14 @@
   };
   return is_valid() ? names[encoding()] : "vnoreg";
 }
+
+const char* VectorSRegisterImpl::name() const {
+  const char* names[number_of_registers] = {
+    "VSR0",  "VSR1",  "VSR2",  "VSR3",  "VSR4",  "VSR5",  "VSR6",  "VSR7",
+    "VSR8",  "VSR9",  "VSR10", "VSR11", "VSR12", "VSR13", "VSR14", "VSR15",
+    "VSR16", "VSR17", "VSR18", "VSR19", "VSR20", "VSR21", "VSR22", "VSR23",
+    "VSR24", "VSR25", "VSR26", "VSR27", "VSR28", "VSR29", "VSR30", "VSR31"
+  };
+  return is_valid() ? names[encoding()] : "vsnoreg";
+}
+
--- a/hotspot/src/cpu/ppc/vm/register_ppc.hpp	Fri May 27 17:12:10 2016 +0300
+++ b/hotspot/src/cpu/ppc/vm/register_ppc.hpp	Mon May 23 10:35:51 2016 -0300
@@ -491,6 +491,106 @@
 #endif // DONT_USE_REGISTER_DEFINES
 
 
+// Use VectorSRegister as a shortcut.
+class VectorSRegisterImpl;
+typedef VectorSRegisterImpl* VectorSRegister;
+
+inline VectorSRegister as_VectorSRegister(int encoding) {
+  return (VectorSRegister)(intptr_t)encoding;
+}
+
+// The implementation of Vector-Scalar (VSX) registers on POWER architecture.
+class VectorSRegisterImpl: public AbstractRegisterImpl {
+ public:
+  enum {
+    number_of_registers = 32
+  };
+
+  // construction
+  inline friend VectorSRegister as_VectorSRegister(int encoding);
+
+  // accessors
+  int encoding() const { assert(is_valid(), "invalid register"); return value(); }
+
+  // testers
+  bool is_valid() const { return 0 <=  value() &&  value() < number_of_registers; }
+
+  const char* name() const;
+};
+
+// The Vector-Scalar (VSX) registers of the POWER architecture.
+
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, vsnoreg, (-1));
+
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR0,  ( 0));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR1,  ( 1));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR2,  ( 2));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR3,  ( 3));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR4,  ( 4));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR5,  ( 5));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR6,  ( 6));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR7,  ( 7));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR8,  ( 8));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR9,  ( 9));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR10, (10));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR11, (11));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR12, (12));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR13, (13));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR14, (14));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR15, (15));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR16, (16));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR17, (17));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR18, (18));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR19, (19));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR20, (20));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR21, (21));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR22, (22));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR23, (23));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR24, (24));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR25, (25));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR26, (26));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR27, (27));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR28, (28));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR29, (29));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR30, (30));
+CONSTANT_REGISTER_DECLARATION(VectorSRegister, VSR31, (31));
+
+#ifndef DONT_USE_REGISTER_DEFINES
+#define vsnoregi ((VectorSRegister)(vsnoreg_VectorSRegisterEnumValue))
+#define VSR0    ((VectorSRegister)(   VSR0_VectorSRegisterEnumValue))
+#define VSR1    ((VectorSRegister)(   VSR1_VectorSRegisterEnumValue))
+#define VSR2    ((VectorSRegister)(   VSR2_VectorSRegisterEnumValue))
+#define VSR3    ((VectorSRegister)(   VSR3_VectorSRegisterEnumValue))
+#define VSR4    ((VectorSRegister)(   VSR4_VectorSRegisterEnumValue))
+#define VSR5    ((VectorSRegister)(   VSR5_VectorSRegisterEnumValue))
+#define VSR6    ((VectorSRegister)(   VSR6_VectorSRegisterEnumValue))
+#define VSR7    ((VectorSRegister)(   VSR7_VectorSRegisterEnumValue))
+#define VSR8    ((VectorSRegister)(   VSR8_VectorSRegisterEnumValue))
+#define VSR9    ((VectorSRegister)(   VSR9_VectorSRegisterEnumValue))
+#define VSR10   ((VectorSRegister)(  VSR10_VectorSRegisterEnumValue))
+#define VSR11   ((VectorSRegister)(  VSR11_VectorSRegisterEnumValue))
+#define VSR12   ((VectorSRegister)(  VSR12_VectorSRegisterEnumValue))
+#define VSR13   ((VectorSRegister)(  VSR13_VectorSRegisterEnumValue))
+#define VSR14   ((VectorSRegister)(  VSR14_VectorSRegisterEnumValue))
+#define VSR15   ((VectorSRegister)(  VSR15_VectorSRegisterEnumValue))
+#define VSR16   ((VectorSRegister)(  VSR16_VectorSRegisterEnumValue))
+#define VSR17   ((VectorSRegister)(  VSR17_VectorSRegisterEnumValue))
+#define VSR18   ((VectorSRegister)(  VSR18_VectorSRegisterEnumValue))
+#define VSR19   ((VectorSRegister)(  VSR19_VectorSRegisterEnumValue))
+#define VSR20   ((VectorSRegister)(  VSR20_VectorSRegisterEnumValue))
+#define VSR21   ((VectorSRegister)(  VSR21_VectorSRegisterEnumValue))
+#define VSR22   ((VectorSRegister)(  VSR22_VectorSRegisterEnumValue))
+#define VSR23   ((VectorSRegister)(  VSR23_VectorSRegisterEnumValue))
+#define VSR24   ((VectorSRegister)(  VSR24_VectorSRegisterEnumValue))
+#define VSR25   ((VectorSRegister)(  VSR25_VectorSRegisterEnumValue))
+#define VSR26   ((VectorSRegister)(  VSR26_VectorSRegisterEnumValue))
+#define VSR27   ((VectorSRegister)(  VSR27_VectorSRegisterEnumValue))
+#define VSR28   ((VectorSRegister)(  VSR28_VectorSRegisterEnumValue))
+#define VSR29   ((VectorSRegister)(  VSR29_VectorSRegisterEnumValue))
+#define VSR30   ((VectorSRegister)(  VSR30_VectorSRegisterEnumValue))
+#define VSR31   ((VectorSRegister)(  VSR31_VectorSRegisterEnumValue))
+#endif // DONT_USE_REGISTER_DEFINES
+
 // Maximum number of incoming arguments that can be passed in i registers.
 const int PPC_ARGS_IN_REGS_NUM = 8;
 
--- a/hotspot/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Fri May 27 17:12:10 2016 +0300
+++ b/hotspot/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Mon May 23 10:35:51 2016 -0300
@@ -1341,10 +1341,13 @@
     Register tmp3 = R8_ARG6;
     Register tmp4 = R9_ARG7;
 
+    VectorSRegister tmp_vsr1  = VSR1;
+    VectorSRegister tmp_vsr2  = VSR2;
+
     address start = __ function_entry();
     assert_positive_int(R5_ARG3);
 
-      Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;
+    Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
 
     // don't try anything fancy if arrays don't have many elements
     __ li(tmp3, 0);
@@ -1403,22 +1406,60 @@
       __ andi_(R5_ARG3, R5_ARG3, 15);
       __ mtctr(tmp1);
 
-      __ bind(l_8);
-      // Use unrolled version for mass copying (copy 16 elements a time).
-      // Load feeding store gets zero latency on Power6, however not on Power5.
-      // Therefore, the following sequence is made for the good of both.
-      __ ld(tmp1, 0, R3_ARG1);
-      __ ld(tmp2, 8, R3_ARG1);
-      __ ld(tmp3, 16, R3_ARG1);
-      __ ld(tmp4, 24, R3_ARG1);
-      __ std(tmp1, 0, R4_ARG2);
-      __ std(tmp2, 8, R4_ARG2);
-      __ std(tmp3, 16, R4_ARG2);
-      __ std(tmp4, 24, R4_ARG2);
-      __ addi(R3_ARG1, R3_ARG1, 32);
-      __ addi(R4_ARG2, R4_ARG2, 32);
-      __ bdnz(l_8);
-    }
+      if (!VM_Version::has_vsx()) {
+
+        __ bind(l_8);
+        // Use unrolled version for mass copying (copy 16 elements a time).
+        // Load feeding store gets zero latency on Power6, however not on Power5.
+        // Therefore, the following sequence is made for the good of both.
+        __ ld(tmp1, 0, R3_ARG1);
+        __ ld(tmp2, 8, R3_ARG1);
+        __ ld(tmp3, 16, R3_ARG1);
+        __ ld(tmp4, 24, R3_ARG1);
+        __ std(tmp1, 0, R4_ARG2);
+        __ std(tmp2, 8, R4_ARG2);
+        __ std(tmp3, 16, R4_ARG2);
+        __ std(tmp4, 24, R4_ARG2);
+        __ addi(R3_ARG1, R3_ARG1, 32);
+        __ addi(R4_ARG2, R4_ARG2, 32);
+        __ bdnz(l_8);
+
+      } else { // Processor supports VSX, so use it to mass copy.
+
+        // Prefetch src data into L2 cache.
+        __ dcbt(R3_ARG1, 0);
+
+        // If supported set DSCR pre-fetch to deepest.
+        if (VM_Version::has_mfdscr()) {
+          __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
+          __ mtdscr(tmp2);
+        }
+        __ li(tmp1, 16);
+
+        // Backbranch target aligned to 32-byte. It's not aligned 16-byte
+        // as loop contains < 8 instructions that fit inside a single
+        // i-cache sector.
+        __ align(32);
+
+        __ bind(l_9);
+        // Use loop with VSX load/store instructions to
+        // copy 16 elements a time.
+        __ lxvd2x(tmp_vsr1, 0, R3_ARG1);     // Load from src.
+        __ stxvd2x(tmp_vsr1, 0, R4_ARG2);    // Store to dst.
+        __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1);  // Load from src + 16.
+        __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
+        __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32.
+        __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32.
+        __ bdnz(l_9);                        // Dec CTR and loop if not zero.
+
+        // Restore DSCR pre-fetch value.
+        if (VM_Version::has_mfdscr()) {
+          __ load_const_optimized(tmp2, VM_Version::_dscr_val);
+          __ mtdscr(tmp2);
+        }
+
+      }
+    } // FasterArrayCopy
     __ bind(l_6);
 
     // copy 2 elements at a time
--- a/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp	Fri May 27 17:12:10 2016 +0300
+++ b/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp	Mon May 23 10:35:51 2016 -0300
@@ -38,7 +38,7 @@
 # include <sys/sysinfo.h>
 
 bool VM_Version::_is_determine_features_test_running = false;
-
+uint64_t VM_Version::_dscr_val = 0;
 
 #define MSG(flag)   \
   if (flag && !FLAG_IS_DEFAULT(flag))                                  \
@@ -111,7 +111,7 @@
   // Create and print feature-string.
   char buf[(num_features+1) * 16]; // Max 16 chars per feature.
   jio_snprintf(buf, sizeof(buf),
-               "ppc64%s%s%s%s%s%s%s%s%s%s%s%s%s",
+               "ppc64%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
                (has_fsqrt()   ? " fsqrt"   : ""),
                (has_isel()    ? " isel"    : ""),
                (has_lxarxeh() ? " lxarxeh" : ""),
@@ -125,7 +125,8 @@
                (has_vcipher() ? " aes"     : ""),
                (has_vpmsumb() ? " vpmsumb" : ""),
                (has_tcheck()  ? " tcheck"  : ""),
-               (has_mfdscr()  ? " mfdscr"  : "")
+               (has_mfdscr()  ? " mfdscr"  : ""),
+               (has_vsx()     ? " vsx"     : "")
                // Make sure number of %s matches num_features!
               );
   _features_string = os::strdup(buf);
@@ -643,6 +644,7 @@
   a->vpmsumb(VR0, VR1, VR2);                   // code[11] -> vpmsumb
   a->tcheck(0);                                // code[12] -> tcheck
   a->mfdscr(R0);                               // code[13] -> mfdscr
+  a->lxvd2x(VSR0, 0, R3_ARG1);                 // code[14] -> vsx
   a->blr();
 
   // Emit function to set one cache line to zero. Emit function descriptor and get pointer to it.
@@ -691,6 +693,7 @@
   if (code[feature_cntr++]) features |= vpmsumb_m;
   if (code[feature_cntr++]) features |= tcheck_m;
   if (code[feature_cntr++]) features |= mfdscr_m;
+  if (code[feature_cntr++]) features |= vsx_m;
 
   // Print the detection code.
   if (PrintAssembly) {
@@ -733,31 +736,31 @@
   }
 
   // Apply the configuration if needed.
-  uint64_t dscr_val = (*get_dscr)();
+  _dscr_val = (*get_dscr)();
   if (Verbose) {
-    tty->print_cr("dscr value was 0x%lx" , dscr_val);
+    tty->print_cr("dscr value was 0x%lx" , _dscr_val);
   }
   bool change_requested = false;
   if (DSCR_PPC64 != (uintx)-1) {
-    dscr_val = DSCR_PPC64;
+    _dscr_val = DSCR_PPC64;
     change_requested = true;
   }
   if (DSCR_DPFD_PPC64 <= 7) {
     uint64_t mask = 0x7;
-    if ((dscr_val & mask) != DSCR_DPFD_PPC64) {
-      dscr_val = (dscr_val & ~mask) | (DSCR_DPFD_PPC64);
+    if ((_dscr_val & mask) != DSCR_DPFD_PPC64) {
+      _dscr_val = (_dscr_val & ~mask) | (DSCR_DPFD_PPC64);
       change_requested = true;
     }
   }
   if (DSCR_URG_PPC64 <= 7) {
     uint64_t mask = 0x7 << 6;
-    if ((dscr_val & mask) != DSCR_DPFD_PPC64 << 6) {
-      dscr_val = (dscr_val & ~mask) | (DSCR_URG_PPC64 << 6);
+    if ((_dscr_val & mask) != DSCR_DPFD_PPC64 << 6) {
+      _dscr_val = (_dscr_val & ~mask) | (DSCR_URG_PPC64 << 6);
       change_requested = true;
     }
   }
   if (change_requested) {
-    (*set_dscr)(dscr_val);
+    (*set_dscr)(_dscr_val);
     if (Verbose) {
       tty->print_cr("dscr was set to 0x%lx" , (*get_dscr)());
     }
--- a/hotspot/src/cpu/ppc/vm/vm_version_ppc.hpp	Fri May 27 17:12:10 2016 +0300
+++ b/hotspot/src/cpu/ppc/vm/vm_version_ppc.hpp	Mon May 23 10:35:51 2016 -0300
@@ -46,6 +46,7 @@
     vpmsumb,
     tcheck,
     mfdscr,
+    vsx,
     num_features // last entry to count features
   };
   enum Feature_Flag_Set {
@@ -64,6 +65,7 @@
     vpmsumb_m             = (1 << vpmsumb),
     tcheck_m              = (1 << tcheck ),
     mfdscr_m              = (1 << mfdscr ),
+    vsx_m                 = (1 << vsx    ),
     all_features_m        = (unsigned long)-1
   };
 
@@ -97,10 +99,14 @@
   static bool has_vpmsumb() { return (_features & vpmsumb_m) != 0; }
   static bool has_tcheck()  { return (_features & tcheck_m) != 0; }
   static bool has_mfdscr()  { return (_features & mfdscr_m) != 0; }
+  static bool has_vsx()     { return (_features & vsx_m) != 0; }
 
   // Assembler testing
   static void allow_all();
   static void revert();
+
+  // POWER 8: DSCR current value.
+  static uint64_t _dscr_val;
 };
 
 #endif // CPU_PPC_VM_VM_VERSION_PPC_HPP