8216060: [PPC64] Vector CRC implementation should be used by interpreter and be faster for short arrays
authormdoerr
Mon, 21 Jan 2019 09:44:27 +0100
changeset 53408 e17d7ffad9e3
parent 53407 690aed53fef0
child 53409 99aac2161586
8216060: [PPC64] Vector CRC implementation should be used by interpreter and be faster for short arrays Reviewed-by: gromero, goetz
src/hotspot/cpu/ppc/macroAssembler_ppc.cpp
src/hotspot/cpu/ppc/macroAssembler_ppc.hpp
src/hotspot/cpu/ppc/stubGenerator_ppc.cpp
src/hotspot/cpu/ppc/stubRoutines_ppc.hpp
src/hotspot/cpu/ppc/stubRoutines_ppc_64.cpp
src/hotspot/cpu/ppc/templateInterpreterGenerator_ppc.cpp
--- a/src/hotspot/cpu/ppc/macroAssembler_ppc.cpp	Sun Jan 20 12:20:37 2019 -0500
+++ b/src/hotspot/cpu/ppc/macroAssembler_ppc.cpp	Mon Jan 21 09:44:27 2019 +0100
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018, SAP SE. All rights reserved.
+ * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2019, SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -3974,7 +3974,7 @@
  * Emits code to update CRC-32 with a 4-byte value according to constants in table
  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
  */
-// A not on the lookup table address(es):
+// A note on the lookup table address(es):
 // The lookup table consists of two sets of four columns each.
 // The columns {0..3} are used for little-endian machines.
 // The columns {4..7} are used for big-endian machines.
@@ -4147,57 +4147,50 @@
  * @param len             register containing number of bytes
  * @param table           register pointing to CRC table
  * @param constants       register pointing to CRC table for 128-bit aligned memory
- * @param barretConstants register pointing to table for barrett reduction
- * @param t0-t4           temp registers
+ * @param t0-t5           temp registers
  */
-void MacroAssembler::kernel_crc32_1word_vpmsum(Register crc, Register buf, Register len, Register table,
-                                               Register constants, Register barretConstants,
-                                               Register t0, Register t1, Register t2, Register t3, Register t4,
-                                               bool invertCRC) {
+void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register table,
+                                         Register constants, Register t0, Register t1, Register t2,
+                                         Register t3, Register t4, Register t5, bool invertCRC) {
   assert_different_registers(crc, buf, len, table);
 
-  Label L_alignedHead, L_tail;
-
-  BLOCK_COMMENT("kernel_crc32_1word_vpmsum {");
-
-  // 1. ~c
+  Label L_tail;
+
+  BLOCK_COMMENT("kernel_crc32_vpmsum {");
+
   if (invertCRC) {
     nand(crc, crc, crc);                      // 1s complement of crc
   }
 
-  // 2. use kernel_crc32_1word for short len
+  // Enforce 32 bit.
   clrldi(len, len, 32);
-  cmpdi(CCR0, len, 512);
-  blt(CCR0, L_tail);
-
-  // 3. calculate from 0 to first aligned address
-  const int alignment = 16;
+
+  // Align if we have enough bytes for the fast version.
+  const int alignment = 16,
+            threshold = 32;
   Register prealign = t0;
 
-  andi_(prealign, buf, alignment - 1);
-  beq(CCR0, L_alignedHead);
-  subfic(prealign, prealign, alignment);
+  neg(prealign, buf);
+  addi(t1, len, -threshold);
+  andi(prealign, prealign, alignment - 1);
+  cmpw(CCR0, t1, prealign);
+  blt(CCR0, L_tail); // len - prealign < threshold?
 
   subf(len, prealign, len);
   update_byteLoop_crc32(crc, buf, prealign, table, t2, false);
 
-  // 4. calculate from first aligned address as far as possible
-  BIND(L_alignedHead);
-  kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t0, t1, t2, t3, t4);
-
-  // 5. remaining bytes
+  // Calculate from first aligned address as far as possible.
+  kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5);
+
+  // Remaining bytes.
   BIND(L_tail);
-  Register tc0 = t4;
-  Register tc1 = constants;
-  Register tc2 = barretConstants;
-  kernel_crc32_1word(crc, buf, len, table, t0, t1, t2, t3, tc0, tc1, tc2, table, false);
-
-  // 6. ~c
+  update_byteLoop_crc32(crc, buf, len, table, t2, false);
+
   if (invertCRC) {
     nand(crc, crc, crc);                      // 1s complement of crc
   }
 
-  BLOCK_COMMENT("} kernel_crc32_1word_vpmsum");
+  BLOCK_COMMENT("} kernel_crc32_vpmsum");
 }
 
 /**
@@ -4205,13 +4198,10 @@
  * @param buf             register pointing to input byte buffer (byte*)
  * @param len             register containing number of bytes (will get updated to remaining bytes)
  * @param constants       register pointing to CRC table for 128-bit aligned memory
- * @param barretConstants register pointing to table for barrett reduction
- * @param t0-t4           temp registers
- * Precondition: len should be >= 512. Otherwise, nothing will be done.
+ * @param t0-t5           temp registers
  */
-void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
-    Register constants, Register barretConstants,
-    Register t0, Register t1, Register t2, Register t3, Register t4) {
+void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len,
+    Register constants, Register t0, Register t1, Register t2, Register t3, Register t4, Register t5) {
 
   // Save non-volatile vector registers (frameless).
   Register offset = t1;
@@ -4228,7 +4218,6 @@
   offsetInt -= 8; std(R14, offsetInt, R1_SP);
   offsetInt -= 8; std(R15, offsetInt, R1_SP);
   offsetInt -= 8; std(R16, offsetInt, R1_SP);
-  offsetInt -= 8; std(R17, offsetInt, R1_SP);
 
   // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
   // bytes per iteration. The basic scheme is:
@@ -4239,14 +4228,17 @@
   // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
 
   // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
-  const int unroll_factor = 2048;
-  const int unroll_factor2 = 8;
+  const int unroll_factor = CRC32_UNROLL_FACTOR,
+            unroll_factor2 = CRC32_UNROLL_FACTOR2;
+
+  const int outer_consts_size = (unroll_factor2 - 1) * 16,
+            inner_consts_size = (unroll_factor / unroll_factor2) * 16;
 
   // Support registers.
-  Register offs[] = { noreg, t0, t1, t2, t3, t4, crc /* will live in VCRC */, R14 };
-  Register num_bytes = R15,
-           loop_count = R16,
-           cur_const = R17;
+  Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, crc /* will live in VCRC */ };
+  Register num_bytes = R14,
+           loop_count = R15,
+           cur_const = R16;
   // Constant array for outer loop: unroll_factor2 - 1 registers,
   // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
   VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
@@ -4268,7 +4260,7 @@
     mtdscr(t0);
   }
 
-  mtvrwz(VCRC, crc); // crc lives lives in VCRC, now
+  mtvrwz(VCRC, crc); // crc lives in VCRC, now
 
   for (int i = 1; i < unroll_factor2; ++i) {
     li(offs[i], 16 * i);
@@ -4279,10 +4271,8 @@
   for (int i = 1; i < unroll_factor2 - 1; ++i) {
     lvx(consts0[i], offs[i], constants);
   }
-  addi(constants, constants, (unroll_factor2 - 1) * 16);
 
   load_const_optimized(num_bytes, 16 * unroll_factor);
-  load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
 
   // Reuse data registers outside of the loop.
   VectorRegister Vtmp = data1[0];
@@ -4310,13 +4300,15 @@
   cmpd(CCR0, len, num_bytes);
   blt(CCR0, L_last);
 
+  addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
+  load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
+
   // ********** Main loop start **********
   align(32);
   bind(L_outer_loop);
 
   // Begin of unrolled first iteration (no xor).
   lvx(data1[0], buf);
-  mr(cur_const, constants);
   for (int i = 1; i < unroll_factor2 / 2; ++i) {
     lvx(data1[i], offs[i], buf);
   }
@@ -4369,6 +4361,8 @@
   }
   bdnz(L_inner_loop);
 
+  addi(cur_const, constants, outer_consts_size); // Reset
+
   // Tail of last iteration (no loads).
   for (int i = 0; i < unroll_factor2 / 2; ++i) {
     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
@@ -4397,15 +4391,15 @@
   // Last chance with lower num_bytes.
   bind(L_last);
   srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
-  add_const_optimized(constants, constants, 16 * (unroll_factor / unroll_factor2)); // Point behind last one.
+  // Point behind last const for inner loop.
+  add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
   sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
   clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
-  subf(constants, R0, constants); // Point to constant to be used first.
+  subf(cur_const, R0, cur_const); // Point to constant to be used first.
 
   addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
   bgt(CCR0, L_outer_loop);
   // ********** Main loop end **********
-#undef BE_swap_bytes
 
   // Restore DSCR pre-fetch value.
   if (VM_Version::has_mfdscr()) {
@@ -4413,13 +4407,45 @@
     mtdscr(t0);
   }
 
+  // ********** Simple loop for remaining 16 byte blocks **********
+  {
+    Label L_loop, L_done;
+
+    srdi_(t0, len, 4); // 16 bytes per iteration
+    clrldi(len, len, 64-4);
+    beq(CCR0, L_done);
+
+    // Point to const (same as last const for inner loop).
+    add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
+    mtctr(t0);
+    lvx(Vtmp2, cur_const);
+
+    align(32);
+    bind(L_loop);
+
+    lvx(Vtmp, buf);
+    addi(buf, buf, 16);
+    vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
+    BE_swap_bytes(Vtmp);
+    vxor(VCRC, VCRC, Vtmp);
+    vpmsumw(VCRC, VCRC, Vtmp2);
+    bdnz(L_loop);
+
+    bind(L_done);
+  }
+  // ********** Simple loop end **********
+#undef BE_swap_bytes
+
+  // Point to Barrett constants
+  add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
+
   vspltisb(zeroes, 0);
 
   // Combine to 64 bit result.
   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
 
   // Reduce to 32 bit CRC: Remainder by multiply-high.
-  lvx(Vtmp, barretConstants);
+  lvx(Vtmp, cur_const);
   vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
   vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
@@ -4445,7 +4471,20 @@
   offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
   offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
   offsetInt -= 8;  ld(R16, offsetInt, R1_SP);
-  offsetInt -= 8;  ld(R17, offsetInt, R1_SP);
+}
+
+void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
+                           Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
+  load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
+                                     : StubRoutines::crc_table_addr()   , R0);
+
+  if (VM_Version::has_vpmsumb()) {
+    load_const_optimized(t1, is_crc32c ? StubRoutines::ppc64::crc32c_constants()
+                                       : StubRoutines::ppc64::crc_constants()   , R0);
+    kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
+  } else {
+    kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
+  }
 }
 
 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) {
--- a/src/hotspot/cpu/ppc/macroAssembler_ppc.hpp	Sun Jan 20 12:20:37 2019 -0500
+++ b/src/hotspot/cpu/ppc/macroAssembler_ppc.hpp	Mon Jan 21 09:44:27 2019 +0100
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2002, 2019, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018, SAP SE. All rights reserved.
+ * Copyright (c) 2012, 2019, SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -842,13 +842,14 @@
   void kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
                           Register t0,  Register t1,  Register t2,  Register t3,
                           bool invertCRC);
-  void kernel_crc32_1word_vpmsum(Register crc, Register buf, Register len, Register table,
-                          Register constants, Register barretConstants,
-                          Register t0,  Register t1, Register t2, Register t3, Register t4,
+  void kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register table, Register constants,
+                          Register t0, Register t1, Register t2, Register t3, Register t4, Register t5,
                           bool invertCRC);
-  void kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
-                          Register constants, Register barretConstants,
-                          Register t0, Register t1, Register t2, Register t3, Register t4);
+  void kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
+                                   Register t0, Register t1, Register t2, Register t3, Register t4, Register t5);
+  // Version which internally decides what to use.
+  void crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
+             Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c);
 
   void kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp,
                                bool invertCRC);
--- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp	Sun Jan 20 12:20:37 2019 -0500
+++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp	Mon Jan 21 09:44:27 2019 +0100
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018, SAP SE. All rights reserved.
+ * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2019, SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -3186,35 +3186,6 @@
     return start;
   }
 
-
-  // Compute CRC32/CRC32C function.
-  void generate_CRC_updateBytes(const char* name, Register table, bool invertCRC) {
-
-      // arguments to kernel_crc32:
-      const Register crc     = R3_ARG1;  // Current checksum, preset by caller or result from previous call.
-      const Register data    = R4_ARG2;  // source byte array
-      const Register dataLen = R5_ARG3;  // #bytes to process
-
-      const Register t0      = R2;
-      const Register t1      = R7;
-      const Register t2      = R8;
-      const Register t3      = R9;
-      const Register tc0     = R10;
-      const Register tc1     = R11;
-      const Register tc2     = R12;
-
-      BLOCK_COMMENT("Stub body {");
-      assert_different_registers(crc, data, dataLen, table);
-
-      __ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, tc0, tc1, tc2, table, invertCRC);
-
-      BLOCK_COMMENT("return");
-      __ mr_if_needed(R3_RET, crc);      // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
-      __ blr();
-
-      BLOCK_COMMENT("} Stub body");
-  }
-
   /**
   *  Arguments:
   *
@@ -3492,111 +3463,15 @@
    *   R3_RET     - int   crc result
    */
   // Compute CRC32 function.
-  address generate_CRC32_updateBytes(const char* name) {
+  address generate_CRC32_updateBytes(bool is_crc32c) {
     __ align(CodeEntryAlignment);
-    StubCodeMark mark(this, "StubRoutines", name);
+    StubCodeMark mark(this, "StubRoutines", is_crc32c ? "CRC32C_updateBytes" : "CRC32_updateBytes");
     address start = __ function_entry();  // Remember stub start address (is rtn value).
-
-    const Register table   = R6;       // crc table address
-
-    // arguments to kernel_crc32:
-    const Register crc     = R3_ARG1;  // Current checksum, preset by caller or result from previous call.
-    const Register data    = R4_ARG2;  // source byte array
-    const Register dataLen = R5_ARG3;  // #bytes to process
-
-    if (VM_Version::has_vpmsumb()) {
-      const Register constants    = R2;  // constants address
-      const Register bconstants   = R8;  // barret table address
-
-      const Register t0      = R9;
-      const Register t1      = R10;
-      const Register t2      = R11;
-      const Register t3      = R12;
-      const Register t4      = R7;
-
-      BLOCK_COMMENT("Stub body {");
-      assert_different_registers(crc, data, dataLen, table);
-
-      StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
-      StubRoutines::ppc64::generate_load_crc_constants_addr(_masm, constants);
-      StubRoutines::ppc64::generate_load_crc_barret_constants_addr(_masm, bconstants);
-
-      __ kernel_crc32_1word_vpmsum(crc, data, dataLen, table, constants, bconstants, t0, t1, t2, t3, t4, true);
-
-      BLOCK_COMMENT("return");
-      __ mr_if_needed(R3_RET, crc);      // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
-      __ blr();
-
-      BLOCK_COMMENT("} Stub body");
-    } else {
-      StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
-      generate_CRC_updateBytes(name, table, true);
-    }
-
+    __ crc32(R3_ARG1, R4_ARG2, R5_ARG3, R2, R6, R7, R8, R9, R10, R11, R12, is_crc32c);
+    __ blr();
     return start;
   }
 
-
-  /**
-   * Arguments:
-   *
-   * Inputs:
-   *   R3_ARG1    - int   crc
-   *   R4_ARG2    - byte* buf
-   *   R5_ARG3    - int   length (of buffer)
-   *
-   * scratch:
-   *   R2, R6-R12
-   *
-   * Ouput:
-   *   R3_RET     - int   crc result
-   */
-  // Compute CRC32C function.
-  address generate_CRC32C_updateBytes(const char* name) {
-    __ align(CodeEntryAlignment);
-    StubCodeMark mark(this, "StubRoutines", name);
-    address start = __ function_entry();  // Remember stub start address (is rtn value).
-
-    const Register table   = R6;       // crc table address
-
-    // arguments to kernel_crc32:
-    const Register crc     = R3_ARG1;  // Current checksum, preset by caller or result from previous call.
-    const Register data    = R4_ARG2;  // source byte array
-    const Register dataLen = R5_ARG3;  // #bytes to process
-
-    if (VM_Version::has_vpmsumb()) {
-      const Register constants    = R2;  // constants address
-      const Register bconstants   = R8;  // barret table address
-
-      const Register t0      = R9;
-      const Register t1      = R10;
-      const Register t2      = R11;
-      const Register t3      = R12;
-      const Register t4      = R7;
-
-      BLOCK_COMMENT("Stub body {");
-      assert_different_registers(crc, data, dataLen, table);
-
-      StubRoutines::ppc64::generate_load_crc32c_table_addr(_masm, table);
-      StubRoutines::ppc64::generate_load_crc32c_constants_addr(_masm, constants);
-      StubRoutines::ppc64::generate_load_crc32c_barret_constants_addr(_masm, bconstants);
-
-      __ kernel_crc32_1word_vpmsum(crc, data, dataLen, table, constants, bconstants, t0, t1, t2, t3, t4, false);
-
-      BLOCK_COMMENT("return");
-      __ mr_if_needed(R3_RET, crc);      // Updated crc is function result. No copying required (R3_ARG1 == R3_RET).
-      __ blr();
-
-      BLOCK_COMMENT("} Stub body");
-    } else {
-      StubRoutines::ppc64::generate_load_crc32c_table_addr(_masm, table);
-      generate_CRC_updateBytes(name, table, false);
-    }
-
-    return start;
-  }
-
-
   // Initialization
   void generate_initial() {
     // Generates all stubs and initializes the entry points
@@ -3621,14 +3496,20 @@
 
     // CRC32 Intrinsics.
     if (UseCRC32Intrinsics) {
-      StubRoutines::_crc_table_adr    = (address)StubRoutines::ppc64::_crc_table;
-      StubRoutines::_updateBytesCRC32 = generate_CRC32_updateBytes("CRC32_updateBytes");
+      StubRoutines::_crc_table_adr = (address)StubRoutines::ppc64::_crc_table;
+      if (VM_Version::has_vpmsumb()) {
+        StubRoutines::ppc64::_crc_constants = StubRoutines::ppc64::generate_crc_constants(REVERSE_CRC32_POLY);
+      }
+      StubRoutines::_updateBytesCRC32 = generate_CRC32_updateBytes(false);
     }
 
     // CRC32C Intrinsics.
     if (UseCRC32CIntrinsics) {
       StubRoutines::_crc32c_table_addr = (address)StubRoutines::ppc64::_crc32c_table;
-      StubRoutines::_updateBytesCRC32C = generate_CRC32C_updateBytes("CRC32C_updateBytes");
+      if (VM_Version::has_vpmsumb()) {
+        StubRoutines::ppc64::_crc32c_constants = StubRoutines::ppc64::generate_crc_constants(REVERSE_CRC32C_POLY);
+      }
+      StubRoutines::_updateBytesCRC32C = generate_CRC32_updateBytes(true);
     }
   }
 
--- a/src/hotspot/cpu/ppc/stubRoutines_ppc.hpp	Sun Jan 20 12:20:37 2019 -0500
+++ b/src/hotspot/cpu/ppc/stubRoutines_ppc.hpp	Mon Jan 21 09:44:27 2019 +0100
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2002, 2019, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018, SAP SE. All rights reserved.
+ * Copyright (c) 2012, 2019, SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -45,8 +45,14 @@
 #else
   #define CRC32_TABLES 1
 #endif
-#define CRC32_CONSTANTS_SIZE 1084
-#define CRC32_BARRET_CONSTANTS 10
+
+#define REVERSE_CRC32_POLY  0xEDB88320
+#define REVERSE_CRC32C_POLY 0x82F63B78
+#define INVERSE_REVERSE_CRC32_POLY  0x1aab14226ull
+#define INVERSE_REVERSE_CRC32C_POLY 0x105fd79bdull
+#define CRC32_UNROLL_FACTOR 2048
+#define CRC32_UNROLL_FACTOR2 8
+
 
 class ppc64 {
  friend class StubGenerator;
@@ -56,20 +62,15 @@
   // CRC32 Intrinsics.
   static juint _crc_table[CRC32_TABLES][CRC32_COLUMN_SIZE];
   static juint _crc32c_table[CRC32_TABLES][CRC32_COLUMN_SIZE];
-  static juint *_crc_constants, *_crc_barret_constants;
-  static juint *_crc32c_constants, *_crc32c_barret_constants;
+  static juint *_crc_constants;
+  static juint *_crc32c_constants;
 
  public:
 
   // CRC32 Intrinsics.
-  static void generate_load_crc_table_addr(MacroAssembler* masm, Register table);
-  static void generate_load_crc_constants_addr(MacroAssembler* masm, Register table);
-  static void generate_load_crc_barret_constants_addr(MacroAssembler* masm, Register table);
-  static void generate_load_crc32c_table_addr(MacroAssembler* masm, Register table);
-  static void generate_load_crc32c_constants_addr(MacroAssembler* masm, Register table);
-  static void generate_load_crc32c_barret_constants_addr(MacroAssembler* masm, Register table);
+  static address crc_constants() { return (address)_crc_constants; }
+  static address crc32c_constants() { return (address)_crc32c_constants; }
   static juint* generate_crc_constants(juint reverse_poly);
-  static juint* generate_crc_barret_constants(juint reverse_poly);
 };
 
 #endif // CPU_PPC_STUBROUTINES_PPC_HPP
--- a/src/hotspot/cpu/ppc/stubRoutines_ppc_64.cpp	Sun Jan 20 12:20:37 2019 -0500
+++ b/src/hotspot/cpu/ppc/stubRoutines_ppc_64.cpp	Mon Jan 21 09:44:27 2019 +0100
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018, SAP SE. All rights reserved.
+ * Copyright (c) 2002, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2019, SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -33,39 +33,7 @@
 
 #define __ masm->
 
-// CRC32(C) Intrinsics.
-void StubRoutines::ppc64::generate_load_crc_table_addr(MacroAssembler* masm, Register table) {
-  __ load_const_optimized(table, StubRoutines::_crc_table_adr, R0);
-}
-
-void StubRoutines::ppc64::generate_load_crc_constants_addr(MacroAssembler* masm, Register table) {
-  __ load_const_optimized(table, (address)StubRoutines::ppc64::_crc_constants, R0);
-}
-
-void StubRoutines::ppc64::generate_load_crc_barret_constants_addr(MacroAssembler* masm, Register table) {
-  __ load_const_optimized(table, (address)StubRoutines::ppc64::_crc_barret_constants, R0);
-}
-
-void StubRoutines::ppc64::generate_load_crc32c_table_addr(MacroAssembler* masm, Register table) {
-  __ load_const_optimized(table, StubRoutines::_crc32c_table_addr, R0);
-}
-
-void StubRoutines::ppc64::generate_load_crc32c_constants_addr(MacroAssembler* masm, Register table) {
-  __ load_const_optimized(table, (address)StubRoutines::ppc64::_crc32c_constants, R0);
-}
-
-void StubRoutines::ppc64::generate_load_crc32c_barret_constants_addr(MacroAssembler* masm, Register table) {
-  __ load_const_optimized(table, (address)StubRoutines::ppc64::_crc32c_barret_constants, R0);
-}
-
-// CRC constants and compute functions
-#define REVERSE_CRC32_POLY  0xEDB88320
-#define REVERSE_CRC32C_POLY 0x82F63B78
-#define INVERSE_REVERSE_CRC32_POLY  0x1aab14226ull
-#define INVERSE_REVERSE_CRC32C_POLY 0x105fd79bdull
-#define UNROLL_FACTOR 2048
-#define UNROLL_FACTOR2 8
-
+// CRC constant compute functions
 static juint fold_word(juint w, juint reverse_poly) {
   for (int i = 0; i < 32; i++) {
     int poly_if_odd = (-(w & 1)) & reverse_poly;
@@ -98,13 +66,13 @@
 
 // Constants to fold n words as needed by macroAssembler.
 juint* StubRoutines::ppc64::generate_crc_constants(juint reverse_poly) {
-  juint* ptr = (juint*) malloc(sizeof(juint) * 4 * (UNROLL_FACTOR2 - 1 + UNROLL_FACTOR / UNROLL_FACTOR2));
+  juint* ptr = (juint*) malloc(sizeof(juint) * 4 * (CRC32_UNROLL_FACTOR2 + CRC32_UNROLL_FACTOR / CRC32_UNROLL_FACTOR2));
   guarantee(((intptr_t)ptr & 0xF) == 0, "16-byte alignment needed");
   guarantee(ptr != NULL, "allocation error of a crc table");
 
   // Generate constants for outer loop
   juint v0, v1, v2, v3 = 1;
-  for (int i = 0; i < UNROLL_FACTOR2 - 1; ++i) {
+  for (int i = 0; i < CRC32_UNROLL_FACTOR2 - 1; ++i) {
     v0 = fold_word(v3, reverse_poly);
     v1 = fold_word(v0, reverse_poly);
     v2 = fold_word(v1, reverse_poly);
@@ -123,15 +91,15 @@
   }
 
   // Generate constants for inner loop
-  juint* ptr2 = ptr + 4 * (UNROLL_FACTOR2 - 1);
+  juint* ptr2 = ptr + 4 * (CRC32_UNROLL_FACTOR2 - 1);
   v3 = 1; // Restart from scratch.
-  for (int i = 0; i < UNROLL_FACTOR; ++i) {
+  for (int i = 0; i < CRC32_UNROLL_FACTOR; ++i) {
     v0 = fold_word(v3, reverse_poly);
     v1 = fold_word(v0, reverse_poly);
     v2 = fold_word(v1, reverse_poly);
     v3 = fold_word(v2, reverse_poly);
-    if (i % UNROLL_FACTOR2 == 0) {
-      int idx = UNROLL_FACTOR / UNROLL_FACTOR2 - 1 - i / UNROLL_FACTOR2;
+    if (i % CRC32_UNROLL_FACTOR2 == 0) {
+      int idx = CRC32_UNROLL_FACTOR / CRC32_UNROLL_FACTOR2 - 1 - i / CRC32_UNROLL_FACTOR2;
       for (int j = 0; j < 4; ++j) {
 #ifdef VM_LITTLE_ENDIAN
         ptr2[4*idx  ] = v3;
@@ -148,16 +116,9 @@
     }
   }
 
-  return ptr;
-}
-
-// Constants to reduce 64 to 32 bit as needed by macroAssembler.
-juint* StubRoutines::ppc64::generate_crc_barret_constants(juint reverse_poly) {
-  juint* ptr = (juint*) malloc(sizeof(juint) * CRC32_BARRET_CONSTANTS);
-  guarantee(((intptr_t)ptr & 0xF) == 0, "16-byte alignment needed");
-  guarantee(ptr != NULL, "allocation error of a crc table");
-
-  julong* c = (julong*)ptr;
+  // Constants to reduce 64 to 32 bit as needed by macroAssembler.
+  juint* ptr3 = ptr2 + 4 * (CRC32_UNROLL_FACTOR / CRC32_UNROLL_FACTOR2);
+  julong* c = (julong*)ptr3;
   julong long_poly = (((julong)reverse_poly) << 1) | 1;
   julong inverse_long_poly = compute_inverse_poly(long_poly);
 #ifdef VM_LITTLE_ENDIAN
@@ -177,6 +138,7 @@
 #endif
 
   //printf("inv poly: 0x%016llx\n", (long long unsigned int)inverse_long_poly);
+
   return ptr;
 }
 
@@ -772,8 +734,5 @@
   #endif
   };
 
-juint* StubRoutines::ppc64::_crc_constants    = StubRoutines::ppc64::generate_crc_constants(REVERSE_CRC32_POLY);
-juint* StubRoutines::ppc64::_crc32c_constants = StubRoutines::ppc64::generate_crc_constants(REVERSE_CRC32C_POLY);
-
-juint* StubRoutines::ppc64::_crc_barret_constants    = StubRoutines::ppc64::generate_crc_barret_constants(REVERSE_CRC32_POLY);
-juint* StubRoutines::ppc64::_crc32c_barret_constants = StubRoutines::ppc64::generate_crc_barret_constants(REVERSE_CRC32C_POLY);
+juint* StubRoutines::ppc64::_crc_constants = NULL;
+juint* StubRoutines::ppc64::_crc32c_constants = NULL;
--- a/src/hotspot/cpu/ppc/templateInterpreterGenerator_ppc.cpp	Sun Jan 20 12:20:37 2019 -0500
+++ b/src/hotspot/cpu/ppc/templateInterpreterGenerator_ppc.cpp	Mon Jan 21 09:44:27 2019 +0100
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 2014, 2018, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2015, 2018, SAP SE. All rights reserved.
+ * Copyright (c) 2014, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2019, SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -1832,7 +1832,7 @@
 #endif
     __ lwz(crc,  2*wordSize, argP);    // Current crc state, zero extend to 64 bit to have a clean register.
 
-    StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
+    __ load_const_optimized(table, StubRoutines::crc_table_addr(), R0);
     __ kernel_crc32_singleByte(crc, data, dataLen, table, tmp, true);
 
     // Restore caller sp for c2i case (from compiled) and for resized sender frame (from interpreted).
@@ -1873,19 +1873,7 @@
     const Register crc     = R3_ARG1;  // crc value
     const Register data    = R4_ARG2;  // address of java byte array
     const Register dataLen = R5_ARG3;  // source data len
-    const Register table   = R6_ARG4;  // address of crc32 table
-
-    const Register t0      = R9;       // scratch registers for crc calculation
-    const Register t1      = R10;
-    const Register t2      = R11;
-    const Register t3      = R12;
-
-    const Register tc0     = R2;       // registers to hold pre-calculated column addresses
-    const Register tc1     = R7;
-    const Register tc2     = R8;
-    const Register tc3     = table;    // table address is reconstructed at the end of kernel_crc32_* emitters
-
-    const Register tmp     = t0;       // Only used very locally to calculate byte buffer address.
+    const Register tmp     = R11_scratch1;
 
     // Arguments are reversed on java expression stack.
     // Calculate address of start element.
@@ -1916,12 +1904,7 @@
       __ addi(data, data, arrayOopDesc::base_offset_in_bytes(T_BYTE));
     }
 
-    StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table);
-
-    // Performance measurements show the 1word and 2word variants to be almost equivalent,
-    // with very light advantages for the 1word variant. We chose the 1word variant for
-    // code compactness.
-    __ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, tc0, tc1, tc2, tc3, true);
+    __ crc32(crc, data, dataLen, R2, R6, R7, R8, R9, R10, R11, R12, false);
 
     // Restore caller sp for c2i case (from compiled) and for resized sender frame (from interpreted).
     __ resize_frame_absolute(R21_sender_SP, R11_scratch1, R0);
@@ -1959,19 +1942,7 @@
     const Register crc     = R3_ARG1;  // crc value
     const Register data    = R4_ARG2;  // address of java byte array
     const Register dataLen = R5_ARG3;  // source data len
-    const Register table   = R6_ARG4;  // address of crc32c table
-
-    const Register t0      = R9;       // scratch registers for crc calculation
-    const Register t1      = R10;
-    const Register t2      = R11;
-    const Register t3      = R12;
-
-    const Register tc0     = R2;       // registers to hold pre-calculated column addresses
-    const Register tc1     = R7;
-    const Register tc2     = R8;
-    const Register tc3     = table;    // table address is reconstructed at the end of kernel_crc32_* emitters
-
-    const Register tmp     = t0;       // Only used very locally to calculate byte buffer address.
+    const Register tmp     = R11_scratch1;
 
     // Arguments are reversed on java expression stack.
     // Calculate address of start element.
@@ -2004,12 +1975,7 @@
       __ addi(data, data, arrayOopDesc::base_offset_in_bytes(T_BYTE));
     }
 
-    StubRoutines::ppc64::generate_load_crc32c_table_addr(_masm, table);
-
-    // Performance measurements show the 1word and 2word variants to be almost equivalent,
-    // with very light advantages for the 1word variant. We chose the 1word variant for
-    // code compactness.
-    __ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, tc0, tc1, tc2, tc3, false);
+    __ crc32(crc, data, dataLen, R2, R6, R7, R8, R9, R10, R11, R12, true);
 
     // Restore caller sp for c2i case (from compiled) and for resized sender frame (from interpreted).
     __ resize_frame_absolute(R21_sender_SP, R11_scratch1, R0);