src/hotspot/cpu/ppc/macroAssembler_ppc.cpp
changeset 49376 7cd503c499a0
parent 49347 edb65305d3ac
child 49449 ef5d5d343e2a
--- a/src/hotspot/cpu/ppc/macroAssembler_ppc.cpp	Mon Mar 12 09:46:12 2018 +0100
+++ b/src/hotspot/cpu/ppc/macroAssembler_ppc.cpp	Mon Mar 12 12:02:20 2018 +0100
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017, SAP SE. All rights reserved.
+ * Copyright (c) 2012, 2018, SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -4451,561 +4451,304 @@
  * @param table           register pointing to CRC table
  * @param constants       register pointing to CRC table for 128-bit aligned memory
  * @param barretConstants register pointing to table for barrett reduction
- * @param t0              volatile register
- * @param t1              volatile register
- * @param t2              volatile register
- * @param t3              volatile register
+ * @param t0-t4           temp registers
  */
-void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table,
-                                                Register constants,  Register barretConstants,
-                                                Register t0,  Register t1, Register t2, Register t3, Register t4,
-                                                bool invertCRC) {
+void MacroAssembler::kernel_crc32_1word_vpmsum(Register crc, Register buf, Register len, Register table,
+                                               Register constants, Register barretConstants,
+                                               Register t0, Register t1, Register t2, Register t3, Register t4,
+                                               bool invertCRC) {
   assert_different_registers(crc, buf, len, table);
 
-  Label L_alignedHead, L_tail, L_alignTail, L_start, L_end;
-
-  Register  prealign     = t0;
-  Register  postalign    = t0;
-
-  BLOCK_COMMENT("kernel_crc32_1word_vpmsumb {");
-
-  // 1. use kernel_crc32_1word for shorter than 384bit
+  Label L_alignedHead, L_tail;
+
+  BLOCK_COMMENT("kernel_crc32_1word_vpmsum {");
+
+  // 1. ~c
+  if (invertCRC) {
+    nand(crc, crc, crc);                      // 1s complement of crc
+  }
+
+  // 2. use kernel_crc32_1word for short len
   clrldi(len, len, 32);
-  cmpdi(CCR0, len, 384);
-  bge(CCR0, L_start);
-
-    Register tc0 = t4;
-    Register tc1 = constants;
-    Register tc2 = barretConstants;
-    kernel_crc32_1word(crc, buf, len, table,t0, t1, t2, t3, tc0, tc1, tc2, table, invertCRC);
-    b(L_end);
-
-  BIND(L_start);
-
-    // 2. ~c
-    if (invertCRC) {
-      nand(crc, crc, crc);                      // 1s complement of crc
-    }
-
-    // 3. calculate from 0 to first 128bit-aligned address
-    clrldi_(prealign, buf, 57);
-    beq(CCR0, L_alignedHead);
-
-    subfic(prealign, prealign, 128);
-
-    subf(len, prealign, len);
-    update_byteLoop_crc32(crc, buf, prealign, table, t2, false);
-
-    // 4. calculate from first 128bit-aligned address to last 128bit-aligned address
-    BIND(L_alignedHead);
-
-    clrldi(postalign, len, 57);
-    subf(len, postalign, len);
-
-    // len must be more than 256bit
-    kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t1, t2, t3);
-
-    // 5. calculate remaining
-    cmpdi(CCR0, postalign, 0);
-    beq(CCR0, L_tail);
-
-    update_byteLoop_crc32(crc, buf, postalign, table, t2, false);
-
-    BIND(L_tail);
-
-    // 6. ~c
-    if (invertCRC) {
-      nand(crc, crc, crc);                      // 1s complement of crc
-    }
-
-  BIND(L_end);
-
-  BLOCK_COMMENT("} kernel_crc32_1word_vpmsumb");
+  cmpdi(CCR0, len, 512);
+  blt(CCR0, L_tail);
+
+  // 3. calculate from 0 to first aligned address
+  const int alignment = 16;
+  Register prealign = t0;
+
+  andi_(prealign, buf, alignment - 1);
+  beq(CCR0, L_alignedHead);
+  subfic(prealign, prealign, alignment);
+
+  subf(len, prealign, len);
+  update_byteLoop_crc32(crc, buf, prealign, table, t2, false);
+
+  // 4. calculate from first aligned address as far as possible
+  BIND(L_alignedHead);
+  kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t0, t1, t2, t3, t4);
+
+  // 5. remaining bytes
+  BIND(L_tail);
+  Register tc0 = t4;
+  Register tc1 = constants;
+  Register tc2 = barretConstants;
+  kernel_crc32_1word(crc, buf, len, table, t0, t1, t2, t3, tc0, tc1, tc2, table, false);
+
+  // 6. ~c
+  if (invertCRC) {
+    nand(crc, crc, crc);                      // 1s complement of crc
+  }
+
+  BLOCK_COMMENT("} kernel_crc32_1word_vpmsum");
 }
 
 /**
  * @param crc             register containing existing CRC (32-bit)
  * @param buf             register pointing to input byte buffer (byte*)
- * @param len             register containing number of bytes
+ * @param len             register containing number of bytes (will get updated to remaining bytes)
  * @param constants       register pointing to CRC table for 128-bit aligned memory
  * @param barretConstants register pointing to table for barrett reduction
- * @param t0              volatile register
- * @param t1              volatile register
- * @param t2              volatile register
+ * @param t0-t4           temp registers
+ * Precondition: len should be >= 512. Otherwise, nothing will be done.
  */
 void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
-    Register constants, Register barretConstants, Register t0, Register t1, Register t2) {
-  Label L_mainLoop, L_tail, L_alignTail, L_barrett_reduction, L_end, L_first_warm_up_done, L_first_cool_down, L_second_cool_down, L_XOR, L_test;
-  Label L_lv0, L_lv1, L_lv2, L_lv3, L_lv4, L_lv5, L_lv6, L_lv7, L_lv8, L_lv9, L_lv10, L_lv11, L_lv12, L_lv13, L_lv14, L_lv15;
-  Label L_1, L_2, L_3, L_4;
-
-  Register  rLoaded      = t0;
-  Register  rTmp1        = t1;
-  Register  rTmp2        = t2;
-  Register  off16        = R22;
-  Register  off32        = R23;
-  Register  off48        = R24;
-  Register  off64        = R25;
-  Register  off80        = R26;
-  Register  off96        = R27;
-  Register  off112       = R28;
-  Register  rIdx         = R29;
-  Register  rMax         = R30;
-  Register  constantsPos = R31;
-
-  VectorRegister mask_32bit = VR24;
-  VectorRegister mask_64bit = VR25;
-  VectorRegister zeroes     = VR26;
-  VectorRegister const1     = VR27;
-  VectorRegister const2     = VR28;
+    Register constants, Register barretConstants,
+    Register t0, Register t1, Register t2, Register t3, Register t4) {
 
   // Save non-volatile vector registers (frameless).
-  Register offset = t1;   int offsetInt = 0;
-  offsetInt -= 16; li(offset, -16);           stvx(VR20, offset, R1_SP);
-  offsetInt -= 16; addi(offset, offset, -16); stvx(VR21, offset, R1_SP);
-  offsetInt -= 16; addi(offset, offset, -16); stvx(VR22, offset, R1_SP);
-  offsetInt -= 16; addi(offset, offset, -16); stvx(VR23, offset, R1_SP);
-  offsetInt -= 16; addi(offset, offset, -16); stvx(VR24, offset, R1_SP);
-  offsetInt -= 16; addi(offset, offset, -16); stvx(VR25, offset, R1_SP);
-  offsetInt -= 16; addi(offset, offset, -16); stvx(VR26, offset, R1_SP);
-  offsetInt -= 16; addi(offset, offset, -16); stvx(VR27, offset, R1_SP);
-  offsetInt -= 16; addi(offset, offset, -16); stvx(VR28, offset, R1_SP);
-  offsetInt -= 8; std(R22, offsetInt, R1_SP);
-  offsetInt -= 8; std(R23, offsetInt, R1_SP);
-  offsetInt -= 8; std(R24, offsetInt, R1_SP);
-  offsetInt -= 8; std(R25, offsetInt, R1_SP);
-  offsetInt -= 8; std(R26, offsetInt, R1_SP);
-  offsetInt -= 8; std(R27, offsetInt, R1_SP);
-  offsetInt -= 8; std(R28, offsetInt, R1_SP);
-  offsetInt -= 8; std(R29, offsetInt, R1_SP);
-  offsetInt -= 8; std(R30, offsetInt, R1_SP);
-  offsetInt -= 8; std(R31, offsetInt, R1_SP);
-
-  // Set constants
-  li(off16, 16);
-  li(off32, 32);
-  li(off48, 48);
-  li(off64, 64);
-  li(off80, 80);
-  li(off96, 96);
-  li(off112, 112);
-
-  clrldi(crc, crc, 32);
-
-  vxor(zeroes, zeroes, zeroes);
-  vspltisw(VR0, -1);
-
-  vsldoi(mask_32bit, zeroes, VR0, 4);
-  vsldoi(mask_64bit, zeroes, VR0, 8);
-
-  // Get the initial value into v8
-  vxor(VR8, VR8, VR8);
-  mtvrd(VR8, crc);
-  vsldoi(VR8, zeroes, VR8, 8); // shift into bottom 32 bits
-
-  li (rLoaded, 0);
-
-  rldicr(rIdx, len, 0, 56);
-
-  {
-    BIND(L_1);
-    // Checksum in blocks of MAX_SIZE (32768)
-    lis(rMax, 0);
-    ori(rMax, rMax, 32768);
-    mr(rTmp2, rMax);
-    cmpd(CCR0, rIdx, rMax);
-    bgt(CCR0, L_2);
-    mr(rMax, rIdx);
-
-    BIND(L_2);
-    subf(rIdx, rMax, rIdx);
-
-    // our main loop does 128 bytes at a time
-    srdi(rMax, rMax, 7);
-
-    /*
-     * Work out the offset into the constants table to start at. Each
-     * constant is 16 bytes, and it is used against 128 bytes of input
-     * data - 128 / 16 = 8
-     */
-    sldi(rTmp1, rMax, 4);
-    srdi(rTmp2, rTmp2, 3);
-    subf(rTmp1, rTmp1, rTmp2);
-
-    // We reduce our final 128 bytes in a separate step
-    addi(rMax, rMax, -1);
-    mtctr(rMax);
-
-    // Find the start of our constants
-    add(constantsPos, constants, rTmp1);
-
-    // zero VR0-v7 which will contain our checksums
-    vxor(VR0, VR0, VR0);
-    vxor(VR1, VR1, VR1);
-    vxor(VR2, VR2, VR2);
-    vxor(VR3, VR3, VR3);
-    vxor(VR4, VR4, VR4);
-    vxor(VR5, VR5, VR5);
-    vxor(VR6, VR6, VR6);
-    vxor(VR7, VR7, VR7);
-
-    lvx(const1, constantsPos);
-
-    /*
-     * If we are looping back to consume more data we use the values
-     * already in VR16-v23.
-     */
-    cmpdi(CCR0, rLoaded, 1);
-    beq(CCR0, L_3);
-    {
-
-      // First warm up pass
-      lvx(VR16, buf);
-      lvx(VR17, off16, buf);
-      lvx(VR18, off32, buf);
-      lvx(VR19, off48, buf);
-      lvx(VR20, off64, buf);
-      lvx(VR21, off80, buf);
-      lvx(VR22, off96, buf);
-      lvx(VR23, off112, buf);
-      addi(buf, buf, 8*16);
-
-      // xor in initial value
-      vxor(VR16, VR16, VR8);
-    }
-
-    BIND(L_3);
-    bdz(L_first_warm_up_done);
-
-    addi(constantsPos, constantsPos, 16);
-    lvx(const2, constantsPos);
-
-    // Second warm up pass
-    vpmsumd(VR8, VR16, const1);
-    lvx(VR16, buf);
-
-    vpmsumd(VR9, VR17, const1);
-    lvx(VR17, off16, buf);
-
-    vpmsumd(VR10, VR18, const1);
-    lvx(VR18, off32, buf);
-
-    vpmsumd(VR11, VR19, const1);
-    lvx(VR19, off48, buf);
-
-    vpmsumd(VR12, VR20, const1);
-    lvx(VR20, off64, buf);
-
-    vpmsumd(VR13, VR21, const1);
-    lvx(VR21, off80, buf);
-
-    vpmsumd(VR14, VR22, const1);
-    lvx(VR22, off96, buf);
-
-    vpmsumd(VR15, VR23, const1);
-    lvx(VR23, off112, buf);
-
-    addi(buf, buf, 8 * 16);
-
-    bdz(L_first_cool_down);
-
-    /*
-     * main loop. We modulo schedule it such that it takes three iterations
-     * to complete - first iteration load, second iteration vpmsum, third
-     * iteration xor.
-     */
-    {
-      BIND(L_4);
-      lvx(const1, constantsPos); addi(constantsPos, constantsPos, 16);
-
-      vxor(VR0, VR0, VR8);
-      vpmsumd(VR8, VR16, const2);
-      lvx(VR16, buf);
-
-      vxor(VR1, VR1, VR9);
-      vpmsumd(VR9, VR17, const2);
-      lvx(VR17, off16, buf);
-
-      vxor(VR2, VR2, VR10);
-      vpmsumd(VR10, VR18, const2);
-      lvx(VR18, off32, buf);
-
-      vxor(VR3, VR3, VR11);
-      vpmsumd(VR11, VR19, const2);
-      lvx(VR19, off48, buf);
-      lvx(const2, constantsPos);
-
-      vxor(VR4, VR4, VR12);
-      vpmsumd(VR12, VR20, const1);
-      lvx(VR20, off64, buf);
-
-      vxor(VR5, VR5, VR13);
-      vpmsumd(VR13, VR21, const1);
-      lvx(VR21, off80, buf);
-
-      vxor(VR6, VR6, VR14);
-      vpmsumd(VR14, VR22, const1);
-      lvx(VR22, off96, buf);
-
-      vxor(VR7, VR7, VR15);
-      vpmsumd(VR15, VR23, const1);
-      lvx(VR23, off112, buf);
-
-      addi(buf, buf, 8 * 16);
-
-      bdnz(L_4);
+  Register offset = t1;
+  int offsetInt = 0;
+  offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
+  offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
+  offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
+  offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
+  offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
+  offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
+#ifndef VM_LITTLE_ENDIAN
+  offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
+#endif
+  offsetInt -= 8; std(R14, offsetInt, R1_SP);
+  offsetInt -= 8; std(R15, offsetInt, R1_SP);
+  offsetInt -= 8; std(R16, offsetInt, R1_SP);
+  offsetInt -= 8; std(R17, offsetInt, R1_SP);
+
+  // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
+  // bytes per iteration. The basic scheme is:
+  // lvx: load vector (Big Endian needs reversal)
+  // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
+  // vxor: xor partial results together to get unroll_factor2 vectors
+
+  // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
+
+  // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
+  const int unroll_factor = 2048;
+  const int unroll_factor2 = 8;
+
+  // Support registers.
+  Register offs[] = { noreg, t0, t1, t2, t3, t4, crc /* will live in VCRC */, R14 };
+  Register num_bytes = R15,
+           loop_count = R16,
+           cur_const = R17;
+  // Constant array for outer loop: unroll_factor2 - 1 registers,
+  // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
+  VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
+                 consts1[] = { VR23, VR24 };
+  // Data register arrays: 2 arrays with unroll_factor2 registers.
+  VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
+                 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
+
+  VectorRegister VCRC = data0[0];
+  VectorRegister Vc = VR25;
+  VectorRegister swap_bytes = VR26; // Only for Big Endian.
+
+  // We have at least 1 iteration (ensured by caller).
+  Label L_outer_loop, L_inner_loop, L_last;
+
+  // If supported set DSCR pre-fetch to deepest.
+  if (VM_Version::has_mfdscr()) {
+    load_const_optimized(t0, VM_Version::_dscr_val | 7);
+    mtdscr(t0);
+  }
+
+  mtvrwz(VCRC, crc); // crc lives lives in VCRC, now
+
+  for (int i = 1; i < unroll_factor2; ++i) {
+    li(offs[i], 16 * i);
+  }
+
+  // Load consts for outer loop
+  lvx(consts0[0], constants);
+  for (int i = 1; i < unroll_factor2 - 1; ++i) {
+    lvx(consts0[i], offs[i], constants);
+  }
+  addi(constants, constants, (unroll_factor2 - 1) * 16);
+
+  load_const_optimized(num_bytes, 16 * unroll_factor);
+  load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
+
+  // Reuse data registers outside of the loop.
+  VectorRegister Vtmp = data1[0];
+  VectorRegister Vtmp2 = data1[1];
+  VectorRegister zeroes = data1[2];
+
+  vspltisb(Vtmp, 0);
+  vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
+
+  // Load vector for vpermxor (to xor both 64 bit parts together)
+  lvsl(Vtmp, buf);   // 000102030405060708090a0b0c0d0e0f
+  vspltisb(Vc, 4);
+  vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
+  xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
+  vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
+
+#ifdef VM_LITTLE_ENDIAN
+#define BE_swap_bytes(x)
+#else
+  vspltisb(Vtmp2, 0xf);
+  vxor(swap_bytes, Vtmp, Vtmp2);
+#define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
+#endif
+
+  cmpd(CCR0, len, num_bytes);
+  blt(CCR0, L_last);
+
+  // ********** Main loop start **********
+  align(32);
+  bind(L_outer_loop);
+
+  // Begin of unrolled first iteration (no xor).
+  lvx(data1[0], buf);
+  mr(cur_const, constants);
+  for (int i = 1; i < unroll_factor2 / 2; ++i) {
+    lvx(data1[i], offs[i], buf);
+  }
+  vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
+  lvx(consts1[0], cur_const);
+  mtctr(loop_count);
+  for (int i = 0; i < unroll_factor2 / 2; ++i) {
+    BE_swap_bytes(data1[i]);
+    if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
+    lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
+    vpmsumw(data0[i], data1[i], consts1[0]);
+  }
+  addi(buf, buf, 16 * unroll_factor2);
+  subf(len, num_bytes, len);
+  lvx(consts1[1], offs[1], cur_const);
+  addi(cur_const, cur_const, 32);
+  // Begin of unrolled second iteration (head).
+  for (int i = 0; i < unroll_factor2 / 2; ++i) {
+    BE_swap_bytes(data1[i + unroll_factor2 / 2]);
+    if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
+    vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
+  }
+  for (int i = 0; i < unroll_factor2 / 2; ++i) {
+    BE_swap_bytes(data1[i]);
+    lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
+    vpmsumw(data1[i], data1[i], consts1[1]);
+  }
+  addi(buf, buf, 16 * unroll_factor2);
+
+  // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
+  // Double-iteration allows using the 2 constant registers alternatingly.
+  align(32);
+  bind(L_inner_loop);
+  for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
+    if (j & 1) {
+      lvx(consts1[0], cur_const);
+    } else {
+      lvx(consts1[1], offs[1], cur_const);
+      addi(cur_const, cur_const, 32);
     }
-
-    BIND(L_first_cool_down);
-
-    // First cool down pass
-    lvx(const1, constantsPos);
-    addi(constantsPos, constantsPos, 16);
-
-    vxor(VR0, VR0, VR8);
-    vpmsumd(VR8, VR16, const1);
-
-    vxor(VR1, VR1, VR9);
-    vpmsumd(VR9, VR17, const1);
-
-    vxor(VR2, VR2, VR10);
-    vpmsumd(VR10, VR18, const1);
-
-    vxor(VR3, VR3, VR11);
-    vpmsumd(VR11, VR19, const1);
-
-    vxor(VR4, VR4, VR12);
-    vpmsumd(VR12, VR20, const1);
-
-    vxor(VR5, VR5, VR13);
-    vpmsumd(VR13, VR21, const1);
-
-    vxor(VR6, VR6, VR14);
-    vpmsumd(VR14, VR22, const1);
-
-    vxor(VR7, VR7, VR15);
-    vpmsumd(VR15, VR23, const1);
-
-    BIND(L_second_cool_down);
-    // Second cool down pass
-    vxor(VR0, VR0, VR8);
-    vxor(VR1, VR1, VR9);
-    vxor(VR2, VR2, VR10);
-    vxor(VR3, VR3, VR11);
-    vxor(VR4, VR4, VR12);
-    vxor(VR5, VR5, VR13);
-    vxor(VR6, VR6, VR14);
-    vxor(VR7, VR7, VR15);
-
-    /*
-     * vpmsumd produces a 96 bit result in the least significant bits
-     * of the register. Since we are bit reflected we have to shift it
-     * left 32 bits so it occupies the least significant bits in the
-     * bit reflected domain.
-     */
-    vsldoi(VR0, VR0, zeroes, 4);
-    vsldoi(VR1, VR1, zeroes, 4);
-    vsldoi(VR2, VR2, zeroes, 4);
-    vsldoi(VR3, VR3, zeroes, 4);
-    vsldoi(VR4, VR4, zeroes, 4);
-    vsldoi(VR5, VR5, zeroes, 4);
-    vsldoi(VR6, VR6, zeroes, 4);
-    vsldoi(VR7, VR7, zeroes, 4);
-
-    // xor with last 1024 bits
-    lvx(VR8, buf);
-    lvx(VR9, off16, buf);
-    lvx(VR10, off32, buf);
-    lvx(VR11, off48, buf);
-    lvx(VR12, off64, buf);
-    lvx(VR13, off80, buf);
-    lvx(VR14, off96, buf);
-    lvx(VR15, off112, buf);
-    addi(buf, buf, 8 * 16);
-
-    vxor(VR16, VR0, VR8);
-    vxor(VR17, VR1, VR9);
-    vxor(VR18, VR2, VR10);
-    vxor(VR19, VR3, VR11);
-    vxor(VR20, VR4, VR12);
-    vxor(VR21, VR5, VR13);
-    vxor(VR22, VR6, VR14);
-    vxor(VR23, VR7, VR15);
-
-    li(rLoaded, 1);
-    cmpdi(CCR0, rIdx, 0);
-    addi(rIdx, rIdx, 128);
-    bne(CCR0, L_1);
-  }
-
-  // Work out how many bytes we have left
-  andi_(len, len, 127);
-
-  // Calculate where in the constant table we need to start
-  subfic(rTmp1, len, 128);
-  add(constantsPos, constantsPos, rTmp1);
-
-  // How many 16 byte chunks are in the tail
-  srdi(rIdx, len, 4);
-  mtctr(rIdx);
-
-  /*
-   * Reduce the previously calculated 1024 bits to 64 bits, shifting
-   * 32 bits to include the trailing 32 bits of zeros
-   */
-  lvx(VR0, constantsPos);
-  lvx(VR1, off16, constantsPos);
-  lvx(VR2, off32, constantsPos);
-  lvx(VR3, off48, constantsPos);
-  lvx(VR4, off64, constantsPos);
-  lvx(VR5, off80, constantsPos);
-  lvx(VR6, off96, constantsPos);
-  lvx(VR7, off112, constantsPos);
-  addi(constantsPos, constantsPos, 8 * 16);
-
-  vpmsumw(VR0, VR16, VR0);
-  vpmsumw(VR1, VR17, VR1);
-  vpmsumw(VR2, VR18, VR2);
-  vpmsumw(VR3, VR19, VR3);
-  vpmsumw(VR4, VR20, VR4);
-  vpmsumw(VR5, VR21, VR5);
-  vpmsumw(VR6, VR22, VR6);
-  vpmsumw(VR7, VR23, VR7);
-
-  // Now reduce the tail (0 - 112 bytes)
-  cmpdi(CCR0, rIdx, 0);
-  beq(CCR0, L_XOR);
-
-  lvx(VR16, buf); addi(buf, buf, 16);
-  lvx(VR17, constantsPos);
-  vpmsumw(VR16, VR16, VR17);
-  vxor(VR0, VR0, VR16);
-  beq(CCR0, L_XOR);
-
-  lvx(VR16, buf); addi(buf, buf, 16);
-  lvx(VR17, off16, constantsPos);
-  vpmsumw(VR16, VR16, VR17);
-  vxor(VR0, VR0, VR16);
-  beq(CCR0, L_XOR);
-
-  lvx(VR16, buf); addi(buf, buf, 16);
-  lvx(VR17, off32, constantsPos);
-  vpmsumw(VR16, VR16, VR17);
-  vxor(VR0, VR0, VR16);
-  beq(CCR0, L_XOR);
-
-  lvx(VR16, buf); addi(buf, buf, 16);
-  lvx(VR17, off48,constantsPos);
-  vpmsumw(VR16, VR16, VR17);
-  vxor(VR0, VR0, VR16);
-  beq(CCR0, L_XOR);
-
-  lvx(VR16, buf); addi(buf, buf, 16);
-  lvx(VR17, off64, constantsPos);
-  vpmsumw(VR16, VR16, VR17);
-  vxor(VR0, VR0, VR16);
-  beq(CCR0, L_XOR);
-
-  lvx(VR16, buf); addi(buf, buf, 16);
-  lvx(VR17, off80, constantsPos);
-  vpmsumw(VR16, VR16, VR17);
-  vxor(VR0, VR0, VR16);
-  beq(CCR0, L_XOR);
-
-  lvx(VR16, buf); addi(buf, buf, 16);
-  lvx(VR17, off96, constantsPos);
-  vpmsumw(VR16, VR16, VR17);
-  vxor(VR0, VR0, VR16);
-
-  // Now xor all the parallel chunks together
-  BIND(L_XOR);
-  vxor(VR0, VR0, VR1);
-  vxor(VR2, VR2, VR3);
-  vxor(VR4, VR4, VR5);
-  vxor(VR6, VR6, VR7);
-
-  vxor(VR0, VR0, VR2);
-  vxor(VR4, VR4, VR6);
-
-  vxor(VR0, VR0, VR4);
-
-  b(L_barrett_reduction);
-
-  BIND(L_first_warm_up_done);
-  lvx(const1, constantsPos);
-  addi(constantsPos, constantsPos, 16);
-  vpmsumd(VR8,  VR16, const1);
-  vpmsumd(VR9,  VR17, const1);
-  vpmsumd(VR10, VR18, const1);
-  vpmsumd(VR11, VR19, const1);
-  vpmsumd(VR12, VR20, const1);
-  vpmsumd(VR13, VR21, const1);
-  vpmsumd(VR14, VR22, const1);
-  vpmsumd(VR15, VR23, const1);
-  b(L_second_cool_down);
-
-  BIND(L_barrett_reduction);
-
-  lvx(const1, barretConstants);
-  addi(barretConstants, barretConstants, 16);
-  lvx(const2, barretConstants);
-
-  vsldoi(VR1, VR0, VR0, 8);
-  vxor(VR0, VR0, VR1);    // xor two 64 bit results together
-
-  // shift left one bit
-  vspltisb(VR1, 1);
-  vsl(VR0, VR0, VR1);
-
-  vand(VR0, VR0, mask_64bit);
-
-  /*
-   * The reflected version of Barrett reduction. Instead of bit
-   * reflecting our data (which is expensive to do), we bit reflect our
-   * constants and our algorithm, which means the intermediate data in
-   * our vector registers goes from 0-63 instead of 63-0. We can reflect
-   * the algorithm because we don't carry in mod 2 arithmetic.
-   */
-  vand(VR1, VR0, mask_32bit);  // bottom 32 bits of a
-  vpmsumd(VR1, VR1, const1);   // ma
-  vand(VR1, VR1, mask_32bit);  // bottom 32bits of ma
-  vpmsumd(VR1, VR1, const2);   // qn */
-  vxor(VR0, VR0, VR1);         // a - qn, subtraction is xor in GF(2)
-
-  /*
-   * Since we are bit reflected, the result (ie the low 32 bits) is in
-   * the high 32 bits. We just need to shift it left 4 bytes
-   * V0 [ 0 1 X 3 ]
-   * V0 [ 0 X 2 3 ]
-   */
-  vsldoi(VR0, VR0, zeroes, 4);    // shift result into top 64 bits of
-
-  // Get it into r3
-  mfvrd(crc, VR0);
-
-  BIND(L_end);
-
+    for (int i = 0; i < unroll_factor2; ++i) {
+      int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
+      if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
+      BE_swap_bytes(data1[idx]);
+      vxor(data0[i], data0[i], data1[i]);
+      if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
+      vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
+    }
+    addi(buf, buf, 16 * unroll_factor2);
+  }
+  bdnz(L_inner_loop);
+
+  // Tail of last iteration (no loads).
+  for (int i = 0; i < unroll_factor2 / 2; ++i) {
+    BE_swap_bytes(data1[i + unroll_factor2 / 2]);
+    vxor(data0[i], data0[i], data1[i]);
+    vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
+  }
+  for (int i = 0; i < unroll_factor2 / 2; ++i) {
+    vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
+    vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
+  }
+
+  // Last data register is ok, other ones need fixup shift.
+  for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
+    vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
+  }
+
+  // Combine to 128 bit result vector VCRC = data0[0].
+  for (int i = 1; i < unroll_factor2; i<<=1) {
+    for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
+      vxor(data0[j], data0[j], data0[j+i]);
+    }
+  }
+  cmpd(CCR0, len, num_bytes);
+  bge(CCR0, L_outer_loop);
+
+  // Last chance with lower num_bytes.
+  bind(L_last);
+  srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
+  add_const_optimized(constants, constants, 16 * (unroll_factor / unroll_factor2)); // Point behind last one.
+  sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
+  clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
+  subf(constants, R0, constants); // Point to constant to be used first.
+
+  addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
+  bgt(CCR0, L_outer_loop);
+  // ********** Main loop end **********
+#undef BE_swap_bytes
+
+  // Restore DSCR pre-fetch value.
+  if (VM_Version::has_mfdscr()) {
+    load_const_optimized(t0, VM_Version::_dscr_val);
+    mtdscr(t0);
+  }
+
+  vspltisb(zeroes, 0);
+
+  // Combine to 64 bit result.
+  vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
+
+  // Reduce to 32 bit CRC: Remainder by multiply-high.
+  lvx(Vtmp, barretConstants);
+  vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
+  vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
+  vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
+  vsldoi(Vtmp, zeroes, Vtmp, 8);
+  vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
+  vxor(VCRC, VCRC, Vtmp2);          // Remainder fits into 32 bit.
+
+  // Move result. len is already updated.
+  vsldoi(VCRC, VCRC, zeroes, 8);
+  mfvrd(crc, VCRC);
+
+  // Restore non-volatile Vector registers (frameless).
   offsetInt = 0;
-  // Restore non-volatile Vector registers (frameless).
-  offsetInt -= 16; li(offset, -16);           lvx(VR20, offset, R1_SP);
-  offsetInt -= 16; addi(offset, offset, -16); lvx(VR21, offset, R1_SP);
-  offsetInt -= 16; addi(offset, offset, -16); lvx(VR22, offset, R1_SP);
-  offsetInt -= 16; addi(offset, offset, -16); lvx(VR23, offset, R1_SP);
-  offsetInt -= 16; addi(offset, offset, -16); lvx(VR24, offset, R1_SP);
-  offsetInt -= 16; addi(offset, offset, -16); lvx(VR25, offset, R1_SP);
-  offsetInt -= 16; addi(offset, offset, -16); lvx(VR26, offset, R1_SP);
-  offsetInt -= 16; addi(offset, offset, -16); lvx(VR27, offset, R1_SP);
-  offsetInt -= 16; addi(offset, offset, -16); lvx(VR28, offset, R1_SP);
-  offsetInt -= 8;  ld(R22, offsetInt, R1_SP);
-  offsetInt -= 8;  ld(R23, offsetInt, R1_SP);
-  offsetInt -= 8;  ld(R24, offsetInt, R1_SP);
-  offsetInt -= 8;  ld(R25, offsetInt, R1_SP);
-  offsetInt -= 8;  ld(R26, offsetInt, R1_SP);
-  offsetInt -= 8;  ld(R27, offsetInt, R1_SP);
-  offsetInt -= 8;  ld(R28, offsetInt, R1_SP);
-  offsetInt -= 8;  ld(R29, offsetInt, R1_SP);
-  offsetInt -= 8;  ld(R30, offsetInt, R1_SP);
-  offsetInt -= 8;  ld(R31, offsetInt, R1_SP);
+  offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
+  offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
+  offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
+  offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
+  offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
+  offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
+#ifndef VM_LITTLE_ENDIAN
+  offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
+#endif
+  offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
+  offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
+  offsetInt -= 8;  ld(R16, offsetInt, R1_SP);
+  offsetInt -= 8;  ld(R17, offsetInt, R1_SP);
 }
 
 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) {