--- a/src/hotspot/cpu/ppc/macroAssembler_ppc.cpp Mon Mar 12 09:46:12 2018 +0100
+++ b/src/hotspot/cpu/ppc/macroAssembler_ppc.cpp Mon Mar 12 12:02:20 2018 +0100
@@ -1,6 +1,6 @@
/*
* Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017, SAP SE. All rights reserved.
+ * Copyright (c) 2012, 2018, SAP SE. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -4451,561 +4451,304 @@
* @param table register pointing to CRC table
* @param constants register pointing to CRC table for 128-bit aligned memory
* @param barretConstants register pointing to table for barrett reduction
- * @param t0 volatile register
- * @param t1 volatile register
- * @param t2 volatile register
- * @param t3 volatile register
+ * @param t0-t4 temp registers
*/
-void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table,
- Register constants, Register barretConstants,
- Register t0, Register t1, Register t2, Register t3, Register t4,
- bool invertCRC) {
+void MacroAssembler::kernel_crc32_1word_vpmsum(Register crc, Register buf, Register len, Register table,
+ Register constants, Register barretConstants,
+ Register t0, Register t1, Register t2, Register t3, Register t4,
+ bool invertCRC) {
assert_different_registers(crc, buf, len, table);
- Label L_alignedHead, L_tail, L_alignTail, L_start, L_end;
-
- Register prealign = t0;
- Register postalign = t0;
-
- BLOCK_COMMENT("kernel_crc32_1word_vpmsumb {");
-
- // 1. use kernel_crc32_1word for shorter than 384bit
+ Label L_alignedHead, L_tail;
+
+ BLOCK_COMMENT("kernel_crc32_1word_vpmsum {");
+
+ // 1. ~c
+ if (invertCRC) {
+ nand(crc, crc, crc); // 1s complement of crc
+ }
+
+ // 2. use kernel_crc32_1word for short len
clrldi(len, len, 32);
- cmpdi(CCR0, len, 384);
- bge(CCR0, L_start);
-
- Register tc0 = t4;
- Register tc1 = constants;
- Register tc2 = barretConstants;
- kernel_crc32_1word(crc, buf, len, table,t0, t1, t2, t3, tc0, tc1, tc2, table, invertCRC);
- b(L_end);
-
- BIND(L_start);
-
- // 2. ~c
- if (invertCRC) {
- nand(crc, crc, crc); // 1s complement of crc
- }
-
- // 3. calculate from 0 to first 128bit-aligned address
- clrldi_(prealign, buf, 57);
- beq(CCR0, L_alignedHead);
-
- subfic(prealign, prealign, 128);
-
- subf(len, prealign, len);
- update_byteLoop_crc32(crc, buf, prealign, table, t2, false);
-
- // 4. calculate from first 128bit-aligned address to last 128bit-aligned address
- BIND(L_alignedHead);
-
- clrldi(postalign, len, 57);
- subf(len, postalign, len);
-
- // len must be more than 256bit
- kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t1, t2, t3);
-
- // 5. calculate remaining
- cmpdi(CCR0, postalign, 0);
- beq(CCR0, L_tail);
-
- update_byteLoop_crc32(crc, buf, postalign, table, t2, false);
-
- BIND(L_tail);
-
- // 6. ~c
- if (invertCRC) {
- nand(crc, crc, crc); // 1s complement of crc
- }
-
- BIND(L_end);
-
- BLOCK_COMMENT("} kernel_crc32_1word_vpmsumb");
+ cmpdi(CCR0, len, 512);
+ blt(CCR0, L_tail);
+
+ // 3. calculate from 0 to first aligned address
+ const int alignment = 16;
+ Register prealign = t0;
+
+ andi_(prealign, buf, alignment - 1);
+ beq(CCR0, L_alignedHead);
+ subfic(prealign, prealign, alignment);
+
+ subf(len, prealign, len);
+ update_byteLoop_crc32(crc, buf, prealign, table, t2, false);
+
+ // 4. calculate from first aligned address as far as possible
+ BIND(L_alignedHead);
+ kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t0, t1, t2, t3, t4);
+
+ // 5. remaining bytes
+ BIND(L_tail);
+ Register tc0 = t4;
+ Register tc1 = constants;
+ Register tc2 = barretConstants;
+ kernel_crc32_1word(crc, buf, len, table, t0, t1, t2, t3, tc0, tc1, tc2, table, false);
+
+ // 6. ~c
+ if (invertCRC) {
+ nand(crc, crc, crc); // 1s complement of crc
+ }
+
+ BLOCK_COMMENT("} kernel_crc32_1word_vpmsum");
}
/**
* @param crc register containing existing CRC (32-bit)
* @param buf register pointing to input byte buffer (byte*)
- * @param len register containing number of bytes
+ * @param len register containing number of bytes (will get updated to remaining bytes)
* @param constants register pointing to CRC table for 128-bit aligned memory
* @param barretConstants register pointing to table for barrett reduction
- * @param t0 volatile register
- * @param t1 volatile register
- * @param t2 volatile register
+ * @param t0-t4 temp registers
+ * Precondition: len should be >= 512. Otherwise, nothing will be done.
*/
void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
- Register constants, Register barretConstants, Register t0, Register t1, Register t2) {
- Label L_mainLoop, L_tail, L_alignTail, L_barrett_reduction, L_end, L_first_warm_up_done, L_first_cool_down, L_second_cool_down, L_XOR, L_test;
- Label L_lv0, L_lv1, L_lv2, L_lv3, L_lv4, L_lv5, L_lv6, L_lv7, L_lv8, L_lv9, L_lv10, L_lv11, L_lv12, L_lv13, L_lv14, L_lv15;
- Label L_1, L_2, L_3, L_4;
-
- Register rLoaded = t0;
- Register rTmp1 = t1;
- Register rTmp2 = t2;
- Register off16 = R22;
- Register off32 = R23;
- Register off48 = R24;
- Register off64 = R25;
- Register off80 = R26;
- Register off96 = R27;
- Register off112 = R28;
- Register rIdx = R29;
- Register rMax = R30;
- Register constantsPos = R31;
-
- VectorRegister mask_32bit = VR24;
- VectorRegister mask_64bit = VR25;
- VectorRegister zeroes = VR26;
- VectorRegister const1 = VR27;
- VectorRegister const2 = VR28;
+ Register constants, Register barretConstants,
+ Register t0, Register t1, Register t2, Register t3, Register t4) {
// Save non-volatile vector registers (frameless).
- Register offset = t1; int offsetInt = 0;
- offsetInt -= 16; li(offset, -16); stvx(VR20, offset, R1_SP);
- offsetInt -= 16; addi(offset, offset, -16); stvx(VR21, offset, R1_SP);
- offsetInt -= 16; addi(offset, offset, -16); stvx(VR22, offset, R1_SP);
- offsetInt -= 16; addi(offset, offset, -16); stvx(VR23, offset, R1_SP);
- offsetInt -= 16; addi(offset, offset, -16); stvx(VR24, offset, R1_SP);
- offsetInt -= 16; addi(offset, offset, -16); stvx(VR25, offset, R1_SP);
- offsetInt -= 16; addi(offset, offset, -16); stvx(VR26, offset, R1_SP);
- offsetInt -= 16; addi(offset, offset, -16); stvx(VR27, offset, R1_SP);
- offsetInt -= 16; addi(offset, offset, -16); stvx(VR28, offset, R1_SP);
- offsetInt -= 8; std(R22, offsetInt, R1_SP);
- offsetInt -= 8; std(R23, offsetInt, R1_SP);
- offsetInt -= 8; std(R24, offsetInt, R1_SP);
- offsetInt -= 8; std(R25, offsetInt, R1_SP);
- offsetInt -= 8; std(R26, offsetInt, R1_SP);
- offsetInt -= 8; std(R27, offsetInt, R1_SP);
- offsetInt -= 8; std(R28, offsetInt, R1_SP);
- offsetInt -= 8; std(R29, offsetInt, R1_SP);
- offsetInt -= 8; std(R30, offsetInt, R1_SP);
- offsetInt -= 8; std(R31, offsetInt, R1_SP);
-
- // Set constants
- li(off16, 16);
- li(off32, 32);
- li(off48, 48);
- li(off64, 64);
- li(off80, 80);
- li(off96, 96);
- li(off112, 112);
-
- clrldi(crc, crc, 32);
-
- vxor(zeroes, zeroes, zeroes);
- vspltisw(VR0, -1);
-
- vsldoi(mask_32bit, zeroes, VR0, 4);
- vsldoi(mask_64bit, zeroes, VR0, 8);
-
- // Get the initial value into v8
- vxor(VR8, VR8, VR8);
- mtvrd(VR8, crc);
- vsldoi(VR8, zeroes, VR8, 8); // shift into bottom 32 bits
-
- li (rLoaded, 0);
-
- rldicr(rIdx, len, 0, 56);
-
- {
- BIND(L_1);
- // Checksum in blocks of MAX_SIZE (32768)
- lis(rMax, 0);
- ori(rMax, rMax, 32768);
- mr(rTmp2, rMax);
- cmpd(CCR0, rIdx, rMax);
- bgt(CCR0, L_2);
- mr(rMax, rIdx);
-
- BIND(L_2);
- subf(rIdx, rMax, rIdx);
-
- // our main loop does 128 bytes at a time
- srdi(rMax, rMax, 7);
-
- /*
- * Work out the offset into the constants table to start at. Each
- * constant is 16 bytes, and it is used against 128 bytes of input
- * data - 128 / 16 = 8
- */
- sldi(rTmp1, rMax, 4);
- srdi(rTmp2, rTmp2, 3);
- subf(rTmp1, rTmp1, rTmp2);
-
- // We reduce our final 128 bytes in a separate step
- addi(rMax, rMax, -1);
- mtctr(rMax);
-
- // Find the start of our constants
- add(constantsPos, constants, rTmp1);
-
- // zero VR0-v7 which will contain our checksums
- vxor(VR0, VR0, VR0);
- vxor(VR1, VR1, VR1);
- vxor(VR2, VR2, VR2);
- vxor(VR3, VR3, VR3);
- vxor(VR4, VR4, VR4);
- vxor(VR5, VR5, VR5);
- vxor(VR6, VR6, VR6);
- vxor(VR7, VR7, VR7);
-
- lvx(const1, constantsPos);
-
- /*
- * If we are looping back to consume more data we use the values
- * already in VR16-v23.
- */
- cmpdi(CCR0, rLoaded, 1);
- beq(CCR0, L_3);
- {
-
- // First warm up pass
- lvx(VR16, buf);
- lvx(VR17, off16, buf);
- lvx(VR18, off32, buf);
- lvx(VR19, off48, buf);
- lvx(VR20, off64, buf);
- lvx(VR21, off80, buf);
- lvx(VR22, off96, buf);
- lvx(VR23, off112, buf);
- addi(buf, buf, 8*16);
-
- // xor in initial value
- vxor(VR16, VR16, VR8);
- }
-
- BIND(L_3);
- bdz(L_first_warm_up_done);
-
- addi(constantsPos, constantsPos, 16);
- lvx(const2, constantsPos);
-
- // Second warm up pass
- vpmsumd(VR8, VR16, const1);
- lvx(VR16, buf);
-
- vpmsumd(VR9, VR17, const1);
- lvx(VR17, off16, buf);
-
- vpmsumd(VR10, VR18, const1);
- lvx(VR18, off32, buf);
-
- vpmsumd(VR11, VR19, const1);
- lvx(VR19, off48, buf);
-
- vpmsumd(VR12, VR20, const1);
- lvx(VR20, off64, buf);
-
- vpmsumd(VR13, VR21, const1);
- lvx(VR21, off80, buf);
-
- vpmsumd(VR14, VR22, const1);
- lvx(VR22, off96, buf);
-
- vpmsumd(VR15, VR23, const1);
- lvx(VR23, off112, buf);
-
- addi(buf, buf, 8 * 16);
-
- bdz(L_first_cool_down);
-
- /*
- * main loop. We modulo schedule it such that it takes three iterations
- * to complete - first iteration load, second iteration vpmsum, third
- * iteration xor.
- */
- {
- BIND(L_4);
- lvx(const1, constantsPos); addi(constantsPos, constantsPos, 16);
-
- vxor(VR0, VR0, VR8);
- vpmsumd(VR8, VR16, const2);
- lvx(VR16, buf);
-
- vxor(VR1, VR1, VR9);
- vpmsumd(VR9, VR17, const2);
- lvx(VR17, off16, buf);
-
- vxor(VR2, VR2, VR10);
- vpmsumd(VR10, VR18, const2);
- lvx(VR18, off32, buf);
-
- vxor(VR3, VR3, VR11);
- vpmsumd(VR11, VR19, const2);
- lvx(VR19, off48, buf);
- lvx(const2, constantsPos);
-
- vxor(VR4, VR4, VR12);
- vpmsumd(VR12, VR20, const1);
- lvx(VR20, off64, buf);
-
- vxor(VR5, VR5, VR13);
- vpmsumd(VR13, VR21, const1);
- lvx(VR21, off80, buf);
-
- vxor(VR6, VR6, VR14);
- vpmsumd(VR14, VR22, const1);
- lvx(VR22, off96, buf);
-
- vxor(VR7, VR7, VR15);
- vpmsumd(VR15, VR23, const1);
- lvx(VR23, off112, buf);
-
- addi(buf, buf, 8 * 16);
-
- bdnz(L_4);
+ Register offset = t1;
+ int offsetInt = 0;
+ offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
+ offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
+ offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
+ offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
+ offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
+ offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
+#ifndef VM_LITTLE_ENDIAN
+ offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
+#endif
+ offsetInt -= 8; std(R14, offsetInt, R1_SP);
+ offsetInt -= 8; std(R15, offsetInt, R1_SP);
+ offsetInt -= 8; std(R16, offsetInt, R1_SP);
+ offsetInt -= 8; std(R17, offsetInt, R1_SP);
+
+ // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
+ // bytes per iteration. The basic scheme is:
+ // lvx: load vector (Big Endian needs reversal)
+ // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
+ // vxor: xor partial results together to get unroll_factor2 vectors
+
+ // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
+
+ // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
+ const int unroll_factor = 2048;
+ const int unroll_factor2 = 8;
+
+ // Support registers.
+ Register offs[] = { noreg, t0, t1, t2, t3, t4, crc /* will live in VCRC */, R14 };
+ Register num_bytes = R15,
+ loop_count = R16,
+ cur_const = R17;
+ // Constant array for outer loop: unroll_factor2 - 1 registers,
+ // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
+ VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
+ consts1[] = { VR23, VR24 };
+ // Data register arrays: 2 arrays with unroll_factor2 registers.
+ VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
+ data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
+
+ VectorRegister VCRC = data0[0];
+ VectorRegister Vc = VR25;
+ VectorRegister swap_bytes = VR26; // Only for Big Endian.
+
+ // We have at least 1 iteration (ensured by caller).
+ Label L_outer_loop, L_inner_loop, L_last;
+
+ // If supported set DSCR pre-fetch to deepest.
+ if (VM_Version::has_mfdscr()) {
+ load_const_optimized(t0, VM_Version::_dscr_val | 7);
+ mtdscr(t0);
+ }
+
+ mtvrwz(VCRC, crc); // crc lives lives in VCRC, now
+
+ for (int i = 1; i < unroll_factor2; ++i) {
+ li(offs[i], 16 * i);
+ }
+
+ // Load consts for outer loop
+ lvx(consts0[0], constants);
+ for (int i = 1; i < unroll_factor2 - 1; ++i) {
+ lvx(consts0[i], offs[i], constants);
+ }
+ addi(constants, constants, (unroll_factor2 - 1) * 16);
+
+ load_const_optimized(num_bytes, 16 * unroll_factor);
+ load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
+
+ // Reuse data registers outside of the loop.
+ VectorRegister Vtmp = data1[0];
+ VectorRegister Vtmp2 = data1[1];
+ VectorRegister zeroes = data1[2];
+
+ vspltisb(Vtmp, 0);
+ vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
+
+ // Load vector for vpermxor (to xor both 64 bit parts together)
+ lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f
+ vspltisb(Vc, 4);
+ vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
+ xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
+ vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
+
+#ifdef VM_LITTLE_ENDIAN
+#define BE_swap_bytes(x)
+#else
+ vspltisb(Vtmp2, 0xf);
+ vxor(swap_bytes, Vtmp, Vtmp2);
+#define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
+#endif
+
+ cmpd(CCR0, len, num_bytes);
+ blt(CCR0, L_last);
+
+ // ********** Main loop start **********
+ align(32);
+ bind(L_outer_loop);
+
+ // Begin of unrolled first iteration (no xor).
+ lvx(data1[0], buf);
+ mr(cur_const, constants);
+ for (int i = 1; i < unroll_factor2 / 2; ++i) {
+ lvx(data1[i], offs[i], buf);
+ }
+ vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
+ lvx(consts1[0], cur_const);
+ mtctr(loop_count);
+ for (int i = 0; i < unroll_factor2 / 2; ++i) {
+ BE_swap_bytes(data1[i]);
+ if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
+ lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
+ vpmsumw(data0[i], data1[i], consts1[0]);
+ }
+ addi(buf, buf, 16 * unroll_factor2);
+ subf(len, num_bytes, len);
+ lvx(consts1[1], offs[1], cur_const);
+ addi(cur_const, cur_const, 32);
+ // Begin of unrolled second iteration (head).
+ for (int i = 0; i < unroll_factor2 / 2; ++i) {
+ BE_swap_bytes(data1[i + unroll_factor2 / 2]);
+ if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
+ vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
+ }
+ for (int i = 0; i < unroll_factor2 / 2; ++i) {
+ BE_swap_bytes(data1[i]);
+ lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
+ vpmsumw(data1[i], data1[i], consts1[1]);
+ }
+ addi(buf, buf, 16 * unroll_factor2);
+
+ // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
+ // Double-iteration allows using the 2 constant registers alternatingly.
+ align(32);
+ bind(L_inner_loop);
+ for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
+ if (j & 1) {
+ lvx(consts1[0], cur_const);
+ } else {
+ lvx(consts1[1], offs[1], cur_const);
+ addi(cur_const, cur_const, 32);
}
-
- BIND(L_first_cool_down);
-
- // First cool down pass
- lvx(const1, constantsPos);
- addi(constantsPos, constantsPos, 16);
-
- vxor(VR0, VR0, VR8);
- vpmsumd(VR8, VR16, const1);
-
- vxor(VR1, VR1, VR9);
- vpmsumd(VR9, VR17, const1);
-
- vxor(VR2, VR2, VR10);
- vpmsumd(VR10, VR18, const1);
-
- vxor(VR3, VR3, VR11);
- vpmsumd(VR11, VR19, const1);
-
- vxor(VR4, VR4, VR12);
- vpmsumd(VR12, VR20, const1);
-
- vxor(VR5, VR5, VR13);
- vpmsumd(VR13, VR21, const1);
-
- vxor(VR6, VR6, VR14);
- vpmsumd(VR14, VR22, const1);
-
- vxor(VR7, VR7, VR15);
- vpmsumd(VR15, VR23, const1);
-
- BIND(L_second_cool_down);
- // Second cool down pass
- vxor(VR0, VR0, VR8);
- vxor(VR1, VR1, VR9);
- vxor(VR2, VR2, VR10);
- vxor(VR3, VR3, VR11);
- vxor(VR4, VR4, VR12);
- vxor(VR5, VR5, VR13);
- vxor(VR6, VR6, VR14);
- vxor(VR7, VR7, VR15);
-
- /*
- * vpmsumd produces a 96 bit result in the least significant bits
- * of the register. Since we are bit reflected we have to shift it
- * left 32 bits so it occupies the least significant bits in the
- * bit reflected domain.
- */
- vsldoi(VR0, VR0, zeroes, 4);
- vsldoi(VR1, VR1, zeroes, 4);
- vsldoi(VR2, VR2, zeroes, 4);
- vsldoi(VR3, VR3, zeroes, 4);
- vsldoi(VR4, VR4, zeroes, 4);
- vsldoi(VR5, VR5, zeroes, 4);
- vsldoi(VR6, VR6, zeroes, 4);
- vsldoi(VR7, VR7, zeroes, 4);
-
- // xor with last 1024 bits
- lvx(VR8, buf);
- lvx(VR9, off16, buf);
- lvx(VR10, off32, buf);
- lvx(VR11, off48, buf);
- lvx(VR12, off64, buf);
- lvx(VR13, off80, buf);
- lvx(VR14, off96, buf);
- lvx(VR15, off112, buf);
- addi(buf, buf, 8 * 16);
-
- vxor(VR16, VR0, VR8);
- vxor(VR17, VR1, VR9);
- vxor(VR18, VR2, VR10);
- vxor(VR19, VR3, VR11);
- vxor(VR20, VR4, VR12);
- vxor(VR21, VR5, VR13);
- vxor(VR22, VR6, VR14);
- vxor(VR23, VR7, VR15);
-
- li(rLoaded, 1);
- cmpdi(CCR0, rIdx, 0);
- addi(rIdx, rIdx, 128);
- bne(CCR0, L_1);
- }
-
- // Work out how many bytes we have left
- andi_(len, len, 127);
-
- // Calculate where in the constant table we need to start
- subfic(rTmp1, len, 128);
- add(constantsPos, constantsPos, rTmp1);
-
- // How many 16 byte chunks are in the tail
- srdi(rIdx, len, 4);
- mtctr(rIdx);
-
- /*
- * Reduce the previously calculated 1024 bits to 64 bits, shifting
- * 32 bits to include the trailing 32 bits of zeros
- */
- lvx(VR0, constantsPos);
- lvx(VR1, off16, constantsPos);
- lvx(VR2, off32, constantsPos);
- lvx(VR3, off48, constantsPos);
- lvx(VR4, off64, constantsPos);
- lvx(VR5, off80, constantsPos);
- lvx(VR6, off96, constantsPos);
- lvx(VR7, off112, constantsPos);
- addi(constantsPos, constantsPos, 8 * 16);
-
- vpmsumw(VR0, VR16, VR0);
- vpmsumw(VR1, VR17, VR1);
- vpmsumw(VR2, VR18, VR2);
- vpmsumw(VR3, VR19, VR3);
- vpmsumw(VR4, VR20, VR4);
- vpmsumw(VR5, VR21, VR5);
- vpmsumw(VR6, VR22, VR6);
- vpmsumw(VR7, VR23, VR7);
-
- // Now reduce the tail (0 - 112 bytes)
- cmpdi(CCR0, rIdx, 0);
- beq(CCR0, L_XOR);
-
- lvx(VR16, buf); addi(buf, buf, 16);
- lvx(VR17, constantsPos);
- vpmsumw(VR16, VR16, VR17);
- vxor(VR0, VR0, VR16);
- beq(CCR0, L_XOR);
-
- lvx(VR16, buf); addi(buf, buf, 16);
- lvx(VR17, off16, constantsPos);
- vpmsumw(VR16, VR16, VR17);
- vxor(VR0, VR0, VR16);
- beq(CCR0, L_XOR);
-
- lvx(VR16, buf); addi(buf, buf, 16);
- lvx(VR17, off32, constantsPos);
- vpmsumw(VR16, VR16, VR17);
- vxor(VR0, VR0, VR16);
- beq(CCR0, L_XOR);
-
- lvx(VR16, buf); addi(buf, buf, 16);
- lvx(VR17, off48,constantsPos);
- vpmsumw(VR16, VR16, VR17);
- vxor(VR0, VR0, VR16);
- beq(CCR0, L_XOR);
-
- lvx(VR16, buf); addi(buf, buf, 16);
- lvx(VR17, off64, constantsPos);
- vpmsumw(VR16, VR16, VR17);
- vxor(VR0, VR0, VR16);
- beq(CCR0, L_XOR);
-
- lvx(VR16, buf); addi(buf, buf, 16);
- lvx(VR17, off80, constantsPos);
- vpmsumw(VR16, VR16, VR17);
- vxor(VR0, VR0, VR16);
- beq(CCR0, L_XOR);
-
- lvx(VR16, buf); addi(buf, buf, 16);
- lvx(VR17, off96, constantsPos);
- vpmsumw(VR16, VR16, VR17);
- vxor(VR0, VR0, VR16);
-
- // Now xor all the parallel chunks together
- BIND(L_XOR);
- vxor(VR0, VR0, VR1);
- vxor(VR2, VR2, VR3);
- vxor(VR4, VR4, VR5);
- vxor(VR6, VR6, VR7);
-
- vxor(VR0, VR0, VR2);
- vxor(VR4, VR4, VR6);
-
- vxor(VR0, VR0, VR4);
-
- b(L_barrett_reduction);
-
- BIND(L_first_warm_up_done);
- lvx(const1, constantsPos);
- addi(constantsPos, constantsPos, 16);
- vpmsumd(VR8, VR16, const1);
- vpmsumd(VR9, VR17, const1);
- vpmsumd(VR10, VR18, const1);
- vpmsumd(VR11, VR19, const1);
- vpmsumd(VR12, VR20, const1);
- vpmsumd(VR13, VR21, const1);
- vpmsumd(VR14, VR22, const1);
- vpmsumd(VR15, VR23, const1);
- b(L_second_cool_down);
-
- BIND(L_barrett_reduction);
-
- lvx(const1, barretConstants);
- addi(barretConstants, barretConstants, 16);
- lvx(const2, barretConstants);
-
- vsldoi(VR1, VR0, VR0, 8);
- vxor(VR0, VR0, VR1); // xor two 64 bit results together
-
- // shift left one bit
- vspltisb(VR1, 1);
- vsl(VR0, VR0, VR1);
-
- vand(VR0, VR0, mask_64bit);
-
- /*
- * The reflected version of Barrett reduction. Instead of bit
- * reflecting our data (which is expensive to do), we bit reflect our
- * constants and our algorithm, which means the intermediate data in
- * our vector registers goes from 0-63 instead of 63-0. We can reflect
- * the algorithm because we don't carry in mod 2 arithmetic.
- */
- vand(VR1, VR0, mask_32bit); // bottom 32 bits of a
- vpmsumd(VR1, VR1, const1); // ma
- vand(VR1, VR1, mask_32bit); // bottom 32bits of ma
- vpmsumd(VR1, VR1, const2); // qn */
- vxor(VR0, VR0, VR1); // a - qn, subtraction is xor in GF(2)
-
- /*
- * Since we are bit reflected, the result (ie the low 32 bits) is in
- * the high 32 bits. We just need to shift it left 4 bytes
- * V0 [ 0 1 X 3 ]
- * V0 [ 0 X 2 3 ]
- */
- vsldoi(VR0, VR0, zeroes, 4); // shift result into top 64 bits of
-
- // Get it into r3
- mfvrd(crc, VR0);
-
- BIND(L_end);
-
+ for (int i = 0; i < unroll_factor2; ++i) {
+ int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
+ if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
+ BE_swap_bytes(data1[idx]);
+ vxor(data0[i], data0[i], data1[i]);
+ if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
+ vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
+ }
+ addi(buf, buf, 16 * unroll_factor2);
+ }
+ bdnz(L_inner_loop);
+
+ // Tail of last iteration (no loads).
+ for (int i = 0; i < unroll_factor2 / 2; ++i) {
+ BE_swap_bytes(data1[i + unroll_factor2 / 2]);
+ vxor(data0[i], data0[i], data1[i]);
+ vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
+ }
+ for (int i = 0; i < unroll_factor2 / 2; ++i) {
+ vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
+ vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
+ }
+
+ // Last data register is ok, other ones need fixup shift.
+ for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
+ vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
+ }
+
+ // Combine to 128 bit result vector VCRC = data0[0].
+ for (int i = 1; i < unroll_factor2; i<<=1) {
+ for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
+ vxor(data0[j], data0[j], data0[j+i]);
+ }
+ }
+ cmpd(CCR0, len, num_bytes);
+ bge(CCR0, L_outer_loop);
+
+ // Last chance with lower num_bytes.
+ bind(L_last);
+ srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
+ add_const_optimized(constants, constants, 16 * (unroll_factor / unroll_factor2)); // Point behind last one.
+ sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
+ clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
+ subf(constants, R0, constants); // Point to constant to be used first.
+
+ addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
+ bgt(CCR0, L_outer_loop);
+ // ********** Main loop end **********
+#undef BE_swap_bytes
+
+ // Restore DSCR pre-fetch value.
+ if (VM_Version::has_mfdscr()) {
+ load_const_optimized(t0, VM_Version::_dscr_val);
+ mtdscr(t0);
+ }
+
+ vspltisb(zeroes, 0);
+
+ // Combine to 64 bit result.
+ vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
+
+ // Reduce to 32 bit CRC: Remainder by multiply-high.
+ lvx(Vtmp, barretConstants);
+ vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit.
+ vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly.
+ vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
+ vsldoi(Vtmp, zeroes, Vtmp, 8);
+ vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly.
+ vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit.
+
+ // Move result. len is already updated.
+ vsldoi(VCRC, VCRC, zeroes, 8);
+ mfvrd(crc, VCRC);
+
+ // Restore non-volatile Vector registers (frameless).
offsetInt = 0;
- // Restore non-volatile Vector registers (frameless).
- offsetInt -= 16; li(offset, -16); lvx(VR20, offset, R1_SP);
- offsetInt -= 16; addi(offset, offset, -16); lvx(VR21, offset, R1_SP);
- offsetInt -= 16; addi(offset, offset, -16); lvx(VR22, offset, R1_SP);
- offsetInt -= 16; addi(offset, offset, -16); lvx(VR23, offset, R1_SP);
- offsetInt -= 16; addi(offset, offset, -16); lvx(VR24, offset, R1_SP);
- offsetInt -= 16; addi(offset, offset, -16); lvx(VR25, offset, R1_SP);
- offsetInt -= 16; addi(offset, offset, -16); lvx(VR26, offset, R1_SP);
- offsetInt -= 16; addi(offset, offset, -16); lvx(VR27, offset, R1_SP);
- offsetInt -= 16; addi(offset, offset, -16); lvx(VR28, offset, R1_SP);
- offsetInt -= 8; ld(R22, offsetInt, R1_SP);
- offsetInt -= 8; ld(R23, offsetInt, R1_SP);
- offsetInt -= 8; ld(R24, offsetInt, R1_SP);
- offsetInt -= 8; ld(R25, offsetInt, R1_SP);
- offsetInt -= 8; ld(R26, offsetInt, R1_SP);
- offsetInt -= 8; ld(R27, offsetInt, R1_SP);
- offsetInt -= 8; ld(R28, offsetInt, R1_SP);
- offsetInt -= 8; ld(R29, offsetInt, R1_SP);
- offsetInt -= 8; ld(R30, offsetInt, R1_SP);
- offsetInt -= 8; ld(R31, offsetInt, R1_SP);
+ offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
+ offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
+ offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
+ offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
+ offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
+ offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
+#ifndef VM_LITTLE_ENDIAN
+ offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
+#endif
+ offsetInt -= 8; ld(R14, offsetInt, R1_SP);
+ offsetInt -= 8; ld(R15, offsetInt, R1_SP);
+ offsetInt -= 8; ld(R16, offsetInt, R1_SP);
+ offsetInt -= 8; ld(R17, offsetInt, R1_SP);
}
void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) {