hotspot/src/cpu/sparc/vm/macroAssembler_sparc.cpp
changeset 34205 9ec51d30a11e
parent 33628 09241459a8b8
child 34211 d25c2fc1e248
--- a/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.cpp	Fri Nov 20 10:09:42 2015 +0100
+++ b/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.cpp	Fri Nov 20 08:29:10 2015 -0800
@@ -4771,3 +4771,243 @@
   movdtox(src, tmp1);
   reverse_bytes_32(tmp1, dst, tmp2);
 }
+
+void MacroAssembler::fold_128bit_crc32(Register xcrc_hi, Register xcrc_lo, Register xK_hi, Register xK_lo, Register xtmp_hi, Register xtmp_lo, Register buf, int offset) {
+  xmulx(xcrc_hi, xK_hi, xtmp_lo);
+  xmulxhi(xcrc_hi, xK_hi, xtmp_hi);
+  xmulxhi(xcrc_lo, xK_lo, xcrc_hi);
+  xmulx(xcrc_lo, xK_lo, xcrc_lo);
+  xor3(xcrc_lo, xtmp_lo, xcrc_lo);
+  xor3(xcrc_hi, xtmp_hi, xcrc_hi);
+  ldxl(buf, G0, xtmp_lo);
+  inc(buf, 8);
+  ldxl(buf, G0, xtmp_hi);
+  inc(buf, 8);
+  xor3(xcrc_lo, xtmp_lo, xcrc_lo);
+  xor3(xcrc_hi, xtmp_hi, xcrc_hi);
+}
+
+void MacroAssembler::fold_128bit_crc32(Register xcrc_hi, Register xcrc_lo, Register xK_hi, Register xK_lo, Register xtmp_hi, Register xtmp_lo, Register xbuf_hi, Register xbuf_lo) {
+  mov(xcrc_lo, xtmp_lo);
+  mov(xcrc_hi, xtmp_hi);
+  xmulx(xtmp_hi, xK_hi, xtmp_lo);
+  xmulxhi(xtmp_hi, xK_hi, xtmp_hi);
+  xmulxhi(xcrc_lo, xK_lo, xcrc_hi);
+  xmulx(xcrc_lo, xK_lo, xcrc_lo);
+  xor3(xcrc_lo, xbuf_lo, xcrc_lo);
+  xor3(xcrc_hi, xbuf_hi, xcrc_hi);
+  xor3(xcrc_lo, xtmp_lo, xcrc_lo);
+  xor3(xcrc_hi, xtmp_hi, xcrc_hi);
+}
+
+void MacroAssembler::fold_8bit_crc32(Register xcrc, Register table, Register xtmp, Register tmp) {
+  and3(xcrc, 0xFF, tmp);
+  sllx(tmp, 2, tmp);
+  lduw(table, tmp, xtmp);
+  srlx(xcrc, 8, xcrc);
+  xor3(xtmp, xcrc, xcrc);
+}
+
+void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
+  and3(crc, 0xFF, tmp);
+  srlx(crc, 8, crc);
+  sllx(tmp, 2, tmp);
+  lduw(table, tmp, tmp);
+  xor3(tmp, crc, crc);
+}
+
+#define CRC32_TMP_REG_NUM 18
+
+#define CRC32_CONST_64  0x163cd6124
+#define CRC32_CONST_96  0x0ccaa009e
+#define CRC32_CONST_160 0x1751997d0
+#define CRC32_CONST_480 0x1c6e41596
+#define CRC32_CONST_544 0x154442bd4
+
+void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table) {
+
+  Label L_cleanup_loop, L_cleanup_check, L_align_loop, L_align_check;
+  Label L_main_loop_prologue;
+  Label L_fold_512b, L_fold_512b_loop, L_fold_128b;
+  Label L_fold_tail, L_fold_tail_loop;
+  Label L_8byte_fold_loop, L_8byte_fold_check;
+
+  const Register tmp[CRC32_TMP_REG_NUM] = {L0, L1, L2, L3, L4, L5, L6, G1, I0, I1, I2, I3, I4, I5, I7, O4, O5, G3};
+
+  Register const_64  = tmp[CRC32_TMP_REG_NUM-1];
+  Register const_96  = tmp[CRC32_TMP_REG_NUM-1];
+  Register const_160 = tmp[CRC32_TMP_REG_NUM-2];
+  Register const_480 = tmp[CRC32_TMP_REG_NUM-1];
+  Register const_544 = tmp[CRC32_TMP_REG_NUM-2];
+
+  set(ExternalAddress(StubRoutines::crc_table_addr()), table);
+
+  not1(crc); // ~c
+  clruwu(crc); // clear upper 32 bits of crc
+
+  // Check if below cutoff, proceed directly to cleanup code
+  mov(31, G4);
+  cmp_and_br_short(len, G4, Assembler::lessEqualUnsigned, Assembler::pt, L_cleanup_check);
+
+  // Align buffer to 8 byte boundry
+  mov(8, O5);
+  and3(buf, 0x7, O4);
+  sub(O5, O4, O5);
+  and3(O5, 0x7, O5);
+  sub(len, O5, len);
+  ba(L_align_check);
+  delayed()->nop();
+
+  // Alignment loop, table look up method for up to 7 bytes
+  bind(L_align_loop);
+  ldub(buf, 0, O4);
+  inc(buf);
+  dec(O5);
+  xor3(O4, crc, O4);
+  and3(O4, 0xFF, O4);
+  sllx(O4, 2, O4);
+  lduw(table, O4, O4);
+  srlx(crc, 8, crc);
+  xor3(O4, crc, crc);
+  bind(L_align_check);
+  nop();
+  cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_align_loop);
+
+  // Aligned on 64-bit (8-byte) boundry at this point
+  // Check if still above cutoff (31-bytes)
+  mov(31, G4);
+  cmp_and_br_short(len, G4, Assembler::lessEqualUnsigned, Assembler::pt, L_cleanup_check);
+  // At least 32 bytes left to process
+
+  // Free up registers by storing them to FP registers
+  for (int i = 0; i < CRC32_TMP_REG_NUM; i++) {
+    movxtod(tmp[i], as_FloatRegister(2*i));
+  }
+
+  // Determine which loop to enter
+  // Shared prologue
+  ldxl(buf, G0, tmp[0]);
+  inc(buf, 8);
+  ldxl(buf, G0, tmp[1]);
+  inc(buf, 8);
+  xor3(tmp[0], crc, tmp[0]); // Fold CRC into first few bytes
+  and3(crc, 0, crc); // Clear out the crc register
+  // Main loop needs 128-bytes at least
+  mov(128, G4);
+  mov(64, tmp[2]);
+  cmp_and_br_short(len, G4, Assembler::greaterEqualUnsigned, Assembler::pt, L_main_loop_prologue);
+  // Less than 64 bytes
+  nop();
+  cmp_and_br_short(len, tmp[2], Assembler::lessUnsigned, Assembler::pt, L_fold_tail);
+  // Between 64 and 127 bytes
+  set64(CRC32_CONST_96,  const_96,  tmp[8]);
+  set64(CRC32_CONST_160, const_160, tmp[9]);
+  fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[2], tmp[3], buf, 0);
+  fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[4], tmp[5], buf, 16);
+  fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[6], tmp[7], buf, 32);
+  dec(len, 48);
+  ba(L_fold_tail);
+  delayed()->nop();
+
+  bind(L_main_loop_prologue);
+  for (int i = 2; i < 8; i++) {
+    ldxl(buf, G0, tmp[i]);
+    inc(buf, 8);
+  }
+
+  // Fold total 512 bits of polynomial on each iteration,
+  // 128 bits per each of 4 parallel streams
+  set64(CRC32_CONST_480, const_480, tmp[8]);
+  set64(CRC32_CONST_544, const_544, tmp[9]);
+
+  mov(128, G4);
+  bind(L_fold_512b_loop);
+  fold_128bit_crc32(tmp[1], tmp[0], const_480, const_544, tmp[9],  tmp[8],  buf,  0);
+  fold_128bit_crc32(tmp[3], tmp[2], const_480, const_544, tmp[11], tmp[10], buf, 16);
+  fold_128bit_crc32(tmp[5], tmp[4], const_480, const_544, tmp[13], tmp[12], buf, 32);
+  fold_128bit_crc32(tmp[7], tmp[6], const_480, const_544, tmp[15], tmp[14], buf, 64);
+  dec(len, 64);
+  cmp_and_br_short(len, G4, Assembler::greaterEqualUnsigned, Assembler::pt, L_fold_512b_loop);
+
+  // Fold 512 bits to 128 bits
+  bind(L_fold_512b);
+  set64(CRC32_CONST_96,  const_96,  tmp[8]);
+  set64(CRC32_CONST_160, const_160, tmp[9]);
+
+  fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[8], tmp[9], tmp[3], tmp[2]);
+  fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[8], tmp[9], tmp[5], tmp[4]);
+  fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[8], tmp[9], tmp[7], tmp[6]);
+  dec(len, 48);
+
+  // Fold the rest of 128 bits data chunks
+  bind(L_fold_tail);
+  mov(32, G4);
+  cmp_and_br_short(len, G4, Assembler::lessEqualUnsigned, Assembler::pt, L_fold_128b);
+
+  set64(CRC32_CONST_96,  const_96,  tmp[8]);
+  set64(CRC32_CONST_160, const_160, tmp[9]);
+
+  bind(L_fold_tail_loop);
+  fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[2], tmp[3], buf, 0);
+  sub(len, 16, len);
+  cmp_and_br_short(len, G4, Assembler::greaterEqualUnsigned, Assembler::pt, L_fold_tail_loop);
+
+  // Fold the 128 bits in tmps 0 - 1 into tmp 1
+  bind(L_fold_128b);
+
+  set64(CRC32_CONST_64, const_64, tmp[4]);
+
+  xmulx(const_64, tmp[0], tmp[2]);
+  xmulxhi(const_64, tmp[0], tmp[3]);
+
+  srl(tmp[2], G0, tmp[4]);
+  xmulx(const_64, tmp[4], tmp[4]);
+
+  srlx(tmp[2], 32, tmp[2]);
+  sllx(tmp[3], 32, tmp[3]);
+  or3(tmp[2], tmp[3], tmp[2]);
+
+  xor3(tmp[4], tmp[1], tmp[4]);
+  xor3(tmp[4], tmp[2], tmp[1]);
+  dec(len, 8);
+
+  // Use table lookup for the 8 bytes left in tmp[1]
+  dec(len, 8);
+
+  // 8 8-bit folds to compute 32-bit CRC.
+  for (int j = 0; j < 4; j++) {
+    fold_8bit_crc32(tmp[1], table, tmp[2], tmp[3]);
+  }
+  srl(tmp[1], G0, crc); // move 32 bits to general register
+  for (int j = 0; j < 4; j++) {
+    fold_8bit_crc32(crc, table, tmp[3]);
+  }
+
+  bind(L_8byte_fold_check);
+
+  // Restore int registers saved in FP registers
+  for (int i = 0; i < CRC32_TMP_REG_NUM; i++) {
+    movdtox(as_FloatRegister(2*i), tmp[i]);
+  }
+
+  ba(L_cleanup_check);
+  delayed()->nop();
+
+  // Table look-up method for the remaining few bytes
+  bind(L_cleanup_loop);
+  ldub(buf, 0, O4);
+  inc(buf);
+  dec(len);
+  xor3(O4, crc, O4);
+  and3(O4, 0xFF, O4);
+  sllx(O4, 2, O4);
+  lduw(table, O4, O4);
+  srlx(crc, 8, crc);
+  xor3(O4, crc, crc);
+  bind(L_cleanup_check);
+  nop();
+  cmp_and_br_short(len, 0, Assembler::greaterUnsigned, Assembler::pt, L_cleanup_loop);
+
+  not1(crc);
+}
+