hotspot/src/cpu/sparc/vm/macroAssembler_sparc.cpp
changeset 38237 d972e3a2df53
parent 38142 e16b23089599
child 38241 32eab2eb41fd
--- a/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.cpp	Fri Apr 29 17:24:16 2016 +0200
+++ b/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.cpp	Wed May 04 15:30:21 2016 -0700
@@ -4837,21 +4837,21 @@
 
 // Reverse byte order of lower 32 bits, assuming upper 32 bits all zeros
 void MacroAssembler::reverse_bytes_32(Register src, Register dst, Register tmp) {
-  srlx(src, 24, dst);
-
-  sllx(src, 32+8, tmp);
-  srlx(tmp, 32+24, tmp);
-  sllx(tmp, 8, tmp);
-  or3(dst, tmp, dst);
-
-  sllx(src, 32+16, tmp);
-  srlx(tmp, 32+24, tmp);
-  sllx(tmp, 16, tmp);
-  or3(dst, tmp, dst);
-
-  sllx(src, 32+24, tmp);
-  srlx(tmp, 32, tmp);
-  or3(dst, tmp, dst);
+    srlx(src, 24, dst);
+
+    sllx(src, 32+8, tmp);
+    srlx(tmp, 32+24, tmp);
+    sllx(tmp, 8, tmp);
+    or3(dst, tmp, dst);
+
+    sllx(src, 32+16, tmp);
+    srlx(tmp, 32+24, tmp);
+    sllx(tmp, 16, tmp);
+    or3(dst, tmp, dst);
+
+    sllx(src, 32+24, tmp);
+    srlx(tmp, 32, tmp);
+    or3(dst, tmp, dst);
 }
 
 void MacroAssembler::movitof_revbytes(Register src, FloatRegister dst, Register tmp1, Register tmp2) {
@@ -5103,3 +5103,176 @@
   not1(crc);
 }
 
+#define CHUNK_LEN   128          /* 128 x 8B = 1KB */
+#define CHUNK_K1    0x1307a0206  /* reverseBits(pow(x, CHUNK_LEN*8*8*3 - 32) mod P(x)) << 1 */
+#define CHUNK_K2    0x1a0f717c4  /* reverseBits(pow(x, CHUNK_LEN*8*8*2 - 32) mod P(x)) << 1 */
+#define CHUNK_K3    0x0170076fa  /* reverseBits(pow(x, CHUNK_LEN*8*8*1 - 32) mod P(x)) << 1 */
+
+void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len, Register table) {
+
+  Label L_crc32c_head, L_crc32c_aligned;
+  Label L_crc32c_parallel, L_crc32c_parallel_loop;
+  Label L_crc32c_serial, L_crc32c_x32_loop, L_crc32c_x8, L_crc32c_x8_loop;
+  Label L_crc32c_done, L_crc32c_tail, L_crc32c_return;
+
+  set(ExternalAddress(StubRoutines::crc32c_table_addr()), table);
+
+  cmp_and_br_short(len, 0, Assembler::lessEqual, Assembler::pn, L_crc32c_return);
+
+  // clear upper 32 bits of crc
+  clruwu(crc);
+
+  and3(buf, 7, G4);
+  cmp_and_brx_short(G4, 0, Assembler::equal, Assembler::pt, L_crc32c_aligned);
+
+  mov(8, G1);
+  sub(G1, G4, G4);
+
+  // ------ process the misaligned head (7 bytes or less) ------
+  bind(L_crc32c_head);
+
+  // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF];
+  ldub(buf, 0, G1);
+  update_byte_crc32(crc, G1, table);
+
+  inc(buf);
+  dec(len);
+  cmp_and_br_short(len, 0, Assembler::equal, Assembler::pn, L_crc32c_return);
+  dec(G4);
+  cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_head);
+
+  // ------ process the 8-byte-aligned body ------
+  bind(L_crc32c_aligned);
+  nop();
+  cmp_and_br_short(len, 8, Assembler::less, Assembler::pn, L_crc32c_tail);
+
+  // reverse the byte order of lower 32 bits to big endian, and move to FP side
+  movitof_revbytes(crc, F0, G1, G3);
+
+  set(CHUNK_LEN*8*4, G4);
+  cmp_and_br_short(len, G4, Assembler::less, Assembler::pt, L_crc32c_serial);
+
+  // ------ process four 1KB chunks in parallel ------
+  bind(L_crc32c_parallel);
+
+  fzero(FloatRegisterImpl::D, F2);
+  fzero(FloatRegisterImpl::D, F4);
+  fzero(FloatRegisterImpl::D, F6);
+
+  mov(CHUNK_LEN - 1, G4);
+  bind(L_crc32c_parallel_loop);
+  // schedule ldf's ahead of crc32c's to hide the load-use latency
+  ldf(FloatRegisterImpl::D, buf, 0,            F8);
+  ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8,  F10);
+  ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12);
+  ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*24, F14);
+  crc32c(F0, F8,  F0);
+  crc32c(F2, F10, F2);
+  crc32c(F4, F12, F4);
+  crc32c(F6, F14, F6);
+  inc(buf, 8);
+  dec(G4);
+  cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_parallel_loop);
+
+  ldf(FloatRegisterImpl::D, buf, 0,            F8);
+  ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8,  F10);
+  ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12);
+  crc32c(F0, F8,  F0);
+  crc32c(F2, F10, F2);
+  crc32c(F4, F12, F4);
+
+  inc(buf, CHUNK_LEN*24);
+  ldfl(FloatRegisterImpl::D, buf, G0, F14);  // load in little endian
+  inc(buf, 8);
+
+  prefetch(buf, 0,            Assembler::severalReads);
+  prefetch(buf, CHUNK_LEN*8,  Assembler::severalReads);
+  prefetch(buf, CHUNK_LEN*16, Assembler::severalReads);
+  prefetch(buf, CHUNK_LEN*24, Assembler::severalReads);
+
+  // move to INT side, and reverse the byte order of lower 32 bits to little endian
+  movftoi_revbytes(F0, O4, G1, G4);
+  movftoi_revbytes(F2, O5, G1, G4);
+  movftoi_revbytes(F4, G5, G1, G4);
+
+  // combine the results of 4 chunks
+  set64(CHUNK_K1, G3, G1);
+  xmulx(O4, G3, O4);
+  set64(CHUNK_K2, G3, G1);
+  xmulx(O5, G3, O5);
+  set64(CHUNK_K3, G3, G1);
+  xmulx(G5, G3, G5);
+
+  movdtox(F14, G4);
+  xor3(O4, O5, O5);
+  xor3(G5, O5, O5);
+  xor3(G4, O5, O5);
+
+  // reverse the byte order to big endian, via stack, and move to FP side
+  // TODO: use new revb instruction
+  add(SP, -8, G1);
+  srlx(G1, 3, G1);
+  sllx(G1, 3, G1);
+  stx(O5, G1, G0);
+  ldfl(FloatRegisterImpl::D, G1, G0, F2);  // load in little endian
+
+  crc32c(F6, F2, F0);
+
+  set(CHUNK_LEN*8*4, G4);
+  sub(len, G4, len);
+  cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_parallel);
+  nop();
+  cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_done);
+
+  bind(L_crc32c_serial);
+
+  mov(32, G4);
+  cmp_and_br_short(len, G4, Assembler::less, Assembler::pn, L_crc32c_x8);
+
+  // ------ process 32B chunks ------
+  bind(L_crc32c_x32_loop);
+  ldf(FloatRegisterImpl::D, buf, 0, F2);
+  crc32c(F0, F2, F0);
+  ldf(FloatRegisterImpl::D, buf, 8, F2);
+  crc32c(F0, F2, F0);
+  ldf(FloatRegisterImpl::D, buf, 16, F2);
+  crc32c(F0, F2, F0);
+  ldf(FloatRegisterImpl::D, buf, 24, F2);
+  inc(buf, 32);
+  crc32c(F0, F2, F0);
+  dec(len, 32);
+  cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_x32_loop);
+
+  bind(L_crc32c_x8);
+  nop();
+  cmp_and_br_short(len, 8, Assembler::less, Assembler::pt, L_crc32c_done);
+
+  // ------ process 8B chunks ------
+  bind(L_crc32c_x8_loop);
+  ldf(FloatRegisterImpl::D, buf, 0, F2);
+  inc(buf, 8);
+  crc32c(F0, F2, F0);
+  dec(len, 8);
+  cmp_and_br_short(len, 8, Assembler::greaterEqual, Assembler::pt, L_crc32c_x8_loop);
+
+  bind(L_crc32c_done);
+
+  // move to INT side, and reverse the byte order of lower 32 bits to little endian
+  movftoi_revbytes(F0, crc, G1, G3);
+
+  cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_return);
+
+  // ------ process the misaligned tail (7 bytes or less) ------
+  bind(L_crc32c_tail);
+
+  // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF];
+  ldub(buf, 0, G1);
+  update_byte_crc32(crc, G1, table);
+
+  inc(buf);
+  dec(len);
+  cmp_and_br_short(len, 0, Assembler::greater, Assembler::pt, L_crc32c_tail);
+
+  bind(L_crc32c_return);
+  nop();
+}