hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp
changeset 38237 d972e3a2df53
parent 37466 287c4ebd11b0
child 38241 32eab2eb41fd
--- a/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Fri Apr 29 17:24:16 2016 +0200
+++ b/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Wed May 04 15:30:21 2016 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -4909,11 +4909,6 @@
       return start;
   }
 
-#define CHUNK_LEN   128          /* 128 x 8B = 1KB */
-#define CHUNK_K1    0x1307a0206  /* reverseBits(pow(x, CHUNK_LEN*8*8*3 - 32) mod P(x)) << 1 */
-#define CHUNK_K2    0x1a0f717c4  /* reverseBits(pow(x, CHUNK_LEN*8*8*2 - 32) mod P(x)) << 1 */
-#define CHUNK_K3    0x0170076fa  /* reverseBits(pow(x, CHUNK_LEN*8*8*1 - 32) mod P(x)) << 1 */
-
   /**
    *  Arguments:
    *
@@ -4938,171 +4933,8 @@
     const Register len   = O2;  // number of bytes
     const Register table = O3;  // byteTable
 
-    Label L_crc32c_head, L_crc32c_aligned;
-    Label L_crc32c_parallel, L_crc32c_parallel_loop;
-    Label L_crc32c_serial, L_crc32c_x32_loop, L_crc32c_x8, L_crc32c_x8_loop;
-    Label L_crc32c_done, L_crc32c_tail, L_crc32c_return;
-
-    __ cmp_and_br_short(len, 0, Assembler::lessEqual, Assembler::pn, L_crc32c_return);
-
-    // clear upper 32 bits of crc
-    __ clruwu(crc);
-
-    __ and3(buf, 7, G4);
-    __ cmp_and_brx_short(G4, 0, Assembler::equal, Assembler::pt, L_crc32c_aligned);
-
-    __ mov(8, G1);
-    __ sub(G1, G4, G4);
-
-    // ------ process the misaligned head (7 bytes or less) ------
-    __ BIND(L_crc32c_head);
-
-    // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF];
-    __ ldub(buf, 0, G1);
-    __ update_byte_crc32(crc, G1, table);
-
-    __ inc(buf);
-    __ dec(len);
-    __ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pn, L_crc32c_return);
-    __ dec(G4);
-    __ cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_head);
-
-    // ------ process the 8-byte-aligned body ------
-    __ BIND(L_crc32c_aligned);
-    __ nop();
-    __ cmp_and_br_short(len, 8, Assembler::less, Assembler::pn, L_crc32c_tail);
-
-    // reverse the byte order of lower 32 bits to big endian, and move to FP side
-    __ movitof_revbytes(crc, F0, G1, G3);
-
-    __ set(CHUNK_LEN*8*4, G4);
-    __ cmp_and_br_short(len, G4, Assembler::less, Assembler::pt, L_crc32c_serial);
-
-    // ------ process four 1KB chunks in parallel ------
-    __ BIND(L_crc32c_parallel);
-
-    __ fzero(FloatRegisterImpl::D, F2);
-    __ fzero(FloatRegisterImpl::D, F4);
-    __ fzero(FloatRegisterImpl::D, F6);
-
-    __ mov(CHUNK_LEN - 1, G4);
-    __ BIND(L_crc32c_parallel_loop);
-    // schedule ldf's ahead of crc32c's to hide the load-use latency
-    __ ldf(FloatRegisterImpl::D, buf, 0,            F8);
-    __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8,  F10);
-    __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12);
-    __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*24, F14);
-    __ crc32c(F0, F8,  F0);
-    __ crc32c(F2, F10, F2);
-    __ crc32c(F4, F12, F4);
-    __ crc32c(F6, F14, F6);
-    __ inc(buf, 8);
-    __ dec(G4);
-    __ cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_parallel_loop);
-
-    __ ldf(FloatRegisterImpl::D, buf, 0,            F8);
-    __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8,  F10);
-    __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12);
-    __ crc32c(F0, F8,  F0);
-    __ crc32c(F2, F10, F2);
-    __ crc32c(F4, F12, F4);
-
-    __ inc(buf, CHUNK_LEN*24);
-    __ ldfl(FloatRegisterImpl::D, buf, G0, F14);  // load in little endian
-    __ inc(buf, 8);
-
-    __ prefetch(buf, 0,            Assembler::severalReads);
-    __ prefetch(buf, CHUNK_LEN*8,  Assembler::severalReads);
-    __ prefetch(buf, CHUNK_LEN*16, Assembler::severalReads);
-    __ prefetch(buf, CHUNK_LEN*24, Assembler::severalReads);
-
-    // move to INT side, and reverse the byte order of lower 32 bits to little endian
-    __ movftoi_revbytes(F0, O4, G1, G4);
-    __ movftoi_revbytes(F2, O5, G1, G4);
-    __ movftoi_revbytes(F4, G5, G1, G4);
-
-    // combine the results of 4 chunks
-    __ set64(CHUNK_K1, G3, G1);
-    __ xmulx(O4, G3, O4);
-    __ set64(CHUNK_K2, G3, G1);
-    __ xmulx(O5, G3, O5);
-    __ set64(CHUNK_K3, G3, G1);
-    __ xmulx(G5, G3, G5);
-
-    __ movdtox(F14, G4);
-    __ xor3(O4, O5, O5);
-    __ xor3(G5, O5, O5);
-    __ xor3(G4, O5, O5);
-
-    // reverse the byte order to big endian, via stack, and move to FP side
-    __ add(SP, -8, G1);
-    __ srlx(G1, 3, G1);
-    __ sllx(G1, 3, G1);
-    __ stx(O5, G1, G0);
-    __ ldfl(FloatRegisterImpl::D, G1, G0, F2);  // load in little endian
-
-    __ crc32c(F6, F2, F0);
-
-    __ set(CHUNK_LEN*8*4, G4);
-    __ sub(len, G4, len);
-    __ cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_parallel);
-    __ nop();
-    __ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_done);
-
-    __ BIND(L_crc32c_serial);
-
-    __ mov(32, G4);
-    __ cmp_and_br_short(len, G4, Assembler::less, Assembler::pn, L_crc32c_x8);
-
-    // ------ process 32B chunks ------
-    __ BIND(L_crc32c_x32_loop);
-    __ ldf(FloatRegisterImpl::D, buf, 0, F2);
-    __ inc(buf, 8);
-    __ crc32c(F0, F2, F0);
-    __ ldf(FloatRegisterImpl::D, buf, 0, F2);
-    __ inc(buf, 8);
-    __ crc32c(F0, F2, F0);
-    __ ldf(FloatRegisterImpl::D, buf, 0, F2);
-    __ inc(buf, 8);
-    __ crc32c(F0, F2, F0);
-    __ ldf(FloatRegisterImpl::D, buf, 0, F2);
-    __ inc(buf, 8);
-    __ crc32c(F0, F2, F0);
-    __ dec(len, 32);
-    __ cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_x32_loop);
-
-    __ BIND(L_crc32c_x8);
-    __ nop();
-    __ cmp_and_br_short(len, 8, Assembler::less, Assembler::pt, L_crc32c_done);
-
-    // ------ process 8B chunks ------
-    __ BIND(L_crc32c_x8_loop);
-    __ ldf(FloatRegisterImpl::D, buf, 0, F2);
-    __ inc(buf, 8);
-    __ crc32c(F0, F2, F0);
-    __ dec(len, 8);
-    __ cmp_and_br_short(len, 8, Assembler::greaterEqual, Assembler::pt, L_crc32c_x8_loop);
-
-    __ BIND(L_crc32c_done);
-
-    // move to INT side, and reverse the byte order of lower 32 bits to little endian
-    __ movftoi_revbytes(F0, crc, G1, G3);
-
-    __ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_return);
-
-    // ------ process the misaligned tail (7 bytes or less) ------
-    __ BIND(L_crc32c_tail);
-
-    // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF];
-    __ ldub(buf, 0, G1);
-    __ update_byte_crc32(crc, G1, table);
-
-    __ inc(buf);
-    __ dec(len);
-    __ cmp_and_br_short(len, 0, Assembler::greater, Assembler::pt, L_crc32c_tail);
-
-    __ BIND(L_crc32c_return);
-    __ nop();
+    __ kernel_crc32c(crc, buf, len, table);
+
     __ retl();
     __ delayed()->nop();
 
@@ -5366,6 +5198,12 @@
       StubRoutines::_crc_table_adr = (address)StubRoutines::Sparc::_crc_table;
       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
     }
+
+    if (UseCRC32CIntrinsics) {
+      // set table address before stub generation which use it
+      StubRoutines::_crc32c_table_addr = (address)StubRoutines::Sparc::_crc32c_table;
+      StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
+    }
   }
 
 
@@ -5425,12 +5263,6 @@
       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
     }
-
-    // generate CRC32C intrinsic code
-    if (UseCRC32CIntrinsics) {
-      StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
-    }
-
     // generate Adler32 intrinsics code
     if (UseAdler32Intrinsics) {
       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();