jdk-sandbox: comparison hotspot/src/cpu/sparc/vm/stubGenerator

equal deleted inserted replaced

-:510f77046e00
+:d972e3a2df53
 /*
-* Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
+* Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 __ delayed()->restore();
 return start;
 }
-#define CHUNK_LEN   128          /* 128 x 8B = 1KB */
-#define CHUNK_K1    0x1307a0206  /* reverseBits(pow(x, CHUNK_LEN*8*8*3 - 32) mod P(x)) << 1 */
-#define CHUNK_K2    0x1a0f717c4  /* reverseBits(pow(x, CHUNK_LEN*8*8*2 - 32) mod P(x)) << 1 */
-#define CHUNK_K3    0x0170076fa  /* reverseBits(pow(x, CHUNK_LEN*8*8*1 - 32) mod P(x)) << 1 */
 /**
 *  Arguments:
 *
 * Inputs:
 *   O0   - int   crc
 const Register crc   = O0;  // crc
 const Register buf   = O1;  // source java byte array address
 const Register len   = O2;  // number of bytes
 const Register table = O3;  // byteTable
-Label L_crc32c_head, L_crc32c_aligned;
+__ kernel_crc32c(crc, buf, len, table);
-Label L_crc32c_parallel, L_crc32c_parallel_loop;
-Label L_crc32c_serial, L_crc32c_x32_loop, L_crc32c_x8, L_crc32c_x8_loop;
-Label L_crc32c_done, L_crc32c_tail, L_crc32c_return;
-__ cmp_and_br_short(len, 0, Assembler::lessEqual, Assembler::pn, L_crc32c_return);
-// clear upper 32 bits of crc
-__ clruwu(crc);
-__ and3(buf, 7, G4);
-__ cmp_and_brx_short(G4, 0, Assembler::equal, Assembler::pt, L_crc32c_aligned);
-__ mov(8, G1);
-__ sub(G1, G4, G4);
-// ------ process the misaligned head (7 bytes or less) ------
-__ BIND(L_crc32c_head);
-// crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF];
-__ ldub(buf, 0, G1);
-__ update_byte_crc32(crc, G1, table);
-__ inc(buf);
-__ dec(len);
-__ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pn, L_crc32c_return);
-__ dec(G4);
-__ cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_head);
-// ------ process the 8-byte-aligned body ------
-__ BIND(L_crc32c_aligned);
-__ nop();
-__ cmp_and_br_short(len, 8, Assembler::less, Assembler::pn, L_crc32c_tail);
-// reverse the byte order of lower 32 bits to big endian, and move to FP side
-__ movitof_revbytes(crc, F0, G1, G3);
-__ set(CHUNK_LEN*8*4, G4);
-__ cmp_and_br_short(len, G4, Assembler::less, Assembler::pt, L_crc32c_serial);
-// ------ process four 1KB chunks in parallel ------
-__ BIND(L_crc32c_parallel);
-__ fzero(FloatRegisterImpl::D, F2);
-__ fzero(FloatRegisterImpl::D, F4);
-__ fzero(FloatRegisterImpl::D, F6);
-__ mov(CHUNK_LEN - 1, G4);
-__ BIND(L_crc32c_parallel_loop);
-// schedule ldf's ahead of crc32c's to hide the load-use latency
-__ ldf(FloatRegisterImpl::D, buf, 0,            F8);
-__ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8,  F10);
-__ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12);
-__ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*24, F14);
-__ crc32c(F0, F8,  F0);
-__ crc32c(F2, F10, F2);
-__ crc32c(F4, F12, F4);
-__ crc32c(F6, F14, F6);
-__ inc(buf, 8);
-__ dec(G4);
-__ cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_parallel_loop);
-__ ldf(FloatRegisterImpl::D, buf, 0,            F8);
-__ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8,  F10);
-__ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12);
-__ crc32c(F0, F8,  F0);
-__ crc32c(F2, F10, F2);
-__ crc32c(F4, F12, F4);
-__ inc(buf, CHUNK_LEN*24);
-__ ldfl(FloatRegisterImpl::D, buf, G0, F14);  // load in little endian
-__ inc(buf, 8);
-__ prefetch(buf, 0,            Assembler::severalReads);
-__ prefetch(buf, CHUNK_LEN*8,  Assembler::severalReads);
-__ prefetch(buf, CHUNK_LEN*16, Assembler::severalReads);
-__ prefetch(buf, CHUNK_LEN*24, Assembler::severalReads);
-// move to INT side, and reverse the byte order of lower 32 bits to little endian
-__ movftoi_revbytes(F0, O4, G1, G4);
-__ movftoi_revbytes(F2, O5, G1, G4);
-__ movftoi_revbytes(F4, G5, G1, G4);
-// combine the results of 4 chunks
-__ set64(CHUNK_K1, G3, G1);
-__ xmulx(O4, G3, O4);
-__ set64(CHUNK_K2, G3, G1);
-__ xmulx(O5, G3, O5);
-__ set64(CHUNK_K3, G3, G1);
-__ xmulx(G5, G3, G5);
-__ movdtox(F14, G4);
-__ xor3(O4, O5, O5);
-__ xor3(G5, O5, O5);
-__ xor3(G4, O5, O5);
-// reverse the byte order to big endian, via stack, and move to FP side
-__ add(SP, -8, G1);
-__ srlx(G1, 3, G1);
-__ sllx(G1, 3, G1);
-__ stx(O5, G1, G0);
-__ ldfl(FloatRegisterImpl::D, G1, G0, F2);  // load in little endian
-__ crc32c(F6, F2, F0);
-__ set(CHUNK_LEN*8*4, G4);
-__ sub(len, G4, len);
-__ cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_parallel);
-__ nop();
-__ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_done);
-__ BIND(L_crc32c_serial);
-__ mov(32, G4);
-__ cmp_and_br_short(len, G4, Assembler::less, Assembler::pn, L_crc32c_x8);
-// ------ process 32B chunks ------
-__ BIND(L_crc32c_x32_loop);
-__ ldf(FloatRegisterImpl::D, buf, 0, F2);
-__ inc(buf, 8);
-__ crc32c(F0, F2, F0);
-__ ldf(FloatRegisterImpl::D, buf, 0, F2);
-__ inc(buf, 8);
-__ crc32c(F0, F2, F0);
-__ ldf(FloatRegisterImpl::D, buf, 0, F2);
-__ inc(buf, 8);
-__ crc32c(F0, F2, F0);
-__ ldf(FloatRegisterImpl::D, buf, 0, F2);
-__ inc(buf, 8);
-__ crc32c(F0, F2, F0);
-__ dec(len, 32);
-__ cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_x32_loop);
-__ BIND(L_crc32c_x8);
-__ nop();
-__ cmp_and_br_short(len, 8, Assembler::less, Assembler::pt, L_crc32c_done);
-// ------ process 8B chunks ------
-__ BIND(L_crc32c_x8_loop);
-__ ldf(FloatRegisterImpl::D, buf, 0, F2);
-__ inc(buf, 8);
-__ crc32c(F0, F2, F0);
-__ dec(len, 8);
-__ cmp_and_br_short(len, 8, Assembler::greaterEqual, Assembler::pt, L_crc32c_x8_loop);
-__ BIND(L_crc32c_done);
-// move to INT side, and reverse the byte order of lower 32 bits to little endian
-__ movftoi_revbytes(F0, crc, G1, G3);
-__ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_return);
-// ------ process the misaligned tail (7 bytes or less) ------
-__ BIND(L_crc32c_tail);
-// crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF];
-__ ldub(buf, 0, G1);
-__ update_byte_crc32(crc, G1, table);
-__ inc(buf);
-__ dec(len);
-__ cmp_and_br_short(len, 0, Assembler::greater, Assembler::pt, L_crc32c_tail);
-__ BIND(L_crc32c_return);
-__ nop();
 __ retl();
 __ delayed()->nop();
 return start;
 }
 if (UseCRC32Intrinsics) {
 // set table address before stub generation which use it
 StubRoutines::_crc_table_adr = (address)StubRoutines::Sparc::_crc_table;
 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
 }
+if (UseCRC32CIntrinsics) {
+// set table address before stub generation which use it
+StubRoutines::_crc32c_table_addr = (address)StubRoutines::Sparc::_crc32c_table;
+StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
+}
 }
 void generate_all() {
 // Generates all stubs and initializes the entry points
 }
 if (UseSHA512Intrinsics) {
 StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
 }
-// generate CRC32C intrinsic code
-if (UseCRC32CIntrinsics) {
-StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
-}
 // generate Adler32 intrinsics code
 if (UseAdler32Intrinsics) {
 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
 }
 }

changeset 38237	d972e3a2df53
parent 37466	287c4ebd11b0
child 38241	32eab2eb41fd