8134553: CRC32C implementations for x86/x64 targets
Reviewed-by: kvn
Contributed-by: tomasz.wojtowicz@intel.com
--- a/hotspot/src/cpu/aarch64/vm/interpreterGenerator_aarch64.hpp Wed Sep 16 13:16:17 2015 -0700
+++ b/hotspot/src/cpu/aarch64/vm/interpreterGenerator_aarch64.hpp Wed Sep 16 15:54:32 2015 -0700
@@ -48,6 +48,7 @@
address generate_Reference_get_entry();
address generate_CRC32_update_entry();
address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind);
+ address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
void lock_method(void);
void generate_stack_overflow_check(void);
--- a/hotspot/src/cpu/ppc/vm/interpreterGenerator_ppc.hpp Wed Sep 16 13:16:17 2015 -0700
+++ b/hotspot/src/cpu/ppc/vm/interpreterGenerator_ppc.hpp Wed Sep 16 15:54:32 2015 -0700
@@ -38,5 +38,6 @@
address generate_CRC32_update_entry();
address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind);
+ address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
#endif // CPU_PPC_VM_INTERPRETERGENERATOR_PPC_HPP
--- a/hotspot/src/cpu/sparc/vm/interpreterGenerator_sparc.hpp Wed Sep 16 13:16:17 2015 -0700
+++ b/hotspot/src/cpu/sparc/vm/interpreterGenerator_sparc.hpp Wed Sep 16 15:54:32 2015 -0700
@@ -48,4 +48,5 @@
// Not supported
address generate_CRC32_update_entry() { return NULL; }
address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
+ address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
#endif // CPU_SPARC_VM_INTERPRETERGENERATOR_SPARC_HPP
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp Wed Sep 16 13:16:17 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp Wed Sep 16 15:54:32 2015 -0700
@@ -1604,6 +1604,85 @@
emit_int8((unsigned char)0xA2);
}
+// Opcode / Instruction Op / En 64 - Bit Mode Compat / Leg Mode Description Implemented
+// F2 0F 38 F0 / r CRC32 r32, r / m8 RM Valid Valid Accumulate CRC32 on r / m8. v
+// F2 REX 0F 38 F0 / r CRC32 r32, r / m8* RM Valid N.E. Accumulate CRC32 on r / m8. -
+// F2 REX.W 0F 38 F0 / r CRC32 r64, r / m8 RM Valid N.E. Accumulate CRC32 on r / m8. -
+//
+// F2 0F 38 F1 / r CRC32 r32, r / m16 RM Valid Valid Accumulate CRC32 on r / m16. v
+//
+// F2 0F 38 F1 / r CRC32 r32, r / m32 RM Valid Valid Accumulate CRC32 on r / m32. v
+//
+// F2 REX.W 0F 38 F1 / r CRC32 r64, r / m64 RM Valid N.E. Accumulate CRC32 on r / m64. v
+void Assembler::crc32(Register crc, Register v, int8_t sizeInBytes) {
+ assert(VM_Version::supports_sse4_2(), "");
+ int8_t w = 0x01;
+ Prefix p = Prefix_EMPTY;
+
+ emit_int8((int8_t)0xF2);
+ switch (sizeInBytes) {
+ case 1:
+ w = 0;
+ break;
+ case 2:
+ case 4:
+ break;
+ LP64_ONLY(case 8:)
+ // This instruction is not valid in 32 bits
+ // Note:
+ // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+ //
+ // Page B - 72 Vol. 2C says
+ // qwreg2 to qwreg 1111 0010 : 0100 1R0B : 0000 1111 : 0011 1000 : 1111 0000 : 11 qwreg1 qwreg2
+ // mem64 to qwreg 1111 0010 : 0100 1R0B : 0000 1111 : 0011 1000 : 1111 0000 : mod qwreg r / m
+ // F0!!!
+ // while 3 - 208 Vol. 2A
+ // F2 REX.W 0F 38 F1 / r CRC32 r64, r / m64 RM Valid N.E.Accumulate CRC32 on r / m64.
+ //
+ // the 0 on a last bit is reserved for a different flavor of this instruction :
+ // F2 REX.W 0F 38 F0 / r CRC32 r64, r / m8 RM Valid N.E.Accumulate CRC32 on r / m8.
+ p = REX_W;
+ break;
+ default:
+ assert(0, "Unsupported value for a sizeInBytes argument");
+ break;
+ }
+ LP64_ONLY(prefix(crc, v, p);)
+ emit_int8((int8_t)0x0F);
+ emit_int8(0x38);
+ emit_int8((int8_t)(0xF0 | w));
+ emit_int8(0xC0 | ((crc->encoding() & 0x7) << 3) | (v->encoding() & 7));
+}
+
+void Assembler::crc32(Register crc, Address adr, int8_t sizeInBytes) {
+ assert(VM_Version::supports_sse4_2(), "");
+ InstructionMark im(this);
+ int8_t w = 0x01;
+ Prefix p = Prefix_EMPTY;
+
+ emit_int8((int8_t)0xF2);
+ switch (sizeInBytes) {
+ case 1:
+ w = 0;
+ break;
+ case 2:
+ case 4:
+ break;
+ LP64_ONLY(case 8:)
+ // This instruction is not valid in 32 bits
+ p = REX_W;
+ break;
+ default:
+ assert(0, "Unsupported value for a sizeInBytes argument");
+ break;
+ }
+ LP64_ONLY(prefix(crc, adr, p);)
+ emit_int8((int8_t)0x0F);
+ emit_int8(0x38);
+ emit_int8((int8_t)(0xF0 | w));
+ emit_operand(crc, adr);
+}
+
void Assembler::cvtdq2pd(XMMRegister dst, XMMRegister src) {
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
emit_simd_arith_nonds(0xE6, dst, src, VEX_SIMD_F3, /* no_mask_reg */ false, /* legacy_mode */ true);
@@ -6223,6 +6302,14 @@
emit_int8((unsigned char)(0xC0 | src->encoding() << 3 | dst->encoding()));
}
+// 0F A4 / r ib
+void Assembler::shldl(Register dst, Register src, int8_t imm8) {
+ emit_int8(0x0F);
+ emit_int8((unsigned char)0xA4);
+ emit_int8((unsigned char)(0xC0 | src->encoding() << 3 | dst->encoding()));
+ emit_int8(imm8);
+}
+
void Assembler::shrdl(Register dst, Register src) {
emit_int8(0x0F);
emit_int8((unsigned char)0xAD);
@@ -6408,6 +6495,40 @@
}
}
+void Assembler::prefix(Register dst, Register src, Prefix p) {
+ if (src->encoding() >= 8) {
+ p = (Prefix)(p | REX_B);
+ }
+ if (dst->encoding() >= 8) {
+ p = (Prefix)( p | REX_R);
+ }
+ if (p != Prefix_EMPTY) {
+ // do not generate an empty prefix
+ prefix(p);
+ }
+}
+
+void Assembler::prefix(Register dst, Address adr, Prefix p) {
+ if (adr.base_needs_rex()) {
+ if (adr.index_needs_rex()) {
+ assert(false, "prefix(Register dst, Address adr, Prefix p) does not support handling of an X");
+ } else {
+ prefix(REX_B);
+ }
+ } else {
+ if (adr.index_needs_rex()) {
+ assert(false, "prefix(Register dst, Address adr, Prefix p) does not support handling of an X");
+ }
+ }
+ if (dst->encoding() >= 8) {
+ p = (Prefix)(p | REX_R);
+ }
+ if (p != Prefix_EMPTY) {
+ // do not generate an empty prefix
+ prefix(p);
+ }
+}
+
void Assembler::prefix(Address adr) {
if (adr.base_needs_rex()) {
if (adr.index_needs_rex()) {
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp Wed Sep 16 13:16:17 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp Wed Sep 16 15:54:32 2015 -0700
@@ -506,7 +506,8 @@
VEX_3bytes = 0xC4,
VEX_2bytes = 0xC5,
- EVEX_4bytes = 0x62
+ EVEX_4bytes = 0x62,
+ Prefix_EMPTY = 0x0
};
enum VexPrefix {
@@ -615,6 +616,8 @@
int prefixq_and_encode(int dst_enc, int src_enc);
void prefix(Register reg);
+ void prefix(Register dst, Register src, Prefix p);
+ void prefix(Register dst, Address adr, Prefix p);
void prefix(Address adr);
void prefixq(Address adr);
@@ -1177,6 +1180,10 @@
// Identify processor type and features
void cpuid();
+ // CRC32C
+ void crc32(Register crc, Register v, int8_t sizeInBytes);
+ void crc32(Register crc, Address adr, int8_t sizeInBytes);
+
// Convert Scalar Double-Precision Floating-Point Value to Scalar Single-Precision Floating-Point Value
void cvtsd2ss(XMMRegister dst, XMMRegister src);
void cvtsd2ss(XMMRegister dst, Address src);
@@ -1783,6 +1790,7 @@
void setb(Condition cc, Register dst);
void shldl(Register dst, Register src);
+ void shldl(Register dst, Register src, int8_t imm8);
void shll(Register dst, int imm8);
void shll(Register dst);
--- a/hotspot/src/cpu/x86/vm/assembler_x86.inline.hpp Wed Sep 16 13:16:17 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.inline.hpp Wed Sep 16 15:54:32 2015 -0700
@@ -37,6 +37,8 @@
inline int Assembler::prefixq_and_encode(int dst_enc, int src_enc) { return dst_enc << 3 | src_enc; }
inline void Assembler::prefix(Register reg) {}
+inline void Assembler::prefix(Register dst, Register src, Prefix p) {}
+inline void Assembler::prefix(Register dst, Address adr, Prefix p) {}
inline void Assembler::prefix(Address adr) {}
inline void Assembler::prefixq(Address adr) {}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/x86/vm/crc32c.h Wed Sep 16 15:54:32 2015 -0700
@@ -0,0 +1,66 @@
+/*
+* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+*
+* This code is free software; you can redistribute it and/or modify it
+* under the terms of the GNU General Public License version 2 only, as
+* published by the Free Software Foundation.
+*
+* This code is distributed in the hope that it will be useful, but WITHOUT
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+* version 2 for more details (a copy is included in the LICENSE file that
+* accompanied this code).
+*
+* You should have received a copy of the GNU General Public License version
+* 2 along with this work; if not, write to the Free Software Foundation,
+* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+*
+* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+* or visit www.oracle.com if you need additional information or have any
+* questions.
+*
+*/
+
+enum {
+ // S. Gueron / Information Processing Letters 112 (2012) 184
+ // shows than anything above 6K and below 32K is a good choice
+ // 32K does not deliver any further performance gains
+ // 6K=8*256 (*3 as we compute 3 blocks together)
+ //
+ // Thus selecting the smallest value so it could apply to the largest number
+ // of buffer sizes.
+ CRC32C_HIGH = 8 * 256,
+
+ // empirical
+ // based on ubench study using methodology described in
+ // V. Gopal et al. / Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction April 2011 8
+ //
+ // arbitrary value between 27 and 256
+ CRC32C_MIDDLE = 8 * 86,
+
+ // V. Gopal et al. / Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction April 2011 9
+ // shows that 240 and 1024 are equally good choices as the 216==8*27
+ //
+ // Selecting the smallest value which resulted in a significant performance improvement over
+ // sequential version
+ CRC32C_LOW = 8 * 27,
+
+ CRC32C_NUM_ChunkSizeInBytes = 3,
+
+ // We need to compute powers of 64N and 128N for each "chunk" size
+ CRC32C_NUM_PRECOMPUTED_CONSTANTS = ( 2 * CRC32C_NUM_ChunkSizeInBytes )
+};
+// Notes:
+// 1. Why we need to choose a "chunk" approach?
+// Overhead of computing a powers and powers of for an arbitrary buffer of size N is significant
+// (implementation approaches a library perf.)
+// 2. Why only 3 "chunks"?
+// Performance experiments results showed that a HIGH+LOW was not delivering a stable speedup
+// curve.
+//
+// Disclaimer:
+// If you ever decide to increase/decrease number of "chunks" be sure to modify
+// a) constants table generation (hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp)
+// b) constant fetch from that table (macroAssembler_x86.cpp)
+// c) unrolled for loop (macroAssembler_x86.cpp)
--- a/hotspot/src/cpu/x86/vm/interpreterGenerator_x86.hpp Wed Sep 16 13:16:17 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/interpreterGenerator_x86.hpp Wed Sep 16 15:54:32 2015 -0700
@@ -42,6 +42,7 @@
address generate_Reference_get_entry();
address generate_CRC32_update_entry();
address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind);
+ address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind);
#ifndef _LP64
address generate_Float_intBitsToFloat_entry();
address generate_Float_floatToRawIntBits_entry();
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Wed Sep 16 13:16:17 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Wed Sep 16 15:54:32 2015 -0700
@@ -45,6 +45,7 @@
#include "gc/g1/g1SATBCardTableModRefBS.hpp"
#include "gc/g1/heapRegion.hpp"
#endif // INCLUDE_ALL_GCS
+#include "crc32c.h"
#ifdef PRODUCT
#define BLOCK_COMMENT(str) /* nothing */
@@ -8636,6 +8637,471 @@
notl(crc); // ~c
}
+#ifdef _LP64
+// S. Gueron / Information Processing Letters 112 (2012) 184
+// Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
+// Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
+// Output: the 64-bit carry-less product of B * CONST
+void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
+ Register tmp1, Register tmp2, Register tmp3) {
+ lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
+ if (n > 0) {
+ addq(tmp3, n * 256 * 8);
+ }
+ // Q1 = TABLEExt[n][B & 0xFF];
+ movl(tmp1, in);
+ andl(tmp1, 0x000000FF);
+ shll(tmp1, 3);
+ addq(tmp1, tmp3);
+ movq(tmp1, Address(tmp1, 0));
+
+ // Q2 = TABLEExt[n][B >> 8 & 0xFF];
+ movl(tmp2, in);
+ shrl(tmp2, 8);
+ andl(tmp2, 0x000000FF);
+ shll(tmp2, 3);
+ addq(tmp2, tmp3);
+ movq(tmp2, Address(tmp2, 0));
+
+ shlq(tmp2, 8);
+ xorq(tmp1, tmp2);
+
+ // Q3 = TABLEExt[n][B >> 16 & 0xFF];
+ movl(tmp2, in);
+ shrl(tmp2, 16);
+ andl(tmp2, 0x000000FF);
+ shll(tmp2, 3);
+ addq(tmp2, tmp3);
+ movq(tmp2, Address(tmp2, 0));
+
+ shlq(tmp2, 16);
+ xorq(tmp1, tmp2);
+
+ // Q4 = TABLEExt[n][B >> 24 & 0xFF];
+ shrl(in, 24);
+ andl(in, 0x000000FF);
+ shll(in, 3);
+ addq(in, tmp3);
+ movq(in, Address(in, 0));
+
+ shlq(in, 24);
+ xorq(in, tmp1);
+ // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
+}
+
+void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
+ Register in_out,
+ uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
+ XMMRegister w_xtmp2,
+ Register tmp1,
+ Register n_tmp2, Register n_tmp3) {
+ if (is_pclmulqdq_supported) {
+ movdl(w_xtmp1, in_out); // modified blindly
+
+ movl(tmp1, const_or_pre_comp_const_index);
+ movdl(w_xtmp2, tmp1);
+ pclmulqdq(w_xtmp1, w_xtmp2, 0);
+
+ movdq(in_out, w_xtmp1);
+ } else {
+ crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
+ }
+}
+
+// Recombination Alternative 2: No bit-reflections
+// T1 = (CRC_A * U1) << 1
+// T2 = (CRC_B * U2) << 1
+// C1 = T1 >> 32
+// C2 = T2 >> 32
+// T1 = T1 & 0xFFFFFFFF
+// T2 = T2 & 0xFFFFFFFF
+// T1 = CRC32(0, T1)
+// T2 = CRC32(0, T2)
+// C1 = C1 ^ T1
+// C2 = C2 ^ T2
+// CRC = C1 ^ C2 ^ CRC_C
+void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
+ XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+ Register tmp1, Register tmp2,
+ Register n_tmp3) {
+ crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
+ crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
+ shlq(in_out, 1);
+ movl(tmp1, in_out);
+ shrq(in_out, 32);
+ xorl(tmp2, tmp2);
+ crc32(tmp2, tmp1, 4);
+ xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
+ shlq(in1, 1);
+ movl(tmp1, in1);
+ shrq(in1, 32);
+ xorl(tmp2, tmp2);
+ crc32(tmp2, tmp1, 4);
+ xorl(in1, tmp2);
+ xorl(in_out, in1);
+ xorl(in_out, in2);
+}
+
+// Set N to predefined value
+// Subtract from a lenght of a buffer
+// execute in a loop:
+// CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
+// for i = 1 to N do
+// CRC_A = CRC32(CRC_A, A[i])
+// CRC_B = CRC32(CRC_B, B[i])
+// CRC_C = CRC32(CRC_C, C[i])
+// end for
+// Recombine
+void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
+ Register in_out1, Register in_out2, Register in_out3,
+ Register tmp1, Register tmp2, Register tmp3,
+ XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+ Register tmp4, Register tmp5,
+ Register n_tmp6) {
+ Label L_processPartitions;
+ Label L_processPartition;
+ Label L_exit;
+
+ bind(L_processPartitions);
+ cmpl(in_out1, 3 * size);
+ jcc(Assembler::less, L_exit);
+ xorl(tmp1, tmp1);
+ xorl(tmp2, tmp2);
+ movq(tmp3, in_out2);
+ addq(tmp3, size);
+
+ bind(L_processPartition);
+ crc32(in_out3, Address(in_out2, 0), 8);
+ crc32(tmp1, Address(in_out2, size), 8);
+ crc32(tmp2, Address(in_out2, size * 2), 8);
+ addq(in_out2, 8);
+ cmpq(in_out2, tmp3);
+ jcc(Assembler::less, L_processPartition);
+ crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
+ w_xtmp1, w_xtmp2, w_xtmp3,
+ tmp4, tmp5,
+ n_tmp6);
+ addq(in_out2, 2 * size);
+ subl(in_out1, 3 * size);
+ jmp(L_processPartitions);
+
+ bind(L_exit);
+}
+#else
+void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
+ Register tmp1, Register tmp2, Register tmp3,
+ XMMRegister xtmp1, XMMRegister xtmp2) {
+ lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
+ if (n > 0) {
+ addl(tmp3, n * 256 * 8);
+ }
+ // Q1 = TABLEExt[n][B & 0xFF];
+ movl(tmp1, in_out);
+ andl(tmp1, 0x000000FF);
+ shll(tmp1, 3);
+ addl(tmp1, tmp3);
+ movq(xtmp1, Address(tmp1, 0));
+
+ // Q2 = TABLEExt[n][B >> 8 & 0xFF];
+ movl(tmp2, in_out);
+ shrl(tmp2, 8);
+ andl(tmp2, 0x000000FF);
+ shll(tmp2, 3);
+ addl(tmp2, tmp3);
+ movq(xtmp2, Address(tmp2, 0));
+
+ psllq(xtmp2, 8);
+ pxor(xtmp1, xtmp2);
+
+ // Q3 = TABLEExt[n][B >> 16 & 0xFF];
+ movl(tmp2, in_out);
+ shrl(tmp2, 16);
+ andl(tmp2, 0x000000FF);
+ shll(tmp2, 3);
+ addl(tmp2, tmp3);
+ movq(xtmp2, Address(tmp2, 0));
+
+ psllq(xtmp2, 16);
+ pxor(xtmp1, xtmp2);
+
+ // Q4 = TABLEExt[n][B >> 24 & 0xFF];
+ shrl(in_out, 24);
+ andl(in_out, 0x000000FF);
+ shll(in_out, 3);
+ addl(in_out, tmp3);
+ movq(xtmp2, Address(in_out, 0));
+
+ psllq(xtmp2, 24);
+ pxor(xtmp1, xtmp2); // Result in CXMM
+ // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
+}
+
+void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
+ Register in_out,
+ uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
+ XMMRegister w_xtmp2,
+ Register tmp1,
+ Register n_tmp2, Register n_tmp3) {
+ if (is_pclmulqdq_supported) {
+ movdl(w_xtmp1, in_out);
+
+ movl(tmp1, const_or_pre_comp_const_index);
+ movdl(w_xtmp2, tmp1);
+ pclmulqdq(w_xtmp1, w_xtmp2, 0);
+ // Keep result in XMM since GPR is 32 bit in length
+ } else {
+ crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
+ }
+}
+
+void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
+ XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+ Register tmp1, Register tmp2,
+ Register n_tmp3) {
+ crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
+ crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
+
+ psllq(w_xtmp1, 1);
+ movdl(tmp1, w_xtmp1);
+ psrlq(w_xtmp1, 32);
+ movdl(in_out, w_xtmp1);
+
+ xorl(tmp2, tmp2);
+ crc32(tmp2, tmp1, 4);
+ xorl(in_out, tmp2);
+
+ psllq(w_xtmp2, 1);
+ movdl(tmp1, w_xtmp2);
+ psrlq(w_xtmp2, 32);
+ movdl(in1, w_xtmp2);
+
+ xorl(tmp2, tmp2);
+ crc32(tmp2, tmp1, 4);
+ xorl(in1, tmp2);
+ xorl(in_out, in1);
+ xorl(in_out, in2);
+}
+
+void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
+ Register in_out1, Register in_out2, Register in_out3,
+ Register tmp1, Register tmp2, Register tmp3,
+ XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+ Register tmp4, Register tmp5,
+ Register n_tmp6) {
+ Label L_processPartitions;
+ Label L_processPartition;
+ Label L_exit;
+
+ bind(L_processPartitions);
+ cmpl(in_out1, 3 * size);
+ jcc(Assembler::less, L_exit);
+ xorl(tmp1, tmp1);
+ xorl(tmp2, tmp2);
+ movl(tmp3, in_out2);
+ addl(tmp3, size);
+
+ bind(L_processPartition);
+ crc32(in_out3, Address(in_out2, 0), 4);
+ crc32(tmp1, Address(in_out2, size), 4);
+ crc32(tmp2, Address(in_out2, size*2), 4);
+ crc32(in_out3, Address(in_out2, 0+4), 4);
+ crc32(tmp1, Address(in_out2, size+4), 4);
+ crc32(tmp2, Address(in_out2, size*2+4), 4);
+ addl(in_out2, 8);
+ cmpl(in_out2, tmp3);
+ jcc(Assembler::less, L_processPartition);
+
+ push(tmp3);
+ push(in_out1);
+ push(in_out2);
+ tmp4 = tmp3;
+ tmp5 = in_out1;
+ n_tmp6 = in_out2;
+
+ crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
+ w_xtmp1, w_xtmp2, w_xtmp3,
+ tmp4, tmp5,
+ n_tmp6);
+
+ pop(in_out2);
+ pop(in_out1);
+ pop(tmp3);
+
+ addl(in_out2, 2 * size);
+ subl(in_out1, 3 * size);
+ jmp(L_processPartitions);
+
+ bind(L_exit);
+}
+#endif //LP64
+
+#ifdef _LP64
+// Algorithm 2: Pipelined usage of the CRC32 instruction.
+// Input: A buffer I of L bytes.
+// Output: the CRC32C value of the buffer.
+// Notations:
+// Write L = 24N + r, with N = floor (L/24).
+// r = L mod 24 (0 <= r < 24).
+// Consider I as the concatenation of A|B|C|R, where A, B, C, each,
+// N quadwords, and R consists of r bytes.
+// A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
+// B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
+// C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
+// if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
+void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
+ Register tmp1, Register tmp2, Register tmp3,
+ Register tmp4, Register tmp5, Register tmp6,
+ XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+ bool is_pclmulqdq_supported) {
+ uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
+ Label L_wordByWord;
+ Label L_byteByByteProlog;
+ Label L_byteByByte;
+ Label L_exit;
+
+ if (is_pclmulqdq_supported ) {
+ const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
+ const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
+
+ const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
+ const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
+
+ const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
+ const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
+ assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
+ } else {
+ const_or_pre_comp_const_index[0] = 1;
+ const_or_pre_comp_const_index[1] = 0;
+
+ const_or_pre_comp_const_index[2] = 3;
+ const_or_pre_comp_const_index[3] = 2;
+
+ const_or_pre_comp_const_index[4] = 5;
+ const_or_pre_comp_const_index[5] = 4;
+ }
+ crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
+ in2, in1, in_out,
+ tmp1, tmp2, tmp3,
+ w_xtmp1, w_xtmp2, w_xtmp3,
+ tmp4, tmp5,
+ tmp6);
+ crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
+ in2, in1, in_out,
+ tmp1, tmp2, tmp3,
+ w_xtmp1, w_xtmp2, w_xtmp3,
+ tmp4, tmp5,
+ tmp6);
+ crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
+ in2, in1, in_out,
+ tmp1, tmp2, tmp3,
+ w_xtmp1, w_xtmp2, w_xtmp3,
+ tmp4, tmp5,
+ tmp6);
+ movl(tmp1, in2);
+ andl(tmp1, 0x00000007);
+ negl(tmp1);
+ addl(tmp1, in2);
+ addq(tmp1, in1);
+
+ BIND(L_wordByWord);
+ cmpq(in1, tmp1);
+ jcc(Assembler::greaterEqual, L_byteByByteProlog);
+ crc32(in_out, Address(in1, 0), 4);
+ addq(in1, 4);
+ jmp(L_wordByWord);
+
+ BIND(L_byteByByteProlog);
+ andl(in2, 0x00000007);
+ movl(tmp2, 1);
+
+ BIND(L_byteByByte);
+ cmpl(tmp2, in2);
+ jccb(Assembler::greater, L_exit);
+ crc32(in_out, Address(in1, 0), 1);
+ incq(in1);
+ incl(tmp2);
+ jmp(L_byteByByte);
+
+ BIND(L_exit);
+}
+#else
+void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
+ Register tmp1, Register tmp2, Register tmp3,
+ Register tmp4, Register tmp5, Register tmp6,
+ XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+ bool is_pclmulqdq_supported) {
+ uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
+ Label L_wordByWord;
+ Label L_byteByByteProlog;
+ Label L_byteByByte;
+ Label L_exit;
+
+ if (is_pclmulqdq_supported) {
+ const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
+ const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
+
+ const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
+ const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
+
+ const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
+ const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
+ } else {
+ const_or_pre_comp_const_index[0] = 1;
+ const_or_pre_comp_const_index[1] = 0;
+
+ const_or_pre_comp_const_index[2] = 3;
+ const_or_pre_comp_const_index[3] = 2;
+
+ const_or_pre_comp_const_index[4] = 5;
+ const_or_pre_comp_const_index[5] = 4;
+ }
+ crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
+ in2, in1, in_out,
+ tmp1, tmp2, tmp3,
+ w_xtmp1, w_xtmp2, w_xtmp3,
+ tmp4, tmp5,
+ tmp6);
+ crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
+ in2, in1, in_out,
+ tmp1, tmp2, tmp3,
+ w_xtmp1, w_xtmp2, w_xtmp3,
+ tmp4, tmp5,
+ tmp6);
+ crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
+ in2, in1, in_out,
+ tmp1, tmp2, tmp3,
+ w_xtmp1, w_xtmp2, w_xtmp3,
+ tmp4, tmp5,
+ tmp6);
+ movl(tmp1, in2);
+ andl(tmp1, 0x00000007);
+ negl(tmp1);
+ addl(tmp1, in2);
+ addl(tmp1, in1);
+
+ BIND(L_wordByWord);
+ cmpl(in1, tmp1);
+ jcc(Assembler::greaterEqual, L_byteByByteProlog);
+ crc32(in_out, Address(in1,0), 4);
+ addl(in1, 4);
+ jmp(L_wordByWord);
+
+ BIND(L_byteByByteProlog);
+ andl(in2, 0x00000007);
+ movl(tmp2, 1);
+
+ BIND(L_byteByByte);
+ cmpl(tmp2, in2);
+ jccb(Assembler::greater, L_exit);
+ movb(tmp1, Address(in1, 0));
+ crc32(in_out, tmp1, 1);
+ incl(in1);
+ incl(tmp2);
+ jmp(L_byteByByte);
+
+ BIND(L_exit);
+}
+#endif // LP64
#undef BIND
#undef BLOCK_COMMENT
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp Wed Sep 16 13:16:17 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp Wed Sep 16 15:54:32 2015 -0700
@@ -1278,9 +1278,42 @@
Register raxReg);
#endif
- // CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic.
+ // CRC32 code for java.util.zip.CRC32::updateBytes() intrinsic.
void update_byte_crc32(Register crc, Register val, Register table);
void kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp);
+ // CRC32C code for java.util.zip.CRC32C::updateBytes() intrinsic
+ // Note on a naming convention:
+ // Prefix w = register only used on a Westmere+ architecture
+ // Prefix n = register only used on a Nehalem architecture
+#ifdef _LP64
+ void crc32c_ipl_alg4(Register in_out, uint32_t n,
+ Register tmp1, Register tmp2, Register tmp3);
+#else
+ void crc32c_ipl_alg4(Register in_out, uint32_t n,
+ Register tmp1, Register tmp2, Register tmp3,
+ XMMRegister xtmp1, XMMRegister xtmp2);
+#endif
+ void crc32c_pclmulqdq(XMMRegister w_xtmp1,
+ Register in_out,
+ uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
+ XMMRegister w_xtmp2,
+ Register tmp1,
+ Register n_tmp2, Register n_tmp3);
+ void crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
+ XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+ Register tmp1, Register tmp2,
+ Register n_tmp3);
+ void crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
+ Register in_out1, Register in_out2, Register in_out3,
+ Register tmp1, Register tmp2, Register tmp3,
+ XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+ Register tmp4, Register tmp5,
+ Register n_tmp6);
+ void crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
+ Register tmp1, Register tmp2, Register tmp3,
+ Register tmp4, Register tmp5, Register tmp6,
+ XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
+ bool is_pclmulqdq_supported);
// Fold 128-bit data chunk
void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset);
void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf);
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp Wed Sep 16 13:16:17 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp Wed Sep 16 15:54:32 2015 -0700
@@ -2991,6 +2991,63 @@
return start;
}
+ /**
+ * Arguments:
+ *
+ * Inputs:
+ * rsp(4) - int crc
+ * rsp(8) - byte* buf
+ * rsp(12) - int length
+ * rsp(16) - table_start - optional (present only when doing a library_calll,
+ * not used by x86 algorithm)
+ *
+ * Ouput:
+ * rax - int crc result
+ */
+ address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
+ assert(UseCRC32CIntrinsics, "need SSE4_2");
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
+ address start = __ pc();
+ const Register crc = rax; // crc
+ const Register buf = rcx; // source java byte array address
+ const Register len = rdx; // length
+ const Register d = rbx;
+ const Register g = rsi;
+ const Register h = rdi;
+ const Register empty = 0; // will never be used, in order not
+ // to change a signature for crc32c_IPL_Alg2_Alt2
+ // between 64/32 I'm just keeping it here
+ assert_different_registers(crc, buf, len, d, g, h);
+
+ BLOCK_COMMENT("Entry:");
+ __ enter(); // required for proper stackwalking of RuntimeStub frame
+ Address crc_arg(rsp, 4 + 4 + 0); // ESP+4 +
+ // we need to add additional 4 because __ enter
+ // have just pushed ebp on a stack
+ Address buf_arg(rsp, 4 + 4 + 4);
+ Address len_arg(rsp, 4 + 4 + 8);
+ // Load up:
+ __ movl(crc, crc_arg);
+ __ movl(buf, buf_arg);
+ __ movl(len, len_arg);
+ __ push(d);
+ __ push(g);
+ __ push(h);
+ __ crc32c_ipl_alg2_alt2(crc, buf, len,
+ d, g, h,
+ empty, empty, empty,
+ xmm0, xmm1, xmm2,
+ is_pclmulqdq_supported);
+ __ pop(h);
+ __ pop(g);
+ __ pop(d);
+ __ leave(); // required for proper stackwalking of RuntimeStub frame
+ __ ret(0);
+
+ return start;
+ }
+
// Safefetch stubs.
void generate_safefetch(const char* name, int size, address* entry,
address* fault_pc, address* continuation_pc) {
@@ -3204,6 +3261,13 @@
StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
}
+
+ if (UseCRC32CIntrinsics) {
+ bool supports_clmul = VM_Version::supports_clmul();
+ StubRoutines::x86::generate_CRC32C_table(supports_clmul);
+ StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
+ StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
+ }
}
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp Wed Sep 16 13:16:17 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp Wed Sep 16 15:54:32 2015 -0700
@@ -3958,6 +3958,64 @@
return start;
}
+ /**
+ * Arguments:
+ *
+ * Inputs:
+ * c_rarg0 - int crc
+ * c_rarg1 - byte* buf
+ * c_rarg2 - long length
+ * c_rarg3 - table_start - optional (present only when doing a library_calll,
+ * not used by x86 algorithm)
+ *
+ * Ouput:
+ * rax - int crc result
+ */
+ address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
+ assert(UseCRC32CIntrinsics, "need SSE4_2");
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
+ address start = __ pc();
+ //reg.arg int#0 int#1 int#2 int#3 int#4 int#5 float regs
+ //Windows RCX RDX R8 R9 none none XMM0..XMM3
+ //Lin / Sol RDI RSI RDX RCX R8 R9 XMM0..XMM7
+ const Register crc = c_rarg0; // crc
+ const Register buf = c_rarg1; // source java byte array address
+ const Register len = c_rarg2; // length
+ const Register a = rax;
+ const Register j = r9;
+ const Register k = r10;
+ const Register l = r11;
+#ifdef _WIN64
+ const Register y = rdi;
+ const Register z = rsi;
+#else
+ const Register y = rcx;
+ const Register z = r8;
+#endif
+ assert_different_registers(crc, buf, len, a, j, k, l, y, z);
+
+ BLOCK_COMMENT("Entry:");
+ __ enter(); // required for proper stackwalking of RuntimeStub frame
+#ifdef _WIN64
+ __ push(y);
+ __ push(z);
+#endif
+ __ crc32c_ipl_alg2_alt2(crc, buf, len,
+ a, j, k,
+ l, y, z,
+ c_farg0, c_farg1, c_farg2,
+ is_pclmulqdq_supported);
+ __ movl(rax, crc);
+#ifdef _WIN64
+ __ pop(z);
+ __ pop(y);
+#endif
+ __ leave(); // required for proper stackwalking of RuntimeStub frame
+ __ ret(0);
+
+ return start;
+ }
/**
* Arguments:
@@ -4302,6 +4360,13 @@
StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
}
+
+ if (UseCRC32CIntrinsics) {
+ bool supports_clmul = VM_Version::supports_clmul();
+ StubRoutines::x86::generate_CRC32C_table(supports_clmul);
+ StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
+ StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
+ }
}
void generate_all() {
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp Wed Sep 16 13:16:17 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp Wed Sep 16 15:54:32 2015 -0700
@@ -27,6 +27,7 @@
#include "runtime/frame.inline.hpp"
#include "runtime/stubRoutines.hpp"
#include "runtime/thread.inline.hpp"
+#include "crc32c.h"
// Implementation of the platform-specific part of StubRoutines - for
// a description of how to extend it, see the stubRoutines.hpp file.
@@ -130,3 +131,107 @@
0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
0x2d02ef8dUL
};
+
+#define D 32
+#define P 0x82F63B78 // Reflection of Castagnoli (0x11EDC6F41)
+
+#define TILL_CYCLE 31
+uint32_t _crc32c_pow_2k_table[TILL_CYCLE]; // because _crc32c_pow_2k_table[TILL_CYCLE == 31] == _crc32c_pow_2k_table[0]
+
+// A. Kadatch and B. Jenkins / Everything we know about CRC but afraid to forget September 3, 2010 8
+// Listing 1: Multiplication of normalized polynomials
+// "a" and "b" occupy D least significant bits.
+uint32_t crc32c_multiply(uint32_t a, uint32_t b) {
+ uint32_t product = 0;
+ uint32_t b_pow_x_table[D + 1]; // b_pow_x_table[k] = (b * x**k) mod P
+ b_pow_x_table[0] = b;
+ for (int k = 0; k < D; ++k) {
+ // If "a" has non-zero coefficient at x**k,/ add ((b * x**k) mod P) to the result.
+ if ((a & (uint64_t)(1 << (D - 1 - k))) != 0) product ^= b_pow_x_table[k];
+
+ // Compute b_pow_x_table[k+1] = (b ** x**(k+1)) mod P.
+ if (b_pow_x_table[k] & 1) {
+ // If degree of (b_pow_x_table[k] * x) is D, then
+ // degree of (b_pow_x_table[k] * x - P) is less than D.
+ b_pow_x_table[k + 1] = (b_pow_x_table[k] >> 1) ^ P;
+ }
+ else {
+ b_pow_x_table[k + 1] = b_pow_x_table[k] >> 1;
+ }
+ }
+ return product;
+}
+#undef D
+#undef P
+
+// A. Kadatch and B. Jenkins / Everything we know about CRC but afraid to forget September 3, 2010 9
+void crc32c_init_pow_2k(void) {
+ // _crc32c_pow_2k_table(0) =
+ // x^(2^k) mod P(x) = x mod P(x) = x
+ // Since we are operating on a reflected values
+ // x = 10b, reflect(x) = 0x40000000
+ _crc32c_pow_2k_table[0] = 0x40000000;
+
+ for (int k = 1; k < TILL_CYCLE; k++) {
+ // _crc32c_pow_2k_table(k+1) = _crc32c_pow_2k_table(k-1)^2 mod P(x)
+ uint32_t tmp = _crc32c_pow_2k_table[k - 1];
+ _crc32c_pow_2k_table[k] = crc32c_multiply(tmp, tmp);
+ }
+}
+
+// x^N mod P(x)
+uint32_t crc32c_f_pow_n(uint32_t n) {
+ // result = 1 (polynomial)
+ uint32_t one, result = 0x80000000, i = 0;
+
+ while (one = (n & 1), (n == 1 || n - one > 0)) {
+ if (one) {
+ result = crc32c_multiply(result, _crc32c_pow_2k_table[i]);
+ }
+ n >>= 1;
+ i++;
+ }
+
+ return result;
+}
+
+juint *StubRoutines::x86::_crc32c_table;
+
+void StubRoutines::x86::generate_CRC32C_table(bool is_pclmulqdq_table_supported) {
+
+ static juint pow_n[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
+
+ crc32c_init_pow_2k();
+
+ pow_n[0] = crc32c_f_pow_n(CRC32C_HIGH * 8); // 8N * 8 = 64N
+ pow_n[1] = crc32c_f_pow_n(CRC32C_HIGH * 8 * 2); // 128N
+
+ pow_n[2] = crc32c_f_pow_n(CRC32C_MIDDLE * 8);
+ pow_n[3] = crc32c_f_pow_n(CRC32C_MIDDLE * 8 * 2);
+
+ pow_n[4] = crc32c_f_pow_n(CRC32C_LOW * 8);
+ pow_n[CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1] =
+ crc32c_f_pow_n(CRC32C_LOW * 8 * 2);
+
+ if (is_pclmulqdq_table_supported) {
+ _crc32c_table = pow_n;
+ } else {
+ static julong pclmulqdq_table[CRC32C_NUM_PRECOMPUTED_CONSTANTS * 256];
+
+ for (int j = 0; j < CRC32C_NUM_PRECOMPUTED_CONSTANTS; j++) {
+ static juint X_CONST = pow_n[j];
+ for (int64_t i = 0; i < 256; i++) { // to force 64 bit wide computations
+ // S. Gueron / Information Processing Letters 112 (2012) 184
+ // Algorithm 3: Generating a carry-less multiplication lookup table.
+ // Input: A 32-bit constant, X_CONST.
+ // Output: A table of 256 entries, each one is a 64-bit quadword,
+ // that can be used for computing "byte" * X_CONST, for a given byte.
+ pclmulqdq_table[j * 256 + i] =
+ ((i & 1) * X_CONST) ^ ((i & 2) * X_CONST) ^ ((i & 4) * X_CONST) ^
+ ((i & 8) * X_CONST) ^ ((i & 16) * X_CONST) ^ ((i & 32) * X_CONST) ^
+ ((i & 64) * X_CONST) ^ ((i & 128) * X_CONST);
+ }
+ }
+ _crc32c_table = (juint*)pclmulqdq_table;
+ }
+}
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp Wed Sep 16 13:16:17 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp Wed Sep 16 15:54:32 2015 -0700
@@ -36,6 +36,8 @@
// masks and table for CRC32
static uint64_t _crc_by128_masks[];
static juint _crc_table[];
+ // table for CRC32C
+ static juint* _crc32c_table;
// swap mask for ghash
static address _ghash_long_swap_mask_addr;
static address _ghash_byte_swap_mask_addr;
@@ -46,5 +48,6 @@
static address crc_by128_masks_addr() { return (address)_crc_by128_masks; }
static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; }
static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; }
+ static void generate_CRC32C_table(bool is_pclmulqdq_supported);
#endif // CPU_X86_VM_STUBROUTINES_X86_32_HPP
--- a/hotspot/src/cpu/x86/vm/templateInterpreter_x86_32.cpp Wed Sep 16 13:16:17 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/templateInterpreter_x86_32.cpp Wed Sep 16 15:54:32 2015 -0700
@@ -790,18 +790,25 @@
const Register buf = rdx; // source java byte array address
const Register len = rdi; // length
+ // value x86_32
+ // interp. arg ptr ESP + 4
+ // int java.util.zip.CRC32.updateBytes(int crc, byte[] b, int off, int len)
+ // 3 2 1 0
+ // int java.util.zip.CRC32.updateByteBuffer(int crc, long buf, int off, int len)
+ // 4 2,3 1 0
+
// Arguments are reversed on java expression stack
- __ movl(len, Address(rsp, wordSize)); // Length
+ __ movl(len, Address(rsp, 4 + 0)); // Length
// Calculate address of start element
if (kind == Interpreter::java_util_zip_CRC32_updateByteBuffer) {
- __ movptr(buf, Address(rsp, 3*wordSize)); // long buf
- __ addptr(buf, Address(rsp, 2*wordSize)); // + offset
- __ movl(crc, Address(rsp, 5*wordSize)); // Initial CRC
+ __ movptr(buf, Address(rsp, 4 + 2 * wordSize)); // long buf
+ __ addptr(buf, Address(rsp, 4 + 1 * wordSize)); // + offset
+ __ movl(crc, Address(rsp, 4 + 4 * wordSize)); // Initial CRC
} else {
- __ movptr(buf, Address(rsp, 3*wordSize)); // byte[] array
+ __ movptr(buf, Address(rsp, 4 + 2 * wordSize)); // byte[] array
__ addptr(buf, arrayOopDesc::base_offset_in_bytes(T_BYTE)); // + header size
- __ addptr(buf, Address(rsp, 2*wordSize)); // + offset
- __ movl(crc, Address(rsp, 4*wordSize)); // Initial CRC
+ __ addptr(buf, Address(rsp, 4 + 1 * wordSize)); // + offset
+ __ movl(crc, Address(rsp, 4 + 3 * wordSize)); // Initial CRC
}
__ super_call_VM_leaf(CAST_FROM_FN_PTR(address, StubRoutines::updateBytesCRC32()), crc, buf, len);
@@ -823,6 +830,53 @@
}
/**
+* Method entry for static native methods:
+* int java.util.zip.CRC32C.updateBytes(int crc, byte[] b, int off, int end)
+* int java.util.zip.CRC32C.updateByteBuffer(int crc, long address, int off, int end)
+*/
+address InterpreterGenerator::generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) {
+ if (UseCRC32CIntrinsics) {
+ address entry = __ pc();
+ // Load parameters
+ const Register crc = rax; // crc
+ const Register buf = rcx; // source java byte array address
+ const Register len = rdx; // length
+ const Register end = len;
+
+ // value x86_32
+ // interp. arg ptr ESP + 4
+ // int java.util.zip.CRC32.updateBytes(int crc, byte[] b, int off, int end)
+ // 3 2 1 0
+ // int java.util.zip.CRC32.updateByteBuffer(int crc, long address, int off, int end)
+ // 4 2,3 1 0
+
+ // Arguments are reversed on java expression stack
+ __ movl(end, Address(rsp, 4 + 0)); // end
+ __ subl(len, Address(rsp, 4 + 1 * wordSize)); // end - offset == length
+ // Calculate address of start element
+ if (kind == Interpreter::java_util_zip_CRC32_updateByteBuffer) {
+ __ movptr(buf, Address(rsp, 4 + 2 * wordSize)); // long address
+ __ addptr(buf, Address(rsp, 4 + 1 * wordSize)); // + offset
+ __ movl(crc, Address(rsp, 4 + 4 * wordSize)); // Initial CRC
+ } else {
+ __ movptr(buf, Address(rsp, 4 + 2 * wordSize)); // byte[] array
+ __ addptr(buf, arrayOopDesc::base_offset_in_bytes(T_BYTE)); // + header size
+ __ addptr(buf, Address(rsp, 4 + 1 * wordSize)); // + offset
+ __ movl(crc, Address(rsp, 4 + 3 * wordSize)); // Initial CRC
+ }
+ __ super_call_VM_leaf(CAST_FROM_FN_PTR(address, StubRoutines::updateBytesCRC32C()), crc, buf, len);
+ // result in rax
+ // _areturn
+ __ pop(rdi); // get return address
+ __ mov(rsp, rsi); // set sp to sender sp
+ __ jmp(rdi);
+
+ return entry;
+ }
+ return generate_native_entry(false);
+}
+
+/**
* Method entry for static native method:
* java.lang.Float.intBitsToFloat(int bits)
*/
--- a/hotspot/src/cpu/x86/vm/templateInterpreter_x86_64.cpp Wed Sep 16 13:16:17 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/templateInterpreter_x86_64.cpp Wed Sep 16 15:54:32 2015 -0700
@@ -804,6 +804,57 @@
return generate_native_entry(false);
}
+/**
+* Method entry for static native methods:
+* int java.util.zip.CRC32C.updateBytes(int crc, byte[] b, int off, int end)
+* int java.util.zip.CRC32C.updateByteBuffer(int crc, long address, int off, int end)
+*/
+address InterpreterGenerator::generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) {
+ if (UseCRC32CIntrinsics) {
+ address entry = __ pc();
+ // Load parameters
+ const Register crc = c_rarg0; // crc
+ const Register buf = c_rarg1; // source java byte array address
+ const Register len = c_rarg2;
+ const Register off = c_rarg3; // offset
+ const Register end = len;
+
+ // Arguments are reversed on java expression stack
+ // Calculate address of start element
+ if (kind == Interpreter::java_util_zip_CRC32C_updateDirectByteBuffer) {
+ __ movptr(buf, Address(rsp, 3 * wordSize)); // long buf
+ __ movl2ptr(off, Address(rsp, 2 * wordSize)); // offset
+ __ addq(buf, off); // + offset
+ __ movl(crc, Address(rsp, 5 * wordSize)); // Initial CRC
+ // Note on 5 * wordSize vs. 4 * wordSize:
+ // * int java.util.zip.CRC32C.updateByteBuffer(int crc, long address, int off, int end)
+ // 4 2,3 1 0
+ // end starts at SP + 8
+ // The Java(R) Virtual Machine Specification Java SE 7 Edition
+ // 4.10.2.3. Values of Types long and double
+ // "When calculating operand stack length, values of type long and double have length two."
+ } else {
+ __ movptr(buf, Address(rsp, 3 * wordSize)); // byte[] array
+ __ addptr(buf, arrayOopDesc::base_offset_in_bytes(T_BYTE)); // + header size
+ __ movl2ptr(off, Address(rsp, 2 * wordSize)); // offset
+ __ addq(buf, off); // + offset
+ __ movl(crc, Address(rsp, 4 * wordSize)); // Initial CRC
+ }
+ __ movl(end, Address(rsp, wordSize)); // end
+ __ subl(end, off); // end - off
+ __ super_call_VM_leaf(CAST_FROM_FN_PTR(address, StubRoutines::updateBytesCRC32C()), crc, buf, len);
+ // result in rax
+ // _areturn
+ __ pop(rdi); // get return address
+ __ mov(rsp, r13); // set sp to sender sp
+ __ jmp(rdi);
+
+ return entry;
+ }
+
+ return generate_native_entry(false);
+}
+
// Interpreter stub for calling a native method. (asm interpreter)
// This sets up a somewhat different looking stack for calling the
// native method than the typical interpreter frame setup.
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Wed Sep 16 13:16:17 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Wed Sep 16 15:54:32 2015 -0700
@@ -661,6 +661,18 @@
FLAG_SET_DEFAULT(UseCRC32Intrinsics, false);
}
+ if (supports_sse4_2()) {
+ if (FLAG_IS_DEFAULT(UseCRC32CIntrinsics)) {
+ UseCRC32CIntrinsics = true;
+ }
+ }
+ else if (UseCRC32CIntrinsics) {
+ if (!FLAG_IS_DEFAULT(UseCRC32CIntrinsics)) {
+ warning("CRC32C intrinsics are not available on this CPU");
+ }
+ FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
+ }
+
// The AES intrinsic stubs require AES instruction support (of course)
// but also require sse3 mode for instructions it use.
if (UseAES && (UseSSE > 2)) {
@@ -704,12 +716,6 @@
FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
}
- if (UseCRC32CIntrinsics) {
- if (!FLAG_IS_DEFAULT(UseCRC32CIntrinsics))
- warning("CRC32C intrinsics are not available on this CPU");
- FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
- }
-
if (UseAdler32Intrinsics) {
warning("Adler32Intrinsics not available on this CPU.");
FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
--- a/hotspot/src/cpu/zero/vm/interpreterGenerator_zero.hpp Wed Sep 16 13:16:17 2015 -0700
+++ b/hotspot/src/cpu/zero/vm/interpreterGenerator_zero.hpp Wed Sep 16 15:54:32 2015 -0700
@@ -42,4 +42,5 @@
// Not supported
address generate_CRC32_update_entry() { return NULL; }
address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
+ address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
#endif // CPU_ZERO_VM_INTERPRETERGENERATOR_ZERO_HPP
--- a/hotspot/src/share/vm/interpreter/abstractInterpreter.hpp Wed Sep 16 13:16:17 2015 -0700
+++ b/hotspot/src/share/vm/interpreter/abstractInterpreter.hpp Wed Sep 16 15:54:32 2015 -0700
@@ -90,6 +90,8 @@
java_util_zip_CRC32_update, // implementation of java.util.zip.CRC32.update()
java_util_zip_CRC32_updateBytes, // implementation of java.util.zip.CRC32.updateBytes()
java_util_zip_CRC32_updateByteBuffer, // implementation of java.util.zip.CRC32.updateByteBuffer()
+ java_util_zip_CRC32C_updateBytes, // implementation of java.util.zip.CRC32C.updateBytes(crc, b[], off, end)
+ java_util_zip_CRC32C_updateDirectByteBuffer, // implementation of java.util.zip.CRC32C.updateDirectByteBuffer(crc, address, off, end)
java_lang_Float_intBitsToFloat, // implementation of java.lang.Float.intBitsToFloat()
java_lang_Float_floatToRawIntBits, // implementation of java.lang.Float.floatToRawIntBits()
java_lang_Double_longBitsToDouble, // implementation of java.lang.Double.longBitsToDouble()
--- a/hotspot/src/share/vm/interpreter/interpreter.cpp Wed Sep 16 13:16:17 2015 -0700
+++ b/hotspot/src/share/vm/interpreter/interpreter.cpp Wed Sep 16 15:54:32 2015 -0700
@@ -234,6 +234,13 @@
case vmIntrinsics::_updateByteBufferCRC32 : return java_util_zip_CRC32_updateByteBuffer;
}
}
+ if (UseCRC32CIntrinsics) {
+ // Use optimized stub code for CRC32C methods.
+ switch (m->intrinsic_id()) {
+ case vmIntrinsics::_updateBytesCRC32C : return java_util_zip_CRC32C_updateBytes;
+ case vmIntrinsics::_updateDirectByteBufferCRC32C : return java_util_zip_CRC32C_updateDirectByteBuffer;
+ }
+ }
switch(m->intrinsic_id()) {
case vmIntrinsics::_intBitsToFloat: return java_lang_Float_intBitsToFloat;
@@ -349,6 +356,8 @@
case java_util_zip_CRC32_update : tty->print("java_util_zip_CRC32_update"); break;
case java_util_zip_CRC32_updateBytes : tty->print("java_util_zip_CRC32_updateBytes"); break;
case java_util_zip_CRC32_updateByteBuffer : tty->print("java_util_zip_CRC32_updateByteBuffer"); break;
+ case java_util_zip_CRC32C_updateBytes : tty->print("java_util_zip_CRC32C_updateBytes"); break;
+ case java_util_zip_CRC32C_updateDirectByteBuffer: tty->print("java_util_zip_CRC32C_updateDirectByteByffer"); break;
default:
if (kind >= method_handle_invoke_FIRST &&
kind <= method_handle_invoke_LAST) {
@@ -567,6 +576,10 @@
: // fall thru
case Interpreter::java_util_zip_CRC32_updateByteBuffer
: entry_point = generate_CRC32_updateBytes_entry(kind); break;
+ case Interpreter::java_util_zip_CRC32C_updateBytes
+ : // fall thru
+ case Interpreter::java_util_zip_CRC32C_updateDirectByteBuffer
+ : entry_point = generate_CRC32C_updateBytes_entry(kind); break;
#if defined(TARGET_ARCH_x86) && !defined(_LP64)
// On x86_32 platforms, a special entry is generated for the following four methods.
// On other platforms the normal entry is used to enter these methods.
@@ -582,9 +595,9 @@
case Interpreter::java_lang_Float_intBitsToFloat:
case Interpreter::java_lang_Float_floatToRawIntBits:
case Interpreter::java_lang_Double_longBitsToDouble:
- case Interpreter::java_lang_Double_doubleToRawLongBits:
- entry_point = generate_native_entry(false);
- break;
+ case Interpreter::java_lang_Double_doubleToRawLongBits:
+ entry_point = generate_native_entry(false);
+ break;
#endif // defined(TARGET_ARCH_x86) && !defined(_LP64)
#endif // CC_INTERP
default:
--- a/hotspot/src/share/vm/interpreter/templateInterpreter.cpp Wed Sep 16 13:16:17 2015 -0700
+++ b/hotspot/src/share/vm/interpreter/templateInterpreter.cpp Wed Sep 16 15:54:32 2015 -0700
@@ -418,6 +418,11 @@
method_entry(java_util_zip_CRC32_updateByteBuffer)
}
+ if (UseCRC32CIntrinsics) {
+ method_entry(java_util_zip_CRC32C_updateBytes)
+ method_entry(java_util_zip_CRC32C_updateDirectByteBuffer)
+ }
+
method_entry(java_lang_Float_intBitsToFloat);
method_entry(java_lang_Float_floatToRawIntBits);
method_entry(java_lang_Double_longBitsToDouble);
--- a/hotspot/src/share/vm/runtime/stubRoutines.cpp Wed Sep 16 13:16:17 2015 -0700
+++ b/hotspot/src/share/vm/runtime/stubRoutines.cpp Wed Sep 16 15:54:32 2015 -0700
@@ -136,8 +136,9 @@
address StubRoutines::_sha512_implCompressMB = NULL;
address StubRoutines::_updateBytesCRC32 = NULL;
-address StubRoutines::_crc_table_adr = NULL;
+address StubRoutines::_crc_table_adr = NULL;
+address StubRoutines::_crc32c_table_addr = NULL;
address StubRoutines::_updateBytesCRC32C = NULL;
address StubRoutines::_updateBytesAdler32 = NULL;
--- a/hotspot/src/share/vm/runtime/stubRoutines.hpp Wed Sep 16 13:16:17 2015 -0700
+++ b/hotspot/src/share/vm/runtime/stubRoutines.hpp Wed Sep 16 15:54:32 2015 -0700
@@ -197,6 +197,7 @@
static address _updateBytesCRC32;
static address _crc_table_adr;
+ static address _crc32c_table_addr;
static address _updateBytesCRC32C;
static address _updateBytesAdler32;
@@ -364,6 +365,7 @@
static address updateBytesCRC32() { return _updateBytesCRC32; }
static address crc_table_addr() { return _crc_table_adr; }
+ static address crc32c_table_addr() { return _crc32c_table_addr; }
static address updateBytesCRC32C() { return _updateBytesCRC32C; }
static address updateBytesAdler32() { return _updateBytesAdler32; }
--- a/hotspot/src/share/vm/runtime/vmStructs.cpp Wed Sep 16 13:16:17 2015 -0700
+++ b/hotspot/src/share/vm/runtime/vmStructs.cpp Wed Sep 16 15:54:32 2015 -0700
@@ -832,6 +832,7 @@
static_field(StubRoutines, _ghash_processBlocks, address) \
static_field(StubRoutines, _updateBytesCRC32, address) \
static_field(StubRoutines, _crc_table_adr, address) \
+ static_field(StubRoutines, _crc32c_table_addr, address) \
static_field(StubRoutines, _updateBytesCRC32C, address) \
static_field(StubRoutines, _multiplyToLen, address) \
static_field(StubRoutines, _squareToLen, address) \