8143012: CRC32 Intrinsics support on SPARC
Reviewed-by: kvn, roland
Contributed-by: ahmed.khawaja@oracle.com
--- a/hotspot/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp Fri Nov 20 10:09:42 2015 +0100
+++ b/hotspot/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp Fri Nov 20 08:29:10 2015 -0800
@@ -2812,7 +2812,23 @@
}
void LIR_Assembler::emit_updatecrc32(LIR_OpUpdateCRC32* op) {
- fatal("CRC32 intrinsic is not implemented on this platform");
+ assert(op->crc()->is_single_cpu(), "crc must be register");
+ assert(op->val()->is_single_cpu(), "byte value must be register");
+ assert(op->result_opr()->is_single_cpu(), "result must be register");
+ Register crc = op->crc()->as_register();
+ Register val = op->val()->as_register();
+ Register table = op->result_opr()->as_register();
+ Register res = op->result_opr()->as_register();
+
+ assert_different_registers(val, crc, table);
+
+ __ set(ExternalAddress(StubRoutines::crc_table_addr()), table);
+ __ not1(crc);
+ __ clruwu(crc);
+ __ update_byte_crc32(crc, val, table);
+ __ not1(crc);
+
+ __ mov(crc, res);
}
void LIR_Assembler::emit_lock(LIR_OpLock* op) {
--- a/hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp Fri Nov 20 10:09:42 2015 +0100
+++ b/hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp Fri Nov 20 08:29:10 2015 -0800
@@ -786,7 +786,86 @@
}
void LIRGenerator::do_update_CRC32(Intrinsic* x) {
- fatal("CRC32 intrinsic is not implemented on this platform");
+ // Make all state_for calls early since they can emit code
+ LIR_Opr result = rlock_result(x);
+ int flags = 0;
+ switch (x->id()) {
+ case vmIntrinsics::_updateCRC32: {
+ LIRItem crc(x->argument_at(0), this);
+ LIRItem val(x->argument_at(1), this);
+ // val is destroyed by update_crc32
+ val.set_destroys_register();
+ crc.load_item();
+ val.load_item();
+ __ update_crc32(crc.result(), val.result(), result);
+ break;
+ }
+ case vmIntrinsics::_updateBytesCRC32:
+ case vmIntrinsics::_updateByteBufferCRC32: {
+
+ bool is_updateBytes = (x->id() == vmIntrinsics::_updateBytesCRC32);
+
+ LIRItem crc(x->argument_at(0), this);
+ LIRItem buf(x->argument_at(1), this);
+ LIRItem off(x->argument_at(2), this);
+ LIRItem len(x->argument_at(3), this);
+
+ buf.load_item();
+ off.load_nonconstant();
+
+ LIR_Opr index = off.result();
+ int offset = is_updateBytes ? arrayOopDesc::base_offset_in_bytes(T_BYTE) : 0;
+ if(off.result()->is_constant()) {
+ index = LIR_OprFact::illegalOpr;
+ offset += off.result()->as_jint();
+ }
+
+ LIR_Opr base_op = buf.result();
+
+ if (index->is_valid()) {
+ LIR_Opr tmp = new_register(T_LONG);
+ __ convert(Bytecodes::_i2l, index, tmp);
+ index = tmp;
+ if (index->is_constant()) {
+ offset += index->as_constant_ptr()->as_jint();
+ index = LIR_OprFact::illegalOpr;
+ } else if (index->is_register()) {
+ LIR_Opr tmp2 = new_register(T_LONG);
+ LIR_Opr tmp3 = new_register(T_LONG);
+ __ move(base_op, tmp2);
+ __ move(index, tmp3);
+ __ add(tmp2, tmp3, tmp2);
+ base_op = tmp2;
+ } else {
+ ShouldNotReachHere();
+ }
+ }
+
+ LIR_Address* a = new LIR_Address(base_op, offset, T_BYTE);
+
+ BasicTypeList signature(3);
+ signature.append(T_INT);
+ signature.append(T_ADDRESS);
+ signature.append(T_INT);
+ CallingConvention* cc = frame_map()->c_calling_convention(&signature);
+ const LIR_Opr result_reg = result_register_for(x->type());
+
+ LIR_Opr addr = new_pointer_register();
+ __ leal(LIR_OprFact::address(a), addr);
+
+ crc.load_item_force(cc->at(0));
+ __ move(addr, cc->at(1));
+ len.load_item_force(cc->at(2));
+
+ __ call_runtime_leaf(StubRoutines::updateBytesCRC32(), getThreadTemp(), result_reg, cc->args());
+ __ move(result_reg, result);
+
+ break;
+ }
+ default: {
+ ShouldNotReachHere();
+ }
+ }
}
// _i2l, _i2f, _i2d, _l2i, _l2f, _l2d, _f2i, _f2l, _f2d, _d2i, _d2l, _d2f
--- a/hotspot/src/cpu/sparc/vm/interpreterGenerator_sparc.hpp Fri Nov 20 10:09:42 2015 +0100
+++ b/hotspot/src/cpu/sparc/vm/interpreterGenerator_sparc.hpp Fri Nov 20 08:29:10 2015 -0800
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -43,8 +43,9 @@
void generate_counter_incr(Label* overflow, Label* profile_method, Label* profile_method_continue);
void generate_counter_overflow(Label& Lcontinue);
+ address generate_CRC32_update_entry();
+ address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind);
+
// Not supported
- address generate_CRC32_update_entry() { return NULL; }
- address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
#endif // CPU_SPARC_VM_INTERPRETERGENERATOR_SPARC_HPP
--- a/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.cpp Fri Nov 20 10:09:42 2015 +0100
+++ b/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.cpp Fri Nov 20 08:29:10 2015 -0800
@@ -4771,3 +4771,243 @@
movdtox(src, tmp1);
reverse_bytes_32(tmp1, dst, tmp2);
}
+
+void MacroAssembler::fold_128bit_crc32(Register xcrc_hi, Register xcrc_lo, Register xK_hi, Register xK_lo, Register xtmp_hi, Register xtmp_lo, Register buf, int offset) {
+ xmulx(xcrc_hi, xK_hi, xtmp_lo);
+ xmulxhi(xcrc_hi, xK_hi, xtmp_hi);
+ xmulxhi(xcrc_lo, xK_lo, xcrc_hi);
+ xmulx(xcrc_lo, xK_lo, xcrc_lo);
+ xor3(xcrc_lo, xtmp_lo, xcrc_lo);
+ xor3(xcrc_hi, xtmp_hi, xcrc_hi);
+ ldxl(buf, G0, xtmp_lo);
+ inc(buf, 8);
+ ldxl(buf, G0, xtmp_hi);
+ inc(buf, 8);
+ xor3(xcrc_lo, xtmp_lo, xcrc_lo);
+ xor3(xcrc_hi, xtmp_hi, xcrc_hi);
+}
+
+void MacroAssembler::fold_128bit_crc32(Register xcrc_hi, Register xcrc_lo, Register xK_hi, Register xK_lo, Register xtmp_hi, Register xtmp_lo, Register xbuf_hi, Register xbuf_lo) {
+ mov(xcrc_lo, xtmp_lo);
+ mov(xcrc_hi, xtmp_hi);
+ xmulx(xtmp_hi, xK_hi, xtmp_lo);
+ xmulxhi(xtmp_hi, xK_hi, xtmp_hi);
+ xmulxhi(xcrc_lo, xK_lo, xcrc_hi);
+ xmulx(xcrc_lo, xK_lo, xcrc_lo);
+ xor3(xcrc_lo, xbuf_lo, xcrc_lo);
+ xor3(xcrc_hi, xbuf_hi, xcrc_hi);
+ xor3(xcrc_lo, xtmp_lo, xcrc_lo);
+ xor3(xcrc_hi, xtmp_hi, xcrc_hi);
+}
+
+void MacroAssembler::fold_8bit_crc32(Register xcrc, Register table, Register xtmp, Register tmp) {
+ and3(xcrc, 0xFF, tmp);
+ sllx(tmp, 2, tmp);
+ lduw(table, tmp, xtmp);
+ srlx(xcrc, 8, xcrc);
+ xor3(xtmp, xcrc, xcrc);
+}
+
+void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
+ and3(crc, 0xFF, tmp);
+ srlx(crc, 8, crc);
+ sllx(tmp, 2, tmp);
+ lduw(table, tmp, tmp);
+ xor3(tmp, crc, crc);
+}
+
+#define CRC32_TMP_REG_NUM 18
+
+#define CRC32_CONST_64 0x163cd6124
+#define CRC32_CONST_96 0x0ccaa009e
+#define CRC32_CONST_160 0x1751997d0
+#define CRC32_CONST_480 0x1c6e41596
+#define CRC32_CONST_544 0x154442bd4
+
+void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table) {
+
+ Label L_cleanup_loop, L_cleanup_check, L_align_loop, L_align_check;
+ Label L_main_loop_prologue;
+ Label L_fold_512b, L_fold_512b_loop, L_fold_128b;
+ Label L_fold_tail, L_fold_tail_loop;
+ Label L_8byte_fold_loop, L_8byte_fold_check;
+
+ const Register tmp[CRC32_TMP_REG_NUM] = {L0, L1, L2, L3, L4, L5, L6, G1, I0, I1, I2, I3, I4, I5, I7, O4, O5, G3};
+
+ Register const_64 = tmp[CRC32_TMP_REG_NUM-1];
+ Register const_96 = tmp[CRC32_TMP_REG_NUM-1];
+ Register const_160 = tmp[CRC32_TMP_REG_NUM-2];
+ Register const_480 = tmp[CRC32_TMP_REG_NUM-1];
+ Register const_544 = tmp[CRC32_TMP_REG_NUM-2];
+
+ set(ExternalAddress(StubRoutines::crc_table_addr()), table);
+
+ not1(crc); // ~c
+ clruwu(crc); // clear upper 32 bits of crc
+
+ // Check if below cutoff, proceed directly to cleanup code
+ mov(31, G4);
+ cmp_and_br_short(len, G4, Assembler::lessEqualUnsigned, Assembler::pt, L_cleanup_check);
+
+ // Align buffer to 8 byte boundry
+ mov(8, O5);
+ and3(buf, 0x7, O4);
+ sub(O5, O4, O5);
+ and3(O5, 0x7, O5);
+ sub(len, O5, len);
+ ba(L_align_check);
+ delayed()->nop();
+
+ // Alignment loop, table look up method for up to 7 bytes
+ bind(L_align_loop);
+ ldub(buf, 0, O4);
+ inc(buf);
+ dec(O5);
+ xor3(O4, crc, O4);
+ and3(O4, 0xFF, O4);
+ sllx(O4, 2, O4);
+ lduw(table, O4, O4);
+ srlx(crc, 8, crc);
+ xor3(O4, crc, crc);
+ bind(L_align_check);
+ nop();
+ cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_align_loop);
+
+ // Aligned on 64-bit (8-byte) boundry at this point
+ // Check if still above cutoff (31-bytes)
+ mov(31, G4);
+ cmp_and_br_short(len, G4, Assembler::lessEqualUnsigned, Assembler::pt, L_cleanup_check);
+ // At least 32 bytes left to process
+
+ // Free up registers by storing them to FP registers
+ for (int i = 0; i < CRC32_TMP_REG_NUM; i++) {
+ movxtod(tmp[i], as_FloatRegister(2*i));
+ }
+
+ // Determine which loop to enter
+ // Shared prologue
+ ldxl(buf, G0, tmp[0]);
+ inc(buf, 8);
+ ldxl(buf, G0, tmp[1]);
+ inc(buf, 8);
+ xor3(tmp[0], crc, tmp[0]); // Fold CRC into first few bytes
+ and3(crc, 0, crc); // Clear out the crc register
+ // Main loop needs 128-bytes at least
+ mov(128, G4);
+ mov(64, tmp[2]);
+ cmp_and_br_short(len, G4, Assembler::greaterEqualUnsigned, Assembler::pt, L_main_loop_prologue);
+ // Less than 64 bytes
+ nop();
+ cmp_and_br_short(len, tmp[2], Assembler::lessUnsigned, Assembler::pt, L_fold_tail);
+ // Between 64 and 127 bytes
+ set64(CRC32_CONST_96, const_96, tmp[8]);
+ set64(CRC32_CONST_160, const_160, tmp[9]);
+ fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[2], tmp[3], buf, 0);
+ fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[4], tmp[5], buf, 16);
+ fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[6], tmp[7], buf, 32);
+ dec(len, 48);
+ ba(L_fold_tail);
+ delayed()->nop();
+
+ bind(L_main_loop_prologue);
+ for (int i = 2; i < 8; i++) {
+ ldxl(buf, G0, tmp[i]);
+ inc(buf, 8);
+ }
+
+ // Fold total 512 bits of polynomial on each iteration,
+ // 128 bits per each of 4 parallel streams
+ set64(CRC32_CONST_480, const_480, tmp[8]);
+ set64(CRC32_CONST_544, const_544, tmp[9]);
+
+ mov(128, G4);
+ bind(L_fold_512b_loop);
+ fold_128bit_crc32(tmp[1], tmp[0], const_480, const_544, tmp[9], tmp[8], buf, 0);
+ fold_128bit_crc32(tmp[3], tmp[2], const_480, const_544, tmp[11], tmp[10], buf, 16);
+ fold_128bit_crc32(tmp[5], tmp[4], const_480, const_544, tmp[13], tmp[12], buf, 32);
+ fold_128bit_crc32(tmp[7], tmp[6], const_480, const_544, tmp[15], tmp[14], buf, 64);
+ dec(len, 64);
+ cmp_and_br_short(len, G4, Assembler::greaterEqualUnsigned, Assembler::pt, L_fold_512b_loop);
+
+ // Fold 512 bits to 128 bits
+ bind(L_fold_512b);
+ set64(CRC32_CONST_96, const_96, tmp[8]);
+ set64(CRC32_CONST_160, const_160, tmp[9]);
+
+ fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[8], tmp[9], tmp[3], tmp[2]);
+ fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[8], tmp[9], tmp[5], tmp[4]);
+ fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[8], tmp[9], tmp[7], tmp[6]);
+ dec(len, 48);
+
+ // Fold the rest of 128 bits data chunks
+ bind(L_fold_tail);
+ mov(32, G4);
+ cmp_and_br_short(len, G4, Assembler::lessEqualUnsigned, Assembler::pt, L_fold_128b);
+
+ set64(CRC32_CONST_96, const_96, tmp[8]);
+ set64(CRC32_CONST_160, const_160, tmp[9]);
+
+ bind(L_fold_tail_loop);
+ fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[2], tmp[3], buf, 0);
+ sub(len, 16, len);
+ cmp_and_br_short(len, G4, Assembler::greaterEqualUnsigned, Assembler::pt, L_fold_tail_loop);
+
+ // Fold the 128 bits in tmps 0 - 1 into tmp 1
+ bind(L_fold_128b);
+
+ set64(CRC32_CONST_64, const_64, tmp[4]);
+
+ xmulx(const_64, tmp[0], tmp[2]);
+ xmulxhi(const_64, tmp[0], tmp[3]);
+
+ srl(tmp[2], G0, tmp[4]);
+ xmulx(const_64, tmp[4], tmp[4]);
+
+ srlx(tmp[2], 32, tmp[2]);
+ sllx(tmp[3], 32, tmp[3]);
+ or3(tmp[2], tmp[3], tmp[2]);
+
+ xor3(tmp[4], tmp[1], tmp[4]);
+ xor3(tmp[4], tmp[2], tmp[1]);
+ dec(len, 8);
+
+ // Use table lookup for the 8 bytes left in tmp[1]
+ dec(len, 8);
+
+ // 8 8-bit folds to compute 32-bit CRC.
+ for (int j = 0; j < 4; j++) {
+ fold_8bit_crc32(tmp[1], table, tmp[2], tmp[3]);
+ }
+ srl(tmp[1], G0, crc); // move 32 bits to general register
+ for (int j = 0; j < 4; j++) {
+ fold_8bit_crc32(crc, table, tmp[3]);
+ }
+
+ bind(L_8byte_fold_check);
+
+ // Restore int registers saved in FP registers
+ for (int i = 0; i < CRC32_TMP_REG_NUM; i++) {
+ movdtox(as_FloatRegister(2*i), tmp[i]);
+ }
+
+ ba(L_cleanup_check);
+ delayed()->nop();
+
+ // Table look-up method for the remaining few bytes
+ bind(L_cleanup_loop);
+ ldub(buf, 0, O4);
+ inc(buf);
+ dec(len);
+ xor3(O4, crc, O4);
+ and3(O4, 0xFF, O4);
+ sllx(O4, 2, O4);
+ lduw(table, O4, O4);
+ srlx(crc, 8, crc);
+ xor3(O4, crc, crc);
+ bind(L_cleanup_check);
+ nop();
+ cmp_and_br_short(len, 0, Assembler::greaterUnsigned, Assembler::pt, L_cleanup_loop);
+
+ not1(crc);
+}
+
--- a/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.hpp Fri Nov 20 10:09:42 2015 +0100
+++ b/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.hpp Fri Nov 20 08:29:10 2015 -0800
@@ -904,7 +904,9 @@
inline void ldf(FloatRegisterImpl::Width w, const Address& a, FloatRegister d, int offset = 0);
// little-endian
- inline void ldxl(Register s1, Register s2, Register d) { ldxa(s1, s2, ASI_PRIMARY_LITTLE, d); }
+ inline void lduwl(Register s1, Register s2, Register d) { lduwa(s1, s2, ASI_PRIMARY_LITTLE, d); }
+ inline void ldswl(Register s1, Register s2, Register d) { ldswa(s1, s2, ASI_PRIMARY_LITTLE, d);}
+ inline void ldxl( Register s1, Register s2, Register d) { ldxa(s1, s2, ASI_PRIMARY_LITTLE, d); }
inline void ldfl(FloatRegisterImpl::Width w, Register s1, Register s2, FloatRegister d) { ldfa(w, s1, s2, ASI_PRIMARY_LITTLE, d); }
// membar psuedo instruction. takes into account target memory model.
@@ -1469,6 +1471,15 @@
void movitof_revbytes(Register src, FloatRegister dst, Register tmp1, Register tmp2);
void movftoi_revbytes(FloatRegister src, Register dst, Register tmp1, Register tmp2);
+ // CRC32 code for java.util.zip.CRC32::updateBytes0() instrinsic.
+ void kernel_crc32(Register crc, Register buf, Register len, Register table);
+ // Fold 128-bit data chunk
+ void fold_128bit_crc32(Register xcrc_hi, Register xcrc_lo, Register xK_hi, Register xK_lo, Register xtmp_hi, Register xtmp_lo, Register buf, int offset);
+ void fold_128bit_crc32(Register xcrc_hi, Register xcrc_lo, Register xK_hi, Register xK_lo, Register xtmp_hi, Register xtmp_lo, Register xbuf_hi, Register xbuf_lo);
+ // Fold 8-bit data
+ void fold_8bit_crc32(Register xcrc, Register table, Register xtmp, Register tmp);
+ void fold_8bit_crc32(Register crc, Register table, Register tmp);
+
#undef VIRTUAL
};
--- a/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp Fri Nov 20 10:09:42 2015 +0100
+++ b/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp Fri Nov 20 08:29:10 2015 -0800
@@ -5292,6 +5292,38 @@
return start;
}
+/**
+ * Arguments:
+ *
+ * Inputs:
+ * O0 - int crc
+ * O1 - byte* buf
+ * O2 - int len
+ * O3 - int* table
+ *
+ * Output:
+ * O0 - int crc result
+ */
+ address generate_updateBytesCRC32() {
+ assert(UseCRC32Intrinsics, "need VIS3 instructions");
+
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
+ address start = __ pc();
+
+ const Register crc = O0; // crc
+ const Register buf = O1; // source java byte array address
+ const Register len = O2; // length
+ const Register table = O3; // crc_table address (reuse register)
+
+ __ kernel_crc32(crc, buf, len, table);
+
+ __ retl();
+ __ delayed()->nop();
+
+ return start;
+ }
+
void generate_initial() {
// Generates all stubs and initializes the entry points
@@ -5324,6 +5356,12 @@
// Build this early so it's available for the interpreter.
StubRoutines::_throw_StackOverflowError_entry = generate_throw_exception("StackOverflowError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
+
+ if (UseCRC32Intrinsics) {
+ // set table address before stub generation which use it
+ StubRoutines::_crc_table_adr = (address)StubRoutines::Sparc::_crc_table;
+ StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
+ }
}
--- a/hotspot/src/cpu/sparc/vm/stubRoutines_sparc.cpp Fri Nov 20 10:09:42 2015 +0100
+++ b/hotspot/src/cpu/sparc/vm/stubRoutines_sparc.cpp Fri Nov 20 08:29:10 2015 -0800
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -52,3 +52,98 @@
address StubRoutines::Sparc::_flush_callers_register_windows_entry = CAST_FROM_FN_PTR(address, bootstrap_flush_windows);
address StubRoutines::Sparc::_partial_subtype_check = NULL;
+
+uint64_t StubRoutines::Sparc::_crc_by128_masks[] =
+{
+ /* The fields in this structure are arranged so that they can be
+ * picked up two at a time with 128-bit loads.
+ *
+ * Because of flipped bit order for this CRC polynomials
+ * the constant for X**N is left-shifted by 1. This is because
+ * a 64 x 64 polynomial multiply produces a 127-bit result
+ * but the highest term is always aligned to bit 0 in the container.
+ * Pre-shifting by one fixes this, at the cost of potentially making
+ * the 32-bit constant no longer fit in a 32-bit container (thus the
+ * use of uint64_t, though this is also the size used by the carry-
+ * less multiply instruction.
+ *
+ * In addition, the flipped bit order and highest-term-at-least-bit
+ * multiply changes the constants used. The 96-bit result will be
+ * aligned to the high-term end of the target 128-bit container,
+ * not the low-term end; that is, instead of a 512-bit or 576-bit fold,
+ * instead it is a 480 (=512-32) or 544 (=512+64-32) bit fold.
+ *
+ * This cause additional problems in the 128-to-64-bit reduction; see the
+ * code for details. By storing a mask in the otherwise unused half of
+ * a 128-bit constant, bits can be cleared before multiplication without
+ * storing and reloading. Note that staying on a 128-bit datapath means
+ * that some data is uselessly stored and some unused data is intersected
+ * with an irrelevant constant.
+ */
+
+ ((uint64_t) 0xffffffffUL), /* low of K_M_64 */
+ ((uint64_t) 0xb1e6b092U << 1), /* high of K_M_64 */
+ ((uint64_t) 0xba8ccbe8U << 1), /* low of K_160_96 */
+ ((uint64_t) 0x6655004fU << 1), /* high of K_160_96 */
+ ((uint64_t) 0xaa2215eaU << 1), /* low of K_544_480 */
+ ((uint64_t) 0xe3720acbU << 1) /* high of K_544_480 */
+};
+
+/**
+ * crc_table[] from jdk/src/java.base/share/native/libzip/zlib-1.2.8/crc32.h
+ */
+juint StubRoutines::Sparc::_crc_table[] =
+{
+ 0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
+ 0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
+ 0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
+ 0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
+ 0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
+ 0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
+ 0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
+ 0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
+ 0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
+ 0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
+ 0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
+ 0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
+ 0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
+ 0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
+ 0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
+ 0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
+ 0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
+ 0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
+ 0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
+ 0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
+ 0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
+ 0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
+ 0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
+ 0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
+ 0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
+ 0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
+ 0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
+ 0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
+ 0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
+ 0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
+ 0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
+ 0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
+ 0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
+ 0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
+ 0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
+ 0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
+ 0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
+ 0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
+ 0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
+ 0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
+ 0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
+ 0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
+ 0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
+ 0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
+ 0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
+ 0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
+ 0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
+ 0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
+ 0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
+ 0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
+ 0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
+ 0x2d02ef8dUL
+};
--- a/hotspot/src/cpu/sparc/vm/stubRoutines_sparc.hpp Fri Nov 20 10:09:42 2015 +0100
+++ b/hotspot/src/cpu/sparc/vm/stubRoutines_sparc.hpp Fri Nov 20 08:29:10 2015 -0800
@@ -53,6 +53,9 @@
static address _flush_callers_register_windows_entry;
static address _partial_subtype_check;
+ // masks and table for CRC32
+ static uint64_t _crc_by128_masks[];
+ static juint _crc_table[];
public:
// test assembler stop routine by setting registers
@@ -65,6 +68,8 @@
static intptr_t* (*flush_callers_register_windows_func())() { return CAST_TO_FN_PTR(intptr_t* (*)(void), _flush_callers_register_windows_entry); }
static address partial_subtype_check() { return _partial_subtype_check; }
+
+ static address crc_by128_masks_addr() { return (address)_crc_by128_masks; }
};
#endif // CPU_SPARC_VM_STUBROUTINES_SPARC_HPP
--- a/hotspot/src/cpu/sparc/vm/templateInterpreter_sparc.cpp Fri Nov 20 10:09:42 2015 +0100
+++ b/hotspot/src/cpu/sparc/vm/templateInterpreter_sparc.cpp Fri Nov 20 08:29:10 2015 -0800
@@ -803,6 +803,106 @@
return NULL;
}
+/**
+ * Method entry for static native methods:
+ * int java.util.zip.CRC32.update(int crc, int b)
+ */
+address InterpreterGenerator::generate_CRC32_update_entry() {
+
+ if (UseCRC32Intrinsics) {
+ address entry = __ pc();
+
+ Label L_slow_path;
+ // If we need a safepoint check, generate full interpreter entry.
+ ExternalAddress state(SafepointSynchronize::address_of_state());
+ __ set(ExternalAddress(SafepointSynchronize::address_of_state()), O2);
+ __ set(SafepointSynchronize::_not_synchronized, O3);
+ __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pt, L_slow_path);
+
+ // Load parameters
+ const Register crc = O0; // initial crc
+ const Register val = O1; // byte to update with
+ const Register table = O2; // address of 256-entry lookup table
+
+ __ ldub(Gargs, 3, val);
+ __ lduw(Gargs, 8, crc);
+
+ __ set(ExternalAddress(StubRoutines::crc_table_addr()), table);
+
+ __ not1(crc); // ~crc
+ __ clruwu(crc);
+ __ update_byte_crc32(crc, val, table);
+ __ not1(crc); // ~crc
+
+ // result in O0
+ __ retl();
+ __ delayed()->nop();
+
+ // generate a vanilla native entry as the slow path
+ __ bind(L_slow_path);
+ __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::native));
+ return entry;
+ }
+ return NULL;
+}
+
+/**
+ * Method entry for static native methods:
+ * int java.util.zip.CRC32.updateBytes(int crc, byte[] b, int off, int len)
+ * int java.util.zip.CRC32.updateByteBuffer(int crc, long buf, int off, int len)
+ */
+address InterpreterGenerator::generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind) {
+
+ if (UseCRC32Intrinsics) {
+ address entry = __ pc();
+
+ Label L_slow_path;
+ // If we need a safepoint check, generate full interpreter entry.
+ ExternalAddress state(SafepointSynchronize::address_of_state());
+ __ set(ExternalAddress(SafepointSynchronize::address_of_state()), O2);
+ __ set(SafepointSynchronize::_not_synchronized, O3);
+ __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pt, L_slow_path);
+
+ // Load parameters from the stack
+ const Register crc = O0; // initial crc
+ const Register buf = O1; // source java byte array address
+ const Register len = O2; // len
+ const Register offset = O3; // offset
+
+ // Arguments are reversed on java expression stack
+ // Calculate address of start element
+ if (kind == Interpreter::java_util_zip_CRC32_updateByteBuffer) {
+ __ lduw(Gargs, 0, len);
+ __ lduw(Gargs, 8, offset);
+ __ ldx( Gargs, 16, buf);
+ __ lduw(Gargs, 32, crc);
+ __ add(buf, offset, buf);
+ } else {
+ __ lduw(Gargs, 0, len);
+ __ lduw(Gargs, 8, offset);
+ __ ldx( Gargs, 16, buf);
+ __ lduw(Gargs, 24, crc);
+ __ add(buf, arrayOopDesc::base_offset_in_bytes(T_BYTE), buf); // account for the header size
+ __ add(buf ,offset, buf);
+ }
+
+ // Call the crc32 kernel
+ __ MacroAssembler::save_thread(L7_thread_cache);
+ __ kernel_crc32(crc, buf, len, O3);
+ __ MacroAssembler::restore_thread(L7_thread_cache);
+
+ // result in O0
+ __ retl();
+ __ delayed()->nop();
+
+ // generate a vanilla native entry as the slow path
+ __ bind(L_slow_path);
+ __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::native));
+ return entry;
+ }
+ return NULL;
+}
+
//
// Interpreter stub for calling a native method. (asm interpreter)
// This sets up a somewhat different looking stack for calling the native method
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp Fri Nov 20 10:09:42 2015 +0100
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp Fri Nov 20 08:29:10 2015 -0800
@@ -347,6 +347,15 @@
FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
}
+ if (UseVIS > 2) {
+ if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {
+ FLAG_SET_DEFAULT(UseCRC32Intrinsics, true);
+ }
+ } else if (UseCRC32Intrinsics) {
+ warning("SPARC CRC32 intrinsics require VIS3 insructions support. Intriniscs will be disabled");
+ FLAG_SET_DEFAULT(UseCRC32Intrinsics, false);
+ }
+
if (FLAG_IS_DEFAULT(ContendedPaddingWidth) &&
(cache_line_size > ContendedPaddingWidth))
ContendedPaddingWidth = cache_line_size;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/intrinsics/crc32/TestCRC32.java Fri Nov 20 08:29:10 2015 -0800
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * @test
+ * @bug 8143012
+ * @summary CRC32 Intrinsics support on SPARC
+ *
+ * @run main/othervm/timeout=720 -Xbatch TestCRC32 -m
+ */
+
+import java.nio.ByteBuffer;
+import java.util.zip.Checksum;
+import java.util.zip.CRC32;
+
+public class TestCRC32 {
+ public static void main(String[] args) {
+ int offset = Integer.getInteger("offset", 0);
+ int msgSize = Integer.getInteger("msgSize", 512);
+ boolean multi = false;
+ int iters = 20000;
+ int warmupIters = 20000;
+
+ if (args.length > 0) {
+ if (args[0].equals("-m")) {
+ multi = true;
+ } else {
+ iters = Integer.valueOf(args[0]);
+ }
+ if (args.length > 1) {
+ warmupIters = Integer.valueOf(args[1]);
+ }
+ }
+
+ if (multi) {
+ test_multi(warmupIters);
+ return;
+ }
+
+ System.out.println(" offset = " + offset);
+ System.out.println("msgSize = " + msgSize + " bytes");
+ System.out.println(" iters = " + iters);
+
+ byte[] b = initializedBytes(msgSize, offset);
+
+ CRC32 crc0 = new CRC32();
+ CRC32 crc1 = new CRC32();
+ CRC32 crc2 = new CRC32();
+
+ crc0.update(b, offset, msgSize);
+
+ System.out.println("-------------------------------------------------------");
+
+ /* warm up */
+ for (int i = 0; i < warmupIters; i++) {
+ crc1.reset();
+ crc1.update(b, offset, msgSize);
+ }
+
+ /* measure performance */
+ long start = System.nanoTime();
+ for (int i = 0; i < iters; i++) {
+ crc1.reset();
+ crc1.update(b, offset, msgSize);
+ }
+ long end = System.nanoTime();
+ double total = (double)(end - start)/1e9; // in seconds
+ double thruput = (double)msgSize*iters/1e6/total; // in MB/s
+ System.out.println("CRC32.update(byte[]) runtime = " + total + " seconds");
+ System.out.println("CRC32.update(byte[]) throughput = " + thruput + " MB/s");
+
+ /* check correctness */
+ for (int i = 0; i < iters; i++) {
+ crc1.reset();
+ crc1.update(b, offset, msgSize);
+ if (!check(crc0, crc1)) break;
+ }
+ report("CRCs", crc0, crc1);
+
+ System.out.println("-------------------------------------------------------");
+
+ ByteBuffer buf = ByteBuffer.allocateDirect(msgSize);
+ buf.put(b, offset, msgSize);
+ buf.flip();
+
+ /* warm up */
+ for (int i = 0; i < warmupIters; i++) {
+ crc2.reset();
+ crc2.update(buf);
+ buf.rewind();
+ }
+
+ /* measure performance */
+ start = System.nanoTime();
+ for (int i = 0; i < iters; i++) {
+ crc2.reset();
+ crc2.update(buf);
+ buf.rewind();
+ }
+ end = System.nanoTime();
+ total = (double)(end - start)/1e9; // in seconds
+ thruput = (double)msgSize*iters/1e6/total; // in MB/s
+ System.out.println("CRC32.update(ByteBuffer) runtime = " + total + " seconds");
+ System.out.println("CRC32.update(ByteBuffer) throughput = " + thruput + " MB/s");
+
+ /* check correctness */
+ for (int i = 0; i < iters; i++) {
+ crc2.reset();
+ crc2.update(buf);
+ buf.rewind();
+ if (!check(crc0, crc2)) break;
+ }
+ report("CRCs", crc0, crc2);
+
+ System.out.println("-------------------------------------------------------");
+ }
+
+ private static void report(String s, Checksum crc0, Checksum crc1) {
+ System.out.printf("%s: crc0 = %08x, crc1 = %08x\n",
+ s, crc0.getValue(), crc1.getValue());
+ }
+
+ private static boolean check(Checksum crc0, Checksum crc1) {
+ if (crc0.getValue() != crc1.getValue()) {
+ System.err.printf("ERROR: crc0 = %08x, crc1 = %08x\n",
+ crc0.getValue(), crc1.getValue());
+ return false;
+ }
+ return true;
+ }
+
+ private static byte[] initializedBytes(int M, int offset) {
+ byte[] bytes = new byte[M + offset];
+ for (int i = 0; i < offset; i++) {
+ bytes[i] = (byte) i;
+ }
+ for (int i = offset; i < bytes.length; i++) {
+ bytes[i] = (byte) (i - offset);
+ }
+ return bytes;
+ }
+
+ private static void test_multi(int iters) {
+ int len1 = 8; // the 8B/iteration loop
+ int len2 = 32; // the 32B/iteration loop
+ int len3 = 4096; // the 4KB/iteration loop
+
+ byte[] b = initializedBytes(len3*16, 0);
+ int[] offsets = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 128, 256, 512 };
+ int[] sizes = { 0, 1, 2, 3, 4, 5, 6, 7,
+ len1, len1+1, len1+2, len1+3, len1+4, len1+5, len1+6, len1+7,
+ len1*2, len1*2+1, len1*2+3, len1*2+5, len1*2+7,
+ len2, len2+1, len2+3, len2+5, len2+7,
+ len2*2, len2*4, len2*8, len2*16, len2*32, len2*64,
+ len3, len3+1, len3+3, len3+5, len3+7,
+ len3*2, len3*4, len3*8,
+ len1+len2, len1+len2+1, len1+len2+3, len1+len2+5, len1+len2+7,
+ len1+len3, len1+len3+1, len1+len3+3, len1+len3+5, len1+len3+7,
+ len2+len3, len2+len3+1, len2+len3+3, len2+len3+5, len2+len3+7,
+ len1+len2+len3, len1+len2+len3+1, len1+len2+len3+3,
+ len1+len2+len3+5, len1+len2+len3+7,
+ (len1+len2+len3)*2, (len1+len2+len3)*2+1, (len1+len2+len3)*2+3,
+ (len1+len2+len3)*2+5, (len1+len2+len3)*2+7,
+ (len1+len2+len3)*3, (len1+len2+len3)*3-1, (len1+len2+len3)*3-3,
+ (len1+len2+len3)*3-5, (len1+len2+len3)*3-7 };
+ CRC32[] crc0 = new CRC32[offsets.length*sizes.length];
+ CRC32[] crc1 = new CRC32[offsets.length*sizes.length];
+ int i, j, k;
+
+ System.out.printf("testing %d cases ...\n", offsets.length*sizes.length);
+
+ /* set the result from interpreter as reference */
+ for (i = 0; i < offsets.length; i++) {
+ for (j = 0; j < sizes.length; j++) {
+ crc0[i*sizes.length + j] = new CRC32();
+ crc1[i*sizes.length + j] = new CRC32();
+ crc0[i*sizes.length + j].update(b, offsets[i], sizes[j]);
+ }
+ }
+
+ /* warm up the JIT compiler and get result */
+ for (k = 0; k < iters; k++) {
+ for (i = 0; i < offsets.length; i++) {
+ for (j = 0; j < sizes.length; j++) {
+ crc1[i*sizes.length + j].reset();
+ crc1[i*sizes.length + j].update(b, offsets[i], sizes[j]);
+ }
+ }
+ }
+
+ /* check correctness */
+ for (i = 0; i < offsets.length; i++) {
+ for (j = 0; j < sizes.length; j++) {
+ if (!check(crc0[i*sizes.length + j], crc1[i*sizes.length + j])) {
+ System.out.printf("offsets[%d] = %d", i, offsets[i]);
+ System.out.printf("\tsizes[%d] = %d\n", j, sizes[j]);
+ }
+ }
+ }
+ }
+}