8073108: Use x86 and SPARC CPU instructions for GHASH acceleration
Reviewed-by: kvn, jrose
--- a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp Tue Jun 16 16:10:36 2015 -0700
+++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp Wed Jun 17 17:48:25 2015 -0700
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2015, Red Hat Inc. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
@@ -190,6 +190,11 @@
}
}
+ if (UseGHASHIntrinsics) {
+ warning("GHASH intrinsics are not available on this CPU");
+ FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
+ }
+
if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {
UseCRC32Intrinsics = true;
}
--- a/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp Tue Jun 16 16:10:36 2015 -0700
+++ b/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp Wed Jun 17 17:48:25 2015 -0700
@@ -176,6 +176,11 @@
FLAG_SET_DEFAULT(UseAESIntrinsics, false);
}
+ if (UseGHASHIntrinsics) {
+ warning("GHASH intrinsics are not available on this CPU");
+ FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
+ }
+
if (UseSHA) {
warning("SHA instructions are not available on this CPU");
FLAG_SET_DEFAULT(UseSHA, false);
--- a/hotspot/src/cpu/sparc/vm/assembler_sparc.hpp Tue Jun 16 16:10:36 2015 -0700
+++ b/hotspot/src/cpu/sparc/vm/assembler_sparc.hpp Wed Jun 17 17:48:25 2015 -0700
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -129,6 +129,7 @@
flog3_op3 = 0x36,
edge_op3 = 0x36,
fsrc_op3 = 0x36,
+ xmulx_op3 = 0x36,
impdep2_op3 = 0x37,
stpartialf_op3 = 0x37,
jmpl_op3 = 0x38,
@@ -220,6 +221,8 @@
mdtox_opf = 0x110,
mstouw_opf = 0x111,
mstosw_opf = 0x113,
+ xmulx_opf = 0x115,
+ xmulxhi_opf = 0x116,
mxtod_opf = 0x118,
mwtos_opf = 0x119,
@@ -1212,6 +1215,9 @@
void movwtos( Register s, FloatRegister d ) { vis3_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::S) | op3(mftoi_op3) | opf(mwtos_opf) | rs2(s)); }
void movxtod( Register s, FloatRegister d ) { vis3_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(mftoi_op3) | opf(mxtod_opf) | rs2(s)); }
+ void xmulx(Register s1, Register s2, Register d) { vis3_only(); emit_int32( op(arith_op) | rd(d) | op3(xmulx_op3) | rs1(s1) | opf(xmulx_opf) | rs2(s2)); }
+ void xmulxhi(Register s1, Register s2, Register d) { vis3_only(); emit_int32( op(arith_op) | rd(d) | op3(xmulx_op3) | rs1(s1) | opf(xmulxhi_opf) | rs2(s2)); }
+
// Crypto SHA instructions
void sha1() { sha1_only(); emit_int32( op(arith_op) | op3(sha_op3) | opf(sha1_opf)); }
--- a/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp Tue Jun 16 16:10:36 2015 -0700
+++ b/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp Wed Jun 17 17:48:25 2015 -0700
@@ -4786,6 +4786,130 @@
return start;
}
+ /* Single and multi-block ghash operations */
+ address generate_ghash_processBlocks() {
+ __ align(CodeEntryAlignment);
+ Label L_ghash_loop, L_aligned, L_main;
+ StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
+ address start = __ pc();
+
+ Register state = I0;
+ Register subkeyH = I1;
+ Register data = I2;
+ Register len = I3;
+
+ __ save_frame(0);
+
+ __ ldx(state, 0, O0);
+ __ ldx(state, 8, O1);
+
+ // Loop label for multiblock operations
+ __ BIND(L_ghash_loop);
+
+ // Check if 'data' is unaligned
+ __ andcc(data, 7, G1);
+ __ br(Assembler::zero, false, Assembler::pt, L_aligned);
+ __ delayed()->nop();
+
+ Register left_shift = L1;
+ Register right_shift = L2;
+ Register data_ptr = L3;
+
+ // Get left and right shift values in bits
+ __ sll(G1, LogBitsPerByte, left_shift);
+ __ mov(64, right_shift);
+ __ sub(right_shift, left_shift, right_shift);
+
+ // Align to read 'data'
+ __ sub(data, G1, data_ptr);
+
+ // Load first 8 bytes of 'data'
+ __ ldx(data_ptr, 0, O4);
+ __ sllx(O4, left_shift, O4);
+ __ ldx(data_ptr, 8, O5);
+ __ srlx(O5, right_shift, G4);
+ __ bset(G4, O4);
+
+ // Load second 8 bytes of 'data'
+ __ sllx(O5, left_shift, O5);
+ __ ldx(data_ptr, 16, G4);
+ __ srlx(G4, right_shift, G4);
+ __ ba(L_main);
+ __ delayed()->bset(G4, O5);
+
+ // If 'data' is aligned, load normally
+ __ BIND(L_aligned);
+ __ ldx(data, 0, O4);
+ __ ldx(data, 8, O5);
+
+ __ BIND(L_main);
+ __ ldx(subkeyH, 0, O2);
+ __ ldx(subkeyH, 8, O3);
+
+ __ xor3(O0, O4, O0);
+ __ xor3(O1, O5, O1);
+
+ __ xmulxhi(O0, O3, G3);
+ __ xmulx(O0, O2, O5);
+ __ xmulxhi(O1, O2, G4);
+ __ xmulxhi(O1, O3, G5);
+ __ xmulx(O0, O3, G1);
+ __ xmulx(O1, O3, G2);
+ __ xmulx(O1, O2, O3);
+ __ xmulxhi(O0, O2, O4);
+
+ __ mov(0xE1, O0);
+ __ sllx(O0, 56, O0);
+
+ __ xor3(O5, G3, O5);
+ __ xor3(O5, G4, O5);
+ __ xor3(G5, G1, G1);
+ __ xor3(G1, O3, G1);
+ __ srlx(G2, 63, O1);
+ __ srlx(G1, 63, G3);
+ __ sllx(G2, 63, O3);
+ __ sllx(G2, 58, O2);
+ __ xor3(O3, O2, O2);
+
+ __ sllx(G1, 1, G1);
+ __ or3(G1, O1, G1);
+
+ __ xor3(G1, O2, G1);
+
+ __ sllx(G2, 1, G2);
+
+ __ xmulxhi(G1, O0, O1);
+ __ xmulx(G1, O0, O2);
+ __ xmulxhi(G2, O0, O3);
+ __ xmulx(G2, O0, G1);
+
+ __ xor3(O4, O1, O4);
+ __ xor3(O5, O2, O5);
+ __ xor3(O5, O3, O5);
+
+ __ sllx(O4, 1, O2);
+ __ srlx(O5, 63, O3);
+
+ __ or3(O2, O3, O0);
+
+ __ sllx(O5, 1, O1);
+ __ srlx(G1, 63, O2);
+ __ or3(O1, O2, O1);
+ __ xor3(O1, G3, O1);
+
+ __ deccc(len);
+ __ br(Assembler::notZero, true, Assembler::pt, L_ghash_loop);
+ __ delayed()->add(data, 16, data);
+
+ __ stx(O0, I0, 0);
+ __ stx(O1, I0, 8);
+
+ __ ret();
+ __ delayed()->restore();
+
+ return start;
+ }
+
void generate_initial() {
// Generates all stubs and initializes the entry points
@@ -4859,6 +4983,10 @@
StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
}
+ // generate GHASH intrinsics code
+ if (UseGHASHIntrinsics) {
+ StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
+ }
// generate SHA1/SHA256/SHA512 intrinsics code
if (UseSHA1Intrinsics) {
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp Tue Jun 16 16:10:36 2015 -0700
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp Wed Jun 17 17:48:25 2015 -0700
@@ -300,6 +300,17 @@
}
}
+ // GHASH/GCM intrinsics
+ if (has_vis3() && (UseVIS > 2)) {
+ if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) {
+ UseGHASHIntrinsics = true;
+ }
+ } else if (UseGHASHIntrinsics) {
+ if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics))
+ warning("GHASH intrinsics require VIS3 insructions support. Intriniscs will be disabled");
+ FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
+ }
+
// SHA1, SHA256, and SHA512 instructions were added to SPARC T-series at different times
if (has_sha1() || has_sha256() || has_sha512()) {
if (UseVIS > 0) { // SHA intrinsics use VIS1 instructions
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp Tue Jun 16 16:10:36 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp Wed Jun 17 17:48:25 2015 -0700
@@ -3095,8 +3095,16 @@
void Assembler::psrldq(XMMRegister dst, int shift) {
// Shift 128 bit value in xmm register by number of bytes.
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
- int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F,
- false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+ int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+ emit_int8(0x73);
+ emit_int8((unsigned char)(0xC0 | encode));
+ emit_int8(shift);
+}
+
+void Assembler::pslldq(XMMRegister dst, int shift) {
+ // Shift left 128 bit value in xmm register by number of bytes.
+ NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+ int encode = simd_prefix_and_encode(xmm7, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
emit_int8(0x73);
emit_int8((unsigned char)(0xC0 | encode));
emit_int8(shift);
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp Tue Jun 16 16:10:36 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp Wed Jun 17 17:48:25 2015 -0700
@@ -1666,6 +1666,8 @@
// Shift Right by bytes Logical DoubleQuadword Immediate
void psrldq(XMMRegister dst, int shift);
+ // Shift Left by bytes Logical DoubleQuadword Immediate
+ void pslldq(XMMRegister dst, int shift);
// Logical Compare 128bit
void ptest(XMMRegister dst, XMMRegister src);
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp Tue Jun 16 16:10:36 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp Wed Jun 17 17:48:25 2015 -0700
@@ -2727,6 +2727,167 @@
return start;
}
+ // byte swap x86 long
+ address generate_ghash_long_swap_mask() {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
+ address start = __ pc();
+ __ emit_data(0x0b0a0908, relocInfo::none, 0);
+ __ emit_data(0x0f0e0d0c, relocInfo::none, 0);
+ __ emit_data(0x03020100, relocInfo::none, 0);
+ __ emit_data(0x07060504, relocInfo::none, 0);
+
+ return start;
+ }
+
+ // byte swap x86 byte array
+ address generate_ghash_byte_swap_mask() {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
+ address start = __ pc();
+ __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
+ __ emit_data(0x08090a0b, relocInfo::none, 0);
+ __ emit_data(0x04050607, relocInfo::none, 0);
+ __ emit_data(0x00010203, relocInfo::none, 0);
+ return start;
+ }
+
+ /* Single and multi-block ghash operations */
+ address generate_ghash_processBlocks() {
+ assert(UseGHASHIntrinsics, "need GHASH intrinsics and CLMUL support");
+ __ align(CodeEntryAlignment);
+ Label L_ghash_loop, L_exit;
+ StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
+ address start = __ pc();
+
+ const Register state = rdi;
+ const Register subkeyH = rsi;
+ const Register data = rdx;
+ const Register blocks = rcx;
+
+ const Address state_param(rbp, 8+0);
+ const Address subkeyH_param(rbp, 8+4);
+ const Address data_param(rbp, 8+8);
+ const Address blocks_param(rbp, 8+12);
+
+ const XMMRegister xmm_temp0 = xmm0;
+ const XMMRegister xmm_temp1 = xmm1;
+ const XMMRegister xmm_temp2 = xmm2;
+ const XMMRegister xmm_temp3 = xmm3;
+ const XMMRegister xmm_temp4 = xmm4;
+ const XMMRegister xmm_temp5 = xmm5;
+ const XMMRegister xmm_temp6 = xmm6;
+ const XMMRegister xmm_temp7 = xmm7;
+
+ __ enter();
+
+ __ movptr(state, state_param);
+ __ movptr(subkeyH, subkeyH_param);
+ __ movptr(data, data_param);
+ __ movptr(blocks, blocks_param);
+
+ __ movdqu(xmm_temp0, Address(state, 0));
+ __ pshufb(xmm_temp0, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+
+ __ movdqu(xmm_temp1, Address(subkeyH, 0));
+ __ pshufb(xmm_temp1, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+
+ __ BIND(L_ghash_loop);
+ __ movdqu(xmm_temp2, Address(data, 0));
+ __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
+
+ __ pxor(xmm_temp0, xmm_temp2);
+
+ //
+ // Multiply with the hash key
+ //
+ __ movdqu(xmm_temp3, xmm_temp0);
+ __ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0
+ __ movdqu(xmm_temp4, xmm_temp0);
+ __ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1
+
+ __ movdqu(xmm_temp5, xmm_temp0);
+ __ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0
+ __ movdqu(xmm_temp6, xmm_temp0);
+ __ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1
+
+ __ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0
+
+ __ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5
+ __ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right
+ __ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left
+ __ pxor(xmm_temp3, xmm_temp5);
+ __ pxor(xmm_temp6, xmm_temp4); // Register pair <xmm6:xmm3> holds the result
+ // of the carry-less multiplication of
+ // xmm0 by xmm1.
+
+ // We shift the result of the multiplication by one bit position
+ // to the left to cope for the fact that the bits are reversed.
+ __ movdqu(xmm_temp7, xmm_temp3);
+ __ movdqu(xmm_temp4, xmm_temp6);
+ __ pslld (xmm_temp3, 1);
+ __ pslld(xmm_temp6, 1);
+ __ psrld(xmm_temp7, 31);
+ __ psrld(xmm_temp4, 31);
+ __ movdqu(xmm_temp5, xmm_temp7);
+ __ pslldq(xmm_temp4, 4);
+ __ pslldq(xmm_temp7, 4);
+ __ psrldq(xmm_temp5, 12);
+ __ por(xmm_temp3, xmm_temp7);
+ __ por(xmm_temp6, xmm_temp4);
+ __ por(xmm_temp6, xmm_temp5);
+
+ //
+ // First phase of the reduction
+ //
+ // Move xmm3 into xmm4, xmm5, xmm7 in order to perform the shifts
+ // independently.
+ __ movdqu(xmm_temp7, xmm_temp3);
+ __ movdqu(xmm_temp4, xmm_temp3);
+ __ movdqu(xmm_temp5, xmm_temp3);
+ __ pslld(xmm_temp7, 31); // packed right shift shifting << 31
+ __ pslld(xmm_temp4, 30); // packed right shift shifting << 30
+ __ pslld(xmm_temp5, 25); // packed right shift shifting << 25
+ __ pxor(xmm_temp7, xmm_temp4); // xor the shifted versions
+ __ pxor(xmm_temp7, xmm_temp5);
+ __ movdqu(xmm_temp4, xmm_temp7);
+ __ pslldq(xmm_temp7, 12);
+ __ psrldq(xmm_temp4, 4);
+ __ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete
+
+ //
+ // Second phase of the reduction
+ //
+ // Make 3 copies of xmm3 in xmm2, xmm5, xmm7 for doing these
+ // shift operations.
+ __ movdqu(xmm_temp2, xmm_temp3);
+ __ movdqu(xmm_temp7, xmm_temp3);
+ __ movdqu(xmm_temp5, xmm_temp3);
+ __ psrld(xmm_temp2, 1); // packed left shifting >> 1
+ __ psrld(xmm_temp7, 2); // packed left shifting >> 2
+ __ psrld(xmm_temp5, 7); // packed left shifting >> 7
+ __ pxor(xmm_temp2, xmm_temp7); // xor the shifted versions
+ __ pxor(xmm_temp2, xmm_temp5);
+ __ pxor(xmm_temp2, xmm_temp4);
+ __ pxor(xmm_temp3, xmm_temp2);
+ __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6
+
+ __ decrement(blocks);
+ __ jcc(Assembler::zero, L_exit);
+ __ movdqu(xmm_temp0, xmm_temp6);
+ __ addptr(data, 16);
+ __ jmp(L_ghash_loop);
+
+ __ BIND(L_exit);
+ // Byte swap 16-byte result
+ __ pshufb(xmm_temp6, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+ __ movdqu(Address(state, 0), xmm_temp6); // store the result
+
+ __ leave();
+ __ ret(0);
+ return start;
+ }
+
/**
* Arguments:
*
@@ -3026,6 +3187,13 @@
StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
}
+ // Generate GHASH intrinsics code
+ if (UseGHASHIntrinsics) {
+ StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
+ StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
+ StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
+ }
+
// Safefetch stubs.
generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
&StubRoutines::_safefetch32_fault_pc,
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp Tue Jun 16 16:10:36 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp Wed Jun 17 17:48:25 2015 -0700
@@ -3681,6 +3681,175 @@
return start;
}
+
+ // byte swap x86 long
+ address generate_ghash_long_swap_mask() {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
+ address start = __ pc();
+ __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
+ __ emit_data64(0x0706050403020100, relocInfo::none );
+ return start;
+ }
+
+ // byte swap x86 byte array
+ address generate_ghash_byte_swap_mask() {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
+ address start = __ pc();
+ __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
+ __ emit_data64(0x0001020304050607, relocInfo::none );
+ return start;
+ }
+
+ /* Single and multi-block ghash operations */
+ address generate_ghash_processBlocks() {
+ __ align(CodeEntryAlignment);
+ Label L_ghash_loop, L_exit;
+ StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
+ address start = __ pc();
+
+ const Register state = c_rarg0;
+ const Register subkeyH = c_rarg1;
+ const Register data = c_rarg2;
+ const Register blocks = c_rarg3;
+
+#ifdef _WIN64
+ const int XMM_REG_LAST = 10;
+#endif
+
+ const XMMRegister xmm_temp0 = xmm0;
+ const XMMRegister xmm_temp1 = xmm1;
+ const XMMRegister xmm_temp2 = xmm2;
+ const XMMRegister xmm_temp3 = xmm3;
+ const XMMRegister xmm_temp4 = xmm4;
+ const XMMRegister xmm_temp5 = xmm5;
+ const XMMRegister xmm_temp6 = xmm6;
+ const XMMRegister xmm_temp7 = xmm7;
+ const XMMRegister xmm_temp8 = xmm8;
+ const XMMRegister xmm_temp9 = xmm9;
+ const XMMRegister xmm_temp10 = xmm10;
+
+ __ enter();
+
+#ifdef _WIN64
+ // save the xmm registers which must be preserved 6-10
+ __ subptr(rsp, -rsp_after_call_off * wordSize);
+ for (int i = 6; i <= XMM_REG_LAST; i++) {
+ __ movdqu(xmm_save(i), as_XMMRegister(i));
+ }
+#endif
+
+ __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+
+ __ movdqu(xmm_temp0, Address(state, 0));
+ __ pshufb(xmm_temp0, xmm_temp10);
+
+
+ __ BIND(L_ghash_loop);
+ __ movdqu(xmm_temp2, Address(data, 0));
+ __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
+
+ __ movdqu(xmm_temp1, Address(subkeyH, 0));
+ __ pshufb(xmm_temp1, xmm_temp10);
+
+ __ pxor(xmm_temp0, xmm_temp2);
+
+ //
+ // Multiply with the hash key
+ //
+ __ movdqu(xmm_temp3, xmm_temp0);
+ __ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0
+ __ movdqu(xmm_temp4, xmm_temp0);
+ __ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1
+
+ __ movdqu(xmm_temp5, xmm_temp0);
+ __ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0
+ __ movdqu(xmm_temp6, xmm_temp0);
+ __ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1
+
+ __ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0
+
+ __ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5
+ __ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right
+ __ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left
+ __ pxor(xmm_temp3, xmm_temp5);
+ __ pxor(xmm_temp6, xmm_temp4); // Register pair <xmm6:xmm3> holds the result
+ // of the carry-less multiplication of
+ // xmm0 by xmm1.
+
+ // We shift the result of the multiplication by one bit position
+ // to the left to cope for the fact that the bits are reversed.
+ __ movdqu(xmm_temp7, xmm_temp3);
+ __ movdqu(xmm_temp8, xmm_temp6);
+ __ pslld(xmm_temp3, 1);
+ __ pslld(xmm_temp6, 1);
+ __ psrld(xmm_temp7, 31);
+ __ psrld(xmm_temp8, 31);
+ __ movdqu(xmm_temp9, xmm_temp7);
+ __ pslldq(xmm_temp8, 4);
+ __ pslldq(xmm_temp7, 4);
+ __ psrldq(xmm_temp9, 12);
+ __ por(xmm_temp3, xmm_temp7);
+ __ por(xmm_temp6, xmm_temp8);
+ __ por(xmm_temp6, xmm_temp9);
+
+ //
+ // First phase of the reduction
+ //
+ // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
+ // independently.
+ __ movdqu(xmm_temp7, xmm_temp3);
+ __ movdqu(xmm_temp8, xmm_temp3);
+ __ movdqu(xmm_temp9, xmm_temp3);
+ __ pslld(xmm_temp7, 31); // packed right shift shifting << 31
+ __ pslld(xmm_temp8, 30); // packed right shift shifting << 30
+ __ pslld(xmm_temp9, 25); // packed right shift shifting << 25
+ __ pxor(xmm_temp7, xmm_temp8); // xor the shifted versions
+ __ pxor(xmm_temp7, xmm_temp9);
+ __ movdqu(xmm_temp8, xmm_temp7);
+ __ pslldq(xmm_temp7, 12);
+ __ psrldq(xmm_temp8, 4);
+ __ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete
+
+ //
+ // Second phase of the reduction
+ //
+ // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
+ // shift operations.
+ __ movdqu(xmm_temp2, xmm_temp3);
+ __ movdqu(xmm_temp4, xmm_temp3);
+ __ movdqu(xmm_temp5, xmm_temp3);
+ __ psrld(xmm_temp2, 1); // packed left shifting >> 1
+ __ psrld(xmm_temp4, 2); // packed left shifting >> 2
+ __ psrld(xmm_temp5, 7); // packed left shifting >> 7
+ __ pxor(xmm_temp2, xmm_temp4); // xor the shifted versions
+ __ pxor(xmm_temp2, xmm_temp5);
+ __ pxor(xmm_temp2, xmm_temp8);
+ __ pxor(xmm_temp3, xmm_temp2);
+ __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6
+
+ __ decrement(blocks);
+ __ jcc(Assembler::zero, L_exit);
+ __ movdqu(xmm_temp0, xmm_temp6);
+ __ addptr(data, 16);
+ __ jmp(L_ghash_loop);
+
+ __ BIND(L_exit);
+ __ pshufb(xmm_temp6, xmm_temp10); // Byte swap 16-byte result
+ __ movdqu(Address(state, 0), xmm_temp6); // store the result
+
+#ifdef _WIN64
+ // restore xmm regs belonging to calling function
+ for (int i = 6; i <= XMM_REG_LAST; i++) {
+ __ movdqu(as_XMMRegister(i), xmm_save(i));
+ }
+#endif
+ __ leave();
+ __ ret(0);
+ return start;
+ }
+
/**
* Arguments:
*
@@ -4120,6 +4289,13 @@
StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
}
+ // Generate GHASH intrinsics code
+ if (UseGHASHIntrinsics) {
+ StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
+ StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
+ StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
+ }
+
// Safefetch stubs.
generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
&StubRoutines::_safefetch32_fault_pc,
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp Tue Jun 16 16:10:36 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp Wed Jun 17 17:48:25 2015 -0700
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -33,6 +33,8 @@
address StubRoutines::x86::_verify_mxcsr_entry = NULL;
address StubRoutines::x86::_key_shuffle_mask_addr = NULL;
+address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL;
+address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL;
uint64_t StubRoutines::x86::_crc_by128_masks[] =
{
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp Tue Jun 16 16:10:36 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp Wed Jun 17 17:48:25 2015 -0700
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -36,10 +36,15 @@
// masks and table for CRC32
static uint64_t _crc_by128_masks[];
static juint _crc_table[];
+ // swap mask for ghash
+ static address _ghash_long_swap_mask_addr;
+ static address _ghash_byte_swap_mask_addr;
public:
static address verify_mxcsr_entry() { return _verify_mxcsr_entry; }
static address key_shuffle_mask_addr() { return _key_shuffle_mask_addr; }
static address crc_by128_masks_addr() { return (address)_crc_by128_masks; }
+ static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; }
+ static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; }
#endif // CPU_X86_VM_STUBROUTINES_X86_32_HPP
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Tue Jun 16 16:10:36 2015 -0700
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Wed Jun 17 17:48:25 2015 -0700
@@ -677,6 +677,17 @@
FLAG_SET_DEFAULT(UseAESIntrinsics, false);
}
+ // GHASH/GCM intrinsics
+ if (UseCLMUL && (UseSSE > 2)) {
+ if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) {
+ UseGHASHIntrinsics = true;
+ }
+ } else if (UseGHASHIntrinsics) {
+ if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics))
+ warning("GHASH intrinsic requires CLMUL and SSE2 instructions on this CPU");
+ FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
+ }
+
if (UseSHA) {
warning("SHA instructions are not available on this CPU");
FLAG_SET_DEFAULT(UseSHA, false);
--- a/hotspot/src/share/vm/classfile/vmSymbols.hpp Tue Jun 16 16:10:36 2015 -0700
+++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp Wed Jun 17 17:48:25 2015 -0700
@@ -846,6 +846,12 @@
do_name( implCompressMB_name, "implCompressMultiBlock") \
do_signature(implCompressMB_signature, "([BII)I") \
\
+ /* support for com.sun.crypto.provider.GHASH */ \
+ do_class(com_sun_crypto_provider_ghash, "com/sun/crypto/provider/GHASH") \
+ do_intrinsic(_ghash_processBlocks, com_sun_crypto_provider_ghash, processBlocks_name, ghash_processBlocks_signature, F_S) \
+ do_name(processBlocks_name, "processBlocks") \
+ do_signature(ghash_processBlocks_signature, "([BII[J[J)V") \
+ \
/* support for java.util.zip */ \
do_class(java_util_zip_CRC32, "java/util/zip/CRC32") \
do_intrinsic(_updateCRC32, java_util_zip_CRC32, update_name, int2_int_signature, F_SN) \
--- a/hotspot/src/share/vm/opto/escape.cpp Tue Jun 16 16:10:36 2015 -0700
+++ b/hotspot/src/share/vm/opto/escape.cpp Wed Jun 17 17:48:25 2015 -0700
@@ -966,6 +966,7 @@
strcmp(call->as_CallLeaf()->_name, "aescrypt_decryptBlock") == 0 ||
strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_encryptAESCrypt") == 0 ||
strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_decryptAESCrypt") == 0 ||
+ strcmp(call->as_CallLeaf()->_name, "ghash_processBlocks") == 0 ||
strcmp(call->as_CallLeaf()->_name, "sha1_implCompress") == 0 ||
strcmp(call->as_CallLeaf()->_name, "sha1_implCompressMB") == 0 ||
strcmp(call->as_CallLeaf()->_name, "sha256_implCompress") == 0 ||
--- a/hotspot/src/share/vm/opto/library_call.cpp Tue Jun 16 16:10:36 2015 -0700
+++ b/hotspot/src/share/vm/opto/library_call.cpp Wed Jun 17 17:48:25 2015 -0700
@@ -278,6 +278,7 @@
Node* inline_cipherBlockChaining_AESCrypt_predicate(bool decrypting);
Node* get_key_start_from_aescrypt_object(Node* aescrypt_object);
Node* get_original_key_start_from_aescrypt_object(Node* aescrypt_object);
+ bool inline_ghash_processBlocks();
bool inline_sha_implCompress(vmIntrinsics::ID id);
bool inline_digestBase_implCompressMB(int predicate);
bool inline_sha_implCompressMB(Node* digestBaseObj, ciInstanceKlass* instklass_SHA,
@@ -528,6 +529,10 @@
predicates = 3;
break;
+ case vmIntrinsics::_ghash_processBlocks:
+ if (!UseGHASHIntrinsics) return NULL;
+ break;
+
case vmIntrinsics::_updateCRC32:
case vmIntrinsics::_updateBytesCRC32:
case vmIntrinsics::_updateByteBufferCRC32:
@@ -929,6 +934,9 @@
case vmIntrinsics::_mulAdd:
return inline_mulAdd();
+ case vmIntrinsics::_ghash_processBlocks:
+ return inline_ghash_processBlocks();
+
case vmIntrinsics::_encodeISOArray:
return inline_encodeISOArray();
@@ -5858,6 +5866,35 @@
return _gvn.transform(region);
}
+//------------------------------inline_ghash_processBlocks
+bool LibraryCallKit::inline_ghash_processBlocks() {
+ address stubAddr;
+ const char *stubName;
+ assert(UseGHASHIntrinsics, "need GHASH intrinsics support");
+
+ stubAddr = StubRoutines::ghash_processBlocks();
+ stubName = "ghash_processBlocks";
+
+ Node* data = argument(0);
+ Node* offset = argument(1);
+ Node* len = argument(2);
+ Node* state = argument(3);
+ Node* subkeyH = argument(4);
+
+ Node* state_start = array_element_address(state, intcon(0), T_LONG);
+ assert(state_start, "state is NULL");
+ Node* subkeyH_start = array_element_address(subkeyH, intcon(0), T_LONG);
+ assert(subkeyH_start, "subkeyH is NULL");
+ Node* data_start = array_element_address(data, offset, T_BYTE);
+ assert(data_start, "data is NULL");
+
+ Node* ghash = make_runtime_call(RC_LEAF|RC_NO_FP,
+ OptoRuntime::ghash_processBlocks_Type(),
+ stubAddr, stubName, TypePtr::BOTTOM,
+ state_start, subkeyH_start, data_start, len);
+ return true;
+}
+
//------------------------------inline_sha_implCompress-----------------------
//
// Calculate SHA (i.e., SHA-1) for single-block byte[] array.
--- a/hotspot/src/share/vm/opto/runtime.cpp Tue Jun 16 16:10:36 2015 -0700
+++ b/hotspot/src/share/vm/opto/runtime.cpp Wed Jun 17 17:48:25 2015 -0700
@@ -987,7 +987,25 @@
return TypeFunc::make(domain, range);
}
+// GHASH block processing
+const TypeFunc* OptoRuntime::ghash_processBlocks_Type() {
+ int argcnt = 4;
+ const Type** fields = TypeTuple::fields(argcnt);
+ int argp = TypeFunc::Parms;
+ fields[argp++] = TypePtr::NOTNULL; // state
+ fields[argp++] = TypePtr::NOTNULL; // subkeyH
+ fields[argp++] = TypePtr::NOTNULL; // data
+ fields[argp++] = TypeInt::INT; // blocks
+ assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
+ const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
+
+ // result type needed
+ fields = TypeTuple::fields(1);
+ fields[TypeFunc::Parms+0] = NULL; // void
+ const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
+ return TypeFunc::make(domain, range);
+}
//------------- Interpreter state access for on stack replacement
const TypeFunc* OptoRuntime::osr_end_Type() {
--- a/hotspot/src/share/vm/opto/runtime.hpp Tue Jun 16 16:10:36 2015 -0700
+++ b/hotspot/src/share/vm/opto/runtime.hpp Wed Jun 17 17:48:25 2015 -0700
@@ -316,6 +316,8 @@
static const TypeFunc* mulAdd_Type();
+ static const TypeFunc* ghash_processBlocks_Type();
+
static const TypeFunc* updateBytesCRC32_Type();
// leaf on stack replacement interpreter accessor types
--- a/hotspot/src/share/vm/runtime/globals.hpp Tue Jun 16 16:10:36 2015 -0700
+++ b/hotspot/src/share/vm/runtime/globals.hpp Wed Jun 17 17:48:25 2015 -0700
@@ -641,6 +641,9 @@
product(bool, UseSHA, false, \
"Control whether SHA instructions can be used on SPARC") \
\
+ product(bool, UseGHASHIntrinsics, false, \
+ "Use intrinsics for GHASH versions of crypto") \
+ \
product(size_t, LargePageSizeInBytes, 0, \
"Large page size (0 to let VM choose the page size)") \
\
--- a/hotspot/src/share/vm/runtime/stubRoutines.cpp Tue Jun 16 16:10:36 2015 -0700
+++ b/hotspot/src/share/vm/runtime/stubRoutines.cpp Wed Jun 17 17:48:25 2015 -0700
@@ -125,6 +125,7 @@
address StubRoutines::_aescrypt_decryptBlock = NULL;
address StubRoutines::_cipherBlockChaining_encryptAESCrypt = NULL;
address StubRoutines::_cipherBlockChaining_decryptAESCrypt = NULL;
+address StubRoutines::_ghash_processBlocks = NULL;
address StubRoutines::_sha1_implCompress = NULL;
address StubRoutines::_sha1_implCompressMB = NULL;
--- a/hotspot/src/share/vm/runtime/stubRoutines.hpp Tue Jun 16 16:10:36 2015 -0700
+++ b/hotspot/src/share/vm/runtime/stubRoutines.hpp Wed Jun 17 17:48:25 2015 -0700
@@ -185,6 +185,7 @@
static address _aescrypt_decryptBlock;
static address _cipherBlockChaining_encryptAESCrypt;
static address _cipherBlockChaining_decryptAESCrypt;
+ static address _ghash_processBlocks;
static address _sha1_implCompress;
static address _sha1_implCompressMB;
@@ -346,6 +347,7 @@
static address aescrypt_decryptBlock() { return _aescrypt_decryptBlock; }
static address cipherBlockChaining_encryptAESCrypt() { return _cipherBlockChaining_encryptAESCrypt; }
static address cipherBlockChaining_decryptAESCrypt() { return _cipherBlockChaining_decryptAESCrypt; }
+ static address ghash_processBlocks() { return _ghash_processBlocks; }
static address sha1_implCompress() { return _sha1_implCompress; }
static address sha1_implCompressMB() { return _sha1_implCompressMB; }
--- a/hotspot/src/share/vm/runtime/vmStructs.cpp Tue Jun 16 16:10:36 2015 -0700
+++ b/hotspot/src/share/vm/runtime/vmStructs.cpp Wed Jun 17 17:48:25 2015 -0700
@@ -828,6 +828,7 @@
static_field(StubRoutines, _aescrypt_decryptBlock, address) \
static_field(StubRoutines, _cipherBlockChaining_encryptAESCrypt, address) \
static_field(StubRoutines, _cipherBlockChaining_decryptAESCrypt, address) \
+ static_field(StubRoutines, _ghash_processBlocks, address) \
static_field(StubRoutines, _updateBytesCRC32, address) \
static_field(StubRoutines, _crc_table_adr, address) \
static_field(StubRoutines, _multiplyToLen, address) \
--- a/hotspot/test/compiler/codegen/7184394/TestAESBase.java Tue Jun 16 16:10:36 2015 -0700
+++ b/hotspot/test/compiler/codegen/7184394/TestAESBase.java Wed Jun 17 17:48:25 2015 -0700
@@ -31,6 +31,7 @@
import java.util.Random;
import javax.crypto.Cipher;
import javax.crypto.SecretKey;
+import javax.crypto.spec.GCMParameterSpec;
import javax.crypto.spec.IvParameterSpec;
import javax.crypto.spec.SecretKeySpec;
@@ -62,6 +63,10 @@
Cipher dCipher;
AlgorithmParameters algParams;
SecretKey key;
+ GCMParameterSpec gcm_spec;
+ byte[] aad;
+ int tlen = 12;
+ byte[] iv;
static int numThreads = 0;
int threadId;
@@ -100,6 +105,12 @@
int ivLen = (algorithm.equals("AES") ? 16 : algorithm.equals("DES") ? 8 : 0);
IvParameterSpec initVector = new IvParameterSpec(new byte[ivLen]);
cipher.init(Cipher.ENCRYPT_MODE, key, initVector);
+ } else if (mode.equals("GCM")) {
+ iv = new byte[64];
+ random.nextBytes(iv);
+ aad = new byte[5];
+ random.nextBytes(aad);
+ gcm_init();
} else {
algParams = cipher.getParameters();
cipher.init(Cipher.ENCRYPT_MODE, key, algParams);
@@ -186,4 +197,12 @@
}
abstract void childShowCipher();
+
+ void gcm_init() throws Exception {
+ tlen = 12;
+ gcm_spec = new GCMParameterSpec(tlen * 8, iv);
+ cipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE");
+ cipher.init(Cipher.ENCRYPT_MODE, key, gcm_spec);
+ cipher.update(aad);
+ }
}
--- a/hotspot/test/compiler/codegen/7184394/TestAESEncode.java Tue Jun 16 16:10:36 2015 -0700
+++ b/hotspot/test/compiler/codegen/7184394/TestAESEncode.java Wed Jun 17 17:48:25 2015 -0700
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -32,7 +32,11 @@
@Override
public void run() {
try {
- if (!noReinit) cipher.init(Cipher.ENCRYPT_MODE, key, algParams);
+ if (mode.equals("GCM")) {
+ gcm_init();
+ } else if (!noReinit) {
+ cipher.init(Cipher.ENCRYPT_MODE, key, algParams);
+ }
encode = new byte[encodeLength];
if (testingMisalignment) {
int tempSize = cipher.update(input, encInputOffset, (msgSize - lastChunkSize), encode, encOutputOffset);
--- a/hotspot/test/compiler/codegen/7184394/TestAESMain.java Tue Jun 16 16:10:36 2015 -0700
+++ b/hotspot/test/compiler/codegen/7184394/TestAESMain.java Wed Jun 17 17:48:25 2015 -0700
@@ -44,6 +44,13 @@
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 TestAESMain
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DdecOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain
*
* @author Tom Deneau
*/