# HG changeset patch # User kvn # Date 1451373061 28800 # Node ID a9b3c1984a01236db96bde7a27ac0bfef26d8d5a # Parent 0341260cd1f2015e248b22e4ba005363cad428bb 8143925: Enhancing CounterMode.crypt() for AES Summary: Add intrinsic for CounterMode.crypt() to leverage the parallel nature of AES in Counter(CTR) Mode. Reviewed-by: kvn, ascarpino Contributed-by: kishor.kharbas@intel.com diff -r 0341260cd1f2 -r a9b3c1984a01 hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp --- a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp Mon Dec 28 10:10:37 2015 -1000 +++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp Mon Dec 28 23:11:01 2015 -0800 @@ -202,6 +202,11 @@ } } + if (UseAESCTRIntrinsics) { + warning("AES/CTR intrinsics are not available on this CPU"); + FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); + } + if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) { UseCRC32Intrinsics = true; } diff -r 0341260cd1f2 -r a9b3c1984a01 hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp --- a/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp Mon Dec 28 10:10:37 2015 -1000 +++ b/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp Mon Dec 28 23:11:01 2015 -0800 @@ -196,6 +196,11 @@ FLAG_SET_DEFAULT(UseAESIntrinsics, false); } + if (UseAESCTRIntrinsics) { + warning("AES/CTR intrinsics are not available on this CPU"); + FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); + } + if (UseGHASHIntrinsics) { warning("GHASH intrinsics are not available on this CPU"); FLAG_SET_DEFAULT(UseGHASHIntrinsics, false); diff -r 0341260cd1f2 -r a9b3c1984a01 hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp --- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp Mon Dec 28 10:10:37 2015 -1000 +++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp Mon Dec 28 23:11:01 2015 -0800 @@ -260,6 +260,11 @@ } } + if (UseAESCTRIntrinsics) { + warning("AES/CTR intrinsics are not available on this CPU"); + FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); + } + // GHASH/GCM intrinsics if (has_vis3() && (UseVIS > 2)) { if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) { diff -r 0341260cd1f2 -r a9b3c1984a01 hotspot/src/cpu/x86/vm/assembler_x86.cpp --- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp Mon Dec 28 10:10:37 2015 -1000 +++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp Mon Dec 28 23:11:01 2015 -0800 @@ -3349,22 +3349,41 @@ void Assembler::pextrd(Register dst, XMMRegister src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false); - int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + int encode = simd_prefix_and_encode(src, xnoreg, as_XMMRegister(dst->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); emit_int8(0x16); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(imm8); } +void Assembler::pextrd(Address dst, XMMRegister src, int imm8) { + assert(VM_Version::supports_sse4_1(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false); + attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit); + simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int8(0x16); + emit_operand(src, dst); + emit_int8(imm8); +} + void Assembler::pextrq(Register dst, XMMRegister src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false); - int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + int encode = simd_prefix_and_encode(src, xnoreg, as_XMMRegister(dst->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); emit_int8(0x16); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(imm8); } -// The encoding for pextrw is SSE2 to support the LIBM implementation. +void Assembler::pextrq(Address dst, XMMRegister src, int imm8) { + assert(VM_Version::supports_sse4_1(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false); + attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit); + simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int8(0x16); + emit_operand(src, dst); + emit_int8(imm8); +} + void Assembler::pextrw(Register dst, XMMRegister src, int imm8) { assert(VM_Version::supports_sse2(), ""); InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false); @@ -3374,6 +3393,26 @@ emit_int8(imm8); } +void Assembler::pextrw(Address dst, XMMRegister src, int imm8) { + assert(VM_Version::supports_sse4_1(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false); + attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_16bit); + simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int8((unsigned char)0x15); + emit_operand(src, dst); + emit_int8(imm8); +} + +void Assembler::pextrb(Address dst, XMMRegister src, int imm8) { + assert(VM_Version::supports_sse4_1(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false); + attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_8bit); + simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int8(0x14); + emit_operand(src, dst); + emit_int8(imm8); +} + void Assembler::pinsrd(XMMRegister dst, Register src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false); @@ -3383,6 +3422,16 @@ emit_int8(imm8); } +void Assembler::pinsrd(XMMRegister dst, Address src, int imm8) { + assert(VM_Version::supports_sse4_1(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false); + attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit); + simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int8(0x22); + emit_operand(dst,src); + emit_int8(imm8); +} + void Assembler::pinsrq(XMMRegister dst, Register src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false); @@ -3392,6 +3441,16 @@ emit_int8(imm8); } +void Assembler::pinsrq(XMMRegister dst, Address src, int imm8) { + assert(VM_Version::supports_sse4_1(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false); + attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit); + simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int8(0x22); + emit_operand(dst, src); + emit_int8(imm8); +} + void Assembler::pinsrw(XMMRegister dst, Register src, int imm8) { assert(VM_Version::supports_sse2(), ""); InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false); @@ -3401,6 +3460,26 @@ emit_int8(imm8); } +void Assembler::pinsrw(XMMRegister dst, Address src, int imm8) { + assert(VM_Version::supports_sse2(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false); + attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_16bit); + simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8((unsigned char)0xC4); + emit_operand(dst, src); + emit_int8(imm8); +} + +void Assembler::pinsrb(XMMRegister dst, Address src, int imm8) { + assert(VM_Version::supports_sse4_1(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false); + attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_8bit); + simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int8(0x20); + emit_operand(dst, src); + emit_int8(imm8); +} + void Assembler::pmovzxbw(XMMRegister dst, Address src) { assert(VM_Version::supports_sse4_1(), ""); InstructionMark im(this); @@ -4188,6 +4267,12 @@ emit_arith(0x33, 0xC0, dst, src); } +void Assembler::xorb(Register dst, Address src) { + InstructionMark im(this); + prefix(src, dst); + emit_int8(0x32); + emit_operand(dst, src); +} // AVX 3-operands scalar float-point arithmetic instructions diff -r 0341260cd1f2 -r a9b3c1984a01 hotspot/src/cpu/x86/vm/assembler_x86.hpp --- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp Mon Dec 28 10:10:37 2015 -1000 +++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp Mon Dec 28 23:11:01 2015 -0800 @@ -1543,14 +1543,22 @@ // SSE 4.1 extract void pextrd(Register dst, XMMRegister src, int imm8); void pextrq(Register dst, XMMRegister src, int imm8); + void pextrd(Address dst, XMMRegister src, int imm8); + void pextrq(Address dst, XMMRegister src, int imm8); + void pextrb(Address dst, XMMRegister src, int imm8); // SSE 2 extract void pextrw(Register dst, XMMRegister src, int imm8); + void pextrw(Address dst, XMMRegister src, int imm8); // SSE 4.1 insert void pinsrd(XMMRegister dst, Register src, int imm8); void pinsrq(XMMRegister dst, Register src, int imm8); + void pinsrd(XMMRegister dst, Address src, int imm8); + void pinsrq(XMMRegister dst, Address src, int imm8); + void pinsrb(XMMRegister dst, Address src, int imm8); // SSE 2 insert void pinsrw(XMMRegister dst, Register src, int imm8); + void pinsrw(XMMRegister dst, Address src, int imm8); // SSE4.1 packed move void pmovzxbw(XMMRegister dst, XMMRegister src); @@ -1762,6 +1770,8 @@ void xorl(Register dst, Address src); void xorl(Register dst, Register src); + void xorb(Register dst, Address src); + void xorq(Register dst, Address src); void xorq(Register dst, Register src); diff -r 0341260cd1f2 -r a9b3c1984a01 hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp --- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp Mon Dec 28 10:10:37 2015 -1000 +++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp Mon Dec 28 23:11:01 2015 -0800 @@ -2142,6 +2142,17 @@ return start; } + address generate_counter_shuffle_mask() { + __ align(16); + StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask"); + address start = __ pc(); + __ emit_data(0x0c0d0e0f, relocInfo::none, 0); + __ emit_data(0x08090a0b, relocInfo::none, 0); + __ emit_data(0x04050607, relocInfo::none, 0); + __ emit_data(0x00010203, relocInfo::none, 0); + return start; + } + // Utility routine for loading a 128-bit key word in little endian format // can optionally specify that the shuffle mask is already in an xmmregister void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { @@ -2167,6 +2178,31 @@ __ aesdec(xmmdst, xmmtmp); } + // Utility routine for increase 128bit counter (iv in CTR mode) + // XMM_128bit, D3, D2, D1, D0 + void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) { + __ pextrd(reg, xmmdst, 0x0); + __ addl(reg, inc_delta); + __ pinsrd(xmmdst, reg, 0x0); + __ jcc(Assembler::carryClear, next_block); // jump if no carry + + __ pextrd(reg, xmmdst, 0x01); // Carry-> D1 + __ addl(reg, 0x01); + __ pinsrd(xmmdst, reg, 0x01); + __ jcc(Assembler::carryClear, next_block); // jump if no carry + + __ pextrd(reg, xmmdst, 0x02); // Carry-> D2 + __ addl(reg, 0x01); + __ pinsrd(xmmdst, reg, 0x02); + __ jcc(Assembler::carryClear, next_block); // jump if no carry + + __ pextrd(reg, xmmdst, 0x03); // Carry -> D3 + __ addl(reg, 0x01); + __ pinsrd(xmmdst, reg, 0x03); + + __ BIND(next_block); // next instruction + } + // Arguments: // @@ -2742,6 +2778,317 @@ return start; } + + // CTR AES crypt. + // In 32-bit stub, parallelize 4 blocks at a time + // Arguments: + // + // Inputs: + // c_rarg0 - source byte array address + // c_rarg1 - destination byte array address + // c_rarg2 - K (key) in little endian int array + // c_rarg3 - counter vector byte array address + // c_rarg4 - input length + // + // Output: + // rax - input length + // + address generate_counterMode_AESCrypt_Parallel() { + assert(UseAES, "need AES instructions and misaligned SSE support"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); + address start = __ pc(); + const Register from = rsi; // source array address + const Register to = rdx; // destination array address + const Register key = rcx; // key array address + const Register counter = rdi; // counter byte array initialized from initvector array address + + // and left with the results of the last encryption block + const Register len_reg = rbx; + const Register pos = rax; + + __ enter(); // required for proper stackwalking of RuntimeStub frame + handleSOERegisters(true /*saving*/); // save rbx, rsi, rdi + + // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge + // context for the registers used, where all instructions below are using 128-bit mode + // On EVEX without VL and BW, these instructions will all be AVX. + if (VM_Version::supports_avx512vlbw()) { + __ movl(rdx, 0xffff); + __ kmovdl(k1, rdx); + } + + // load registers from incoming parameters + const Address from_param(rbp, 8+0); + const Address to_param (rbp, 8+4); + const Address key_param (rbp, 8+8); + const Address rvec_param (rbp, 8+12); + const Address len_param (rbp, 8+16); + const Address saved_counter_param(rbp, 8 + 20); + const Address used_addr_param(rbp, 8 + 24); + + __ movptr(from , from_param); + __ movptr(to , to_param); + //__ movptr(key, key_param); + //__ movptr(counter, rvec_param); + __ movptr(len_reg , len_param); + //__ movptr(pos, 0); + + // Use the partially used encrpyted counter from last invocation + Label L_exit_preLoop, L_preLoop_start; + + // Use the registers 'counter' and 'key' here in this preloop + // to hold of last 2 params 'used' and 'saved_encCounter_start' + Register used = counter; + Register saved_encCounter_start = key; + Register used_addr = saved_encCounter_start; + + __ movptr(used_addr, used_addr_param); + __ movptr(used, Address(used_addr, 0)); + __ movptr(saved_encCounter_start, saved_counter_param); + + __ BIND(L_preLoop_start); + __ cmpptr(used, 16); + __ jcc(Assembler::aboveEqual, L_exit_preLoop); + __ cmpptr(len_reg, 0); + __ jcc(Assembler::lessEqual, L_exit_preLoop); + __ movb(rax, Address(saved_encCounter_start, used)); + __ xorb(rax, Address(from, 0)); + __ movb(Address(to, 0), rax); + __ addptr(from, 1); + __ addptr(to, 1); + __ addptr(used, 1); + __ subptr(len_reg, 1); + + __ jmp(L_preLoop_start); + + __ BIND(L_exit_preLoop); + __ movptr(used_addr, used_addr_param); + __ movptr(used_addr, used_addr_param); + __ movl(Address(used_addr, 0), used); + + // load the parameters 'key' and 'counter' + __ movptr(key, key_param); + __ movptr(counter, rvec_param); + + // xmm register assignments for the loops below + const XMMRegister xmm_curr_counter = xmm0; + const XMMRegister xmm_counter_shuf_mask = xmm1; // need to be reloaded + const XMMRegister xmm_key_shuf_mask = xmm2; // need to be reloaded + const XMMRegister xmm_key = xmm3; + const XMMRegister xmm_result0 = xmm4; + const XMMRegister xmm_result1 = xmm5; + const XMMRegister xmm_result2 = xmm6; + const XMMRegister xmm_result3 = xmm7; + const XMMRegister xmm_from0 = xmm1; //reuse XMM register + const XMMRegister xmm_from1 = xmm2; + const XMMRegister xmm_from2 = xmm3; + const XMMRegister xmm_from3 = xmm4; + + //for key_128, key_192, key_256 + const int rounds[3] = {10, 12, 14}; + Label L_singleBlockLoopTop[3]; + Label L_multiBlock_loopTop[3]; + Label L_key192_top, L_key256_top; + Label L_incCounter[3][4]; // 3: different key length, 4: 4 blocks at a time + Label L_incCounter_single[3]; //for single block, key128, key192, key256 + Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3]; + Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3]; + + Label L_exit; + const int PARALLEL_FACTOR = 4; //because of the limited register number + + // initialize counter with initial counter + __ movdqu(xmm_curr_counter, Address(counter, 0x00)); + __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr())); + __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled for increase + + // key length could be only {11, 13, 15} * 4 = {44, 52, 60} + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); + __ cmpl(rax, 52); + __ jcc(Assembler::equal, L_key192_top); + __ cmpl(rax, 60); + __ jcc(Assembler::equal, L_key256_top); + + //key128 begins here + __ movptr(pos, 0); // init pos before L_multiBlock_loopTop + +#define CTR_DoFour(opc, src_reg) \ + __ opc(xmm_result0, src_reg); \ + __ opc(xmm_result1, src_reg); \ + __ opc(xmm_result2, src_reg); \ + __ opc(xmm_result3, src_reg); + + // k == 0 : generate code for key_128 + // k == 1 : generate code for key_192 + // k == 2 : generate code for key_256 + for (int k = 0; k < 3; ++k) { + //multi blocks starts here + __ align(OptoLoopAlignment); + __ BIND(L_multiBlock_loopTop[k]); + __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left + __ jcc(Assembler::less, L_singleBlockLoopTop[k]); + + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr())); + + //load, then increase counters + CTR_DoFour(movdqa, xmm_curr_counter); + __ push(rbx); + inc_counter(rbx, xmm_result1, 0x01, L_incCounter[k][0]); + inc_counter(rbx, xmm_result2, 0x02, L_incCounter[k][1]); + inc_counter(rbx, xmm_result3, 0x03, L_incCounter[k][2]); + inc_counter(rbx, xmm_curr_counter, 0x04, L_incCounter[k][3]); + __ pop (rbx); + + load_key(xmm_key, key, 0x00, xmm_key_shuf_mask); // load Round 0 key. interleaving for better performance + + CTR_DoFour(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR + CTR_DoFour(pxor, xmm_key); //PXOR with Round 0 key + + for (int i = 1; i < rounds[k]; ++i) { + load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask); + CTR_DoFour(aesenc, xmm_key); + } + load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask); + CTR_DoFour(aesenclast, xmm_key); + + // get next PARALLEL_FACTOR blocks into xmm_from registers + __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); + __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize)); + __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize)); + + // PXOR with input text + __ pxor(xmm_result0, xmm_from0); //result0 is xmm4 + __ pxor(xmm_result1, xmm_from1); + __ pxor(xmm_result2, xmm_from2); + + // store PARALLEL_FACTOR results into the next 64 bytes of output + __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); + __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1); + __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2); + + // do it here after xmm_result0 is saved, because xmm_from3 reuse the same register of xmm_result0. + __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize)); + __ pxor(xmm_result3, xmm_from3); + __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3); + + __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text + __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length + __ jmp(L_multiBlock_loopTop[k]); + + // singleBlock starts here + __ align(OptoLoopAlignment); + __ BIND(L_singleBlockLoopTop[k]); + __ cmpptr(len_reg, 0); + __ jcc(Assembler::equal, L_exit); + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr())); + __ movdqa(xmm_result0, xmm_curr_counter); + load_key(xmm_key, key, 0x00, xmm_key_shuf_mask); + __ push(rbx);//rbx is used for increasing counter + inc_counter(rbx, xmm_curr_counter, 0x01, L_incCounter_single[k]); + __ pop (rbx); + __ pshufb(xmm_result0, xmm_counter_shuf_mask); + __ pxor(xmm_result0, xmm_key); + for (int i = 1; i < rounds[k]; i++) { + load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask); + __ aesenc(xmm_result0, xmm_key); + } + load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask); + __ aesenclast(xmm_result0, xmm_key); + __ cmpptr(len_reg, AESBlockSize); + __ jcc(Assembler::less, L_processTail_insr[k]); + __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); + __ pxor(xmm_result0, xmm_from0); + __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jmp(L_singleBlockLoopTop[k]); + + __ BIND(L_processTail_insr[k]); + __ addptr(pos, len_reg); + __ testptr(len_reg, 8); + __ jcc(Assembler::zero, L_processTail_4_insr[k]); + __ subptr(pos,8); + __ pinsrd(xmm_from0, Address(from, pos), 0); + __ pinsrd(xmm_from0, Address(from, pos, Address::times_1, 4), 1); + __ BIND(L_processTail_4_insr[k]); + __ testptr(len_reg, 4); + __ jcc(Assembler::zero, L_processTail_2_insr[k]); + __ subptr(pos,4); + __ pslldq(xmm_from0, 4); + __ pinsrd(xmm_from0, Address(from, pos), 0); + __ BIND(L_processTail_2_insr[k]); + __ testptr(len_reg, 2); + __ jcc(Assembler::zero, L_processTail_1_insr[k]); + __ subptr(pos, 2); + __ pslldq(xmm_from0, 2); + __ pinsrw(xmm_from0, Address(from, pos), 0); + __ BIND(L_processTail_1_insr[k]); + __ testptr(len_reg, 1); + __ jcc(Assembler::zero, L_processTail_exit_insr[k]); + __ subptr(pos, 1); + __ pslldq(xmm_from0, 1); + __ pinsrb(xmm_from0, Address(from, pos), 0); + __ BIND(L_processTail_exit_insr[k]); + + __ movptr(saved_encCounter_start, saved_counter_param); + __ movdqu(Address(saved_encCounter_start, 0), xmm_result0); + __ pxor(xmm_result0, xmm_from0); + + __ testptr(len_reg, 8); + __ jcc(Assembler::zero, L_processTail_4_extr[k]); + __ pextrd(Address(to, pos), xmm_result0, 0); + __ pextrd(Address(to, pos, Address::times_1, 4), xmm_result0, 1); + __ psrldq(xmm_result0, 8); + __ addptr(pos, 8); + __ BIND(L_processTail_4_extr[k]); + __ testptr(len_reg, 4); + __ jcc(Assembler::zero, L_processTail_2_extr[k]); + __ pextrd(Address(to, pos), xmm_result0, 0); + __ psrldq(xmm_result0, 4); + __ addptr(pos, 4); + __ BIND(L_processTail_2_extr[k]); + __ testptr(len_reg, 2); + __ jcc(Assembler::zero, L_processTail_1_extr[k]); + __ pextrb(Address(to, pos), xmm_result0, 0); + __ pextrb(Address(to, pos, Address::times_1, 1), xmm_result0, 1); + __ psrldq(xmm_result0, 2); + __ addptr(pos, 2); + __ BIND(L_processTail_1_extr[k]); + __ testptr(len_reg, 1); + __ jcc(Assembler::zero, L_processTail_exit_extr[k]); + __ pextrb(Address(to, pos), xmm_result0, 0); + + __ BIND(L_processTail_exit_extr[k]); + __ movptr(used_addr, used_addr_param); + __ movl(Address(used_addr, 0), len_reg); + __ jmp(L_exit); + } + + __ BIND(L_exit); + __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr())); + __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back. + __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back + handleSOERegisters(false /*restoring*/); + __ movptr(rax, len_param); // return length + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + __ BIND (L_key192_top); + __ movptr(pos, 0); // init pos before L_multiBlock_loopTop + __ jmp(L_multiBlock_loopTop[1]); //key192 + + __ BIND (L_key256_top); + __ movptr(pos, 0); // init pos before L_multiBlock_loopTop + __ jmp(L_multiBlock_loopTop[2]); //key192 + + return start; + } + + // byte swap x86 long address generate_ghash_long_swap_mask() { __ align(CodeEntryAlignment); @@ -3360,6 +3707,11 @@ StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); } + if (UseAESCTRIntrinsics) { + StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask(); + StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel(); + } + // Generate GHASH intrinsics code if (UseGHASHIntrinsics) { StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask(); diff -r 0341260cd1f2 -r a9b3c1984a01 hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp --- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp Mon Dec 28 10:10:37 2015 -1000 +++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp Mon Dec 28 23:11:01 2015 -0800 @@ -3039,6 +3039,15 @@ return start; } + address generate_counter_shuffle_mask() { + __ align(16); + StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask"); + address start = __ pc(); + __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none); + __ emit_data64(0x0001020304050607, relocInfo::none); + return start; + } + // Utility routine for loading a 128-bit key word in little endian format // can optionally specify that the shuffle mask is already in an xmmregister void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { @@ -3050,6 +3059,18 @@ } } + // Utility routine for increase 128bit counter (iv in CTR mode) + void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) { + __ pextrq(reg, xmmdst, 0x0); + __ addq(reg, inc_delta); + __ pinsrq(xmmdst, reg, 0x0); + __ jcc(Assembler::carryClear, next_block); // jump if no carry + __ pextrq(reg, xmmdst, 0x01); // Carry + __ addq(reg, 0x01); + __ pinsrq(xmmdst, reg, 0x01); //Carry end + __ BIND(next_block); // next instruction + } + // Arguments: // // Inputs: @@ -3700,6 +3721,328 @@ return start; } + // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time + // to hide instruction latency + // + // Arguments: + // + // Inputs: + // c_rarg0 - source byte array address + // c_rarg1 - destination byte array address + // c_rarg2 - K (key) in little endian int array + // c_rarg3 - counter vector byte array address + // Linux + // c_rarg4 - input length + // c_rarg5 - saved encryptedCounter start + // rbp + 6 * wordSize - saved used length + // Windows + // rbp + 6 * wordSize - input length + // rbp + 7 * wordSize - saved encryptedCounter start + // rbp + 8 * wordSize - saved used length + // + // Output: + // rax - input length + // + address generate_counterMode_AESCrypt_Parallel() { + assert(UseAES, "need AES instructions and misaligned SSE support"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); + address start = __ pc(); + const Register from = c_rarg0; // source array address + const Register to = c_rarg1; // destination array address + const Register key = c_rarg2; // key array address + const Register counter = c_rarg3; // counter byte array initialized from counter array address + // and left with the results of the last encryption block +#ifndef _WIN64 + const Register len_reg = c_rarg4; + const Register saved_encCounter_start = c_rarg5; + const Register used_addr = r10; + const Address used_mem(rbp, 2 * wordSize); + const Register used = r11; +#else + const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 + const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64 + const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64 + const Register len_reg = r10; // pick the first volatile windows register + const Register saved_encCounter_start = r11; + const Register used_addr = r13; + const Register used = r14; +#endif + const Register pos = rax; + + const int PARALLEL_FACTOR = 6; + const XMMRegister xmm_counter_shuf_mask = xmm0; + const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front + const XMMRegister xmm_curr_counter = xmm2; + + const XMMRegister xmm_key_tmp0 = xmm3; + const XMMRegister xmm_key_tmp1 = xmm4; + + // registers holding the four results in the parallelized loop + const XMMRegister xmm_result0 = xmm5; + const XMMRegister xmm_result1 = xmm6; + const XMMRegister xmm_result2 = xmm7; + const XMMRegister xmm_result3 = xmm8; + const XMMRegister xmm_result4 = xmm9; + const XMMRegister xmm_result5 = xmm10; + + const XMMRegister xmm_from0 = xmm11; + const XMMRegister xmm_from1 = xmm12; + const XMMRegister xmm_from2 = xmm13; + const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64. + const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text + const XMMRegister xmm_from5 = xmm4; + + //for key_128, key_192, key_256 + const int rounds[3] = {10, 12, 14}; + Label L_exit_preLoop, L_preLoop_start; + Label L_multiBlock_loopTop[3]; + Label L_singleBlockLoopTop[3]; + Label L__incCounter[3][6]; //for 6 blocks + Label L__incCounter_single[3]; //for single block, key128, key192, key256 + Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3]; + Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3]; + + Label L_exit; + + __ enter(); // required for proper stackwalking of RuntimeStub frame + + // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge + // context for the registers used, where all instructions below are using 128-bit mode + // On EVEX without VL and BW, these instructions will all be AVX. + if (VM_Version::supports_avx512vlbw()) { + __ movl(rax, 0xffff); + __ kmovql(k1, rax); + } + +#ifdef _WIN64 + // save the xmm registers which must be preserved 6-14 + const int XMM_REG_NUM_KEY_LAST = 14; + __ subptr(rsp, -rsp_after_call_off * wordSize); + for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { + __ movdqu(xmm_save(i), as_XMMRegister(i)); + } + + const Address r13_save(rbp, rdi_off * wordSize); + const Address r14_save(rbp, rsi_off * wordSize); + + __ movptr(r13_save, r13); + __ movptr(r14_save, r14); + + // on win64, fill len_reg from stack position + __ movl(len_reg, len_mem); + __ movptr(saved_encCounter_start, saved_encCounter_mem); + __ movptr(used_addr, used_mem); + __ movl(used, Address(used_addr, 0)); +#else + __ push(len_reg); // Save + __ movptr(used_addr, used_mem); + __ movl(used, Address(used_addr, 0)); +#endif + + __ push(rbx); // Save RBX + __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter + __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr())); + __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled + __ movptr(pos, 0); + + // Use the partially used encrpyted counter from last invocation + __ BIND(L_preLoop_start); + __ cmpptr(used, 16); + __ jcc(Assembler::aboveEqual, L_exit_preLoop); + __ cmpptr(len_reg, 0); + __ jcc(Assembler::lessEqual, L_exit_preLoop); + __ movb(rbx, Address(saved_encCounter_start, used)); + __ xorb(rbx, Address(from, pos)); + __ movb(Address(to, pos), rbx); + __ addptr(pos, 1); + __ addptr(used, 1); + __ subptr(len_reg, 1); + + __ jmp(L_preLoop_start); + + __ BIND(L_exit_preLoop); + __ movl(Address(used_addr, 0), used); + + // key length could be only {11, 13, 15} * 4 = {44, 52, 60} + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); + __ cmpl(rbx, 52); + __ jcc(Assembler::equal, L_multiBlock_loopTop[1]); + __ cmpl(rbx, 60); + __ jcc(Assembler::equal, L_multiBlock_loopTop[2]); + +#define CTR_DoSix(opc, src_reg) \ + __ opc(xmm_result0, src_reg); \ + __ opc(xmm_result1, src_reg); \ + __ opc(xmm_result2, src_reg); \ + __ opc(xmm_result3, src_reg); \ + __ opc(xmm_result4, src_reg); \ + __ opc(xmm_result5, src_reg); + + // k == 0 : generate code for key_128 + // k == 1 : generate code for key_192 + // k == 2 : generate code for key_256 + for (int k = 0; k < 3; ++k) { + //multi blocks starts here + __ align(OptoLoopAlignment); + __ BIND(L_multiBlock_loopTop[k]); + __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left + __ jcc(Assembler::less, L_singleBlockLoopTop[k]); + load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask); + + //load, then increase counters + CTR_DoSix(movdqa, xmm_curr_counter); + inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]); + inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]); + inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]); + inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]); + inc_counter(rbx, xmm_result5, 0x05, L__incCounter[k][4]); + inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]); + CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR + CTR_DoSix(pxor, xmm_key_tmp0); //PXOR with Round 0 key + + //load two ROUND_KEYs at a time + for (int i = 1; i < rounds[k]; ) { + load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask); + load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask); + CTR_DoSix(aesenc, xmm_key_tmp1); + i++; + if (i != rounds[k]) { + CTR_DoSix(aesenc, xmm_key_tmp0); + } else { + CTR_DoSix(aesenclast, xmm_key_tmp0); + } + i++; + } + + // get next PARALLEL_FACTOR blocks into xmm_result registers + __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); + __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize)); + __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize)); + __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize)); + __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize)); + __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize)); + + __ pxor(xmm_result0, xmm_from0); + __ pxor(xmm_result1, xmm_from1); + __ pxor(xmm_result2, xmm_from2); + __ pxor(xmm_result3, xmm_from3); + __ pxor(xmm_result4, xmm_from4); + __ pxor(xmm_result5, xmm_from5); + + // store 6 results into the next 64 bytes of output + __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); + __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1); + __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2); + __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3); + __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4); + __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5); + + __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text + __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length + __ jmp(L_multiBlock_loopTop[k]); + + // singleBlock starts here + __ align(OptoLoopAlignment); + __ BIND(L_singleBlockLoopTop[k]); + __ cmpptr(len_reg, 0); + __ jcc(Assembler::lessEqual, L_exit); + load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask); + __ movdqa(xmm_result0, xmm_curr_counter); + inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]); + __ pshufb(xmm_result0, xmm_counter_shuf_mask); + __ pxor(xmm_result0, xmm_key_tmp0); + for (int i = 1; i < rounds[k]; i++) { + load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask); + __ aesenc(xmm_result0, xmm_key_tmp0); + } + load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask); + __ aesenclast(xmm_result0, xmm_key_tmp0); + __ cmpptr(len_reg, AESBlockSize); + __ jcc(Assembler::less, L_processTail_insr[k]); + __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); + __ pxor(xmm_result0, xmm_from0); + __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jmp(L_singleBlockLoopTop[k]); + __ BIND(L_processTail_insr[k]); + __ addptr(pos, len_reg); + __ testptr(len_reg, 8); + __ jcc(Assembler::zero, L_processTail_4_insr[k]); + __ subptr(pos,8); + __ pinsrq(xmm_from0, Address(from, pos), 0); + __ BIND(L_processTail_4_insr[k]); + __ testptr(len_reg, 4); + __ jcc(Assembler::zero, L_processTail_2_insr[k]); + __ subptr(pos,4); + __ pslldq(xmm_from0, 4); + __ pinsrd(xmm_from0, Address(from, pos), 0); + __ BIND(L_processTail_2_insr[k]); + __ testptr(len_reg, 2); + __ jcc(Assembler::zero, L_processTail_1_insr[k]); + __ subptr(pos, 2); + __ pslldq(xmm_from0, 2); + __ pinsrw(xmm_from0, Address(from, pos), 0); + __ BIND(L_processTail_1_insr[k]); + __ testptr(len_reg, 1); + __ jcc(Assembler::zero, L_processTail_exit_insr[k]); + __ subptr(pos, 1); + __ pslldq(xmm_from0, 1); + __ pinsrb(xmm_from0, Address(from, pos), 0); + __ BIND(L_processTail_exit_insr[k]); + + __ movdqu(Address(saved_encCounter_start, 0), xmm_result0); + __ pxor(xmm_result0, xmm_from0); + + __ testptr(len_reg, 8); + __ jcc(Assembler::zero, L_processTail_4_extr[k]); + __ pextrq(Address(to, pos), xmm_result0, 0); + __ psrldq(xmm_result0, 8); + __ addptr(pos, 8); + __ BIND(L_processTail_4_extr[k]); + __ testptr(len_reg, 4); + __ jcc(Assembler::zero, L_processTail_2_extr[k]); + __ pextrd(Address(to, pos), xmm_result0, 0); + __ psrldq(xmm_result0, 4); + __ addptr(pos, 4); + __ BIND(L_processTail_2_extr[k]); + __ testptr(len_reg, 2); + __ jcc(Assembler::zero, L_processTail_1_extr[k]); + __ pextrw(Address(to, pos), xmm_result0, 0); + __ psrldq(xmm_result0, 2); + __ addptr(pos, 2); + __ BIND(L_processTail_1_extr[k]); + __ testptr(len_reg, 1); + __ jcc(Assembler::zero, L_processTail_exit_extr[k]); + __ pextrb(Address(to, pos), xmm_result0, 0); + + __ BIND(L_processTail_exit_extr[k]); + __ movl(Address(used_addr, 0), len_reg); + __ jmp(L_exit); + + } + + __ BIND(L_exit); + __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back. + __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back + __ pop(rbx); // pop the saved RBX. +#ifdef _WIN64 + // restore regs belonging to calling function + for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { + __ movdqu(as_XMMRegister(i), xmm_save(i)); + } + __ movl(rax, len_mem); + __ movptr(r13, r13_save); + __ movptr(r14, r14_save); +#else + __ pop(rax); // return 'len' +#endif + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + return start; + } // byte swap x86 long address generate_ghash_long_swap_mask() { @@ -4555,12 +4898,15 @@ // don't bother generating these AES intrinsic stubs unless global flag is set if (UseAESIntrinsics) { StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // needed by the others - StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); } + if (UseAESCTRIntrinsics){ + StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask(); + StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel(); + } // Generate GHASH intrinsics code if (UseGHASHIntrinsics) { diff -r 0341260cd1f2 -r a9b3c1984a01 hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp --- a/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp Mon Dec 28 10:10:37 2015 -1000 +++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp Mon Dec 28 23:11:01 2015 -0800 @@ -34,6 +34,7 @@ address StubRoutines::x86::_verify_mxcsr_entry = NULL; address StubRoutines::x86::_key_shuffle_mask_addr = NULL; +address StubRoutines::x86::_counter_shuffle_mask_addr = NULL; address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL; address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL; diff -r 0341260cd1f2 -r a9b3c1984a01 hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp --- a/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp Mon Dec 28 10:10:37 2015 -1000 +++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp Mon Dec 28 23:11:01 2015 -0800 @@ -33,6 +33,10 @@ static address _verify_mxcsr_entry; // shuffle mask for fixing up 128-bit words consisting of big-endian 32-bit integers static address _key_shuffle_mask_addr; + + //shuffle mask for big-endian 128-bit integers + static address _counter_shuffle_mask_addr; + // masks and table for CRC32 static uint64_t _crc_by128_masks[]; static juint _crc_table[]; @@ -45,9 +49,9 @@ public: static address verify_mxcsr_entry() { return _verify_mxcsr_entry; } static address key_shuffle_mask_addr() { return _key_shuffle_mask_addr; } + static address counter_shuffle_mask_addr() { return _counter_shuffle_mask_addr; } static address crc_by128_masks_addr() { return (address)_crc_by128_masks; } static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; } static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; } static void generate_CRC32C_table(bool is_pclmulqdq_supported); - #endif // CPU_X86_VM_STUBROUTINES_X86_32_HPP diff -r 0341260cd1f2 -r a9b3c1984a01 hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp --- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp Mon Dec 28 10:10:37 2015 -1000 +++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp Mon Dec 28 23:11:01 2015 -0800 @@ -31,7 +31,7 @@ enum platform_dependent_constants { code_size1 = 9000, // simply increase if too small (assembler will crash if too small) - code_size2 = 30000 // simply increase if too small (assembler will crash if too small) + code_size2 = 33800 // simply increase if too small (assembler will crash if too small) }; class x86 { diff -r 0341260cd1f2 -r a9b3c1984a01 hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp --- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp Mon Dec 28 10:10:37 2015 -1000 +++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp Mon Dec 28 23:11:01 2015 -0800 @@ -33,7 +33,7 @@ enum platform_dependent_constants { code_size1 = 19000, // simply increase if too small (assembler will crash if too small) - code_size2 = 32000 // simply increase if too small (assembler will crash if too small) + code_size2 = 35000 // simply increase if too small (assembler will crash if too small) }; class x86 { diff -r 0341260cd1f2 -r a9b3c1984a01 hotspot/src/cpu/x86/vm/vm_version_x86.cpp --- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Mon Dec 28 10:10:37 2015 -1000 +++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Mon Dec 28 23:11:01 2015 -0800 @@ -648,6 +648,28 @@ } FLAG_SET_DEFAULT(UseAESIntrinsics, false); } + + // --AES-CTR begins-- + if (!UseAESIntrinsics) { + if (UseAESCTRIntrinsics && !FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) { + warning("AES-CTR intrinsics require UseAESIntrinsics flag to be enabled. Intrinsics will be disabled."); + FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); + } + } else { + if(supports_sse4_1() && UseSSE >= 4) { + if (FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) { + FLAG_SET_DEFAULT(UseAESCTRIntrinsics, true); + } + } else { + // The AES-CTR intrinsic stubs require AES instruction support (of course) + // but also require sse4.1 mode or higher for instructions it use. + if (UseAESCTRIntrinsics && !FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) { + warning("X86 AES-CTR intrinsics require SSE4.1 instructions or higher. Intrinsics will be disabled."); + } + FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); + } + } + // --AES-CTR ends-- } } else if (UseAES || UseAESIntrinsics) { if (UseAES && !FLAG_IS_DEFAULT(UseAES)) { @@ -658,6 +680,10 @@ warning("AES intrinsics are not available on this CPU"); FLAG_SET_DEFAULT(UseAESIntrinsics, false); } + if (UseAESCTRIntrinsics && !FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) { + warning("AES-CTR intrinsics are not available on this CPU"); + FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); + } } // Use CLMUL instructions if available. @@ -681,6 +707,16 @@ FLAG_SET_DEFAULT(UseCRC32Intrinsics, false); } + if (UseAESIntrinsics) { + if (FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) { + UseAESCTRIntrinsics = true; + } + } else if (UseAESCTRIntrinsics) { + if (!FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) + warning("AES/CTR intrinsics are not available on this CPU"); + FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); + } + if (supports_sse4_2()) { if (FLAG_IS_DEFAULT(UseCRC32CIntrinsics)) { UseCRC32CIntrinsics = true; diff -r 0341260cd1f2 -r a9b3c1984a01 hotspot/src/share/vm/classfile/vmSymbols.cpp --- a/hotspot/src/share/vm/classfile/vmSymbols.cpp Mon Dec 28 10:10:37 2015 -1000 +++ b/hotspot/src/share/vm/classfile/vmSymbols.cpp Mon Dec 28 23:11:01 2015 -0800 @@ -409,6 +409,7 @@ switch (id) { case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt: case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt: + case vmIntrinsics::_counterMode_AESCrypt: return 1; case vmIntrinsics::_digestBase_implCompressMB: return 3; @@ -597,6 +598,9 @@ case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt: if (!UseAESIntrinsics) return true; break; + case vmIntrinsics::_counterMode_AESCrypt: + if (!UseAESCTRIntrinsics) return true; + break; case vmIntrinsics::_sha_implCompress: if (!UseSHA1Intrinsics) return true; break; diff -r 0341260cd1f2 -r a9b3c1984a01 hotspot/src/share/vm/classfile/vmSymbols.hpp --- a/hotspot/src/share/vm/classfile/vmSymbols.hpp Mon Dec 28 10:10:37 2015 -1000 +++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp Mon Dec 28 23:11:01 2015 -0800 @@ -981,6 +981,10 @@ do_name( decrypt_name, "implDecrypt") \ do_signature(byteArray_int_int_byteArray_int_signature, "([BII[BI)I") \ \ + do_class(com_sun_crypto_provider_counterMode, "com/sun/crypto/provider/CounterMode") \ + do_intrinsic(_counterMode_AESCrypt, com_sun_crypto_provider_counterMode, crypt_name, byteArray_int_int_byteArray_int_signature, F_R) \ + do_name( crypt_name, "implCrypt") \ + \ /* support for sun.security.provider.SHA */ \ do_class(sun_security_provider_sha, "sun/security/provider/SHA") \ do_intrinsic(_sha_implCompress, sun_security_provider_sha, implCompress_name, implCompress_signature, F_R) \ diff -r 0341260cd1f2 -r a9b3c1984a01 hotspot/src/share/vm/opto/c2compiler.cpp --- a/hotspot/src/share/vm/opto/c2compiler.cpp Mon Dec 28 10:10:37 2015 -1000 +++ b/hotspot/src/share/vm/opto/c2compiler.cpp Mon Dec 28 23:11:01 2015 -0800 @@ -432,6 +432,7 @@ case vmIntrinsics::_aescrypt_decryptBlock: case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt: case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt: + case vmIntrinsics::_counterMode_AESCrypt: case vmIntrinsics::_sha_implCompress: case vmIntrinsics::_sha2_implCompress: case vmIntrinsics::_sha5_implCompress: diff -r 0341260cd1f2 -r a9b3c1984a01 hotspot/src/share/vm/opto/escape.cpp --- a/hotspot/src/share/vm/opto/escape.cpp Mon Dec 28 10:10:37 2015 -1000 +++ b/hotspot/src/share/vm/opto/escape.cpp Mon Dec 28 23:11:01 2015 -0800 @@ -976,6 +976,7 @@ strcmp(call->as_CallLeaf()->_name, "aescrypt_decryptBlock") == 0 || strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_encryptAESCrypt") == 0 || strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_decryptAESCrypt") == 0 || + strcmp(call->as_CallLeaf()->_name, "counterMode_AESCrypt") == 0 || strcmp(call->as_CallLeaf()->_name, "ghash_processBlocks") == 0 || strcmp(call->as_CallLeaf()->_name, "sha1_implCompress") == 0 || strcmp(call->as_CallLeaf()->_name, "sha1_implCompressMB") == 0 || diff -r 0341260cd1f2 -r a9b3c1984a01 hotspot/src/share/vm/opto/library_call.cpp --- a/hotspot/src/share/vm/opto/library_call.cpp Mon Dec 28 10:10:37 2015 -1000 +++ b/hotspot/src/share/vm/opto/library_call.cpp Mon Dec 28 23:11:01 2015 -0800 @@ -201,6 +201,7 @@ return generate_method_call(method_id, true, false); } Node * load_field_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, bool is_exact, bool is_static, ciInstanceKlass * fromKls); + Node * field_address_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, bool is_exact, bool is_static, ciInstanceKlass * fromKls); Node* make_string_method_node(int opcode, Node* str1_start, Node* cnt1, Node* str2_start, Node* cnt2, StrIntrinsicNode::ArgEnc ae); bool inline_string_compareTo(StrIntrinsicNode::ArgEnc ae); @@ -283,7 +284,9 @@ bool inline_Class_cast(); bool inline_aescrypt_Block(vmIntrinsics::ID id); bool inline_cipherBlockChaining_AESCrypt(vmIntrinsics::ID id); + bool inline_counterMode_AESCrypt(vmIntrinsics::ID id); Node* inline_cipherBlockChaining_AESCrypt_predicate(bool decrypting); + Node* inline_counterMode_AESCrypt_predicate(); Node* get_key_start_from_aescrypt_object(Node* aescrypt_object); Node* get_original_key_start_from_aescrypt_object(Node* aescrypt_object); bool inline_ghash_processBlocks(); @@ -697,6 +700,9 @@ case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt: return inline_cipherBlockChaining_AESCrypt(intrinsic_id()); + case vmIntrinsics::_counterMode_AESCrypt: + return inline_counterMode_AESCrypt(intrinsic_id()); + case vmIntrinsics::_sha_implCompress: case vmIntrinsics::_sha2_implCompress: case vmIntrinsics::_sha5_implCompress: @@ -784,6 +790,8 @@ return inline_cipherBlockChaining_AESCrypt_predicate(false); case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt: return inline_cipherBlockChaining_AESCrypt_predicate(true); + case vmIntrinsics::_counterMode_AESCrypt: + return inline_counterMode_AESCrypt_predicate(); case vmIntrinsics::_digestBase_implCompressMB: return inline_digestBase_implCompressMB_predicate(predicate); @@ -5778,6 +5786,39 @@ return loadedField; } +Node * LibraryCallKit::field_address_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, + bool is_exact = true, bool is_static = false, + ciInstanceKlass * fromKls = NULL) { + if (fromKls == NULL) { + const TypeInstPtr* tinst = _gvn.type(fromObj)->isa_instptr(); + assert(tinst != NULL, "obj is null"); + assert(tinst->klass()->is_loaded(), "obj is not loaded"); + assert(!is_exact || tinst->klass_is_exact(), "klass not exact"); + fromKls = tinst->klass()->as_instance_klass(); + } + else { + assert(is_static, "only for static field access"); + } + ciField* field = fromKls->get_field_by_name(ciSymbol::make(fieldName), + ciSymbol::make(fieldTypeString), + is_static); + + assert(field != NULL, "undefined field"); + assert(!field->is_volatile(), "not defined for volatile fields"); + + if (is_static) { + const TypeInstPtr* tip = TypeInstPtr::make(fromKls->java_mirror()); + fromObj = makecon(tip); + } + + // Next code copied from Parse::do_get_xxx(): + + // Compute address and memory type. + int offset = field->offset_in_bytes(); + Node *adr = basic_plus_adr(fromObj, fromObj, offset); + + return adr; +} //------------------------------inline_aescrypt_Block----------------------- bool LibraryCallKit::inline_aescrypt_Block(vmIntrinsics::ID id) { @@ -5944,6 +5985,90 @@ return true; } +//------------------------------inline_counterMode_AESCrypt----------------------- +bool LibraryCallKit::inline_counterMode_AESCrypt(vmIntrinsics::ID id) { + assert(UseAES, "need AES instruction support"); + if (!UseAESCTRIntrinsics) return false; + + address stubAddr = NULL; + const char *stubName = NULL; + if (id == vmIntrinsics::_counterMode_AESCrypt) { + stubAddr = StubRoutines::counterMode_AESCrypt(); + stubName = "counterMode_AESCrypt"; + } + if (stubAddr == NULL) return false; + + Node* counterMode_object = argument(0); + Node* src = argument(1); + Node* src_offset = argument(2); + Node* len = argument(3); + Node* dest = argument(4); + Node* dest_offset = argument(5); + + // (1) src and dest are arrays. + const Type* src_type = src->Value(&_gvn); + const Type* dest_type = dest->Value(&_gvn); + const TypeAryPtr* top_src = src_type->isa_aryptr(); + const TypeAryPtr* top_dest = dest_type->isa_aryptr(); + assert(top_src != NULL && top_src->klass() != NULL && + top_dest != NULL && top_dest->klass() != NULL, "args are strange"); + + // checks are the responsibility of the caller + Node* src_start = src; + Node* dest_start = dest; + if (src_offset != NULL || dest_offset != NULL) { + assert(src_offset != NULL && dest_offset != NULL, ""); + src_start = array_element_address(src, src_offset, T_BYTE); + dest_start = array_element_address(dest, dest_offset, T_BYTE); + } + + // if we are in this set of code, we "know" the embeddedCipher is an AESCrypt object + // (because of the predicated logic executed earlier). + // so we cast it here safely. + // this requires a newer class file that has this array as littleEndian ints, otherwise we revert to java + Node* embeddedCipherObj = load_field_from_object(counterMode_object, "embeddedCipher", "Lcom/sun/crypto/provider/SymmetricCipher;", /*is_exact*/ false); + if (embeddedCipherObj == NULL) return false; + // cast it to what we know it will be at runtime + const TypeInstPtr* tinst = _gvn.type(counterMode_object)->isa_instptr(); + assert(tinst != NULL, "CTR obj is null"); + assert(tinst->klass()->is_loaded(), "CTR obj is not loaded"); + ciKlass* klass_AESCrypt = tinst->klass()->as_instance_klass()->find_klass(ciSymbol::make("com/sun/crypto/provider/AESCrypt")); + assert(klass_AESCrypt->is_loaded(), "predicate checks that this class is loaded"); + ciInstanceKlass* instklass_AESCrypt = klass_AESCrypt->as_instance_klass(); + const TypeKlassPtr* aklass = TypeKlassPtr::make(instklass_AESCrypt); + const TypeOopPtr* xtype = aklass->as_instance_type(); + Node* aescrypt_object = new CheckCastPPNode(control(), embeddedCipherObj, xtype); + aescrypt_object = _gvn.transform(aescrypt_object); + // we need to get the start of the aescrypt_object's expanded key array + Node* k_start = get_key_start_from_aescrypt_object(aescrypt_object); + if (k_start == NULL) return false; + // similarly, get the start address of the r vector + Node* obj_counter = load_field_from_object(counterMode_object, "counter", "[B", /*is_exact*/ false); + if (obj_counter == NULL) return false; + Node* cnt_start = array_element_address(obj_counter, intcon(0), T_BYTE); + + Node* saved_encCounter = load_field_from_object(counterMode_object, "encryptedCounter", "[B", /*is_exact*/ false); + if (saved_encCounter == NULL) return false; + Node* saved_encCounter_start = array_element_address(saved_encCounter, intcon(0), T_BYTE); + Node* used = field_address_from_object(counterMode_object, "used", "I", /*is_exact*/ false); + + Node* ctrCrypt; + if (Matcher::pass_original_key_for_aes()) { + // no SPARC version for AES/CTR intrinsics now. + return false; + } + // Call the stub, passing src_start, dest_start, k_start, r_start and src_len + ctrCrypt = make_runtime_call(RC_LEAF|RC_NO_FP, + OptoRuntime::counterMode_aescrypt_Type(), + stubAddr, stubName, TypePtr::BOTTOM, + src_start, dest_start, k_start, cnt_start, len, saved_encCounter_start, used); + + // return cipher length (int) + Node* retvalue = _gvn.transform(new ProjNode(ctrCrypt, TypeFunc::Parms)); + set_result(retvalue); + return true; +} + //------------------------------get_key_start_from_aescrypt_object----------------------- Node * LibraryCallKit::get_key_start_from_aescrypt_object(Node *aescrypt_object) { Node* objAESCryptKey = load_field_from_object(aescrypt_object, "K", "[I", /*is_exact*/ false); @@ -6025,6 +6150,48 @@ return _gvn.transform(region); } +//----------------------------inline_counterMode_AESCrypt_predicate---------------------------- +// Return node representing slow path of predicate check. +// the pseudo code we want to emulate with this predicate is: +// for encryption: +// if (embeddedCipherObj instanceof AESCrypt) do_intrinsic, else do_javapath +// for decryption: +// if ((embeddedCipherObj instanceof AESCrypt) && (cipher!=plain)) do_intrinsic, else do_javapath +// note cipher==plain is more conservative than the original java code but that's OK +// + +Node* LibraryCallKit::inline_counterMode_AESCrypt_predicate() { + // The receiver was checked for NULL already. + Node* objCTR = argument(0); + + // Load embeddedCipher field of CipherBlockChaining object. + Node* embeddedCipherObj = load_field_from_object(objCTR, "embeddedCipher", "Lcom/sun/crypto/provider/SymmetricCipher;", /*is_exact*/ false); + + // get AESCrypt klass for instanceOf check + // AESCrypt might not be loaded yet if some other SymmetricCipher got us to this compile point + // will have same classloader as CipherBlockChaining object + const TypeInstPtr* tinst = _gvn.type(objCTR)->isa_instptr(); + assert(tinst != NULL, "CTRobj is null"); + assert(tinst->klass()->is_loaded(), "CTRobj is not loaded"); + + // we want to do an instanceof comparison against the AESCrypt class + ciKlass* klass_AESCrypt = tinst->klass()->as_instance_klass()->find_klass(ciSymbol::make("com/sun/crypto/provider/AESCrypt")); + if (!klass_AESCrypt->is_loaded()) { + // if AESCrypt is not even loaded, we never take the intrinsic fast path + Node* ctrl = control(); + set_control(top()); // no regular fast path + return ctrl; + } + + ciInstanceKlass* instklass_AESCrypt = klass_AESCrypt->as_instance_klass(); + Node* instof = gen_instanceof(embeddedCipherObj, makecon(TypeKlassPtr::make(instklass_AESCrypt))); + Node* cmp_instof = _gvn.transform(new CmpINode(instof, intcon(1))); + Node* bool_instof = _gvn.transform(new BoolNode(cmp_instof, BoolTest::ne)); + Node* instof_false = generate_guard(bool_instof, NULL, PROB_MIN); + + return instof_false; // even if it is NULL +} + //------------------------------inline_ghash_processBlocks bool LibraryCallKit::inline_ghash_processBlocks() { address stubAddr; diff -r 0341260cd1f2 -r a9b3c1984a01 hotspot/src/share/vm/opto/runtime.cpp --- a/hotspot/src/share/vm/opto/runtime.cpp Mon Dec 28 10:10:37 2015 -1000 +++ b/hotspot/src/share/vm/opto/runtime.cpp Mon Dec 28 23:11:01 2015 -0800 @@ -948,6 +948,35 @@ return TypeFunc::make(domain, range); } +//for counterMode calls of aescrypt encrypt/decrypt, four pointers and a length, returning int +const TypeFunc* OptoRuntime::counterMode_aescrypt_Type() { + // create input type (domain) + int num_args = 7; + if (Matcher::pass_original_key_for_aes()) { + num_args = 8; + } + int argcnt = num_args; + const Type** fields = TypeTuple::fields(argcnt); + int argp = TypeFunc::Parms; + fields[argp++] = TypePtr::NOTNULL; // src + fields[argp++] = TypePtr::NOTNULL; // dest + fields[argp++] = TypePtr::NOTNULL; // k array + fields[argp++] = TypePtr::NOTNULL; // counter array + fields[argp++] = TypeInt::INT; // src len + fields[argp++] = TypePtr::NOTNULL; // saved_encCounter + fields[argp++] = TypePtr::NOTNULL; // saved used addr + if (Matcher::pass_original_key_for_aes()) { + fields[argp++] = TypePtr::NOTNULL; // original k array + } + assert(argp == TypeFunc::Parms + argcnt, "correct decoding"); + const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms + argcnt, fields); + // returning cipher len (int) + fields = TypeTuple::fields(1); + fields[TypeFunc::Parms + 0] = TypeInt::INT; + const TypeTuple* range = TypeTuple::make(TypeFunc::Parms + 1, fields); + return TypeFunc::make(domain, range); +} + /* * void implCompress(byte[] buf, int ofs) */ diff -r 0341260cd1f2 -r a9b3c1984a01 hotspot/src/share/vm/opto/runtime.hpp --- a/hotspot/src/share/vm/opto/runtime.hpp Mon Dec 28 10:10:37 2015 -1000 +++ b/hotspot/src/share/vm/opto/runtime.hpp Mon Dec 28 23:11:01 2015 -0800 @@ -287,6 +287,7 @@ static const TypeFunc* aescrypt_block_Type(); static const TypeFunc* cipherBlockChaining_aescrypt_Type(); + static const TypeFunc* counterMode_aescrypt_Type(); static const TypeFunc* sha_implCompress_Type(); static const TypeFunc* digestBase_implCompressMB_Type(); diff -r 0341260cd1f2 -r a9b3c1984a01 hotspot/src/share/vm/runtime/globals.hpp --- a/hotspot/src/share/vm/runtime/globals.hpp Mon Dec 28 10:10:37 2015 -1000 +++ b/hotspot/src/share/vm/runtime/globals.hpp Mon Dec 28 23:11:01 2015 -0800 @@ -836,6 +836,9 @@ product(bool, UseAESIntrinsics, false, \ "Use intrinsics for AES versions of crypto") \ \ + product(bool, UseAESCTRIntrinsics, false, \ + "Use intrinsics for the paralleled version of AES/CTR crypto") \ + \ product(bool, UseSHA1Intrinsics, false, \ "Use intrinsics for SHA-1 crypto hash function. " \ "Requires that UseSHA is enabled.") \ diff -r 0341260cd1f2 -r a9b3c1984a01 hotspot/src/share/vm/runtime/stubRoutines.cpp --- a/hotspot/src/share/vm/runtime/stubRoutines.cpp Mon Dec 28 10:10:37 2015 -1000 +++ b/hotspot/src/share/vm/runtime/stubRoutines.cpp Mon Dec 28 23:11:01 2015 -0800 @@ -127,6 +127,7 @@ address StubRoutines::_aescrypt_decryptBlock = NULL; address StubRoutines::_cipherBlockChaining_encryptAESCrypt = NULL; address StubRoutines::_cipherBlockChaining_decryptAESCrypt = NULL; +address StubRoutines::_counterMode_AESCrypt = NULL; address StubRoutines::_ghash_processBlocks = NULL; address StubRoutines::_sha1_implCompress = NULL; diff -r 0341260cd1f2 -r a9b3c1984a01 hotspot/src/share/vm/runtime/stubRoutines.hpp --- a/hotspot/src/share/vm/runtime/stubRoutines.hpp Mon Dec 28 10:10:37 2015 -1000 +++ b/hotspot/src/share/vm/runtime/stubRoutines.hpp Mon Dec 28 23:11:01 2015 -0800 @@ -186,6 +186,7 @@ static address _aescrypt_decryptBlock; static address _cipherBlockChaining_encryptAESCrypt; static address _cipherBlockChaining_decryptAESCrypt; + static address _counterMode_AESCrypt; static address _ghash_processBlocks; static address _sha1_implCompress; @@ -359,6 +360,7 @@ static address aescrypt_decryptBlock() { return _aescrypt_decryptBlock; } static address cipherBlockChaining_encryptAESCrypt() { return _cipherBlockChaining_encryptAESCrypt; } static address cipherBlockChaining_decryptAESCrypt() { return _cipherBlockChaining_decryptAESCrypt; } + static address counterMode_AESCrypt() { return _counterMode_AESCrypt; } static address ghash_processBlocks() { return _ghash_processBlocks; } static address sha1_implCompress() { return _sha1_implCompress; } diff -r 0341260cd1f2 -r a9b3c1984a01 hotspot/src/share/vm/runtime/vmStructs.cpp --- a/hotspot/src/share/vm/runtime/vmStructs.cpp Mon Dec 28 10:10:37 2015 -1000 +++ b/hotspot/src/share/vm/runtime/vmStructs.cpp Mon Dec 28 23:11:01 2015 -0800 @@ -850,6 +850,7 @@ static_field(StubRoutines, _aescrypt_decryptBlock, address) \ static_field(StubRoutines, _cipherBlockChaining_encryptAESCrypt, address) \ static_field(StubRoutines, _cipherBlockChaining_decryptAESCrypt, address) \ + static_field(StubRoutines, _counterMode_AESCrypt, address) \ static_field(StubRoutines, _ghash_processBlocks, address) \ static_field(StubRoutines, _updateBytesCRC32, address) \ static_field(StubRoutines, _crc_table_adr, address) \ diff -r 0341260cd1f2 -r a9b3c1984a01 hotspot/test/compiler/codegen/7184394/TestAESBase.java --- a/hotspot/test/compiler/codegen/7184394/TestAESBase.java Mon Dec 28 10:10:37 2015 -1000 +++ b/hotspot/test/compiler/codegen/7184394/TestAESBase.java Mon Dec 28 23:11:01 2015 -0800 @@ -104,8 +104,8 @@ cipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE"); dCipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE"); - // CBC init - if (mode.equals("CBC")) { + // CBC or CTR init + if (mode.equals("CBC") || mode.equals("CTR")) { IvParameterSpec initVector = new IvParameterSpec(iv); cipher.init(Cipher.ENCRYPT_MODE, key, initVector); algParams = cipher.getParameters(); diff -r 0341260cd1f2 -r a9b3c1984a01 hotspot/test/compiler/codegen/7184394/TestAESMain.java --- a/hotspot/test/compiler/codegen/7184394/TestAESMain.java Mon Dec 28 10:10:37 2015 -1000 +++ b/hotspot/test/compiler/codegen/7184394/TestAESMain.java Mon Dec 28 23:11:01 2015 -0800 @@ -51,6 +51,13 @@ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 TestAESMain * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencInputOffset=1 TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencOutputOffset=1 TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DdecOutputOffset=1 TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencInputOffset=1 -DencOutputOffset=1 TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain * * @author Tom Deneau */