--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp Mon Dec 28 10:10:37 2015 -1000
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp Mon Dec 28 23:11:01 2015 -0800
@@ -3039,6 +3039,15 @@
return start;
}
+ address generate_counter_shuffle_mask() {
+ __ align(16);
+ StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
+ address start = __ pc();
+ __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
+ __ emit_data64(0x0001020304050607, relocInfo::none);
+ return start;
+ }
+
// Utility routine for loading a 128-bit key word in little endian format
// can optionally specify that the shuffle mask is already in an xmmregister
void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
@@ -3050,6 +3059,18 @@
}
}
+ // Utility routine for increase 128bit counter (iv in CTR mode)
+ void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
+ __ pextrq(reg, xmmdst, 0x0);
+ __ addq(reg, inc_delta);
+ __ pinsrq(xmmdst, reg, 0x0);
+ __ jcc(Assembler::carryClear, next_block); // jump if no carry
+ __ pextrq(reg, xmmdst, 0x01); // Carry
+ __ addq(reg, 0x01);
+ __ pinsrq(xmmdst, reg, 0x01); //Carry end
+ __ BIND(next_block); // next instruction
+ }
+
// Arguments:
//
// Inputs:
@@ -3700,6 +3721,328 @@
return start;
}
+ // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
+ // to hide instruction latency
+ //
+ // Arguments:
+ //
+ // Inputs:
+ // c_rarg0 - source byte array address
+ // c_rarg1 - destination byte array address
+ // c_rarg2 - K (key) in little endian int array
+ // c_rarg3 - counter vector byte array address
+ // Linux
+ // c_rarg4 - input length
+ // c_rarg5 - saved encryptedCounter start
+ // rbp + 6 * wordSize - saved used length
+ // Windows
+ // rbp + 6 * wordSize - input length
+ // rbp + 7 * wordSize - saved encryptedCounter start
+ // rbp + 8 * wordSize - saved used length
+ //
+ // Output:
+ // rax - input length
+ //
+ address generate_counterMode_AESCrypt_Parallel() {
+ assert(UseAES, "need AES instructions and misaligned SSE support");
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
+ address start = __ pc();
+ const Register from = c_rarg0; // source array address
+ const Register to = c_rarg1; // destination array address
+ const Register key = c_rarg2; // key array address
+ const Register counter = c_rarg3; // counter byte array initialized from counter array address
+ // and left with the results of the last encryption block
+#ifndef _WIN64
+ const Register len_reg = c_rarg4;
+ const Register saved_encCounter_start = c_rarg5;
+ const Register used_addr = r10;
+ const Address used_mem(rbp, 2 * wordSize);
+ const Register used = r11;
+#else
+ const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
+ const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64
+ const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64
+ const Register len_reg = r10; // pick the first volatile windows register
+ const Register saved_encCounter_start = r11;
+ const Register used_addr = r13;
+ const Register used = r14;
+#endif
+ const Register pos = rax;
+
+ const int PARALLEL_FACTOR = 6;
+ const XMMRegister xmm_counter_shuf_mask = xmm0;
+ const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
+ const XMMRegister xmm_curr_counter = xmm2;
+
+ const XMMRegister xmm_key_tmp0 = xmm3;
+ const XMMRegister xmm_key_tmp1 = xmm4;
+
+ // registers holding the four results in the parallelized loop
+ const XMMRegister xmm_result0 = xmm5;
+ const XMMRegister xmm_result1 = xmm6;
+ const XMMRegister xmm_result2 = xmm7;
+ const XMMRegister xmm_result3 = xmm8;
+ const XMMRegister xmm_result4 = xmm9;
+ const XMMRegister xmm_result5 = xmm10;
+
+ const XMMRegister xmm_from0 = xmm11;
+ const XMMRegister xmm_from1 = xmm12;
+ const XMMRegister xmm_from2 = xmm13;
+ const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
+ const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
+ const XMMRegister xmm_from5 = xmm4;
+
+ //for key_128, key_192, key_256
+ const int rounds[3] = {10, 12, 14};
+ Label L_exit_preLoop, L_preLoop_start;
+ Label L_multiBlock_loopTop[3];
+ Label L_singleBlockLoopTop[3];
+ Label L__incCounter[3][6]; //for 6 blocks
+ Label L__incCounter_single[3]; //for single block, key128, key192, key256
+ Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
+ Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
+
+ Label L_exit;
+
+ __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+ // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+ // context for the registers used, where all instructions below are using 128-bit mode
+ // On EVEX without VL and BW, these instructions will all be AVX.
+ if (VM_Version::supports_avx512vlbw()) {
+ __ movl(rax, 0xffff);
+ __ kmovql(k1, rax);
+ }
+
+#ifdef _WIN64
+ // save the xmm registers which must be preserved 6-14
+ const int XMM_REG_NUM_KEY_LAST = 14;
+ __ subptr(rsp, -rsp_after_call_off * wordSize);
+ for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
+ __ movdqu(xmm_save(i), as_XMMRegister(i));
+ }
+
+ const Address r13_save(rbp, rdi_off * wordSize);
+ const Address r14_save(rbp, rsi_off * wordSize);
+
+ __ movptr(r13_save, r13);
+ __ movptr(r14_save, r14);
+
+ // on win64, fill len_reg from stack position
+ __ movl(len_reg, len_mem);
+ __ movptr(saved_encCounter_start, saved_encCounter_mem);
+ __ movptr(used_addr, used_mem);
+ __ movl(used, Address(used_addr, 0));
+#else
+ __ push(len_reg); // Save
+ __ movptr(used_addr, used_mem);
+ __ movl(used, Address(used_addr, 0));
+#endif
+
+ __ push(rbx); // Save RBX
+ __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
+ __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
+ __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
+ __ movptr(pos, 0);
+
+ // Use the partially used encrpyted counter from last invocation
+ __ BIND(L_preLoop_start);
+ __ cmpptr(used, 16);
+ __ jcc(Assembler::aboveEqual, L_exit_preLoop);
+ __ cmpptr(len_reg, 0);
+ __ jcc(Assembler::lessEqual, L_exit_preLoop);
+ __ movb(rbx, Address(saved_encCounter_start, used));
+ __ xorb(rbx, Address(from, pos));
+ __ movb(Address(to, pos), rbx);
+ __ addptr(pos, 1);
+ __ addptr(used, 1);
+ __ subptr(len_reg, 1);
+
+ __ jmp(L_preLoop_start);
+
+ __ BIND(L_exit_preLoop);
+ __ movl(Address(used_addr, 0), used);
+
+ // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
+ __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+ __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+ __ cmpl(rbx, 52);
+ __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
+ __ cmpl(rbx, 60);
+ __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
+
+#define CTR_DoSix(opc, src_reg) \
+ __ opc(xmm_result0, src_reg); \
+ __ opc(xmm_result1, src_reg); \
+ __ opc(xmm_result2, src_reg); \
+ __ opc(xmm_result3, src_reg); \
+ __ opc(xmm_result4, src_reg); \
+ __ opc(xmm_result5, src_reg);
+
+ // k == 0 : generate code for key_128
+ // k == 1 : generate code for key_192
+ // k == 2 : generate code for key_256
+ for (int k = 0; k < 3; ++k) {
+ //multi blocks starts here
+ __ align(OptoLoopAlignment);
+ __ BIND(L_multiBlock_loopTop[k]);
+ __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
+ __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
+ load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
+
+ //load, then increase counters
+ CTR_DoSix(movdqa, xmm_curr_counter);
+ inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
+ inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
+ inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
+ inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
+ inc_counter(rbx, xmm_result5, 0x05, L__incCounter[k][4]);
+ inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
+ CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
+ CTR_DoSix(pxor, xmm_key_tmp0); //PXOR with Round 0 key
+
+ //load two ROUND_KEYs at a time
+ for (int i = 1; i < rounds[k]; ) {
+ load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
+ load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
+ CTR_DoSix(aesenc, xmm_key_tmp1);
+ i++;
+ if (i != rounds[k]) {
+ CTR_DoSix(aesenc, xmm_key_tmp0);
+ } else {
+ CTR_DoSix(aesenclast, xmm_key_tmp0);
+ }
+ i++;
+ }
+
+ // get next PARALLEL_FACTOR blocks into xmm_result registers
+ __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
+ __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
+ __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
+ __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
+ __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
+ __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));
+
+ __ pxor(xmm_result0, xmm_from0);
+ __ pxor(xmm_result1, xmm_from1);
+ __ pxor(xmm_result2, xmm_from2);
+ __ pxor(xmm_result3, xmm_from3);
+ __ pxor(xmm_result4, xmm_from4);
+ __ pxor(xmm_result5, xmm_from5);
+
+ // store 6 results into the next 64 bytes of output
+ __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
+ __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
+ __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
+ __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
+ __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
+ __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);
+
+ __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
+ __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
+ __ jmp(L_multiBlock_loopTop[k]);
+
+ // singleBlock starts here
+ __ align(OptoLoopAlignment);
+ __ BIND(L_singleBlockLoopTop[k]);
+ __ cmpptr(len_reg, 0);
+ __ jcc(Assembler::lessEqual, L_exit);
+ load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
+ __ movdqa(xmm_result0, xmm_curr_counter);
+ inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
+ __ pshufb(xmm_result0, xmm_counter_shuf_mask);
+ __ pxor(xmm_result0, xmm_key_tmp0);
+ for (int i = 1; i < rounds[k]; i++) {
+ load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
+ __ aesenc(xmm_result0, xmm_key_tmp0);
+ }
+ load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
+ __ aesenclast(xmm_result0, xmm_key_tmp0);
+ __ cmpptr(len_reg, AESBlockSize);
+ __ jcc(Assembler::less, L_processTail_insr[k]);
+ __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
+ __ pxor(xmm_result0, xmm_from0);
+ __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
+ __ addptr(pos, AESBlockSize);
+ __ subptr(len_reg, AESBlockSize);
+ __ jmp(L_singleBlockLoopTop[k]);
+ __ BIND(L_processTail_insr[k]);
+ __ addptr(pos, len_reg);
+ __ testptr(len_reg, 8);
+ __ jcc(Assembler::zero, L_processTail_4_insr[k]);
+ __ subptr(pos,8);
+ __ pinsrq(xmm_from0, Address(from, pos), 0);
+ __ BIND(L_processTail_4_insr[k]);
+ __ testptr(len_reg, 4);
+ __ jcc(Assembler::zero, L_processTail_2_insr[k]);
+ __ subptr(pos,4);
+ __ pslldq(xmm_from0, 4);
+ __ pinsrd(xmm_from0, Address(from, pos), 0);
+ __ BIND(L_processTail_2_insr[k]);
+ __ testptr(len_reg, 2);
+ __ jcc(Assembler::zero, L_processTail_1_insr[k]);
+ __ subptr(pos, 2);
+ __ pslldq(xmm_from0, 2);
+ __ pinsrw(xmm_from0, Address(from, pos), 0);
+ __ BIND(L_processTail_1_insr[k]);
+ __ testptr(len_reg, 1);
+ __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
+ __ subptr(pos, 1);
+ __ pslldq(xmm_from0, 1);
+ __ pinsrb(xmm_from0, Address(from, pos), 0);
+ __ BIND(L_processTail_exit_insr[k]);
+
+ __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);
+ __ pxor(xmm_result0, xmm_from0);
+
+ __ testptr(len_reg, 8);
+ __ jcc(Assembler::zero, L_processTail_4_extr[k]);
+ __ pextrq(Address(to, pos), xmm_result0, 0);
+ __ psrldq(xmm_result0, 8);
+ __ addptr(pos, 8);
+ __ BIND(L_processTail_4_extr[k]);
+ __ testptr(len_reg, 4);
+ __ jcc(Assembler::zero, L_processTail_2_extr[k]);
+ __ pextrd(Address(to, pos), xmm_result0, 0);
+ __ psrldq(xmm_result0, 4);
+ __ addptr(pos, 4);
+ __ BIND(L_processTail_2_extr[k]);
+ __ testptr(len_reg, 2);
+ __ jcc(Assembler::zero, L_processTail_1_extr[k]);
+ __ pextrw(Address(to, pos), xmm_result0, 0);
+ __ psrldq(xmm_result0, 2);
+ __ addptr(pos, 2);
+ __ BIND(L_processTail_1_extr[k]);
+ __ testptr(len_reg, 1);
+ __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
+ __ pextrb(Address(to, pos), xmm_result0, 0);
+
+ __ BIND(L_processTail_exit_extr[k]);
+ __ movl(Address(used_addr, 0), len_reg);
+ __ jmp(L_exit);
+
+ }
+
+ __ BIND(L_exit);
+ __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
+ __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
+ __ pop(rbx); // pop the saved RBX.
+#ifdef _WIN64
+ // restore regs belonging to calling function
+ for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
+ __ movdqu(as_XMMRegister(i), xmm_save(i));
+ }
+ __ movl(rax, len_mem);
+ __ movptr(r13, r13_save);
+ __ movptr(r14, r14_save);
+#else
+ __ pop(rax); // return 'len'
+#endif
+ __ leave(); // required for proper stackwalking of RuntimeStub frame
+ __ ret(0);
+ return start;
+ }
// byte swap x86 long
address generate_ghash_long_swap_mask() {
@@ -4555,12 +4898,15 @@
// don't bother generating these AES intrinsic stubs unless global flag is set
if (UseAESIntrinsics) {
StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // needed by the others
-
StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
}
+ if (UseAESCTRIntrinsics){
+ StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
+ StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
+ }
// Generate GHASH intrinsics code
if (UseGHASHIntrinsics) {