2598 // |
2598 // |
2599 // Output: |
2599 // Output: |
2600 // rax - input length |
2600 // rax - input length |
2601 // |
2601 // |
2602 |
2602 |
2603 address generate_cipherBlockChaining_decryptAESCrypt() { |
2603 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { |
2604 assert(UseAES, "need AES instructions and misaligned SSE support"); |
2604 assert(UseAES, "need AES instructions and misaligned SSE support"); |
2605 __ align(CodeEntryAlignment); |
2605 __ align(CodeEntryAlignment); |
2606 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); |
2606 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); |
2607 address start = __ pc(); |
2607 address start = __ pc(); |
2608 |
2608 |
2609 Label L_exit, L_key_192_256, L_key_256; |
|
2610 Label L_singleBlock_loopTop_128; |
|
2611 Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256; |
|
2612 const Register from = rsi; // source array address |
2609 const Register from = rsi; // source array address |
2613 const Register to = rdx; // destination array address |
2610 const Register to = rdx; // destination array address |
2614 const Register key = rcx; // key array address |
2611 const Register key = rcx; // key array address |
2615 const Register rvec = rdi; // r byte array initialized from initvector array address |
2612 const Register rvec = rdi; // r byte array initialized from initvector array address |
2616 // and left with the results of the last encryption block |
2613 // and left with the results of the last encryption block |
2617 const Register len_reg = rbx; // src len (must be multiple of blocksize 16) |
2614 const Register len_reg = rbx; // src len (must be multiple of blocksize 16) |
2618 const Register pos = rax; |
2615 const Register pos = rax; |
2619 |
2616 |
2620 // xmm register assignments for the loops below |
2617 const int PARALLEL_FACTOR = 4; |
2621 const XMMRegister xmm_result = xmm0; |
2618 const int ROUNDS[3] = { 10, 12, 14 }; //aes rounds for key128, key192, key256 |
2622 const XMMRegister xmm_temp = xmm1; |
2619 |
2623 // first 6 keys preloaded into xmm2-xmm7 |
2620 Label L_exit; |
2624 const int XMM_REG_NUM_KEY_FIRST = 2; |
2621 Label L_singleBlock_loopTop[3]; //128, 192, 256 |
2625 const int XMM_REG_NUM_KEY_LAST = 7; |
2622 Label L_multiBlock_loopTop[3]; //128, 192, 256 |
2626 const int FIRST_NON_REG_KEY_offset = 0x70; |
2623 |
2627 const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); |
2624 const XMMRegister xmm_prev_block_cipher = xmm0; // holds cipher of previous block |
|
2625 const XMMRegister xmm_key_shuf_mask = xmm1; |
|
2626 |
|
2627 const XMMRegister xmm_key_tmp0 = xmm2; |
|
2628 const XMMRegister xmm_key_tmp1 = xmm3; |
|
2629 |
|
2630 // registers holding the six results in the parallelized loop |
|
2631 const XMMRegister xmm_result0 = xmm4; |
|
2632 const XMMRegister xmm_result1 = xmm5; |
|
2633 const XMMRegister xmm_result2 = xmm6; |
|
2634 const XMMRegister xmm_result3 = xmm7; |
2628 |
2635 |
2629 __ enter(); // required for proper stackwalking of RuntimeStub frame |
2636 __ enter(); // required for proper stackwalking of RuntimeStub frame |
2630 handleSOERegisters(true /*saving*/); |
2637 handleSOERegisters(true /*saving*/); |
2631 |
2638 |
2632 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge |
2639 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge |
2641 const Address from_param(rbp, 8+0); |
2648 const Address from_param(rbp, 8+0); |
2642 const Address to_param (rbp, 8+4); |
2649 const Address to_param (rbp, 8+4); |
2643 const Address key_param (rbp, 8+8); |
2650 const Address key_param (rbp, 8+8); |
2644 const Address rvec_param (rbp, 8+12); |
2651 const Address rvec_param (rbp, 8+12); |
2645 const Address len_param (rbp, 8+16); |
2652 const Address len_param (rbp, 8+16); |
|
2653 |
2646 __ movptr(from , from_param); |
2654 __ movptr(from , from_param); |
2647 __ movptr(to , to_param); |
2655 __ movptr(to , to_param); |
2648 __ movptr(key , key_param); |
2656 __ movptr(key , key_param); |
2649 __ movptr(rvec , rvec_param); |
2657 __ movptr(rvec , rvec_param); |
2650 __ movptr(len_reg , len_param); |
2658 __ movptr(len_reg , len_param); |
2651 |
2659 |
2652 // the java expanded key ordering is rotated one position from what we want |
|
2653 // so we start from 0x10 here and hit 0x00 last |
|
2654 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front |
|
2655 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
2660 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
2656 // load up xmm regs 2 thru 6 with first 5 keys |
2661 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec |
2657 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { |
2662 |
2658 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); |
2663 __ xorptr(pos, pos); |
2659 offset += 0x10; |
|
2660 } |
|
2661 |
|
2662 // inside here, use the rvec register to point to previous block cipher |
|
2663 // with which we xor at the end of each newly decrypted block |
|
2664 const Register prev_block_cipher_ptr = rvec; |
|
2665 |
2664 |
2666 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) |
2665 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) |
2667 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
2666 // rvec is reused |
2668 __ cmpl(rax, 44); |
2667 __ movl(rvec, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
2669 __ jcc(Assembler::notEqual, L_key_192_256); |
2668 __ cmpl(rvec, 52); |
2670 |
2669 __ jcc(Assembler::equal, L_multiBlock_loopTop[1]); |
2671 |
2670 __ cmpl(rvec, 60); |
2672 // 128-bit code follows here, parallelized |
2671 __ jcc(Assembler::equal, L_multiBlock_loopTop[2]); |
2673 __ movl(pos, 0); |
2672 |
2674 __ align(OptoLoopAlignment); |
2673 #define DoFour(opc, src_reg) \ |
2675 __ BIND(L_singleBlock_loopTop_128); |
2674 __ opc(xmm_result0, src_reg); \ |
2676 __ cmpptr(len_reg, 0); // any blocks left?? |
2675 __ opc(xmm_result1, src_reg); \ |
2677 __ jcc(Assembler::equal, L_exit); |
2676 __ opc(xmm_result2, src_reg); \ |
2678 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input |
2677 __ opc(xmm_result3, src_reg); \ |
2679 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds |
2678 |
2680 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { |
2679 for (int k = 0; k < 3; ++k) { |
2681 __ aesdec(xmm_result, as_XMMRegister(rnum)); |
2680 __ align(OptoLoopAlignment); |
2682 } |
2681 __ BIND(L_multiBlock_loopTop[k]); |
2683 for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xa0; key_offset += 0x10) { // 128-bit runs up to key offset a0 |
2682 __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left |
2684 aes_dec_key(xmm_result, xmm_temp, key, key_offset); |
2683 __ jcc(Assembler::less, L_singleBlock_loopTop[k]); |
2685 } |
2684 |
2686 load_key(xmm_temp, key, 0x00); // final key is stored in java expanded array at offset 0 |
2685 __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers |
2687 __ aesdeclast(xmm_result, xmm_temp); |
2686 __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize)); |
2688 __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00)); |
2687 __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize)); |
2689 __ pxor (xmm_result, xmm_temp); // xor with the current r vector |
2688 __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize)); |
2690 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output |
2689 |
2691 // no need to store r to memory until we exit |
2690 // the java expanded key ordering is rotated one position from what we want |
2692 __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0)); // set up new ptr |
2691 // so we start from 0x10 here and hit 0x00 last |
2693 __ addptr(pos, AESBlockSize); |
2692 load_key(xmm_key_tmp0, key, 0x10, xmm_key_shuf_mask); |
2694 __ subptr(len_reg, AESBlockSize); |
2693 DoFour(pxor, xmm_key_tmp0); //xor with first key |
2695 __ jmp(L_singleBlock_loopTop_128); |
2694 // do the aes dec rounds |
2696 |
2695 for (int rnum = 1; rnum <= ROUNDS[k];) { |
|
2696 //load two keys at a time |
|
2697 //k1->0x20, ..., k9->0xa0, k10->0x00 |
|
2698 load_key(xmm_key_tmp1, key, (rnum + 1) * 0x10, xmm_key_shuf_mask); |
|
2699 load_key(xmm_key_tmp0, key, ((rnum + 2) % (ROUNDS[k] + 1)) * 0x10, xmm_key_shuf_mask); // hit 0x00 last! |
|
2700 DoFour(aesdec, xmm_key_tmp1); |
|
2701 rnum++; |
|
2702 if (rnum != ROUNDS[k]) { |
|
2703 DoFour(aesdec, xmm_key_tmp0); |
|
2704 } |
|
2705 else { |
|
2706 DoFour(aesdeclast, xmm_key_tmp0); |
|
2707 } |
|
2708 rnum++; |
|
2709 } |
|
2710 |
|
2711 // for each result, xor with the r vector of previous cipher block |
|
2712 __ pxor(xmm_result0, xmm_prev_block_cipher); |
|
2713 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize)); |
|
2714 __ pxor(xmm_result1, xmm_prev_block_cipher); |
|
2715 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize)); |
|
2716 __ pxor(xmm_result2, xmm_prev_block_cipher); |
|
2717 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize)); |
|
2718 __ pxor(xmm_result3, xmm_prev_block_cipher); |
|
2719 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize)); // this will carry over to next set of blocks |
|
2720 |
|
2721 // store 4 results into the next 64 bytes of output |
|
2722 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); |
|
2723 __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1); |
|
2724 __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2); |
|
2725 __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3); |
|
2726 |
|
2727 __ addptr(pos, 4 * AESBlockSize); |
|
2728 __ subptr(len_reg, 4 * AESBlockSize); |
|
2729 __ jmp(L_multiBlock_loopTop[k]); |
|
2730 |
|
2731 //singleBlock starts here |
|
2732 __ align(OptoLoopAlignment); |
|
2733 __ BIND(L_singleBlock_loopTop[k]); |
|
2734 __ cmpptr(len_reg, 0); // any blocks left? |
|
2735 __ jcc(Assembler::equal, L_exit); |
|
2736 __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input |
|
2737 __ movdqa(xmm_result1, xmm_result0); |
|
2738 |
|
2739 load_key(xmm_key_tmp0, key, 0x10, xmm_key_shuf_mask); |
|
2740 __ pxor(xmm_result0, xmm_key_tmp0); |
|
2741 // do the aes dec rounds |
|
2742 for (int rnum = 1; rnum < ROUNDS[k]; rnum++) { |
|
2743 // the java expanded key ordering is rotated one position from what we want |
|
2744 load_key(xmm_key_tmp0, key, (rnum + 1) * 0x10, xmm_key_shuf_mask); |
|
2745 __ aesdec(xmm_result0, xmm_key_tmp0); |
|
2746 } |
|
2747 load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask); |
|
2748 __ aesdeclast(xmm_result0, xmm_key_tmp0); |
|
2749 __ pxor(xmm_result0, xmm_prev_block_cipher); // xor with the current r vector |
|
2750 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result0); // store into the next 16 bytes of output |
|
2751 // no need to store r to memory until we exit |
|
2752 __ movdqa(xmm_prev_block_cipher, xmm_result1); // set up next r vector with cipher input from this block |
|
2753 |
|
2754 __ addptr(pos, AESBlockSize); |
|
2755 __ subptr(len_reg, AESBlockSize); |
|
2756 __ jmp(L_singleBlock_loopTop[k]); |
|
2757 }//for 128/192/256 |
2697 |
2758 |
2698 __ BIND(L_exit); |
2759 __ BIND(L_exit); |
2699 __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00)); |
2760 __ movptr(rvec, rvec_param); // restore this since reused earlier |
2700 __ movptr(rvec , rvec_param); // restore this since used in loop |
2761 __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object |
2701 __ movdqu(Address(rvec, 0), xmm_temp); // final value of r stored in rvec of CipherBlockChaining object |
|
2702 handleSOERegisters(false /*restoring*/); |
2762 handleSOERegisters(false /*restoring*/); |
2703 __ movptr(rax, len_param); // return length |
2763 __ movptr(rax, len_param); // return length |
2704 __ leave(); // required for proper stackwalking of RuntimeStub frame |
2764 __ leave(); // required for proper stackwalking of RuntimeStub frame |
2705 __ ret(0); |
2765 __ ret(0); |
2706 |
2766 |
2707 |
|
2708 __ BIND(L_key_192_256); |
|
2709 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) |
|
2710 __ cmpl(rax, 52); |
|
2711 __ jcc(Assembler::notEqual, L_key_256); |
|
2712 |
|
2713 // 192-bit code follows here (could be optimized to use parallelism) |
|
2714 __ movl(pos, 0); |
|
2715 __ align(OptoLoopAlignment); |
|
2716 __ BIND(L_singleBlock_loopTop_192); |
|
2717 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input |
|
2718 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds |
|
2719 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { |
|
2720 __ aesdec(xmm_result, as_XMMRegister(rnum)); |
|
2721 } |
|
2722 for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xc0; key_offset += 0x10) { // 192-bit runs up to key offset c0 |
|
2723 aes_dec_key(xmm_result, xmm_temp, key, key_offset); |
|
2724 } |
|
2725 load_key(xmm_temp, key, 0x00); // final key is stored in java expanded array at offset 0 |
|
2726 __ aesdeclast(xmm_result, xmm_temp); |
|
2727 __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00)); |
|
2728 __ pxor (xmm_result, xmm_temp); // xor with the current r vector |
|
2729 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output |
|
2730 // no need to store r to memory until we exit |
|
2731 __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0)); // set up new ptr |
|
2732 __ addptr(pos, AESBlockSize); |
|
2733 __ subptr(len_reg, AESBlockSize); |
|
2734 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192); |
|
2735 __ jmp(L_exit); |
|
2736 |
|
2737 __ BIND(L_key_256); |
|
2738 // 256-bit code follows here (could be optimized to use parallelism) |
|
2739 __ movl(pos, 0); |
|
2740 __ align(OptoLoopAlignment); |
|
2741 __ BIND(L_singleBlock_loopTop_256); |
|
2742 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input |
|
2743 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds |
|
2744 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { |
|
2745 __ aesdec(xmm_result, as_XMMRegister(rnum)); |
|
2746 } |
|
2747 for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xe0; key_offset += 0x10) { // 256-bit runs up to key offset e0 |
|
2748 aes_dec_key(xmm_result, xmm_temp, key, key_offset); |
|
2749 } |
|
2750 load_key(xmm_temp, key, 0x00); // final key is stored in java expanded array at offset 0 |
|
2751 __ aesdeclast(xmm_result, xmm_temp); |
|
2752 __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00)); |
|
2753 __ pxor (xmm_result, xmm_temp); // xor with the current r vector |
|
2754 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output |
|
2755 // no need to store r to memory until we exit |
|
2756 __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0)); // set up new ptr |
|
2757 __ addptr(pos, AESBlockSize); |
|
2758 __ subptr(len_reg, AESBlockSize); |
|
2759 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256); |
|
2760 __ jmp(L_exit); |
|
2761 |
|
2762 return start; |
2767 return start; |
2763 } |
2768 } |
2764 |
|
2765 |
2769 |
2766 // CTR AES crypt. |
2770 // CTR AES crypt. |
2767 // In 32-bit stub, parallelize 4 blocks at a time |
2771 // In 32-bit stub, parallelize 4 blocks at a time |
2768 // Arguments: |
2772 // Arguments: |
2769 // |
2773 // |