3698 __ jmp(L_exit); |
3719 __ jmp(L_exit); |
3699 |
3720 |
3700 return start; |
3721 return start; |
3701 } |
3722 } |
3702 |
3723 |
|
3724 // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time |
|
3725 // to hide instruction latency |
|
3726 // |
|
3727 // Arguments: |
|
3728 // |
|
3729 // Inputs: |
|
3730 // c_rarg0 - source byte array address |
|
3731 // c_rarg1 - destination byte array address |
|
3732 // c_rarg2 - K (key) in little endian int array |
|
3733 // c_rarg3 - counter vector byte array address |
|
3734 // Linux |
|
3735 // c_rarg4 - input length |
|
3736 // c_rarg5 - saved encryptedCounter start |
|
3737 // rbp + 6 * wordSize - saved used length |
|
3738 // Windows |
|
3739 // rbp + 6 * wordSize - input length |
|
3740 // rbp + 7 * wordSize - saved encryptedCounter start |
|
3741 // rbp + 8 * wordSize - saved used length |
|
3742 // |
|
3743 // Output: |
|
3744 // rax - input length |
|
3745 // |
|
3746 address generate_counterMode_AESCrypt_Parallel() { |
|
3747 assert(UseAES, "need AES instructions and misaligned SSE support"); |
|
3748 __ align(CodeEntryAlignment); |
|
3749 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); |
|
3750 address start = __ pc(); |
|
3751 const Register from = c_rarg0; // source array address |
|
3752 const Register to = c_rarg1; // destination array address |
|
3753 const Register key = c_rarg2; // key array address |
|
3754 const Register counter = c_rarg3; // counter byte array initialized from counter array address |
|
3755 // and left with the results of the last encryption block |
|
3756 #ifndef _WIN64 |
|
3757 const Register len_reg = c_rarg4; |
|
3758 const Register saved_encCounter_start = c_rarg5; |
|
3759 const Register used_addr = r10; |
|
3760 const Address used_mem(rbp, 2 * wordSize); |
|
3761 const Register used = r11; |
|
3762 #else |
|
3763 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 |
|
3764 const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64 |
|
3765 const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64 |
|
3766 const Register len_reg = r10; // pick the first volatile windows register |
|
3767 const Register saved_encCounter_start = r11; |
|
3768 const Register used_addr = r13; |
|
3769 const Register used = r14; |
|
3770 #endif |
|
3771 const Register pos = rax; |
|
3772 |
|
3773 const int PARALLEL_FACTOR = 6; |
|
3774 const XMMRegister xmm_counter_shuf_mask = xmm0; |
|
3775 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front |
|
3776 const XMMRegister xmm_curr_counter = xmm2; |
|
3777 |
|
3778 const XMMRegister xmm_key_tmp0 = xmm3; |
|
3779 const XMMRegister xmm_key_tmp1 = xmm4; |
|
3780 |
|
3781 // registers holding the four results in the parallelized loop |
|
3782 const XMMRegister xmm_result0 = xmm5; |
|
3783 const XMMRegister xmm_result1 = xmm6; |
|
3784 const XMMRegister xmm_result2 = xmm7; |
|
3785 const XMMRegister xmm_result3 = xmm8; |
|
3786 const XMMRegister xmm_result4 = xmm9; |
|
3787 const XMMRegister xmm_result5 = xmm10; |
|
3788 |
|
3789 const XMMRegister xmm_from0 = xmm11; |
|
3790 const XMMRegister xmm_from1 = xmm12; |
|
3791 const XMMRegister xmm_from2 = xmm13; |
|
3792 const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64. |
|
3793 const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text |
|
3794 const XMMRegister xmm_from5 = xmm4; |
|
3795 |
|
3796 //for key_128, key_192, key_256 |
|
3797 const int rounds[3] = {10, 12, 14}; |
|
3798 Label L_exit_preLoop, L_preLoop_start; |
|
3799 Label L_multiBlock_loopTop[3]; |
|
3800 Label L_singleBlockLoopTop[3]; |
|
3801 Label L__incCounter[3][6]; //for 6 blocks |
|
3802 Label L__incCounter_single[3]; //for single block, key128, key192, key256 |
|
3803 Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3]; |
|
3804 Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3]; |
|
3805 |
|
3806 Label L_exit; |
|
3807 |
|
3808 __ enter(); // required for proper stackwalking of RuntimeStub frame |
|
3809 |
|
3810 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge |
|
3811 // context for the registers used, where all instructions below are using 128-bit mode |
|
3812 // On EVEX without VL and BW, these instructions will all be AVX. |
|
3813 if (VM_Version::supports_avx512vlbw()) { |
|
3814 __ movl(rax, 0xffff); |
|
3815 __ kmovql(k1, rax); |
|
3816 } |
|
3817 |
|
3818 #ifdef _WIN64 |
|
3819 // save the xmm registers which must be preserved 6-14 |
|
3820 const int XMM_REG_NUM_KEY_LAST = 14; |
|
3821 __ subptr(rsp, -rsp_after_call_off * wordSize); |
|
3822 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { |
|
3823 __ movdqu(xmm_save(i), as_XMMRegister(i)); |
|
3824 } |
|
3825 |
|
3826 const Address r13_save(rbp, rdi_off * wordSize); |
|
3827 const Address r14_save(rbp, rsi_off * wordSize); |
|
3828 |
|
3829 __ movptr(r13_save, r13); |
|
3830 __ movptr(r14_save, r14); |
|
3831 |
|
3832 // on win64, fill len_reg from stack position |
|
3833 __ movl(len_reg, len_mem); |
|
3834 __ movptr(saved_encCounter_start, saved_encCounter_mem); |
|
3835 __ movptr(used_addr, used_mem); |
|
3836 __ movl(used, Address(used_addr, 0)); |
|
3837 #else |
|
3838 __ push(len_reg); // Save |
|
3839 __ movptr(used_addr, used_mem); |
|
3840 __ movl(used, Address(used_addr, 0)); |
|
3841 #endif |
|
3842 |
|
3843 __ push(rbx); // Save RBX |
|
3844 __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter |
|
3845 __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr())); |
|
3846 __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled |
|
3847 __ movptr(pos, 0); |
|
3848 |
|
3849 // Use the partially used encrpyted counter from last invocation |
|
3850 __ BIND(L_preLoop_start); |
|
3851 __ cmpptr(used, 16); |
|
3852 __ jcc(Assembler::aboveEqual, L_exit_preLoop); |
|
3853 __ cmpptr(len_reg, 0); |
|
3854 __ jcc(Assembler::lessEqual, L_exit_preLoop); |
|
3855 __ movb(rbx, Address(saved_encCounter_start, used)); |
|
3856 __ xorb(rbx, Address(from, pos)); |
|
3857 __ movb(Address(to, pos), rbx); |
|
3858 __ addptr(pos, 1); |
|
3859 __ addptr(used, 1); |
|
3860 __ subptr(len_reg, 1); |
|
3861 |
|
3862 __ jmp(L_preLoop_start); |
|
3863 |
|
3864 __ BIND(L_exit_preLoop); |
|
3865 __ movl(Address(used_addr, 0), used); |
|
3866 |
|
3867 // key length could be only {11, 13, 15} * 4 = {44, 52, 60} |
|
3868 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
|
3869 __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
|
3870 __ cmpl(rbx, 52); |
|
3871 __ jcc(Assembler::equal, L_multiBlock_loopTop[1]); |
|
3872 __ cmpl(rbx, 60); |
|
3873 __ jcc(Assembler::equal, L_multiBlock_loopTop[2]); |
|
3874 |
|
3875 #define CTR_DoSix(opc, src_reg) \ |
|
3876 __ opc(xmm_result0, src_reg); \ |
|
3877 __ opc(xmm_result1, src_reg); \ |
|
3878 __ opc(xmm_result2, src_reg); \ |
|
3879 __ opc(xmm_result3, src_reg); \ |
|
3880 __ opc(xmm_result4, src_reg); \ |
|
3881 __ opc(xmm_result5, src_reg); |
|
3882 |
|
3883 // k == 0 : generate code for key_128 |
|
3884 // k == 1 : generate code for key_192 |
|
3885 // k == 2 : generate code for key_256 |
|
3886 for (int k = 0; k < 3; ++k) { |
|
3887 //multi blocks starts here |
|
3888 __ align(OptoLoopAlignment); |
|
3889 __ BIND(L_multiBlock_loopTop[k]); |
|
3890 __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left |
|
3891 __ jcc(Assembler::less, L_singleBlockLoopTop[k]); |
|
3892 load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask); |
|
3893 |
|
3894 //load, then increase counters |
|
3895 CTR_DoSix(movdqa, xmm_curr_counter); |
|
3896 inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]); |
|
3897 inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]); |
|
3898 inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]); |
|
3899 inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]); |
|
3900 inc_counter(rbx, xmm_result5, 0x05, L__incCounter[k][4]); |
|
3901 inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]); |
|
3902 CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR |
|
3903 CTR_DoSix(pxor, xmm_key_tmp0); //PXOR with Round 0 key |
|
3904 |
|
3905 //load two ROUND_KEYs at a time |
|
3906 for (int i = 1; i < rounds[k]; ) { |
|
3907 load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask); |
|
3908 load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask); |
|
3909 CTR_DoSix(aesenc, xmm_key_tmp1); |
|
3910 i++; |
|
3911 if (i != rounds[k]) { |
|
3912 CTR_DoSix(aesenc, xmm_key_tmp0); |
|
3913 } else { |
|
3914 CTR_DoSix(aesenclast, xmm_key_tmp0); |
|
3915 } |
|
3916 i++; |
|
3917 } |
|
3918 |
|
3919 // get next PARALLEL_FACTOR blocks into xmm_result registers |
|
3920 __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); |
|
3921 __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize)); |
|
3922 __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize)); |
|
3923 __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize)); |
|
3924 __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize)); |
|
3925 __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize)); |
|
3926 |
|
3927 __ pxor(xmm_result0, xmm_from0); |
|
3928 __ pxor(xmm_result1, xmm_from1); |
|
3929 __ pxor(xmm_result2, xmm_from2); |
|
3930 __ pxor(xmm_result3, xmm_from3); |
|
3931 __ pxor(xmm_result4, xmm_from4); |
|
3932 __ pxor(xmm_result5, xmm_from5); |
|
3933 |
|
3934 // store 6 results into the next 64 bytes of output |
|
3935 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); |
|
3936 __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1); |
|
3937 __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2); |
|
3938 __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3); |
|
3939 __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4); |
|
3940 __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5); |
|
3941 |
|
3942 __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text |
|
3943 __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length |
|
3944 __ jmp(L_multiBlock_loopTop[k]); |
|
3945 |
|
3946 // singleBlock starts here |
|
3947 __ align(OptoLoopAlignment); |
|
3948 __ BIND(L_singleBlockLoopTop[k]); |
|
3949 __ cmpptr(len_reg, 0); |
|
3950 __ jcc(Assembler::lessEqual, L_exit); |
|
3951 load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask); |
|
3952 __ movdqa(xmm_result0, xmm_curr_counter); |
|
3953 inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]); |
|
3954 __ pshufb(xmm_result0, xmm_counter_shuf_mask); |
|
3955 __ pxor(xmm_result0, xmm_key_tmp0); |
|
3956 for (int i = 1; i < rounds[k]; i++) { |
|
3957 load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask); |
|
3958 __ aesenc(xmm_result0, xmm_key_tmp0); |
|
3959 } |
|
3960 load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask); |
|
3961 __ aesenclast(xmm_result0, xmm_key_tmp0); |
|
3962 __ cmpptr(len_reg, AESBlockSize); |
|
3963 __ jcc(Assembler::less, L_processTail_insr[k]); |
|
3964 __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); |
|
3965 __ pxor(xmm_result0, xmm_from0); |
|
3966 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); |
|
3967 __ addptr(pos, AESBlockSize); |
|
3968 __ subptr(len_reg, AESBlockSize); |
|
3969 __ jmp(L_singleBlockLoopTop[k]); |
|
3970 __ BIND(L_processTail_insr[k]); |
|
3971 __ addptr(pos, len_reg); |
|
3972 __ testptr(len_reg, 8); |
|
3973 __ jcc(Assembler::zero, L_processTail_4_insr[k]); |
|
3974 __ subptr(pos,8); |
|
3975 __ pinsrq(xmm_from0, Address(from, pos), 0); |
|
3976 __ BIND(L_processTail_4_insr[k]); |
|
3977 __ testptr(len_reg, 4); |
|
3978 __ jcc(Assembler::zero, L_processTail_2_insr[k]); |
|
3979 __ subptr(pos,4); |
|
3980 __ pslldq(xmm_from0, 4); |
|
3981 __ pinsrd(xmm_from0, Address(from, pos), 0); |
|
3982 __ BIND(L_processTail_2_insr[k]); |
|
3983 __ testptr(len_reg, 2); |
|
3984 __ jcc(Assembler::zero, L_processTail_1_insr[k]); |
|
3985 __ subptr(pos, 2); |
|
3986 __ pslldq(xmm_from0, 2); |
|
3987 __ pinsrw(xmm_from0, Address(from, pos), 0); |
|
3988 __ BIND(L_processTail_1_insr[k]); |
|
3989 __ testptr(len_reg, 1); |
|
3990 __ jcc(Assembler::zero, L_processTail_exit_insr[k]); |
|
3991 __ subptr(pos, 1); |
|
3992 __ pslldq(xmm_from0, 1); |
|
3993 __ pinsrb(xmm_from0, Address(from, pos), 0); |
|
3994 __ BIND(L_processTail_exit_insr[k]); |
|
3995 |
|
3996 __ movdqu(Address(saved_encCounter_start, 0), xmm_result0); |
|
3997 __ pxor(xmm_result0, xmm_from0); |
|
3998 |
|
3999 __ testptr(len_reg, 8); |
|
4000 __ jcc(Assembler::zero, L_processTail_4_extr[k]); |
|
4001 __ pextrq(Address(to, pos), xmm_result0, 0); |
|
4002 __ psrldq(xmm_result0, 8); |
|
4003 __ addptr(pos, 8); |
|
4004 __ BIND(L_processTail_4_extr[k]); |
|
4005 __ testptr(len_reg, 4); |
|
4006 __ jcc(Assembler::zero, L_processTail_2_extr[k]); |
|
4007 __ pextrd(Address(to, pos), xmm_result0, 0); |
|
4008 __ psrldq(xmm_result0, 4); |
|
4009 __ addptr(pos, 4); |
|
4010 __ BIND(L_processTail_2_extr[k]); |
|
4011 __ testptr(len_reg, 2); |
|
4012 __ jcc(Assembler::zero, L_processTail_1_extr[k]); |
|
4013 __ pextrw(Address(to, pos), xmm_result0, 0); |
|
4014 __ psrldq(xmm_result0, 2); |
|
4015 __ addptr(pos, 2); |
|
4016 __ BIND(L_processTail_1_extr[k]); |
|
4017 __ testptr(len_reg, 1); |
|
4018 __ jcc(Assembler::zero, L_processTail_exit_extr[k]); |
|
4019 __ pextrb(Address(to, pos), xmm_result0, 0); |
|
4020 |
|
4021 __ BIND(L_processTail_exit_extr[k]); |
|
4022 __ movl(Address(used_addr, 0), len_reg); |
|
4023 __ jmp(L_exit); |
|
4024 |
|
4025 } |
|
4026 |
|
4027 __ BIND(L_exit); |
|
4028 __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back. |
|
4029 __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back |
|
4030 __ pop(rbx); // pop the saved RBX. |
|
4031 #ifdef _WIN64 |
|
4032 // restore regs belonging to calling function |
|
4033 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { |
|
4034 __ movdqu(as_XMMRegister(i), xmm_save(i)); |
|
4035 } |
|
4036 __ movl(rax, len_mem); |
|
4037 __ movptr(r13, r13_save); |
|
4038 __ movptr(r14, r14_save); |
|
4039 #else |
|
4040 __ pop(rax); // return 'len' |
|
4041 #endif |
|
4042 __ leave(); // required for proper stackwalking of RuntimeStub frame |
|
4043 __ ret(0); |
|
4044 return start; |
|
4045 } |
3703 |
4046 |
3704 // byte swap x86 long |
4047 // byte swap x86 long |
3705 address generate_ghash_long_swap_mask() { |
4048 address generate_ghash_long_swap_mask() { |
3706 __ align(CodeEntryAlignment); |
4049 __ align(CodeEntryAlignment); |
3707 StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask"); |
4050 StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask"); |