776 vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit); |
776 vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit); |
777 vpxor(xmm1, xmm1, xmm1, Assembler::AVX_128bit); |
777 vpxor(xmm1, xmm1, xmm1, Assembler::AVX_128bit); |
778 vpxor(xmm3, xmm3, xmm3, Assembler::AVX_128bit); |
778 vpxor(xmm3, xmm3, xmm3, Assembler::AVX_128bit); |
779 vpxor(xmm15, xmm15, xmm15, Assembler::AVX_128bit); |
779 vpxor(xmm15, xmm15, xmm15, Assembler::AVX_128bit); |
780 } |
780 } |
|
781 |
|
782 // AES Counter Mode using VAES instructions |
|
783 void MacroAssembler::aesctr_encrypt(Register src_addr, Register dest_addr, Register key, Register counter, |
|
784 Register len_reg, Register used, Register used_addr, Register saved_encCounter_start) { |
|
785 |
|
786 const Register rounds = 0; |
|
787 const Register pos = r12; |
|
788 |
|
789 Label PRELOOP_START, EXIT_PRELOOP, REMAINDER, REMAINDER_16, LOOP, END, EXIT, END_LOOP, |
|
790 AES192, AES256, AES192_REMAINDER16, REMAINDER16_END_LOOP, AES256_REMAINDER16, |
|
791 REMAINDER_8, REMAINDER_4, AES192_REMAINDER8, REMAINDER_LOOP, AES256_REMINDER, |
|
792 AES192_REMAINDER, END_REMAINDER_LOOP, AES256_REMAINDER8, REMAINDER8_END_LOOP, |
|
793 AES192_REMAINDER4, AES256_REMAINDER4, AES256_REMAINDER, END_REMAINDER4, EXTRACT_TAILBYTES, |
|
794 EXTRACT_TAIL_4BYTES, EXTRACT_TAIL_2BYTES, EXTRACT_TAIL_1BYTE, STORE_CTR; |
|
795 |
|
796 cmpl(len_reg, 0); |
|
797 jcc(Assembler::belowEqual, EXIT); |
|
798 |
|
799 movl(pos, 0); |
|
800 // if the number of used encrypted counter bytes < 16, |
|
801 // XOR PT with saved encrypted counter to obtain CT |
|
802 bind(PRELOOP_START); |
|
803 cmpl(used, 16); |
|
804 jcc(Assembler::aboveEqual, EXIT_PRELOOP); |
|
805 movb(rbx, Address(saved_encCounter_start, used)); |
|
806 xorb(rbx, Address(src_addr, pos)); |
|
807 movb(Address(dest_addr, pos), rbx); |
|
808 addptr(pos, 1); |
|
809 addptr(used, 1); |
|
810 decrement(len_reg); |
|
811 jmp(PRELOOP_START); |
|
812 |
|
813 bind(EXIT_PRELOOP); |
|
814 movl(Address(used_addr, 0), used); |
|
815 |
|
816 // Calculate number of rounds i.e. 10, 12, 14, based on key length(128, 192, 256). |
|
817 movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); |
|
818 |
|
819 vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit); |
|
820 // Move initial counter value in xmm0 |
|
821 movdqu(xmm0, Address(counter, 0)); |
|
822 // broadcast counter value to zmm8 |
|
823 evshufi64x2(xmm8, xmm0, xmm0, 0, Assembler::AVX_512bit); |
|
824 |
|
825 // load lbswap mask |
|
826 evmovdquq(xmm16, ExternalAddress(StubRoutines::x86::counter_mask_addr()), Assembler::AVX_512bit, r15); |
|
827 |
|
828 //shuffle counter using lbswap_mask |
|
829 vpshufb(xmm8, xmm8, xmm16, Assembler::AVX_512bit); |
|
830 |
|
831 // pre-increment and propagate counter values to zmm9-zmm15 registers. |
|
832 // Linc0 increments the zmm8 by 1 (initial value being 0), Linc4 increments the counters zmm9-zmm15 by 4 |
|
833 // The counter is incremented after each block i.e. 16 bytes is processed; |
|
834 // each zmm register has 4 counter values as its MSB |
|
835 // the counters are incremented in parallel |
|
836 vpaddd(xmm8, xmm8, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 64), Assembler::AVX_512bit, r15);//linc0 |
|
837 vpaddd(xmm9, xmm8, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//linc4(rip) |
|
838 vpaddd(xmm10, xmm9, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip) |
|
839 vpaddd(xmm11, xmm10, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip) |
|
840 vpaddd(xmm12, xmm11, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip) |
|
841 vpaddd(xmm13, xmm12, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip) |
|
842 vpaddd(xmm14, xmm13, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip) |
|
843 vpaddd(xmm15, xmm14, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip) |
|
844 |
|
845 // load linc32 mask in zmm register.linc32 increments counter by 32 |
|
846 evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 256), Assembler::AVX_512bit, r15);//Linc32 |
|
847 |
|
848 // xmm31 contains the key shuffle mask. |
|
849 movdqu(xmm31, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); |
|
850 // Load key function loads 128 bit key and shuffles it. Then we broadcast the shuffled key to convert it into a 512 bit value. |
|
851 // For broadcasting the values to ZMM, vshufi64 is used instead of evbroadcasti64x2 as the source in this case is ZMM register |
|
852 // that holds shuffled key value. |
|
853 ev_load_key(xmm20, key, 0, xmm31); |
|
854 ev_load_key(xmm21, key, 1 * 16, xmm31); |
|
855 ev_load_key(xmm22, key, 2 * 16, xmm31); |
|
856 ev_load_key(xmm23, key, 3 * 16, xmm31); |
|
857 ev_load_key(xmm24, key, 4 * 16, xmm31); |
|
858 ev_load_key(xmm25, key, 5 * 16, xmm31); |
|
859 ev_load_key(xmm26, key, 6 * 16, xmm31); |
|
860 ev_load_key(xmm27, key, 7 * 16, xmm31); |
|
861 ev_load_key(xmm28, key, 8 * 16, xmm31); |
|
862 ev_load_key(xmm29, key, 9 * 16, xmm31); |
|
863 ev_load_key(xmm30, key, 10 * 16, xmm31); |
|
864 |
|
865 // Process 32 blocks or 512 bytes of data |
|
866 bind(LOOP); |
|
867 cmpl(len_reg, 512); |
|
868 jcc(Assembler::less, REMAINDER); |
|
869 subq(len_reg, 512); |
|
870 //Shuffle counter and Exor it with roundkey1. Result is stored in zmm0-7 |
|
871 vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit); |
|
872 evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit); |
|
873 vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit); |
|
874 evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit); |
|
875 vpshufb(xmm2, xmm10, xmm16, Assembler::AVX_512bit); |
|
876 evpxorq(xmm2, xmm2, xmm20, Assembler::AVX_512bit); |
|
877 vpshufb(xmm3, xmm11, xmm16, Assembler::AVX_512bit); |
|
878 evpxorq(xmm3, xmm3, xmm20, Assembler::AVX_512bit); |
|
879 vpshufb(xmm4, xmm12, xmm16, Assembler::AVX_512bit); |
|
880 evpxorq(xmm4, xmm4, xmm20, Assembler::AVX_512bit); |
|
881 vpshufb(xmm5, xmm13, xmm16, Assembler::AVX_512bit); |
|
882 evpxorq(xmm5, xmm5, xmm20, Assembler::AVX_512bit); |
|
883 vpshufb(xmm6, xmm14, xmm16, Assembler::AVX_512bit); |
|
884 evpxorq(xmm6, xmm6, xmm20, Assembler::AVX_512bit); |
|
885 vpshufb(xmm7, xmm15, xmm16, Assembler::AVX_512bit); |
|
886 evpxorq(xmm7, xmm7, xmm20, Assembler::AVX_512bit); |
|
887 // Perform AES encode operations and put results in zmm0-zmm7. |
|
888 // This is followed by incrementing counter values in zmm8-zmm15. |
|
889 // Since we will be processing 32 blocks at a time, the counter is incremented by 32. |
|
890 roundEnc(xmm21, 7); |
|
891 vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit); |
|
892 roundEnc(xmm22, 7); |
|
893 vpaddq(xmm9, xmm9, xmm19, Assembler::AVX_512bit); |
|
894 roundEnc(xmm23, 7); |
|
895 vpaddq(xmm10, xmm10, xmm19, Assembler::AVX_512bit); |
|
896 roundEnc(xmm24, 7); |
|
897 vpaddq(xmm11, xmm11, xmm19, Assembler::AVX_512bit); |
|
898 roundEnc(xmm25, 7); |
|
899 vpaddq(xmm12, xmm12, xmm19, Assembler::AVX_512bit); |
|
900 roundEnc(xmm26, 7); |
|
901 vpaddq(xmm13, xmm13, xmm19, Assembler::AVX_512bit); |
|
902 roundEnc(xmm27, 7); |
|
903 vpaddq(xmm14, xmm14, xmm19, Assembler::AVX_512bit); |
|
904 roundEnc(xmm28, 7); |
|
905 vpaddq(xmm15, xmm15, xmm19, Assembler::AVX_512bit); |
|
906 roundEnc(xmm29, 7); |
|
907 |
|
908 cmpl(rounds, 52); |
|
909 jcc(Assembler::aboveEqual, AES192); |
|
910 lastroundEnc(xmm30, 7); |
|
911 jmp(END_LOOP); |
|
912 |
|
913 bind(AES192); |
|
914 roundEnc(xmm30, 7); |
|
915 ev_load_key(xmm18, key, 11 * 16, xmm31); |
|
916 roundEnc(xmm18, 7); |
|
917 cmpl(rounds, 60); |
|
918 jcc(Assembler::aboveEqual, AES256); |
|
919 ev_load_key(xmm18, key, 12 * 16, xmm31); |
|
920 lastroundEnc(xmm18, 7); |
|
921 jmp(END_LOOP); |
|
922 |
|
923 bind(AES256); |
|
924 ev_load_key(xmm18, key, 12 * 16, xmm31); |
|
925 roundEnc(xmm18, 7); |
|
926 ev_load_key(xmm18, key, 13 * 16, xmm31); |
|
927 roundEnc(xmm18, 7); |
|
928 ev_load_key(xmm18, key, 14 * 16, xmm31); |
|
929 lastroundEnc(xmm18, 7); |
|
930 |
|
931 // After AES encode rounds, the encrypted block cipher lies in zmm0-zmm7 |
|
932 // xor encrypted block cipher and input plaintext and store resultant ciphertext |
|
933 bind(END_LOOP); |
|
934 evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); |
|
935 evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit); |
|
936 evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); |
|
937 evmovdquq(Address(dest_addr, pos, Address::times_1, 64), xmm1, Assembler::AVX_512bit); |
|
938 evpxorq(xmm2, xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit); |
|
939 evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit); |
|
940 evpxorq(xmm3, xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit); |
|
941 evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit); |
|
942 evpxorq(xmm4, xmm4, Address(src_addr, pos, Address::times_1, 4 * 64), Assembler::AVX_512bit); |
|
943 evmovdquq(Address(dest_addr, pos, Address::times_1, 4 * 64), xmm4, Assembler::AVX_512bit); |
|
944 evpxorq(xmm5, xmm5, Address(src_addr, pos, Address::times_1, 5 * 64), Assembler::AVX_512bit); |
|
945 evmovdquq(Address(dest_addr, pos, Address::times_1, 5 * 64), xmm5, Assembler::AVX_512bit); |
|
946 evpxorq(xmm6, xmm6, Address(src_addr, pos, Address::times_1, 6 * 64), Assembler::AVX_512bit); |
|
947 evmovdquq(Address(dest_addr, pos, Address::times_1, 6 * 64), xmm6, Assembler::AVX_512bit); |
|
948 evpxorq(xmm7, xmm7, Address(src_addr, pos, Address::times_1, 7 * 64), Assembler::AVX_512bit); |
|
949 evmovdquq(Address(dest_addr, pos, Address::times_1, 7 * 64), xmm7, Assembler::AVX_512bit); |
|
950 addq(pos, 512); |
|
951 jmp(LOOP); |
|
952 |
|
953 // Encode 256, 128, 64 or 16 bytes at a time if length is less than 512 bytes |
|
954 bind(REMAINDER); |
|
955 cmpl(len_reg, 0); |
|
956 jcc(Assembler::equal, END); |
|
957 cmpl(len_reg, 256); |
|
958 jcc(Assembler::aboveEqual, REMAINDER_16); |
|
959 cmpl(len_reg, 128); |
|
960 jcc(Assembler::aboveEqual, REMAINDER_8); |
|
961 cmpl(len_reg, 64); |
|
962 jcc(Assembler::aboveEqual, REMAINDER_4); |
|
963 // At this point, we will process 16 bytes of data at a time. |
|
964 // So load xmm19 with counter increment value as 1 |
|
965 evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15); |
|
966 jmp(REMAINDER_LOOP); |
|
967 |
|
968 // Each ZMM register can be used to encode 64 bytes of data, so we have 4 ZMM registers to encode 256 bytes of data |
|
969 bind(REMAINDER_16); |
|
970 subq(len_reg, 256); |
|
971 // As we process 16 blocks at a time, load mask for incrementing the counter value by 16 |
|
972 evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 320), Assembler::AVX_512bit, r15);//Linc16(rip) |
|
973 // shuffle counter and XOR counter with roundkey1 |
|
974 vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit); |
|
975 evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit); |
|
976 vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit); |
|
977 evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit); |
|
978 vpshufb(xmm2, xmm10, xmm16, Assembler::AVX_512bit); |
|
979 evpxorq(xmm2, xmm2, xmm20, Assembler::AVX_512bit); |
|
980 vpshufb(xmm3, xmm11, xmm16, Assembler::AVX_512bit); |
|
981 evpxorq(xmm3, xmm3, xmm20, Assembler::AVX_512bit); |
|
982 // Increment counter values by 16 |
|
983 vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit); |
|
984 vpaddq(xmm9, xmm9, xmm19, Assembler::AVX_512bit); |
|
985 // AES encode rounds |
|
986 roundEnc(xmm21, 3); |
|
987 roundEnc(xmm22, 3); |
|
988 roundEnc(xmm23, 3); |
|
989 roundEnc(xmm24, 3); |
|
990 roundEnc(xmm25, 3); |
|
991 roundEnc(xmm26, 3); |
|
992 roundEnc(xmm27, 3); |
|
993 roundEnc(xmm28, 3); |
|
994 roundEnc(xmm29, 3); |
|
995 |
|
996 cmpl(rounds, 52); |
|
997 jcc(Assembler::aboveEqual, AES192_REMAINDER16); |
|
998 lastroundEnc(xmm30, 3); |
|
999 jmp(REMAINDER16_END_LOOP); |
|
1000 |
|
1001 bind(AES192_REMAINDER16); |
|
1002 roundEnc(xmm30, 3); |
|
1003 ev_load_key(xmm18, key, 11 * 16, xmm31); |
|
1004 roundEnc(xmm18, 3); |
|
1005 ev_load_key(xmm5, key, 12 * 16, xmm31); |
|
1006 |
|
1007 cmpl(rounds, 60); |
|
1008 jcc(Assembler::aboveEqual, AES256_REMAINDER16); |
|
1009 lastroundEnc(xmm5, 3); |
|
1010 jmp(REMAINDER16_END_LOOP); |
|
1011 bind(AES256_REMAINDER16); |
|
1012 roundEnc(xmm5, 3); |
|
1013 ev_load_key(xmm6, key, 13 * 16, xmm31); |
|
1014 roundEnc(xmm6, 3); |
|
1015 ev_load_key(xmm7, key, 14 * 16, xmm31); |
|
1016 lastroundEnc(xmm7, 3); |
|
1017 |
|
1018 // After AES encode rounds, the encrypted block cipher lies in zmm0-zmm3 |
|
1019 // xor 256 bytes of PT with the encrypted counters to produce CT. |
|
1020 bind(REMAINDER16_END_LOOP); |
|
1021 evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_512bit); |
|
1022 evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit); |
|
1023 evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); |
|
1024 evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit); |
|
1025 evpxorq(xmm2, xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit); |
|
1026 evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit); |
|
1027 evpxorq(xmm3, xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit); |
|
1028 evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit); |
|
1029 addq(pos, 256); |
|
1030 |
|
1031 cmpl(len_reg, 128); |
|
1032 jcc(Assembler::aboveEqual, REMAINDER_8); |
|
1033 |
|
1034 cmpl(len_reg, 64); |
|
1035 jcc(Assembler::aboveEqual, REMAINDER_4); |
|
1036 //load mask for incrementing the counter value by 1 |
|
1037 evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);//Linc0 + 16(rip) |
|
1038 jmp(REMAINDER_LOOP); |
|
1039 |
|
1040 // Each ZMM register can be used to encode 64 bytes of data, so we have 2 ZMM registers to encode 128 bytes of data |
|
1041 bind(REMAINDER_8); |
|
1042 subq(len_reg, 128); |
|
1043 // As we process 8 blocks at a time, load mask for incrementing the counter value by 8 |
|
1044 evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 192), Assembler::AVX_512bit, r15);//Linc8(rip) |
|
1045 // shuffle counters and xor with roundkey1 |
|
1046 vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit); |
|
1047 evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit); |
|
1048 vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit); |
|
1049 evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit); |
|
1050 // increment counter by 8 |
|
1051 vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit); |
|
1052 // AES encode |
|
1053 roundEnc(xmm21, 1); |
|
1054 roundEnc(xmm22, 1); |
|
1055 roundEnc(xmm23, 1); |
|
1056 roundEnc(xmm24, 1); |
|
1057 roundEnc(xmm25, 1); |
|
1058 roundEnc(xmm26, 1); |
|
1059 roundEnc(xmm27, 1); |
|
1060 roundEnc(xmm28, 1); |
|
1061 roundEnc(xmm29, 1); |
|
1062 |
|
1063 cmpl(rounds, 52); |
|
1064 jcc(Assembler::aboveEqual, AES192_REMAINDER8); |
|
1065 lastroundEnc(xmm30, 1); |
|
1066 jmp(REMAINDER8_END_LOOP); |
|
1067 |
|
1068 bind(AES192_REMAINDER8); |
|
1069 roundEnc(xmm30, 1); |
|
1070 ev_load_key(xmm18, key, 11 * 16, xmm31); |
|
1071 roundEnc(xmm18, 1); |
|
1072 ev_load_key(xmm5, key, 12 * 16, xmm31); |
|
1073 cmpl(rounds, 60); |
|
1074 jcc(Assembler::aboveEqual, AES256_REMAINDER8); |
|
1075 lastroundEnc(xmm5, 1); |
|
1076 jmp(REMAINDER8_END_LOOP); |
|
1077 |
|
1078 bind(AES256_REMAINDER8); |
|
1079 roundEnc(xmm5, 1); |
|
1080 ev_load_key(xmm6, key, 13 * 16, xmm31); |
|
1081 roundEnc(xmm6, 1); |
|
1082 ev_load_key(xmm7, key, 14 * 16, xmm31); |
|
1083 lastroundEnc(xmm7, 1); |
|
1084 |
|
1085 bind(REMAINDER8_END_LOOP); |
|
1086 // After AES encode rounds, the encrypted block cipher lies in zmm0-zmm1 |
|
1087 // XOR PT with the encrypted counter and store as CT |
|
1088 evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); |
|
1089 evmovdquq(Address(dest_addr, pos, Address::times_1, 0 * 64), xmm0, Assembler::AVX_512bit); |
|
1090 evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); |
|
1091 evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit); |
|
1092 addq(pos, 128); |
|
1093 |
|
1094 cmpl(len_reg, 64); |
|
1095 jcc(Assembler::aboveEqual, REMAINDER_4); |
|
1096 // load mask for incrementing the counter value by 1 |
|
1097 evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);//Linc0 + 16(rip) |
|
1098 jmp(REMAINDER_LOOP); |
|
1099 |
|
1100 // Each ZMM register can be used to encode 64 bytes of data, so we have 1 ZMM register used in this block of code |
|
1101 bind(REMAINDER_4); |
|
1102 subq(len_reg, 64); |
|
1103 // As we process 4 blocks at a time, load mask for incrementing the counter value by 4 |
|
1104 evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip) |
|
1105 // XOR counter with first roundkey |
|
1106 vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit); |
|
1107 evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit); |
|
1108 // Increment counter |
|
1109 vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit); |
|
1110 vaesenc(xmm0, xmm0, xmm21, Assembler::AVX_512bit); |
|
1111 vaesenc(xmm0, xmm0, xmm22, Assembler::AVX_512bit); |
|
1112 vaesenc(xmm0, xmm0, xmm23, Assembler::AVX_512bit); |
|
1113 vaesenc(xmm0, xmm0, xmm24, Assembler::AVX_512bit); |
|
1114 vaesenc(xmm0, xmm0, xmm25, Assembler::AVX_512bit); |
|
1115 vaesenc(xmm0, xmm0, xmm26, Assembler::AVX_512bit); |
|
1116 vaesenc(xmm0, xmm0, xmm27, Assembler::AVX_512bit); |
|
1117 vaesenc(xmm0, xmm0, xmm28, Assembler::AVX_512bit); |
|
1118 vaesenc(xmm0, xmm0, xmm29, Assembler::AVX_512bit); |
|
1119 cmpl(rounds, 52); |
|
1120 jcc(Assembler::aboveEqual, AES192_REMAINDER4); |
|
1121 vaesenclast(xmm0, xmm0, xmm30, Assembler::AVX_512bit); |
|
1122 jmp(END_REMAINDER4); |
|
1123 |
|
1124 bind(AES192_REMAINDER4); |
|
1125 vaesenc(xmm0, xmm0, xmm30, Assembler::AVX_512bit); |
|
1126 ev_load_key(xmm18, key, 11 * 16, xmm31); |
|
1127 vaesenc(xmm0, xmm0, xmm18, Assembler::AVX_512bit); |
|
1128 ev_load_key(xmm5, key, 12 * 16, xmm31); |
|
1129 |
|
1130 cmpl(rounds, 60); |
|
1131 jcc(Assembler::aboveEqual, AES256_REMAINDER4); |
|
1132 vaesenclast(xmm0, xmm0, xmm5, Assembler::AVX_512bit); |
|
1133 jmp(END_REMAINDER4); |
|
1134 |
|
1135 bind(AES256_REMAINDER4); |
|
1136 vaesenc(xmm0, xmm0, xmm5, Assembler::AVX_512bit); |
|
1137 ev_load_key(xmm6, key, 13 * 16, xmm31); |
|
1138 vaesenc(xmm0, xmm0, xmm6, Assembler::AVX_512bit); |
|
1139 ev_load_key(xmm7, key, 14 * 16, xmm31); |
|
1140 vaesenclast(xmm0, xmm0, xmm7, Assembler::AVX_512bit); |
|
1141 // After AES encode rounds, the encrypted block cipher lies in zmm0. |
|
1142 // XOR encrypted block cipher with PT and store 64 bytes of ciphertext |
|
1143 bind(END_REMAINDER4); |
|
1144 evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); |
|
1145 evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit); |
|
1146 addq(pos, 64); |
|
1147 // load mask for incrementing the counter value by 1 |
|
1148 evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);//Linc0 + 16(rip) |
|
1149 |
|
1150 // For a single block, the AES rounds start here. |
|
1151 bind(REMAINDER_LOOP); |
|
1152 cmpl(len_reg, 0); |
|
1153 jcc(Assembler::belowEqual, END); |
|
1154 // XOR counter with first roundkey |
|
1155 vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_128bit); |
|
1156 evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_128bit); |
|
1157 vaesenc(xmm0, xmm0, xmm21, Assembler::AVX_128bit); |
|
1158 // Increment counter by 1 |
|
1159 vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_128bit); |
|
1160 vaesenc(xmm0, xmm0, xmm22, Assembler::AVX_128bit); |
|
1161 vaesenc(xmm0, xmm0, xmm23, Assembler::AVX_128bit); |
|
1162 vaesenc(xmm0, xmm0, xmm24, Assembler::AVX_128bit); |
|
1163 vaesenc(xmm0, xmm0, xmm25, Assembler::AVX_128bit); |
|
1164 vaesenc(xmm0, xmm0, xmm26, Assembler::AVX_128bit); |
|
1165 vaesenc(xmm0, xmm0, xmm27, Assembler::AVX_128bit); |
|
1166 vaesenc(xmm0, xmm0, xmm28, Assembler::AVX_128bit); |
|
1167 vaesenc(xmm0, xmm0, xmm29, Assembler::AVX_128bit); |
|
1168 |
|
1169 cmpl(rounds, 52); |
|
1170 jcc(Assembler::aboveEqual, AES192_REMAINDER); |
|
1171 vaesenclast(xmm0, xmm0, xmm30, Assembler::AVX_128bit); |
|
1172 jmp(END_REMAINDER_LOOP); |
|
1173 |
|
1174 bind(AES192_REMAINDER); |
|
1175 vaesenc(xmm0, xmm0, xmm30, Assembler::AVX_128bit); |
|
1176 ev_load_key(xmm18, key, 11 * 16, xmm31); |
|
1177 vaesenc(xmm0, xmm0, xmm18, Assembler::AVX_128bit); |
|
1178 ev_load_key(xmm5, key, 12 * 16, xmm31); |
|
1179 cmpl(rounds, 60); |
|
1180 jcc(Assembler::aboveEqual, AES256_REMAINDER); |
|
1181 vaesenclast(xmm0, xmm0, xmm5, Assembler::AVX_128bit); |
|
1182 jmp(END_REMAINDER_LOOP); |
|
1183 |
|
1184 bind(AES256_REMAINDER); |
|
1185 vaesenc(xmm0, xmm0, xmm5, Assembler::AVX_128bit); |
|
1186 ev_load_key(xmm6, key, 13 * 16, xmm31); |
|
1187 vaesenc(xmm0, xmm0, xmm6, Assembler::AVX_128bit); |
|
1188 ev_load_key(xmm7, key, 14 * 16, xmm31); |
|
1189 vaesenclast(xmm0, xmm0, xmm7, Assembler::AVX_128bit); |
|
1190 |
|
1191 bind(END_REMAINDER_LOOP); |
|
1192 // If the length register is less than the blockSize i.e. 16 |
|
1193 // then we store only those bytes of the CT to the destination |
|
1194 // corresponding to the length register value |
|
1195 // extracting the exact number of bytes is handled by EXTRACT_TAILBYTES |
|
1196 cmpl(len_reg, 16); |
|
1197 jcc(Assembler::less, EXTRACT_TAILBYTES); |
|
1198 subl(len_reg, 16); |
|
1199 // After AES encode rounds, the encrypted block cipher lies in xmm0. |
|
1200 // If the length register is equal to 16 bytes, store CT in dest after XOR operation. |
|
1201 evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_128bit); |
|
1202 evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_128bit); |
|
1203 addl(pos, 16); |
|
1204 |
|
1205 jmp(REMAINDER_LOOP); |
|
1206 |
|
1207 bind(EXTRACT_TAILBYTES); |
|
1208 // Save encrypted counter value in xmm0 for next invocation, before XOR operation |
|
1209 movdqu(Address(saved_encCounter_start, 0), xmm0); |
|
1210 // XOR encryted block cipher in xmm0 with PT to produce CT |
|
1211 evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_128bit); |
|
1212 // extract upto 15 bytes of CT from xmm0 as specified by length register |
|
1213 testptr(len_reg, 8); |
|
1214 jcc(Assembler::zero, EXTRACT_TAIL_4BYTES); |
|
1215 pextrq(Address(dest_addr, pos), xmm0, 0); |
|
1216 psrldq(xmm0, 8); |
|
1217 addl(pos, 8); |
|
1218 bind(EXTRACT_TAIL_4BYTES); |
|
1219 testptr(len_reg, 4); |
|
1220 jcc(Assembler::zero, EXTRACT_TAIL_2BYTES); |
|
1221 pextrd(Address(dest_addr, pos), xmm0, 0); |
|
1222 psrldq(xmm0, 4); |
|
1223 addq(pos, 4); |
|
1224 bind(EXTRACT_TAIL_2BYTES); |
|
1225 testptr(len_reg, 2); |
|
1226 jcc(Assembler::zero, EXTRACT_TAIL_1BYTE); |
|
1227 pextrw(Address(dest_addr, pos), xmm0, 0); |
|
1228 psrldq(xmm0, 2); |
|
1229 addl(pos, 2); |
|
1230 bind(EXTRACT_TAIL_1BYTE); |
|
1231 testptr(len_reg, 1); |
|
1232 jcc(Assembler::zero, END); |
|
1233 pextrb(Address(dest_addr, pos), xmm0, 0); |
|
1234 addl(pos, 1); |
|
1235 |
|
1236 bind(END); |
|
1237 // If there are no tail bytes, store counter value and exit |
|
1238 cmpl(len_reg, 0); |
|
1239 jcc(Assembler::equal, STORE_CTR); |
|
1240 movl(Address(used_addr, 0), len_reg); |
|
1241 |
|
1242 bind(STORE_CTR); |
|
1243 //shuffle updated counter and store it |
|
1244 vpshufb(xmm8, xmm8, xmm16, Assembler::AVX_128bit); |
|
1245 movdqu(Address(counter, 0), xmm8); |
|
1246 // Zero out counter and key registers |
|
1247 evpxorq(xmm8, xmm8, xmm8, Assembler::AVX_512bit); |
|
1248 evpxorq(xmm20, xmm20, xmm20, Assembler::AVX_512bit); |
|
1249 evpxorq(xmm21, xmm21, xmm21, Assembler::AVX_512bit); |
|
1250 evpxorq(xmm22, xmm22, xmm22, Assembler::AVX_512bit); |
|
1251 evpxorq(xmm23, xmm23, xmm23, Assembler::AVX_512bit); |
|
1252 evpxorq(xmm24, xmm24, xmm24, Assembler::AVX_512bit); |
|
1253 evpxorq(xmm25, xmm25, xmm25, Assembler::AVX_512bit); |
|
1254 evpxorq(xmm26, xmm26, xmm26, Assembler::AVX_512bit); |
|
1255 evpxorq(xmm27, xmm27, xmm27, Assembler::AVX_512bit); |
|
1256 evpxorq(xmm28, xmm28, xmm28, Assembler::AVX_512bit); |
|
1257 evpxorq(xmm29, xmm29, xmm29, Assembler::AVX_512bit); |
|
1258 evpxorq(xmm30, xmm30, xmm30, Assembler::AVX_512bit); |
|
1259 cmpl(rounds, 44); |
|
1260 jcc(Assembler::belowEqual, EXIT); |
|
1261 evpxorq(xmm18, xmm18, xmm18, Assembler::AVX_512bit); |
|
1262 evpxorq(xmm5, xmm5, xmm5, Assembler::AVX_512bit); |
|
1263 cmpl(rounds, 52); |
|
1264 jcc(Assembler::belowEqual, EXIT); |
|
1265 evpxorq(xmm6, xmm6, xmm6, Assembler::AVX_512bit); |
|
1266 evpxorq(xmm7, xmm7, xmm7, Assembler::AVX_512bit); |
|
1267 bind(EXIT); |
|
1268 } |
|
1269 |
781 #endif // _LP64 |
1270 #endif // _LP64 |