src/hotspot/cpu/x86/macroAssembler_x86_aes.cpp
changeset 58977 c6a789f495fe
parent 57786 948ac3112da8
equal deleted inserted replaced
58976:4e3694a617d4 58977:c6a789f495fe
     1 /*
     1 /*
     2 * Copyright (c) 2018, Intel Corporation.
     2 * Copyright (c) 2019, Intel Corporation.
     3 *
     3 *
     4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     5 *
     5 *
     6 * This code is free software; you can redistribute it and/or modify it
     6 * This code is free software; you can redistribute it and/or modify it
     7 * under the terms of the GNU General Public License version 2 only, as
     7 * under the terms of the GNU General Public License version 2 only, as
   776     vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit);
   776     vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit);
   777     vpxor(xmm1, xmm1, xmm1, Assembler::AVX_128bit);
   777     vpxor(xmm1, xmm1, xmm1, Assembler::AVX_128bit);
   778     vpxor(xmm3, xmm3, xmm3, Assembler::AVX_128bit);
   778     vpxor(xmm3, xmm3, xmm3, Assembler::AVX_128bit);
   779     vpxor(xmm15, xmm15, xmm15, Assembler::AVX_128bit);
   779     vpxor(xmm15, xmm15, xmm15, Assembler::AVX_128bit);
   780 }
   780 }
       
   781 
       
   782 // AES Counter Mode using VAES instructions
       
   783 void MacroAssembler::aesctr_encrypt(Register src_addr, Register dest_addr, Register key, Register counter,
       
   784     Register len_reg, Register used, Register used_addr, Register saved_encCounter_start) {
       
   785 
       
   786     const Register rounds = 0;
       
   787     const Register pos = r12;
       
   788 
       
   789     Label PRELOOP_START, EXIT_PRELOOP, REMAINDER, REMAINDER_16, LOOP, END, EXIT, END_LOOP,
       
   790     AES192, AES256, AES192_REMAINDER16, REMAINDER16_END_LOOP, AES256_REMAINDER16,
       
   791     REMAINDER_8, REMAINDER_4, AES192_REMAINDER8, REMAINDER_LOOP, AES256_REMINDER,
       
   792     AES192_REMAINDER, END_REMAINDER_LOOP, AES256_REMAINDER8, REMAINDER8_END_LOOP,
       
   793     AES192_REMAINDER4, AES256_REMAINDER4, AES256_REMAINDER, END_REMAINDER4, EXTRACT_TAILBYTES,
       
   794     EXTRACT_TAIL_4BYTES, EXTRACT_TAIL_2BYTES, EXTRACT_TAIL_1BYTE, STORE_CTR;
       
   795 
       
   796     cmpl(len_reg, 0);
       
   797     jcc(Assembler::belowEqual, EXIT);
       
   798 
       
   799     movl(pos, 0);
       
   800     // if the number of used encrypted counter bytes < 16,
       
   801     // XOR PT with saved encrypted counter to obtain CT
       
   802     bind(PRELOOP_START);
       
   803     cmpl(used, 16);
       
   804     jcc(Assembler::aboveEqual, EXIT_PRELOOP);
       
   805     movb(rbx, Address(saved_encCounter_start, used));
       
   806     xorb(rbx, Address(src_addr, pos));
       
   807     movb(Address(dest_addr, pos), rbx);
       
   808     addptr(pos, 1);
       
   809     addptr(used, 1);
       
   810     decrement(len_reg);
       
   811     jmp(PRELOOP_START);
       
   812 
       
   813     bind(EXIT_PRELOOP);
       
   814     movl(Address(used_addr, 0), used);
       
   815 
       
   816     // Calculate number of rounds i.e. 10, 12, 14,  based on key length(128, 192, 256).
       
   817     movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
       
   818 
       
   819     vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit);
       
   820     // Move initial counter value in xmm0
       
   821     movdqu(xmm0, Address(counter, 0));
       
   822     // broadcast counter value to zmm8
       
   823     evshufi64x2(xmm8, xmm0, xmm0, 0, Assembler::AVX_512bit);
       
   824 
       
   825     // load lbswap mask
       
   826     evmovdquq(xmm16, ExternalAddress(StubRoutines::x86::counter_mask_addr()), Assembler::AVX_512bit, r15);
       
   827 
       
   828     //shuffle counter using lbswap_mask
       
   829     vpshufb(xmm8, xmm8, xmm16, Assembler::AVX_512bit);
       
   830 
       
   831     // pre-increment and propagate counter values to zmm9-zmm15 registers.
       
   832     // Linc0 increments the zmm8 by 1 (initial value being 0), Linc4 increments the counters zmm9-zmm15 by 4
       
   833     // The counter is incremented after each block i.e. 16 bytes is processed;
       
   834     // each zmm register has 4 counter values as its MSB
       
   835     // the counters are incremented in parallel
       
   836     vpaddd(xmm8, xmm8, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 64), Assembler::AVX_512bit, r15);//linc0
       
   837     vpaddd(xmm9, xmm8, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//linc4(rip)
       
   838     vpaddd(xmm10, xmm9, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
       
   839     vpaddd(xmm11, xmm10, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
       
   840     vpaddd(xmm12, xmm11, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
       
   841     vpaddd(xmm13, xmm12, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
       
   842     vpaddd(xmm14, xmm13, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
       
   843     vpaddd(xmm15, xmm14, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
       
   844 
       
   845     // load linc32 mask in zmm register.linc32 increments counter by 32
       
   846     evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 256), Assembler::AVX_512bit, r15);//Linc32
       
   847 
       
   848     // xmm31 contains the key shuffle mask.
       
   849     movdqu(xmm31, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
       
   850     // Load key function loads 128 bit key and shuffles it. Then we broadcast the shuffled key to convert it into a 512 bit value.
       
   851     // For broadcasting the values to ZMM, vshufi64 is used instead of evbroadcasti64x2 as the source in this case is ZMM register
       
   852     // that holds shuffled key value.
       
   853     ev_load_key(xmm20, key, 0, xmm31);
       
   854     ev_load_key(xmm21, key, 1 * 16, xmm31);
       
   855     ev_load_key(xmm22, key, 2 * 16, xmm31);
       
   856     ev_load_key(xmm23, key, 3 * 16, xmm31);
       
   857     ev_load_key(xmm24, key, 4 * 16, xmm31);
       
   858     ev_load_key(xmm25, key, 5 * 16, xmm31);
       
   859     ev_load_key(xmm26, key, 6 * 16, xmm31);
       
   860     ev_load_key(xmm27, key, 7 * 16, xmm31);
       
   861     ev_load_key(xmm28, key, 8 * 16, xmm31);
       
   862     ev_load_key(xmm29, key, 9 * 16, xmm31);
       
   863     ev_load_key(xmm30, key, 10 * 16, xmm31);
       
   864 
       
   865     // Process 32 blocks or 512 bytes of data
       
   866     bind(LOOP);
       
   867     cmpl(len_reg, 512);
       
   868     jcc(Assembler::less, REMAINDER);
       
   869     subq(len_reg, 512);
       
   870     //Shuffle counter and Exor it with roundkey1. Result is stored in zmm0-7
       
   871     vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
       
   872     evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
       
   873     vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit);
       
   874     evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit);
       
   875     vpshufb(xmm2, xmm10, xmm16, Assembler::AVX_512bit);
       
   876     evpxorq(xmm2, xmm2, xmm20, Assembler::AVX_512bit);
       
   877     vpshufb(xmm3, xmm11, xmm16, Assembler::AVX_512bit);
       
   878     evpxorq(xmm3, xmm3, xmm20, Assembler::AVX_512bit);
       
   879     vpshufb(xmm4, xmm12, xmm16, Assembler::AVX_512bit);
       
   880     evpxorq(xmm4, xmm4, xmm20, Assembler::AVX_512bit);
       
   881     vpshufb(xmm5, xmm13, xmm16, Assembler::AVX_512bit);
       
   882     evpxorq(xmm5, xmm5, xmm20, Assembler::AVX_512bit);
       
   883     vpshufb(xmm6, xmm14, xmm16, Assembler::AVX_512bit);
       
   884     evpxorq(xmm6, xmm6, xmm20, Assembler::AVX_512bit);
       
   885     vpshufb(xmm7, xmm15, xmm16, Assembler::AVX_512bit);
       
   886     evpxorq(xmm7, xmm7, xmm20, Assembler::AVX_512bit);
       
   887     // Perform AES encode operations and put results in zmm0-zmm7.
       
   888     // This is followed by incrementing counter values in zmm8-zmm15.
       
   889     // Since we will be processing 32 blocks at a time, the counter is incremented by 32.
       
   890     roundEnc(xmm21, 7);
       
   891     vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
       
   892     roundEnc(xmm22, 7);
       
   893     vpaddq(xmm9, xmm9, xmm19, Assembler::AVX_512bit);
       
   894     roundEnc(xmm23, 7);
       
   895     vpaddq(xmm10, xmm10, xmm19, Assembler::AVX_512bit);
       
   896     roundEnc(xmm24, 7);
       
   897     vpaddq(xmm11, xmm11, xmm19, Assembler::AVX_512bit);
       
   898     roundEnc(xmm25, 7);
       
   899     vpaddq(xmm12, xmm12, xmm19, Assembler::AVX_512bit);
       
   900     roundEnc(xmm26, 7);
       
   901     vpaddq(xmm13, xmm13, xmm19, Assembler::AVX_512bit);
       
   902     roundEnc(xmm27, 7);
       
   903     vpaddq(xmm14, xmm14, xmm19, Assembler::AVX_512bit);
       
   904     roundEnc(xmm28, 7);
       
   905     vpaddq(xmm15, xmm15, xmm19, Assembler::AVX_512bit);
       
   906     roundEnc(xmm29, 7);
       
   907 
       
   908     cmpl(rounds, 52);
       
   909     jcc(Assembler::aboveEqual, AES192);
       
   910     lastroundEnc(xmm30, 7);
       
   911     jmp(END_LOOP);
       
   912 
       
   913     bind(AES192);
       
   914     roundEnc(xmm30, 7);
       
   915     ev_load_key(xmm18, key, 11 * 16, xmm31);
       
   916     roundEnc(xmm18, 7);
       
   917     cmpl(rounds, 60);
       
   918     jcc(Assembler::aboveEqual, AES256);
       
   919     ev_load_key(xmm18, key, 12 * 16, xmm31);
       
   920     lastroundEnc(xmm18, 7);
       
   921     jmp(END_LOOP);
       
   922 
       
   923     bind(AES256);
       
   924     ev_load_key(xmm18, key, 12 * 16, xmm31);
       
   925     roundEnc(xmm18, 7);
       
   926     ev_load_key(xmm18, key, 13 * 16, xmm31);
       
   927     roundEnc(xmm18, 7);
       
   928     ev_load_key(xmm18, key, 14 * 16, xmm31);
       
   929     lastroundEnc(xmm18, 7);
       
   930 
       
   931     // After AES encode rounds, the encrypted block cipher lies in zmm0-zmm7
       
   932     // xor encrypted block cipher and input plaintext and store resultant ciphertext
       
   933     bind(END_LOOP);
       
   934     evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
       
   935     evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
       
   936     evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
       
   937     evmovdquq(Address(dest_addr, pos, Address::times_1, 64), xmm1, Assembler::AVX_512bit);
       
   938     evpxorq(xmm2, xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
       
   939     evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit);
       
   940     evpxorq(xmm3, xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
       
   941     evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit);
       
   942     evpxorq(xmm4, xmm4, Address(src_addr, pos, Address::times_1, 4 * 64), Assembler::AVX_512bit);
       
   943     evmovdquq(Address(dest_addr, pos, Address::times_1, 4 * 64), xmm4, Assembler::AVX_512bit);
       
   944     evpxorq(xmm5, xmm5, Address(src_addr, pos, Address::times_1, 5 * 64), Assembler::AVX_512bit);
       
   945     evmovdquq(Address(dest_addr, pos, Address::times_1, 5 * 64), xmm5, Assembler::AVX_512bit);
       
   946     evpxorq(xmm6, xmm6, Address(src_addr, pos, Address::times_1, 6 * 64), Assembler::AVX_512bit);
       
   947     evmovdquq(Address(dest_addr, pos, Address::times_1, 6 * 64), xmm6, Assembler::AVX_512bit);
       
   948     evpxorq(xmm7, xmm7, Address(src_addr, pos, Address::times_1, 7 * 64), Assembler::AVX_512bit);
       
   949     evmovdquq(Address(dest_addr, pos, Address::times_1, 7 * 64), xmm7, Assembler::AVX_512bit);
       
   950     addq(pos, 512);
       
   951     jmp(LOOP);
       
   952 
       
   953     // Encode 256, 128, 64 or 16 bytes at a time if length is less than 512 bytes
       
   954     bind(REMAINDER);
       
   955     cmpl(len_reg, 0);
       
   956     jcc(Assembler::equal, END);
       
   957     cmpl(len_reg, 256);
       
   958     jcc(Assembler::aboveEqual, REMAINDER_16);
       
   959     cmpl(len_reg, 128);
       
   960     jcc(Assembler::aboveEqual, REMAINDER_8);
       
   961     cmpl(len_reg, 64);
       
   962     jcc(Assembler::aboveEqual, REMAINDER_4);
       
   963     // At this point, we will process 16 bytes of data at a time.
       
   964     // So load xmm19 with counter increment value as 1
       
   965     evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);
       
   966     jmp(REMAINDER_LOOP);
       
   967 
       
   968     // Each ZMM register can be used to encode 64 bytes of data, so we have 4 ZMM registers to encode 256 bytes of data
       
   969     bind(REMAINDER_16);
       
   970     subq(len_reg, 256);
       
   971     // As we process 16 blocks at a time, load mask for incrementing the counter value by 16
       
   972     evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 320), Assembler::AVX_512bit, r15);//Linc16(rip)
       
   973     // shuffle counter and XOR counter with roundkey1
       
   974     vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
       
   975     evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
       
   976     vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit);
       
   977     evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit);
       
   978     vpshufb(xmm2, xmm10, xmm16, Assembler::AVX_512bit);
       
   979     evpxorq(xmm2, xmm2, xmm20, Assembler::AVX_512bit);
       
   980     vpshufb(xmm3, xmm11, xmm16, Assembler::AVX_512bit);
       
   981     evpxorq(xmm3, xmm3, xmm20, Assembler::AVX_512bit);
       
   982     // Increment counter values by 16
       
   983     vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
       
   984     vpaddq(xmm9, xmm9, xmm19, Assembler::AVX_512bit);
       
   985     // AES encode rounds
       
   986     roundEnc(xmm21, 3);
       
   987     roundEnc(xmm22, 3);
       
   988     roundEnc(xmm23, 3);
       
   989     roundEnc(xmm24, 3);
       
   990     roundEnc(xmm25, 3);
       
   991     roundEnc(xmm26, 3);
       
   992     roundEnc(xmm27, 3);
       
   993     roundEnc(xmm28, 3);
       
   994     roundEnc(xmm29, 3);
       
   995 
       
   996     cmpl(rounds, 52);
       
   997     jcc(Assembler::aboveEqual, AES192_REMAINDER16);
       
   998     lastroundEnc(xmm30, 3);
       
   999     jmp(REMAINDER16_END_LOOP);
       
  1000 
       
  1001     bind(AES192_REMAINDER16);
       
  1002     roundEnc(xmm30, 3);
       
  1003     ev_load_key(xmm18, key, 11 * 16, xmm31);
       
  1004     roundEnc(xmm18, 3);
       
  1005     ev_load_key(xmm5, key, 12 * 16, xmm31);
       
  1006 
       
  1007     cmpl(rounds, 60);
       
  1008     jcc(Assembler::aboveEqual, AES256_REMAINDER16);
       
  1009     lastroundEnc(xmm5, 3);
       
  1010     jmp(REMAINDER16_END_LOOP);
       
  1011     bind(AES256_REMAINDER16);
       
  1012     roundEnc(xmm5, 3);
       
  1013     ev_load_key(xmm6, key, 13 * 16, xmm31);
       
  1014     roundEnc(xmm6, 3);
       
  1015     ev_load_key(xmm7, key, 14 * 16, xmm31);
       
  1016     lastroundEnc(xmm7, 3);
       
  1017 
       
  1018     // After AES encode rounds, the encrypted block cipher lies in zmm0-zmm3
       
  1019     // xor 256 bytes of PT with the encrypted counters to produce CT.
       
  1020     bind(REMAINDER16_END_LOOP);
       
  1021     evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_512bit);
       
  1022     evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
       
  1023     evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
       
  1024     evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit);
       
  1025     evpxorq(xmm2, xmm2, Address(src_addr, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
       
  1026     evmovdquq(Address(dest_addr, pos, Address::times_1, 2 * 64), xmm2, Assembler::AVX_512bit);
       
  1027     evpxorq(xmm3, xmm3, Address(src_addr, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
       
  1028     evmovdquq(Address(dest_addr, pos, Address::times_1, 3 * 64), xmm3, Assembler::AVX_512bit);
       
  1029     addq(pos, 256);
       
  1030 
       
  1031     cmpl(len_reg, 128);
       
  1032     jcc(Assembler::aboveEqual, REMAINDER_8);
       
  1033 
       
  1034     cmpl(len_reg, 64);
       
  1035     jcc(Assembler::aboveEqual, REMAINDER_4);
       
  1036     //load mask for incrementing the counter value by 1
       
  1037     evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);//Linc0 + 16(rip)
       
  1038     jmp(REMAINDER_LOOP);
       
  1039 
       
  1040     // Each ZMM register can be used to encode 64 bytes of data, so we have 2 ZMM registers to encode 128 bytes of data
       
  1041     bind(REMAINDER_8);
       
  1042     subq(len_reg, 128);
       
  1043     // As we process 8 blocks at a time, load mask for incrementing the counter value by 8
       
  1044     evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 192), Assembler::AVX_512bit, r15);//Linc8(rip)
       
  1045     // shuffle counters and xor with roundkey1
       
  1046     vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
       
  1047     evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
       
  1048     vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit);
       
  1049     evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit);
       
  1050     // increment counter by 8
       
  1051     vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
       
  1052     // AES encode
       
  1053     roundEnc(xmm21, 1);
       
  1054     roundEnc(xmm22, 1);
       
  1055     roundEnc(xmm23, 1);
       
  1056     roundEnc(xmm24, 1);
       
  1057     roundEnc(xmm25, 1);
       
  1058     roundEnc(xmm26, 1);
       
  1059     roundEnc(xmm27, 1);
       
  1060     roundEnc(xmm28, 1);
       
  1061     roundEnc(xmm29, 1);
       
  1062 
       
  1063     cmpl(rounds, 52);
       
  1064     jcc(Assembler::aboveEqual, AES192_REMAINDER8);
       
  1065     lastroundEnc(xmm30, 1);
       
  1066     jmp(REMAINDER8_END_LOOP);
       
  1067 
       
  1068     bind(AES192_REMAINDER8);
       
  1069     roundEnc(xmm30, 1);
       
  1070     ev_load_key(xmm18, key, 11 * 16, xmm31);
       
  1071     roundEnc(xmm18, 1);
       
  1072     ev_load_key(xmm5, key, 12 * 16, xmm31);
       
  1073     cmpl(rounds, 60);
       
  1074     jcc(Assembler::aboveEqual, AES256_REMAINDER8);
       
  1075     lastroundEnc(xmm5, 1);
       
  1076     jmp(REMAINDER8_END_LOOP);
       
  1077 
       
  1078     bind(AES256_REMAINDER8);
       
  1079     roundEnc(xmm5, 1);
       
  1080     ev_load_key(xmm6, key, 13 * 16, xmm31);
       
  1081     roundEnc(xmm6, 1);
       
  1082     ev_load_key(xmm7, key, 14 * 16, xmm31);
       
  1083     lastroundEnc(xmm7, 1);
       
  1084 
       
  1085     bind(REMAINDER8_END_LOOP);
       
  1086     // After AES encode rounds, the encrypted block cipher lies in zmm0-zmm1
       
  1087     // XOR PT with the encrypted counter and store as CT
       
  1088     evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
       
  1089     evmovdquq(Address(dest_addr, pos, Address::times_1, 0 * 64), xmm0, Assembler::AVX_512bit);
       
  1090     evpxorq(xmm1, xmm1, Address(src_addr, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
       
  1091     evmovdquq(Address(dest_addr, pos, Address::times_1, 1 * 64), xmm1, Assembler::AVX_512bit);
       
  1092     addq(pos, 128);
       
  1093 
       
  1094     cmpl(len_reg, 64);
       
  1095     jcc(Assembler::aboveEqual, REMAINDER_4);
       
  1096     // load mask for incrementing the counter value by 1
       
  1097     evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);//Linc0 + 16(rip)
       
  1098     jmp(REMAINDER_LOOP);
       
  1099 
       
  1100     // Each ZMM register can be used to encode 64 bytes of data, so we have 1 ZMM register used in this block of code
       
  1101     bind(REMAINDER_4);
       
  1102     subq(len_reg, 64);
       
  1103     // As we process 4 blocks at a time, load mask for incrementing the counter value by 4
       
  1104     evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 128), Assembler::AVX_512bit, r15);//Linc4(rip)
       
  1105     // XOR counter with first roundkey
       
  1106     vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
       
  1107     evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
       
  1108     // Increment counter
       
  1109     vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
       
  1110     vaesenc(xmm0, xmm0, xmm21, Assembler::AVX_512bit);
       
  1111     vaesenc(xmm0, xmm0, xmm22, Assembler::AVX_512bit);
       
  1112     vaesenc(xmm0, xmm0, xmm23, Assembler::AVX_512bit);
       
  1113     vaesenc(xmm0, xmm0, xmm24, Assembler::AVX_512bit);
       
  1114     vaesenc(xmm0, xmm0, xmm25, Assembler::AVX_512bit);
       
  1115     vaesenc(xmm0, xmm0, xmm26, Assembler::AVX_512bit);
       
  1116     vaesenc(xmm0, xmm0, xmm27, Assembler::AVX_512bit);
       
  1117     vaesenc(xmm0, xmm0, xmm28, Assembler::AVX_512bit);
       
  1118     vaesenc(xmm0, xmm0, xmm29, Assembler::AVX_512bit);
       
  1119     cmpl(rounds, 52);
       
  1120     jcc(Assembler::aboveEqual, AES192_REMAINDER4);
       
  1121     vaesenclast(xmm0, xmm0, xmm30, Assembler::AVX_512bit);
       
  1122     jmp(END_REMAINDER4);
       
  1123 
       
  1124     bind(AES192_REMAINDER4);
       
  1125     vaesenc(xmm0, xmm0, xmm30, Assembler::AVX_512bit);
       
  1126     ev_load_key(xmm18, key, 11 * 16, xmm31);
       
  1127     vaesenc(xmm0, xmm0, xmm18, Assembler::AVX_512bit);
       
  1128     ev_load_key(xmm5, key, 12 * 16, xmm31);
       
  1129 
       
  1130     cmpl(rounds, 60);
       
  1131     jcc(Assembler::aboveEqual, AES256_REMAINDER4);
       
  1132     vaesenclast(xmm0, xmm0, xmm5, Assembler::AVX_512bit);
       
  1133     jmp(END_REMAINDER4);
       
  1134 
       
  1135     bind(AES256_REMAINDER4);
       
  1136     vaesenc(xmm0, xmm0, xmm5, Assembler::AVX_512bit);
       
  1137     ev_load_key(xmm6, key, 13 * 16, xmm31);
       
  1138     vaesenc(xmm0, xmm0, xmm6, Assembler::AVX_512bit);
       
  1139     ev_load_key(xmm7, key, 14 * 16, xmm31);
       
  1140     vaesenclast(xmm0, xmm0, xmm7, Assembler::AVX_512bit);
       
  1141     // After AES encode rounds, the encrypted block cipher lies in zmm0.
       
  1142     // XOR encrypted block cipher with PT and store 64 bytes of ciphertext
       
  1143     bind(END_REMAINDER4);
       
  1144     evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
       
  1145     evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
       
  1146     addq(pos, 64);
       
  1147     // load mask for incrementing the counter value by 1
       
  1148     evmovdquq(xmm19, ExternalAddress(StubRoutines::x86::counter_mask_addr() + 80), Assembler::AVX_128bit, r15);//Linc0 + 16(rip)
       
  1149 
       
  1150     // For a single block, the AES rounds start here.
       
  1151     bind(REMAINDER_LOOP);
       
  1152     cmpl(len_reg, 0);
       
  1153     jcc(Assembler::belowEqual, END);
       
  1154     // XOR counter with first roundkey
       
  1155     vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_128bit);
       
  1156     evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_128bit);
       
  1157     vaesenc(xmm0, xmm0, xmm21, Assembler::AVX_128bit);
       
  1158     // Increment counter by 1
       
  1159     vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_128bit);
       
  1160     vaesenc(xmm0, xmm0, xmm22, Assembler::AVX_128bit);
       
  1161     vaesenc(xmm0, xmm0, xmm23, Assembler::AVX_128bit);
       
  1162     vaesenc(xmm0, xmm0, xmm24, Assembler::AVX_128bit);
       
  1163     vaesenc(xmm0, xmm0, xmm25, Assembler::AVX_128bit);
       
  1164     vaesenc(xmm0, xmm0, xmm26, Assembler::AVX_128bit);
       
  1165     vaesenc(xmm0, xmm0, xmm27, Assembler::AVX_128bit);
       
  1166     vaesenc(xmm0, xmm0, xmm28, Assembler::AVX_128bit);
       
  1167     vaesenc(xmm0, xmm0, xmm29, Assembler::AVX_128bit);
       
  1168 
       
  1169     cmpl(rounds, 52);
       
  1170     jcc(Assembler::aboveEqual, AES192_REMAINDER);
       
  1171     vaesenclast(xmm0, xmm0, xmm30, Assembler::AVX_128bit);
       
  1172     jmp(END_REMAINDER_LOOP);
       
  1173 
       
  1174     bind(AES192_REMAINDER);
       
  1175     vaesenc(xmm0, xmm0, xmm30, Assembler::AVX_128bit);
       
  1176     ev_load_key(xmm18, key, 11 * 16, xmm31);
       
  1177     vaesenc(xmm0, xmm0, xmm18, Assembler::AVX_128bit);
       
  1178     ev_load_key(xmm5, key, 12 * 16, xmm31);
       
  1179     cmpl(rounds, 60);
       
  1180     jcc(Assembler::aboveEqual, AES256_REMAINDER);
       
  1181     vaesenclast(xmm0, xmm0, xmm5, Assembler::AVX_128bit);
       
  1182     jmp(END_REMAINDER_LOOP);
       
  1183 
       
  1184     bind(AES256_REMAINDER);
       
  1185     vaesenc(xmm0, xmm0, xmm5, Assembler::AVX_128bit);
       
  1186     ev_load_key(xmm6, key, 13 * 16, xmm31);
       
  1187     vaesenc(xmm0, xmm0, xmm6, Assembler::AVX_128bit);
       
  1188     ev_load_key(xmm7, key, 14 * 16, xmm31);
       
  1189     vaesenclast(xmm0, xmm0, xmm7, Assembler::AVX_128bit);
       
  1190 
       
  1191     bind(END_REMAINDER_LOOP);
       
  1192     // If the length register is less than the blockSize i.e. 16
       
  1193     // then we store only those bytes of the CT to the destination
       
  1194     // corresponding to the length register value
       
  1195     // extracting the exact number of bytes is handled by EXTRACT_TAILBYTES
       
  1196     cmpl(len_reg, 16);
       
  1197     jcc(Assembler::less, EXTRACT_TAILBYTES);
       
  1198     subl(len_reg, 16);
       
  1199     // After AES encode rounds, the encrypted block cipher lies in xmm0.
       
  1200     // If the length register is equal to 16 bytes, store CT in dest after XOR operation.
       
  1201     evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_128bit);
       
  1202     evmovdquq(Address(dest_addr, pos, Address::times_1, 0), xmm0, Assembler::AVX_128bit);
       
  1203     addl(pos, 16);
       
  1204 
       
  1205     jmp(REMAINDER_LOOP);
       
  1206 
       
  1207     bind(EXTRACT_TAILBYTES);
       
  1208     // Save encrypted counter value in xmm0 for next invocation, before XOR operation
       
  1209     movdqu(Address(saved_encCounter_start, 0), xmm0);
       
  1210     // XOR encryted block cipher in xmm0 with PT to produce CT
       
  1211     evpxorq(xmm0, xmm0, Address(src_addr, pos, Address::times_1, 0), Assembler::AVX_128bit);
       
  1212     // extract upto 15 bytes of CT from xmm0 as specified by length register
       
  1213     testptr(len_reg, 8);
       
  1214     jcc(Assembler::zero, EXTRACT_TAIL_4BYTES);
       
  1215     pextrq(Address(dest_addr, pos), xmm0, 0);
       
  1216     psrldq(xmm0, 8);
       
  1217     addl(pos, 8);
       
  1218     bind(EXTRACT_TAIL_4BYTES);
       
  1219     testptr(len_reg, 4);
       
  1220     jcc(Assembler::zero, EXTRACT_TAIL_2BYTES);
       
  1221     pextrd(Address(dest_addr, pos), xmm0, 0);
       
  1222     psrldq(xmm0, 4);
       
  1223     addq(pos, 4);
       
  1224     bind(EXTRACT_TAIL_2BYTES);
       
  1225     testptr(len_reg, 2);
       
  1226     jcc(Assembler::zero, EXTRACT_TAIL_1BYTE);
       
  1227     pextrw(Address(dest_addr, pos), xmm0, 0);
       
  1228     psrldq(xmm0, 2);
       
  1229     addl(pos, 2);
       
  1230     bind(EXTRACT_TAIL_1BYTE);
       
  1231     testptr(len_reg, 1);
       
  1232     jcc(Assembler::zero, END);
       
  1233     pextrb(Address(dest_addr, pos), xmm0, 0);
       
  1234     addl(pos, 1);
       
  1235 
       
  1236     bind(END);
       
  1237     // If there are no tail bytes, store counter value and exit
       
  1238     cmpl(len_reg, 0);
       
  1239     jcc(Assembler::equal, STORE_CTR);
       
  1240     movl(Address(used_addr, 0), len_reg);
       
  1241 
       
  1242     bind(STORE_CTR);
       
  1243     //shuffle updated counter and store it
       
  1244     vpshufb(xmm8, xmm8, xmm16, Assembler::AVX_128bit);
       
  1245     movdqu(Address(counter, 0), xmm8);
       
  1246     // Zero out counter and key registers
       
  1247     evpxorq(xmm8, xmm8, xmm8, Assembler::AVX_512bit);
       
  1248     evpxorq(xmm20, xmm20, xmm20, Assembler::AVX_512bit);
       
  1249     evpxorq(xmm21, xmm21, xmm21, Assembler::AVX_512bit);
       
  1250     evpxorq(xmm22, xmm22, xmm22, Assembler::AVX_512bit);
       
  1251     evpxorq(xmm23, xmm23, xmm23, Assembler::AVX_512bit);
       
  1252     evpxorq(xmm24, xmm24, xmm24, Assembler::AVX_512bit);
       
  1253     evpxorq(xmm25, xmm25, xmm25, Assembler::AVX_512bit);
       
  1254     evpxorq(xmm26, xmm26, xmm26, Assembler::AVX_512bit);
       
  1255     evpxorq(xmm27, xmm27, xmm27, Assembler::AVX_512bit);
       
  1256     evpxorq(xmm28, xmm28, xmm28, Assembler::AVX_512bit);
       
  1257     evpxorq(xmm29, xmm29, xmm29, Assembler::AVX_512bit);
       
  1258     evpxorq(xmm30, xmm30, xmm30, Assembler::AVX_512bit);
       
  1259     cmpl(rounds, 44);
       
  1260     jcc(Assembler::belowEqual, EXIT);
       
  1261     evpxorq(xmm18, xmm18, xmm18, Assembler::AVX_512bit);
       
  1262     evpxorq(xmm5, xmm5, xmm5, Assembler::AVX_512bit);
       
  1263     cmpl(rounds, 52);
       
  1264     jcc(Assembler::belowEqual, EXIT);
       
  1265     evpxorq(xmm6, xmm6, xmm6, Assembler::AVX_512bit);
       
  1266     evpxorq(xmm7, xmm7, xmm7, Assembler::AVX_512bit);
       
  1267     bind(EXIT);
       
  1268 }
       
  1269 
   781 #endif // _LP64
  1270 #endif // _LP64