hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
changeset 36825 6ebe5519b753
parent 36555 4f37fd7a5a09
child 38018 1dc6c6f21231
child 37466 287c4ebd11b0
equal deleted inserted replaced
36824:593d798b0954 36825:6ebe5519b753
  2598   //
  2598   //
  2599   // Output:
  2599   // Output:
  2600   //   rax       - input length
  2600   //   rax       - input length
  2601   //
  2601   //
  2602 
  2602 
  2603   address generate_cipherBlockChaining_decryptAESCrypt() {
  2603   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
  2604     assert(UseAES, "need AES instructions and misaligned SSE support");
  2604     assert(UseAES, "need AES instructions and misaligned SSE support");
  2605     __ align(CodeEntryAlignment);
  2605     __ align(CodeEntryAlignment);
  2606     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
  2606     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
  2607     address start = __ pc();
  2607     address start = __ pc();
  2608 
  2608 
  2609     Label L_exit, L_key_192_256, L_key_256;
       
  2610     Label L_singleBlock_loopTop_128;
       
  2611     Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256;
       
  2612     const Register from        = rsi;      // source array address
  2609     const Register from        = rsi;      // source array address
  2613     const Register to          = rdx;      // destination array address
  2610     const Register to          = rdx;      // destination array address
  2614     const Register key         = rcx;      // key array address
  2611     const Register key         = rcx;      // key array address
  2615     const Register rvec        = rdi;      // r byte array initialized from initvector array address
  2612     const Register rvec        = rdi;      // r byte array initialized from initvector array address
  2616                                            // and left with the results of the last encryption block
  2613                                            // and left with the results of the last encryption block
  2617     const Register len_reg     = rbx;      // src len (must be multiple of blocksize 16)
  2614     const Register len_reg     = rbx;      // src len (must be multiple of blocksize 16)
  2618     const Register pos         = rax;
  2615     const Register pos         = rax;
  2619 
  2616 
  2620     // xmm register assignments for the loops below
  2617     const int PARALLEL_FACTOR = 4;
  2621     const XMMRegister xmm_result = xmm0;
  2618     const int ROUNDS[3] = { 10, 12, 14 }; //aes rounds for key128, key192, key256
  2622     const XMMRegister xmm_temp   = xmm1;
  2619 
  2623     // first 6 keys preloaded into xmm2-xmm7
  2620     Label L_exit;
  2624     const int XMM_REG_NUM_KEY_FIRST = 2;
  2621     Label L_singleBlock_loopTop[3]; //128, 192, 256
  2625     const int XMM_REG_NUM_KEY_LAST  = 7;
  2622     Label L_multiBlock_loopTop[3]; //128, 192, 256
  2626     const int FIRST_NON_REG_KEY_offset = 0x70;
  2623 
  2627     const XMMRegister xmm_key_first   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
  2624     const XMMRegister xmm_prev_block_cipher = xmm0; // holds cipher of previous block
       
  2625     const XMMRegister xmm_key_shuf_mask = xmm1;
       
  2626 
       
  2627     const XMMRegister xmm_key_tmp0 = xmm2;
       
  2628     const XMMRegister xmm_key_tmp1 = xmm3;
       
  2629 
       
  2630     // registers holding the six results in the parallelized loop
       
  2631     const XMMRegister xmm_result0 = xmm4;
       
  2632     const XMMRegister xmm_result1 = xmm5;
       
  2633     const XMMRegister xmm_result2 = xmm6;
       
  2634     const XMMRegister xmm_result3 = xmm7;
  2628 
  2635 
  2629     __ enter(); // required for proper stackwalking of RuntimeStub frame
  2636     __ enter(); // required for proper stackwalking of RuntimeStub frame
  2630     handleSOERegisters(true /*saving*/);
  2637     handleSOERegisters(true /*saving*/);
  2631 
  2638 
  2632     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
  2639     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
  2641     const Address  from_param(rbp, 8+0);
  2648     const Address  from_param(rbp, 8+0);
  2642     const Address  to_param  (rbp, 8+4);
  2649     const Address  to_param  (rbp, 8+4);
  2643     const Address  key_param (rbp, 8+8);
  2650     const Address  key_param (rbp, 8+8);
  2644     const Address  rvec_param (rbp, 8+12);
  2651     const Address  rvec_param (rbp, 8+12);
  2645     const Address  len_param  (rbp, 8+16);
  2652     const Address  len_param  (rbp, 8+16);
       
  2653 
  2646     __ movptr(from , from_param);
  2654     __ movptr(from , from_param);
  2647     __ movptr(to   , to_param);
  2655     __ movptr(to   , to_param);
  2648     __ movptr(key  , key_param);
  2656     __ movptr(key  , key_param);
  2649     __ movptr(rvec , rvec_param);
  2657     __ movptr(rvec , rvec_param);
  2650     __ movptr(len_reg , len_param);
  2658     __ movptr(len_reg , len_param);
  2651 
  2659 
  2652     // the java expanded key ordering is rotated one position from what we want
       
  2653     // so we start from 0x10 here and hit 0x00 last
       
  2654     const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
       
  2655     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
  2660     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
  2656     // load up xmm regs 2 thru 6 with first 5 keys
  2661     __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec
  2657     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
  2662 
  2658       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
  2663     __ xorptr(pos, pos);
  2659       offset += 0x10;
       
  2660     }
       
  2661 
       
  2662     // inside here, use the rvec register to point to previous block cipher
       
  2663     // with which we xor at the end of each newly decrypted block
       
  2664     const Register  prev_block_cipher_ptr = rvec;
       
  2665 
  2664 
  2666     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
  2665     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
  2667     __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
  2666     // rvec is reused
  2668     __ cmpl(rax, 44);
  2667     __ movl(rvec, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
  2669     __ jcc(Assembler::notEqual, L_key_192_256);
  2668     __ cmpl(rvec, 52);
  2670 
  2669     __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
  2671 
  2670     __ cmpl(rvec, 60);
  2672     // 128-bit code follows here, parallelized
  2671     __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
  2673     __ movl(pos, 0);
  2672 
  2674     __ align(OptoLoopAlignment);
  2673 #define DoFour(opc, src_reg)           \
  2675     __ BIND(L_singleBlock_loopTop_128);
  2674   __ opc(xmm_result0, src_reg);         \
  2676     __ cmpptr(len_reg, 0);           // any blocks left??
  2675   __ opc(xmm_result1, src_reg);         \
  2677     __ jcc(Assembler::equal, L_exit);
  2676   __ opc(xmm_result2, src_reg);         \
  2678     __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
  2677   __ opc(xmm_result3, src_reg);         \
  2679     __ pxor  (xmm_result, xmm_key_first);                             // do the aes dec rounds
  2678 
  2680     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
  2679     for (int k = 0; k < 3; ++k) {
  2681       __ aesdec(xmm_result, as_XMMRegister(rnum));
  2680       __ align(OptoLoopAlignment);
  2682     }
  2681       __ BIND(L_multiBlock_loopTop[k]);
  2683     for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xa0; key_offset += 0x10) {   // 128-bit runs up to key offset a0
  2682       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
  2684       aes_dec_key(xmm_result, xmm_temp, key, key_offset);
  2683       __ jcc(Assembler::less, L_singleBlock_loopTop[k]);
  2685     }
  2684 
  2686     load_key(xmm_temp, key, 0x00);                                     // final key is stored in java expanded array at offset 0
  2685       __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
  2687     __ aesdeclast(xmm_result, xmm_temp);
  2686       __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
  2688     __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
  2687       __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
  2689     __ pxor  (xmm_result, xmm_temp);                                  // xor with the current r vector
  2688       __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
  2690     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
  2689 
  2691     // no need to store r to memory until we exit
  2690       // the java expanded key ordering is rotated one position from what we want
  2692     __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0));     // set up new ptr
  2691       // so we start from 0x10 here and hit 0x00 last
  2693     __ addptr(pos, AESBlockSize);
  2692       load_key(xmm_key_tmp0, key, 0x10, xmm_key_shuf_mask);
  2694     __ subptr(len_reg, AESBlockSize);
  2693       DoFour(pxor, xmm_key_tmp0); //xor with first key
  2695     __ jmp(L_singleBlock_loopTop_128);
  2694       // do the aes dec rounds
  2696 
  2695       for (int rnum = 1; rnum <= ROUNDS[k];) {
       
  2696         //load two keys at a time
       
  2697         //k1->0x20, ..., k9->0xa0, k10->0x00
       
  2698         load_key(xmm_key_tmp1, key, (rnum + 1) * 0x10, xmm_key_shuf_mask);
       
  2699         load_key(xmm_key_tmp0, key, ((rnum + 2) % (ROUNDS[k] + 1)) * 0x10, xmm_key_shuf_mask); // hit 0x00 last!
       
  2700         DoFour(aesdec, xmm_key_tmp1);
       
  2701         rnum++;
       
  2702         if (rnum != ROUNDS[k]) {
       
  2703           DoFour(aesdec, xmm_key_tmp0);
       
  2704         }
       
  2705         else {
       
  2706           DoFour(aesdeclast, xmm_key_tmp0);
       
  2707         }
       
  2708         rnum++;
       
  2709       }
       
  2710 
       
  2711       // for each result, xor with the r vector of previous cipher block
       
  2712       __ pxor(xmm_result0, xmm_prev_block_cipher);
       
  2713       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
       
  2714       __ pxor(xmm_result1, xmm_prev_block_cipher);
       
  2715       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
       
  2716       __ pxor(xmm_result2, xmm_prev_block_cipher);
       
  2717       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
       
  2718       __ pxor(xmm_result3, xmm_prev_block_cipher);
       
  2719       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize)); // this will carry over to next set of blocks
       
  2720 
       
  2721             // store 4 results into the next 64 bytes of output
       
  2722        __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
       
  2723        __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
       
  2724        __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
       
  2725        __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
       
  2726 
       
  2727        __ addptr(pos, 4 * AESBlockSize);
       
  2728        __ subptr(len_reg, 4 * AESBlockSize);
       
  2729        __ jmp(L_multiBlock_loopTop[k]);
       
  2730 
       
  2731        //singleBlock starts here
       
  2732        __ align(OptoLoopAlignment);
       
  2733        __ BIND(L_singleBlock_loopTop[k]);
       
  2734        __ cmpptr(len_reg, 0); // any blocks left?
       
  2735        __ jcc(Assembler::equal, L_exit);
       
  2736        __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
       
  2737        __ movdqa(xmm_result1, xmm_result0);
       
  2738 
       
  2739        load_key(xmm_key_tmp0, key, 0x10, xmm_key_shuf_mask);
       
  2740        __ pxor(xmm_result0, xmm_key_tmp0);
       
  2741        // do the aes dec rounds
       
  2742        for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
       
  2743          // the java expanded key ordering is rotated one position from what we want
       
  2744          load_key(xmm_key_tmp0, key, (rnum + 1) * 0x10, xmm_key_shuf_mask);
       
  2745          __ aesdec(xmm_result0, xmm_key_tmp0);
       
  2746        }
       
  2747        load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
       
  2748        __ aesdeclast(xmm_result0, xmm_key_tmp0);
       
  2749        __ pxor(xmm_result0, xmm_prev_block_cipher); // xor with the current r vector
       
  2750        __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result0); // store into the next 16 bytes of output
       
  2751        // no need to store r to memory until we exit
       
  2752        __ movdqa(xmm_prev_block_cipher, xmm_result1); // set up next r vector with cipher input from this block
       
  2753 
       
  2754        __ addptr(pos, AESBlockSize);
       
  2755        __ subptr(len_reg, AESBlockSize);
       
  2756        __ jmp(L_singleBlock_loopTop[k]);
       
  2757     }//for 128/192/256
  2697 
  2758 
  2698     __ BIND(L_exit);
  2759     __ BIND(L_exit);
  2699     __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
  2760     __ movptr(rvec, rvec_param);                        // restore this since reused earlier
  2700     __ movptr(rvec , rvec_param);                                     // restore this since used in loop
  2761     __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object
  2701     __ movdqu(Address(rvec, 0), xmm_temp);                            // final value of r stored in rvec of CipherBlockChaining object
       
  2702     handleSOERegisters(false /*restoring*/);
  2762     handleSOERegisters(false /*restoring*/);
  2703     __ movptr(rax, len_param); // return length
  2763     __ movptr(rax, len_param);                          // return length
  2704     __ leave();                                                       // required for proper stackwalking of RuntimeStub frame
  2764     __ leave();                                         // required for proper stackwalking of RuntimeStub frame
  2705     __ ret(0);
  2765     __ ret(0);
  2706 
  2766 
  2707 
       
  2708     __ BIND(L_key_192_256);
       
  2709     // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
       
  2710     __ cmpl(rax, 52);
       
  2711     __ jcc(Assembler::notEqual, L_key_256);
       
  2712 
       
  2713     // 192-bit code follows here (could be optimized to use parallelism)
       
  2714     __ movl(pos, 0);
       
  2715     __ align(OptoLoopAlignment);
       
  2716     __ BIND(L_singleBlock_loopTop_192);
       
  2717     __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
       
  2718     __ pxor  (xmm_result, xmm_key_first);                             // do the aes dec rounds
       
  2719     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
       
  2720       __ aesdec(xmm_result, as_XMMRegister(rnum));
       
  2721     }
       
  2722     for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xc0; key_offset += 0x10) {   // 192-bit runs up to key offset c0
       
  2723       aes_dec_key(xmm_result, xmm_temp, key, key_offset);
       
  2724     }
       
  2725     load_key(xmm_temp, key, 0x00);                                     // final key is stored in java expanded array at offset 0
       
  2726     __ aesdeclast(xmm_result, xmm_temp);
       
  2727     __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
       
  2728     __ pxor  (xmm_result, xmm_temp);                                  // xor with the current r vector
       
  2729     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
       
  2730     // no need to store r to memory until we exit
       
  2731     __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0));     // set up new ptr
       
  2732     __ addptr(pos, AESBlockSize);
       
  2733     __ subptr(len_reg, AESBlockSize);
       
  2734     __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192);
       
  2735     __ jmp(L_exit);
       
  2736 
       
  2737     __ BIND(L_key_256);
       
  2738     // 256-bit code follows here (could be optimized to use parallelism)
       
  2739     __ movl(pos, 0);
       
  2740     __ align(OptoLoopAlignment);
       
  2741     __ BIND(L_singleBlock_loopTop_256);
       
  2742     __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
       
  2743     __ pxor  (xmm_result, xmm_key_first);                             // do the aes dec rounds
       
  2744     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
       
  2745       __ aesdec(xmm_result, as_XMMRegister(rnum));
       
  2746     }
       
  2747     for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xe0; key_offset += 0x10) {   // 256-bit runs up to key offset e0
       
  2748       aes_dec_key(xmm_result, xmm_temp, key, key_offset);
       
  2749     }
       
  2750     load_key(xmm_temp, key, 0x00);                                     // final key is stored in java expanded array at offset 0
       
  2751     __ aesdeclast(xmm_result, xmm_temp);
       
  2752     __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
       
  2753     __ pxor  (xmm_result, xmm_temp);                                  // xor with the current r vector
       
  2754     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
       
  2755     // no need to store r to memory until we exit
       
  2756     __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0));     // set up new ptr
       
  2757     __ addptr(pos, AESBlockSize);
       
  2758     __ subptr(len_reg, AESBlockSize);
       
  2759     __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
       
  2760     __ jmp(L_exit);
       
  2761 
       
  2762     return start;
  2767     return start;
  2763   }
  2768   }
  2764 
       
  2765 
  2769 
  2766   // CTR AES crypt.
  2770   // CTR AES crypt.
  2767   // In 32-bit stub, parallelize 4 blocks at a time
  2771   // In 32-bit stub, parallelize 4 blocks at a time
  2768   // Arguments:
  2772   // Arguments:
  2769   //
  2773   //
  3892       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // might be needed by the others
  3896       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // might be needed by the others
  3893 
  3897 
  3894       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
  3898       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
  3895       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
  3899       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
  3896       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
  3900       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
  3897       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
  3901       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
  3898     }
  3902     }
  3899 
  3903 
  3900     if (UseAESCTRIntrinsics) {
  3904     if (UseAESCTRIntrinsics) {
  3901       StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
  3905       StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
  3902       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
  3906       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();