hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
changeset 35154 a9b3c1984a01
parent 35146 9ebfec283f56
child 35537 bed5e2dc57a1
equal deleted inserted replaced
35153:0341260cd1f2 35154:a9b3c1984a01
  3037     __ emit_data64( 0x0405060700010203, relocInfo::none );
  3037     __ emit_data64( 0x0405060700010203, relocInfo::none );
  3038     __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
  3038     __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
  3039     return start;
  3039     return start;
  3040   }
  3040   }
  3041 
  3041 
       
  3042   address generate_counter_shuffle_mask() {
       
  3043     __ align(16);
       
  3044     StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
       
  3045     address start = __ pc();
       
  3046     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
       
  3047     __ emit_data64(0x0001020304050607, relocInfo::none);
       
  3048     return start;
       
  3049   }
       
  3050 
  3042   // Utility routine for loading a 128-bit key word in little endian format
  3051   // Utility routine for loading a 128-bit key word in little endian format
  3043   // can optionally specify that the shuffle mask is already in an xmmregister
  3052   // can optionally specify that the shuffle mask is already in an xmmregister
  3044   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
  3053   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
  3045     __ movdqu(xmmdst, Address(key, offset));
  3054     __ movdqu(xmmdst, Address(key, offset));
  3046     if (xmm_shuf_mask != NULL) {
  3055     if (xmm_shuf_mask != NULL) {
  3047       __ pshufb(xmmdst, xmm_shuf_mask);
  3056       __ pshufb(xmmdst, xmm_shuf_mask);
  3048     } else {
  3057     } else {
  3049       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
  3058       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
  3050     }
  3059     }
       
  3060   }
       
  3061 
       
  3062   // Utility routine for increase 128bit counter (iv in CTR mode)
       
  3063   void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
       
  3064     __ pextrq(reg, xmmdst, 0x0);
       
  3065     __ addq(reg, inc_delta);
       
  3066     __ pinsrq(xmmdst, reg, 0x0);
       
  3067     __ jcc(Assembler::carryClear, next_block); // jump if no carry
       
  3068     __ pextrq(reg, xmmdst, 0x01); // Carry
       
  3069     __ addq(reg, 0x01);
       
  3070     __ pinsrq(xmmdst, reg, 0x01); //Carry end
       
  3071     __ BIND(next_block);          // next instruction
  3051   }
  3072   }
  3052 
  3073 
  3053   // Arguments:
  3074   // Arguments:
  3054   //
  3075   //
  3055   // Inputs:
  3076   // Inputs:
  3698     __ jmp(L_exit);
  3719     __ jmp(L_exit);
  3699 
  3720 
  3700     return start;
  3721     return start;
  3701   }
  3722   }
  3702 
  3723 
       
  3724   // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
       
  3725   // to hide instruction latency
       
  3726   //
       
  3727   // Arguments:
       
  3728   //
       
  3729   // Inputs:
       
  3730   //   c_rarg0   - source byte array address
       
  3731   //   c_rarg1   - destination byte array address
       
  3732   //   c_rarg2   - K (key) in little endian int array
       
  3733   //   c_rarg3   - counter vector byte array address
       
  3734   //   Linux
       
  3735   //     c_rarg4   -          input length
       
  3736   //     c_rarg5   -          saved encryptedCounter start
       
  3737   //     rbp + 6 * wordSize - saved used length
       
  3738   //   Windows
       
  3739   //     rbp + 6 * wordSize - input length
       
  3740   //     rbp + 7 * wordSize - saved encryptedCounter start
       
  3741   //     rbp + 8 * wordSize - saved used length
       
  3742   //
       
  3743   // Output:
       
  3744   //   rax       - input length
       
  3745   //
       
  3746   address generate_counterMode_AESCrypt_Parallel() {
       
  3747     assert(UseAES, "need AES instructions and misaligned SSE support");
       
  3748     __ align(CodeEntryAlignment);
       
  3749     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
       
  3750     address start = __ pc();
       
  3751     const Register from = c_rarg0; // source array address
       
  3752     const Register to = c_rarg1; // destination array address
       
  3753     const Register key = c_rarg2; // key array address
       
  3754     const Register counter = c_rarg3; // counter byte array initialized from counter array address
       
  3755     // and left with the results of the last encryption block
       
  3756 #ifndef _WIN64
       
  3757     const Register len_reg = c_rarg4;
       
  3758     const Register saved_encCounter_start = c_rarg5;
       
  3759     const Register used_addr = r10;
       
  3760     const Address  used_mem(rbp, 2 * wordSize);
       
  3761     const Register used = r11;
       
  3762 #else
       
  3763     const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
       
  3764     const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64
       
  3765     const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64
       
  3766     const Register len_reg = r10; // pick the first volatile windows register
       
  3767     const Register saved_encCounter_start = r11;
       
  3768     const Register used_addr = r13;
       
  3769     const Register used = r14;
       
  3770 #endif
       
  3771     const Register pos = rax;
       
  3772 
       
  3773     const int PARALLEL_FACTOR = 6;
       
  3774     const XMMRegister xmm_counter_shuf_mask = xmm0;
       
  3775     const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
       
  3776     const XMMRegister xmm_curr_counter = xmm2;
       
  3777 
       
  3778     const XMMRegister xmm_key_tmp0 = xmm3;
       
  3779     const XMMRegister xmm_key_tmp1 = xmm4;
       
  3780 
       
  3781     // registers holding the four results in the parallelized loop
       
  3782     const XMMRegister xmm_result0 = xmm5;
       
  3783     const XMMRegister xmm_result1 = xmm6;
       
  3784     const XMMRegister xmm_result2 = xmm7;
       
  3785     const XMMRegister xmm_result3 = xmm8;
       
  3786     const XMMRegister xmm_result4 = xmm9;
       
  3787     const XMMRegister xmm_result5 = xmm10;
       
  3788 
       
  3789     const XMMRegister xmm_from0 = xmm11;
       
  3790     const XMMRegister xmm_from1 = xmm12;
       
  3791     const XMMRegister xmm_from2 = xmm13;
       
  3792     const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
       
  3793     const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
       
  3794     const XMMRegister xmm_from5 = xmm4;
       
  3795 
       
  3796     //for key_128, key_192, key_256
       
  3797     const int rounds[3] = {10, 12, 14};
       
  3798     Label L_exit_preLoop, L_preLoop_start;
       
  3799     Label L_multiBlock_loopTop[3];
       
  3800     Label L_singleBlockLoopTop[3];
       
  3801     Label L__incCounter[3][6]; //for 6 blocks
       
  3802     Label L__incCounter_single[3]; //for single block, key128, key192, key256
       
  3803     Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
       
  3804     Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
       
  3805 
       
  3806     Label L_exit;
       
  3807 
       
  3808     __ enter(); // required for proper stackwalking of RuntimeStub frame
       
  3809 
       
  3810     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
       
  3811     // context for the registers used, where all instructions below are using 128-bit mode
       
  3812     // On EVEX without VL and BW, these instructions will all be AVX.
       
  3813     if (VM_Version::supports_avx512vlbw()) {
       
  3814         __ movl(rax, 0xffff);
       
  3815         __ kmovql(k1, rax);
       
  3816     }
       
  3817 
       
  3818 #ifdef _WIN64
       
  3819     // save the xmm registers which must be preserved 6-14
       
  3820     const int XMM_REG_NUM_KEY_LAST = 14;
       
  3821     __ subptr(rsp, -rsp_after_call_off * wordSize);
       
  3822     for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
       
  3823       __ movdqu(xmm_save(i), as_XMMRegister(i));
       
  3824     }
       
  3825 
       
  3826     const Address r13_save(rbp, rdi_off * wordSize);
       
  3827     const Address r14_save(rbp, rsi_off * wordSize);
       
  3828 
       
  3829     __ movptr(r13_save, r13);
       
  3830     __ movptr(r14_save, r14);
       
  3831 
       
  3832     // on win64, fill len_reg from stack position
       
  3833     __ movl(len_reg, len_mem);
       
  3834     __ movptr(saved_encCounter_start, saved_encCounter_mem);
       
  3835     __ movptr(used_addr, used_mem);
       
  3836     __ movl(used, Address(used_addr, 0));
       
  3837 #else
       
  3838     __ push(len_reg); // Save
       
  3839     __ movptr(used_addr, used_mem);
       
  3840     __ movl(used, Address(used_addr, 0));
       
  3841 #endif
       
  3842 
       
  3843     __ push(rbx); // Save RBX
       
  3844     __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
       
  3845     __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
       
  3846     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
       
  3847     __ movptr(pos, 0);
       
  3848 
       
  3849     // Use the partially used encrpyted counter from last invocation
       
  3850     __ BIND(L_preLoop_start);
       
  3851     __ cmpptr(used, 16);
       
  3852     __ jcc(Assembler::aboveEqual, L_exit_preLoop);
       
  3853       __ cmpptr(len_reg, 0);
       
  3854       __ jcc(Assembler::lessEqual, L_exit_preLoop);
       
  3855       __ movb(rbx, Address(saved_encCounter_start, used));
       
  3856       __ xorb(rbx, Address(from, pos));
       
  3857       __ movb(Address(to, pos), rbx);
       
  3858       __ addptr(pos, 1);
       
  3859       __ addptr(used, 1);
       
  3860       __ subptr(len_reg, 1);
       
  3861 
       
  3862     __ jmp(L_preLoop_start);
       
  3863 
       
  3864     __ BIND(L_exit_preLoop);
       
  3865     __ movl(Address(used_addr, 0), used);
       
  3866 
       
  3867     // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
       
  3868     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
       
  3869     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
       
  3870     __ cmpl(rbx, 52);
       
  3871     __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
       
  3872     __ cmpl(rbx, 60);
       
  3873     __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
       
  3874 
       
  3875 #define CTR_DoSix(opc, src_reg)                \
       
  3876     __ opc(xmm_result0, src_reg);              \
       
  3877     __ opc(xmm_result1, src_reg);              \
       
  3878     __ opc(xmm_result2, src_reg);              \
       
  3879     __ opc(xmm_result3, src_reg);              \
       
  3880     __ opc(xmm_result4, src_reg);              \
       
  3881     __ opc(xmm_result5, src_reg);
       
  3882 
       
  3883     // k == 0 :  generate code for key_128
       
  3884     // k == 1 :  generate code for key_192
       
  3885     // k == 2 :  generate code for key_256
       
  3886     for (int k = 0; k < 3; ++k) {
       
  3887       //multi blocks starts here
       
  3888       __ align(OptoLoopAlignment);
       
  3889       __ BIND(L_multiBlock_loopTop[k]);
       
  3890       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
       
  3891       __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
       
  3892       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
       
  3893 
       
  3894       //load, then increase counters
       
  3895       CTR_DoSix(movdqa, xmm_curr_counter);
       
  3896       inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
       
  3897       inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
       
  3898       inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
       
  3899       inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
       
  3900       inc_counter(rbx, xmm_result5,  0x05, L__incCounter[k][4]);
       
  3901       inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
       
  3902       CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
       
  3903       CTR_DoSix(pxor, xmm_key_tmp0);   //PXOR with Round 0 key
       
  3904 
       
  3905       //load two ROUND_KEYs at a time
       
  3906       for (int i = 1; i < rounds[k]; ) {
       
  3907         load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
       
  3908         load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
       
  3909         CTR_DoSix(aesenc, xmm_key_tmp1);
       
  3910         i++;
       
  3911         if (i != rounds[k]) {
       
  3912           CTR_DoSix(aesenc, xmm_key_tmp0);
       
  3913         } else {
       
  3914           CTR_DoSix(aesenclast, xmm_key_tmp0);
       
  3915         }
       
  3916         i++;
       
  3917       }
       
  3918 
       
  3919       // get next PARALLEL_FACTOR blocks into xmm_result registers
       
  3920       __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
       
  3921       __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
       
  3922       __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
       
  3923       __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
       
  3924       __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
       
  3925       __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));
       
  3926 
       
  3927       __ pxor(xmm_result0, xmm_from0);
       
  3928       __ pxor(xmm_result1, xmm_from1);
       
  3929       __ pxor(xmm_result2, xmm_from2);
       
  3930       __ pxor(xmm_result3, xmm_from3);
       
  3931       __ pxor(xmm_result4, xmm_from4);
       
  3932       __ pxor(xmm_result5, xmm_from5);
       
  3933 
       
  3934       // store 6 results into the next 64 bytes of output
       
  3935       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
       
  3936       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
       
  3937       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
       
  3938       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
       
  3939       __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
       
  3940       __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);
       
  3941 
       
  3942       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
       
  3943       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
       
  3944       __ jmp(L_multiBlock_loopTop[k]);
       
  3945 
       
  3946       // singleBlock starts here
       
  3947       __ align(OptoLoopAlignment);
       
  3948       __ BIND(L_singleBlockLoopTop[k]);
       
  3949       __ cmpptr(len_reg, 0);
       
  3950       __ jcc(Assembler::lessEqual, L_exit);
       
  3951       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
       
  3952       __ movdqa(xmm_result0, xmm_curr_counter);
       
  3953       inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
       
  3954       __ pshufb(xmm_result0, xmm_counter_shuf_mask);
       
  3955       __ pxor(xmm_result0, xmm_key_tmp0);
       
  3956       for (int i = 1; i < rounds[k]; i++) {
       
  3957         load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
       
  3958         __ aesenc(xmm_result0, xmm_key_tmp0);
       
  3959       }
       
  3960       load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
       
  3961       __ aesenclast(xmm_result0, xmm_key_tmp0);
       
  3962       __ cmpptr(len_reg, AESBlockSize);
       
  3963       __ jcc(Assembler::less, L_processTail_insr[k]);
       
  3964         __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
       
  3965         __ pxor(xmm_result0, xmm_from0);
       
  3966         __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
       
  3967         __ addptr(pos, AESBlockSize);
       
  3968         __ subptr(len_reg, AESBlockSize);
       
  3969         __ jmp(L_singleBlockLoopTop[k]);
       
  3970       __ BIND(L_processTail_insr[k]);
       
  3971         __ addptr(pos, len_reg);
       
  3972         __ testptr(len_reg, 8);
       
  3973         __ jcc(Assembler::zero, L_processTail_4_insr[k]);
       
  3974           __ subptr(pos,8);
       
  3975           __ pinsrq(xmm_from0, Address(from, pos), 0);
       
  3976         __ BIND(L_processTail_4_insr[k]);
       
  3977         __ testptr(len_reg, 4);
       
  3978         __ jcc(Assembler::zero, L_processTail_2_insr[k]);
       
  3979           __ subptr(pos,4);
       
  3980           __ pslldq(xmm_from0, 4);
       
  3981           __ pinsrd(xmm_from0, Address(from, pos), 0);
       
  3982         __ BIND(L_processTail_2_insr[k]);
       
  3983         __ testptr(len_reg, 2);
       
  3984         __ jcc(Assembler::zero, L_processTail_1_insr[k]);
       
  3985           __ subptr(pos, 2);
       
  3986           __ pslldq(xmm_from0, 2);
       
  3987           __ pinsrw(xmm_from0, Address(from, pos), 0);
       
  3988         __ BIND(L_processTail_1_insr[k]);
       
  3989         __ testptr(len_reg, 1);
       
  3990         __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
       
  3991           __ subptr(pos, 1);
       
  3992           __ pslldq(xmm_from0, 1);
       
  3993           __ pinsrb(xmm_from0, Address(from, pos), 0);
       
  3994         __ BIND(L_processTail_exit_insr[k]);
       
  3995 
       
  3996         __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);
       
  3997         __ pxor(xmm_result0, xmm_from0);
       
  3998 
       
  3999         __ testptr(len_reg, 8);
       
  4000         __ jcc(Assembler::zero, L_processTail_4_extr[k]);
       
  4001           __ pextrq(Address(to, pos), xmm_result0, 0);
       
  4002           __ psrldq(xmm_result0, 8);
       
  4003           __ addptr(pos, 8);
       
  4004         __ BIND(L_processTail_4_extr[k]);
       
  4005         __ testptr(len_reg, 4);
       
  4006         __ jcc(Assembler::zero, L_processTail_2_extr[k]);
       
  4007           __ pextrd(Address(to, pos), xmm_result0, 0);
       
  4008           __ psrldq(xmm_result0, 4);
       
  4009           __ addptr(pos, 4);
       
  4010         __ BIND(L_processTail_2_extr[k]);
       
  4011         __ testptr(len_reg, 2);
       
  4012         __ jcc(Assembler::zero, L_processTail_1_extr[k]);
       
  4013           __ pextrw(Address(to, pos), xmm_result0, 0);
       
  4014           __ psrldq(xmm_result0, 2);
       
  4015           __ addptr(pos, 2);
       
  4016         __ BIND(L_processTail_1_extr[k]);
       
  4017         __ testptr(len_reg, 1);
       
  4018         __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
       
  4019           __ pextrb(Address(to, pos), xmm_result0, 0);
       
  4020 
       
  4021         __ BIND(L_processTail_exit_extr[k]);
       
  4022         __ movl(Address(used_addr, 0), len_reg);
       
  4023         __ jmp(L_exit);
       
  4024 
       
  4025     }
       
  4026 
       
  4027     __ BIND(L_exit);
       
  4028     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
       
  4029     __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
       
  4030     __ pop(rbx); // pop the saved RBX.
       
  4031 #ifdef _WIN64
       
  4032     // restore regs belonging to calling function
       
  4033     for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
       
  4034       __ movdqu(as_XMMRegister(i), xmm_save(i));
       
  4035     }
       
  4036     __ movl(rax, len_mem);
       
  4037     __ movptr(r13, r13_save);
       
  4038     __ movptr(r14, r14_save);
       
  4039 #else
       
  4040     __ pop(rax); // return 'len'
       
  4041 #endif
       
  4042     __ leave(); // required for proper stackwalking of RuntimeStub frame
       
  4043     __ ret(0);
       
  4044     return start;
       
  4045   }
  3703 
  4046 
  3704   // byte swap x86 long
  4047   // byte swap x86 long
  3705   address generate_ghash_long_swap_mask() {
  4048   address generate_ghash_long_swap_mask() {
  3706     __ align(CodeEntryAlignment);
  4049     __ align(CodeEntryAlignment);
  3707     StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
  4050     StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
  4553     generate_math_stubs();
  4896     generate_math_stubs();
  4554 
  4897 
  4555     // don't bother generating these AES intrinsic stubs unless global flag is set
  4898     // don't bother generating these AES intrinsic stubs unless global flag is set
  4556     if (UseAESIntrinsics) {
  4899     if (UseAESIntrinsics) {
  4557       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
  4900       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
  4558 
       
  4559       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
  4901       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
  4560       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
  4902       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
  4561       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
  4903       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
  4562       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
  4904       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
       
  4905     }
       
  4906     if (UseAESCTRIntrinsics){
       
  4907       StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
       
  4908       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
  4563     }
  4909     }
  4564 
  4910 
  4565     // Generate GHASH intrinsics code
  4911     // Generate GHASH intrinsics code
  4566     if (UseGHASHIntrinsics) {
  4912     if (UseGHASHIntrinsics) {
  4567       StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
  4913       StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();