hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
changeset 34162 16b54851eaf6
parent 33628 09241459a8b8
child 34185 ee71c590a456
equal deleted inserted replaced
34159:f401f5b4327e 34162:16b54851eaf6
  3649 
  3649 
  3650 void MacroAssembler::movptr(Address dst, Register src) {
  3650 void MacroAssembler::movptr(Address dst, Register src) {
  3651   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
  3651   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
  3652 }
  3652 }
  3653 
  3653 
       
  3654 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
       
  3655   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
       
  3656     Assembler::vextractf32x4h(dst, src, 0);
       
  3657   } else {
       
  3658     Assembler::movdqu(dst, src);
       
  3659   }
       
  3660 }
       
  3661 
       
  3662 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
       
  3663   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
       
  3664     Assembler::vinsertf32x4h(dst, src, 0);
       
  3665   } else {
       
  3666     Assembler::movdqu(dst, src);
       
  3667   }
       
  3668 }
       
  3669 
       
  3670 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
       
  3671   if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
       
  3672     Assembler::evmovdqul(dst, src, Assembler::AVX_512bit);
       
  3673   } else {
       
  3674     Assembler::movdqu(dst, src);
       
  3675   }
       
  3676 }
       
  3677 
  3654 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src) {
  3678 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src) {
  3655   if (reachable(src)) {
  3679   if (reachable(src)) {
  3656     Assembler::movdqu(dst, as_Address(src));
  3680     movdqu(dst, as_Address(src));
  3657   } else {
  3681   } else {
  3658     lea(rscratch1, src);
  3682     lea(rscratch1, src);
  3659     Assembler::movdqu(dst, Address(rscratch1, 0));
  3683     movdqu(dst, Address(rscratch1, 0));
       
  3684   }
       
  3685 }
       
  3686 
       
  3687 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
       
  3688   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
       
  3689     Assembler::vextractf64x4h(dst, src, 0);
       
  3690   } else {
       
  3691     Assembler::vmovdqu(dst, src);
       
  3692   }
       
  3693 }
       
  3694 
       
  3695 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
       
  3696   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
       
  3697     Assembler::vinsertf64x4h(dst, src, 0);
       
  3698   } else {
       
  3699     Assembler::vmovdqu(dst, src);
       
  3700   }
       
  3701 }
       
  3702 
       
  3703 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
       
  3704   if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
       
  3705     Assembler::evmovdqul(dst, src, Assembler::AVX_512bit);
       
  3706   }
       
  3707   else {
       
  3708     Assembler::vmovdqu(dst, src);
       
  3709   }
       
  3710 }
       
  3711 
       
  3712 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src) {
       
  3713   if (reachable(src)) {
       
  3714     vmovdqu(dst, as_Address(src));
       
  3715   }
       
  3716   else {
       
  3717     lea(rscratch1, src);
       
  3718     vmovdqu(dst, Address(rscratch1, 0));
  3660   }
  3719   }
  3661 }
  3720 }
  3662 
  3721 
  3663 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
  3722 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
  3664   if (reachable(src)) {
  3723   if (reachable(src)) {
  3724   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
  3783   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
  3725   // (e.g., MSVC can't call ps() otherwise)
  3784   // (e.g., MSVC can't call ps() otherwise)
  3726   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
  3785   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
  3727 }
  3786 }
  3728 
  3787 
       
  3788 #ifdef _LP64
       
  3789 #define XSTATE_BV 0x200
       
  3790 #endif
       
  3791 
  3729 void MacroAssembler::pop_CPU_state() {
  3792 void MacroAssembler::pop_CPU_state() {
  3730   pop_FPU_state();
  3793   pop_FPU_state();
  3731   pop_IU_state();
  3794   pop_IU_state();
  3732 }
  3795 }
  3733 
  3796 
  3734 void MacroAssembler::pop_FPU_state() {
  3797 void MacroAssembler::pop_FPU_state() {
  3735 #ifndef _LP64
  3798 #ifndef _LP64
  3736   frstor(Address(rsp, 0));
  3799   frstor(Address(rsp, 0));
  3737 #else
  3800 #else
  3738   // AVX will continue to use the fxsave area.
  3801   fxrstor(Address(rsp, 0));
  3739   // EVEX needs to utilize the xsave area, which is under different
       
  3740   // management.
       
  3741   if(VM_Version::supports_evex()) {
       
  3742     // EDX:EAX describe the XSAVE header and
       
  3743     // are obtained while fetching info for XCR0 via cpuid.
       
  3744     // These two registers make up 64-bits in the header for which bits
       
  3745     // 62:10 are currently reserved for future implementations and unused.  Bit 63
       
  3746     // is unused for our implementation as we do not utilize
       
  3747     // compressed XSAVE areas.  Bits 9..8 are currently ignored as we do not use
       
  3748     // the functionality for PKRU state and MSR tracing.
       
  3749     // Ergo we are primarily concerned with bits 7..0, which define
       
  3750     // which ISA extensions and features are enabled for a given machine and are
       
  3751     // defined in XemXcr0Eax and is used to map the XSAVE area
       
  3752     // for restoring registers as described via XCR0.
       
  3753     movl(rdx,VM_Version::get_xsave_header_upper_segment());
       
  3754     movl(rax,VM_Version::get_xsave_header_lower_segment());
       
  3755     xrstor(Address(rsp, 0));
       
  3756   } else {
       
  3757     fxrstor(Address(rsp, 0));
       
  3758   }
       
  3759 #endif
  3802 #endif
  3760   addptr(rsp, FPUStateSizeInWords * wordSize);
  3803   addptr(rsp, FPUStateSizeInWords * wordSize);
  3761 }
  3804 }
  3762 
  3805 
  3763 void MacroAssembler::pop_IU_state() {
  3806 void MacroAssembler::pop_IU_state() {
  3770 // Warning: Stack must be 16 byte aligned (64bit)
  3813 // Warning: Stack must be 16 byte aligned (64bit)
  3771 void MacroAssembler::push_CPU_state() {
  3814 void MacroAssembler::push_CPU_state() {
  3772   push_IU_state();
  3815   push_IU_state();
  3773   push_FPU_state();
  3816   push_FPU_state();
  3774 }
  3817 }
  3775 
       
  3776 #ifdef _LP64
       
  3777 #define XSTATE_BV 0x200
       
  3778 #endif
       
  3779 
  3818 
  3780 void MacroAssembler::push_FPU_state() {
  3819 void MacroAssembler::push_FPU_state() {
  3781   subptr(rsp, FPUStateSizeInWords * wordSize);
  3820   subptr(rsp, FPUStateSizeInWords * wordSize);
  3782 #ifndef _LP64
  3821 #ifndef _LP64
  3783   fnsave(Address(rsp, 0));
  3822   fnsave(Address(rsp, 0));
  3784   fwait();
  3823   fwait();
  3785 #else
  3824 #else
  3786   // AVX will continue to use the fxsave area.
  3825   fxsave(Address(rsp, 0));
  3787   // EVEX needs to utilize the xsave area, which is under different
       
  3788   // management.
       
  3789   if(VM_Version::supports_evex()) {
       
  3790     // Save a copy of EAX and EDX
       
  3791     push(rax);
       
  3792     push(rdx);
       
  3793     // EDX:EAX describe the XSAVE header and
       
  3794     // are obtained while fetching info for XCR0 via cpuid.
       
  3795     // These two registers make up 64-bits in the header for which bits
       
  3796     // 62:10 are currently reserved for future implementations and unused.  Bit 63
       
  3797     // is unused for our implementation as we do not utilize
       
  3798     // compressed XSAVE areas.  Bits 9..8 are currently ignored as we do not use
       
  3799     // the functionality for PKRU state and MSR tracing.
       
  3800     // Ergo we are primarily concerned with bits 7..0, which define
       
  3801     // which ISA extensions and features are enabled for a given machine and are
       
  3802     // defined in XemXcr0Eax and is used to program XSAVE area
       
  3803     // for saving the required registers as defined in XCR0.
       
  3804     int xcr0_edx = VM_Version::get_xsave_header_upper_segment();
       
  3805     int xcr0_eax = VM_Version::get_xsave_header_lower_segment();
       
  3806     movl(rdx,xcr0_edx);
       
  3807     movl(rax,xcr0_eax);
       
  3808     xsave(Address(rsp, wordSize*2));
       
  3809     // now Apply control bits and clear bytes 8..23 in the header
       
  3810     pop(rdx);
       
  3811     pop(rax);
       
  3812     movl(Address(rsp, XSTATE_BV), xcr0_eax);
       
  3813     movl(Address(rsp, XSTATE_BV+4), xcr0_edx);
       
  3814     andq(Address(rsp, XSTATE_BV+8), 0);
       
  3815     andq(Address(rsp, XSTATE_BV+16), 0);
       
  3816   } else {
       
  3817     fxsave(Address(rsp, 0));
       
  3818   }
       
  3819 #endif // LP64
  3826 #endif // LP64
  3820 }
  3827 }
  3821 
  3828 
  3822 void MacroAssembler::push_IU_state() {
  3829 void MacroAssembler::push_IU_state() {
  3823   // Push flags first because pusha kills them
  3830   // Push flags first because pusha kills them
  4005     lea(rscratch1, src);
  4012     lea(rscratch1, src);
  4006     Assembler::xorpd(dst, Address(rscratch1, 0));
  4013     Assembler::xorpd(dst, Address(rscratch1, 0));
  4007   }
  4014   }
  4008 }
  4015 }
  4009 
  4016 
       
  4017 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
       
  4018   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
       
  4019     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
       
  4020   }
       
  4021   else {
       
  4022     Assembler::xorpd(dst, src);
       
  4023   }
       
  4024 }
       
  4025 
       
  4026 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
       
  4027   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
       
  4028     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
       
  4029   } else {
       
  4030     Assembler::xorps(dst, src);
       
  4031   }
       
  4032 }
       
  4033 
  4010 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
  4034 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
  4011   // Used in sign-bit flipping with aligned address.
  4035   // Used in sign-bit flipping with aligned address.
  4012   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
  4036   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
  4013   if (reachable(src)) {
  4037   if (reachable(src)) {
  4014     Assembler::xorps(dst, as_Address(src));
  4038     Assembler::xorps(dst, as_Address(src));
  4045   if (reachable(src)) {
  4069   if (reachable(src)) {
  4046     vaddss(dst, nds, as_Address(src));
  4070     vaddss(dst, nds, as_Address(src));
  4047   } else {
  4071   } else {
  4048     lea(rscratch1, src);
  4072     lea(rscratch1, src);
  4049     vaddss(dst, nds, Address(rscratch1, 0));
  4073     vaddss(dst, nds, Address(rscratch1, 0));
       
  4074   }
       
  4075 }
       
  4076 
       
  4077 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
       
  4078   int dst_enc = dst->encoding();
       
  4079   int nds_enc = nds->encoding();
       
  4080   int src_enc = src->encoding();
       
  4081   if ((dst_enc < 16) && (nds_enc < 16)) {
       
  4082     vandps(dst, nds, negate_field, vector_len);
       
  4083   } else if ((src_enc < 16) && (dst_enc < 16)) {
       
  4084     movss(src, nds);
       
  4085     vandps(dst, src, negate_field, vector_len);
       
  4086   } else if (src_enc < 16) {
       
  4087     movss(src, nds);
       
  4088     vandps(src, src, negate_field, vector_len);
       
  4089     movss(dst, src);
       
  4090   } else if (dst_enc < 16) {
       
  4091     movdqu(src, xmm0);
       
  4092     movss(xmm0, nds);
       
  4093     vandps(dst, xmm0, negate_field, vector_len);
       
  4094     movdqu(xmm0, src);
       
  4095   } else if (nds_enc < 16) {
       
  4096     movdqu(src, xmm0);
       
  4097     vandps(xmm0, nds, negate_field, vector_len);
       
  4098     movss(dst, xmm0);
       
  4099     movdqu(xmm0, src);
       
  4100   } else {
       
  4101     movdqu(src, xmm0);
       
  4102     movss(xmm0, nds);
       
  4103     vandps(xmm0, xmm0, negate_field, vector_len);
       
  4104     movss(dst, xmm0);
       
  4105     movdqu(xmm0, src);
       
  4106   }
       
  4107 }
       
  4108 
       
  4109 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
       
  4110   int dst_enc = dst->encoding();
       
  4111   int nds_enc = nds->encoding();
       
  4112   int src_enc = src->encoding();
       
  4113   if ((dst_enc < 16) && (nds_enc < 16)) {
       
  4114     vandpd(dst, nds, negate_field, vector_len);
       
  4115   } else if ((src_enc < 16) && (dst_enc < 16)) {
       
  4116     movsd(src, nds);
       
  4117     vandpd(dst, src, negate_field, vector_len);
       
  4118   } else if (src_enc < 16) {
       
  4119     movsd(src, nds);
       
  4120     vandpd(src, src, negate_field, vector_len);
       
  4121     movsd(dst, src);
       
  4122   } else if (dst_enc < 16) {
       
  4123     movdqu(src, xmm0);
       
  4124     movsd(xmm0, nds);
       
  4125     vandpd(dst, xmm0, negate_field, vector_len);
       
  4126     movdqu(xmm0, src);
       
  4127   } else if (nds_enc < 16) {
       
  4128     movdqu(src, xmm0);
       
  4129     vandpd(xmm0, nds, negate_field, vector_len);
       
  4130     movsd(dst, xmm0);
       
  4131     movdqu(xmm0, src);
       
  4132   } else {
       
  4133     movdqu(src, xmm0);
       
  4134     movsd(xmm0, nds);
       
  4135     vandpd(xmm0, xmm0, negate_field, vector_len);
       
  4136     movsd(dst, xmm0);
       
  4137     movdqu(xmm0, src);
       
  4138   }
       
  4139 }
       
  4140 
       
  4141 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
       
  4142   int dst_enc = dst->encoding();
       
  4143   int nds_enc = nds->encoding();
       
  4144   int src_enc = src->encoding();
       
  4145   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
       
  4146     Assembler::vpaddb(dst, nds, src, vector_len);
       
  4147   } else if ((dst_enc < 16) && (src_enc < 16)) {
       
  4148     Assembler::vpaddb(dst, dst, src, vector_len);
       
  4149   } else if ((dst_enc < 16) && (nds_enc < 16)) {
       
  4150     // use nds as scratch for src
       
  4151     evmovdqul(nds, src, Assembler::AVX_512bit);
       
  4152     Assembler::vpaddb(dst, dst, nds, vector_len);
       
  4153   } else if ((src_enc < 16) && (nds_enc < 16)) {
       
  4154     // use nds as scratch for dst
       
  4155     evmovdqul(nds, dst, Assembler::AVX_512bit);
       
  4156     Assembler::vpaddb(nds, nds, src, vector_len);
       
  4157     evmovdqul(dst, nds, Assembler::AVX_512bit);
       
  4158   } else if (dst_enc < 16) {
       
  4159     // use nds as scatch for xmm0 to hold src
       
  4160     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
       
  4161     evmovdqul(xmm0, src, Assembler::AVX_512bit);
       
  4162     Assembler::vpaddb(dst, dst, xmm0, vector_len);
       
  4163     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
       
  4164   } else {
       
  4165     // worse case scenario, all regs are in the upper bank
       
  4166     subptr(rsp, 64);
       
  4167     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
       
  4168     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
       
  4169     evmovdqul(xmm1, src, Assembler::AVX_512bit);
       
  4170     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
       
  4171     Assembler::vpaddb(xmm0, xmm0, xmm1, vector_len);
       
  4172     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
       
  4173     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
       
  4174     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
       
  4175     addptr(rsp, 64);
       
  4176   }
       
  4177 }
       
  4178 
       
  4179 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
       
  4180   int dst_enc = dst->encoding();
       
  4181   int nds_enc = nds->encoding();
       
  4182   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
       
  4183     Assembler::vpaddb(dst, nds, src, vector_len);
       
  4184   } else if (dst_enc < 16) {
       
  4185     Assembler::vpaddb(dst, dst, src, vector_len);
       
  4186   } else if (nds_enc < 16) {
       
  4187     // implies dst_enc in upper bank with src as scratch
       
  4188     evmovdqul(nds, dst, Assembler::AVX_512bit);
       
  4189     Assembler::vpaddb(nds, nds, src, vector_len);
       
  4190     evmovdqul(dst, nds, Assembler::AVX_512bit);
       
  4191   } else {
       
  4192     // worse case scenario, all regs in upper bank
       
  4193     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
       
  4194     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
       
  4195     Assembler::vpaddb(xmm0, xmm0, src, vector_len);
       
  4196     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
       
  4197   }
       
  4198 }
       
  4199 
       
  4200 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
       
  4201   int dst_enc = dst->encoding();
       
  4202   int nds_enc = nds->encoding();
       
  4203   int src_enc = src->encoding();
       
  4204   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
       
  4205     Assembler::vpaddw(dst, nds, src, vector_len);
       
  4206   } else if ((dst_enc < 16) && (src_enc < 16)) {
       
  4207     Assembler::vpaddw(dst, dst, src, vector_len);
       
  4208   } else if ((dst_enc < 16) && (nds_enc < 16)) {
       
  4209     // use nds as scratch for src
       
  4210     evmovdqul(nds, src, Assembler::AVX_512bit);
       
  4211     Assembler::vpaddw(dst, dst, nds, vector_len);
       
  4212   } else if ((src_enc < 16) && (nds_enc < 16)) {
       
  4213     // use nds as scratch for dst
       
  4214     evmovdqul(nds, dst, Assembler::AVX_512bit);
       
  4215     Assembler::vpaddw(nds, nds, src, vector_len);
       
  4216     evmovdqul(dst, nds, Assembler::AVX_512bit);
       
  4217   } else if (dst_enc < 16) {
       
  4218     // use nds as scatch for xmm0 to hold src
       
  4219     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
       
  4220     evmovdqul(xmm0, src, Assembler::AVX_512bit);
       
  4221     Assembler::vpaddw(dst, dst, xmm0, vector_len);
       
  4222     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
       
  4223   } else {
       
  4224     // worse case scenario, all regs are in the upper bank
       
  4225     subptr(rsp, 64);
       
  4226     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
       
  4227     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
       
  4228     evmovdqul(xmm1, src, Assembler::AVX_512bit);
       
  4229     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
       
  4230     Assembler::vpaddw(xmm0, xmm0, xmm1, vector_len);
       
  4231     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
       
  4232     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
       
  4233     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
       
  4234     addptr(rsp, 64);
       
  4235   }
       
  4236 }
       
  4237 
       
  4238 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
       
  4239   int dst_enc = dst->encoding();
       
  4240   int nds_enc = nds->encoding();
       
  4241   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
       
  4242     Assembler::vpaddw(dst, nds, src, vector_len);
       
  4243   } else if (dst_enc < 16) {
       
  4244     Assembler::vpaddw(dst, dst, src, vector_len);
       
  4245   } else if (nds_enc < 16) {
       
  4246     // implies dst_enc in upper bank with src as scratch
       
  4247     evmovdqul(nds, dst, Assembler::AVX_512bit);
       
  4248     Assembler::vpaddw(nds, nds, src, vector_len);
       
  4249     evmovdqul(dst, nds, Assembler::AVX_512bit);
       
  4250   } else {
       
  4251     // worse case scenario, all regs in upper bank
       
  4252     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
       
  4253     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
       
  4254     Assembler::vpaddw(xmm0, xmm0, src, vector_len);
       
  4255     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
       
  4256   }
       
  4257 }
       
  4258 
       
  4259 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
       
  4260   int dst_enc = dst->encoding();
       
  4261   int nds_enc = nds->encoding();
       
  4262   int src_enc = src->encoding();
       
  4263   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
       
  4264     Assembler::vpsubb(dst, nds, src, vector_len);
       
  4265   } else if ((dst_enc < 16) && (src_enc < 16)) {
       
  4266     Assembler::vpsubb(dst, dst, src, vector_len);
       
  4267   } else if ((dst_enc < 16) && (nds_enc < 16)) {
       
  4268     // use nds as scratch for src
       
  4269     evmovdqul(nds, src, Assembler::AVX_512bit);
       
  4270     Assembler::vpsubb(dst, dst, nds, vector_len);
       
  4271   } else if ((src_enc < 16) && (nds_enc < 16)) {
       
  4272     // use nds as scratch for dst
       
  4273     evmovdqul(nds, dst, Assembler::AVX_512bit);
       
  4274     Assembler::vpsubb(nds, nds, src, vector_len);
       
  4275     evmovdqul(dst, nds, Assembler::AVX_512bit);
       
  4276   } else if (dst_enc < 16) {
       
  4277     // use nds as scatch for xmm0 to hold src
       
  4278     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
       
  4279     evmovdqul(xmm0, src, Assembler::AVX_512bit);
       
  4280     Assembler::vpsubb(dst, dst, xmm0, vector_len);
       
  4281     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
       
  4282   } else {
       
  4283     // worse case scenario, all regs are in the upper bank
       
  4284     subptr(rsp, 64);
       
  4285     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
       
  4286     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
       
  4287     evmovdqul(xmm1, src, Assembler::AVX_512bit);
       
  4288     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
       
  4289     Assembler::vpsubb(xmm0, xmm0, xmm1, vector_len);
       
  4290     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
       
  4291     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
       
  4292     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
       
  4293     addptr(rsp, 64);
       
  4294   }
       
  4295 }
       
  4296 
       
  4297 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
       
  4298   int dst_enc = dst->encoding();
       
  4299   int nds_enc = nds->encoding();
       
  4300   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
       
  4301     Assembler::vpsubb(dst, nds, src, vector_len);
       
  4302   } else if (dst_enc < 16) {
       
  4303     Assembler::vpsubb(dst, dst, src, vector_len);
       
  4304   } else if (nds_enc < 16) {
       
  4305     // implies dst_enc in upper bank with src as scratch
       
  4306     evmovdqul(nds, dst, Assembler::AVX_512bit);
       
  4307     Assembler::vpsubb(nds, nds, src, vector_len);
       
  4308     evmovdqul(dst, nds, Assembler::AVX_512bit);
       
  4309   } else {
       
  4310     // worse case scenario, all regs in upper bank
       
  4311     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
       
  4312     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
       
  4313     Assembler::vpsubw(xmm0, xmm0, src, vector_len);
       
  4314     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
       
  4315   }
       
  4316 }
       
  4317 
       
  4318 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
       
  4319   int dst_enc = dst->encoding();
       
  4320   int nds_enc = nds->encoding();
       
  4321   int src_enc = src->encoding();
       
  4322   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
       
  4323     Assembler::vpsubw(dst, nds, src, vector_len);
       
  4324   } else if ((dst_enc < 16) && (src_enc < 16)) {
       
  4325     Assembler::vpsubw(dst, dst, src, vector_len);
       
  4326   } else if ((dst_enc < 16) && (nds_enc < 16)) {
       
  4327     // use nds as scratch for src
       
  4328     evmovdqul(nds, src, Assembler::AVX_512bit);
       
  4329     Assembler::vpsubw(dst, dst, nds, vector_len);
       
  4330   } else if ((src_enc < 16) && (nds_enc < 16)) {
       
  4331     // use nds as scratch for dst
       
  4332     evmovdqul(nds, dst, Assembler::AVX_512bit);
       
  4333     Assembler::vpsubw(nds, nds, src, vector_len);
       
  4334     evmovdqul(dst, nds, Assembler::AVX_512bit);
       
  4335   } else if (dst_enc < 16) {
       
  4336     // use nds as scatch for xmm0 to hold src
       
  4337     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
       
  4338     evmovdqul(xmm0, src, Assembler::AVX_512bit);
       
  4339     Assembler::vpsubw(dst, dst, xmm0, vector_len);
       
  4340     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
       
  4341   } else {
       
  4342     // worse case scenario, all regs are in the upper bank
       
  4343     subptr(rsp, 64);
       
  4344     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
       
  4345     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
       
  4346     evmovdqul(xmm1, src, Assembler::AVX_512bit);
       
  4347     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
       
  4348     Assembler::vpsubw(xmm0, xmm0, xmm1, vector_len);
       
  4349     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
       
  4350     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
       
  4351     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
       
  4352     addptr(rsp, 64);
       
  4353   }
       
  4354 }
       
  4355 
       
  4356 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
       
  4357   int dst_enc = dst->encoding();
       
  4358   int nds_enc = nds->encoding();
       
  4359   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
       
  4360     Assembler::vpsubw(dst, nds, src, vector_len);
       
  4361   } else if (dst_enc < 16) {
       
  4362     Assembler::vpsubw(dst, dst, src, vector_len);
       
  4363   } else if (nds_enc < 16) {
       
  4364     // implies dst_enc in upper bank with src as scratch
       
  4365     evmovdqul(nds, dst, Assembler::AVX_512bit);
       
  4366     Assembler::vpsubw(nds, nds, src, vector_len);
       
  4367     evmovdqul(dst, nds, Assembler::AVX_512bit);
       
  4368   } else {
       
  4369     // worse case scenario, all regs in upper bank
       
  4370     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
       
  4371     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
       
  4372     Assembler::vpsubw(xmm0, xmm0, src, vector_len);
       
  4373     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
       
  4374   }
       
  4375 }
       
  4376 
       
  4377 
       
  4378 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
       
  4379   int dst_enc = dst->encoding();
       
  4380   int nds_enc = nds->encoding();
       
  4381   int src_enc = src->encoding();
       
  4382   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
       
  4383     Assembler::vpmullw(dst, nds, src, vector_len);
       
  4384   } else if ((dst_enc < 16) && (src_enc < 16)) {
       
  4385     Assembler::vpmullw(dst, dst, src, vector_len);
       
  4386   } else if ((dst_enc < 16) && (nds_enc < 16)) {
       
  4387     // use nds as scratch for src
       
  4388     evmovdqul(nds, src, Assembler::AVX_512bit);
       
  4389     Assembler::vpmullw(dst, dst, nds, vector_len);
       
  4390   } else if ((src_enc < 16) && (nds_enc < 16)) {
       
  4391     // use nds as scratch for dst
       
  4392     evmovdqul(nds, dst, Assembler::AVX_512bit);
       
  4393     Assembler::vpmullw(nds, nds, src, vector_len);
       
  4394     evmovdqul(dst, nds, Assembler::AVX_512bit);
       
  4395   } else if (dst_enc < 16) {
       
  4396     // use nds as scatch for xmm0 to hold src
       
  4397     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
       
  4398     evmovdqul(xmm0, src, Assembler::AVX_512bit);
       
  4399     Assembler::vpmullw(dst, dst, xmm0, vector_len);
       
  4400     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
       
  4401   } else {
       
  4402     // worse case scenario, all regs are in the upper bank
       
  4403     subptr(rsp, 64);
       
  4404     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
       
  4405     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
       
  4406     evmovdqul(xmm1, src, Assembler::AVX_512bit);
       
  4407     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
       
  4408     Assembler::vpmullw(xmm0, xmm0, xmm1, vector_len);
       
  4409     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
       
  4410     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
       
  4411     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
       
  4412     addptr(rsp, 64);
       
  4413   }
       
  4414 }
       
  4415 
       
  4416 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
       
  4417   int dst_enc = dst->encoding();
       
  4418   int nds_enc = nds->encoding();
       
  4419   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
       
  4420     Assembler::vpmullw(dst, nds, src, vector_len);
       
  4421   } else if (dst_enc < 16) {
       
  4422     Assembler::vpmullw(dst, dst, src, vector_len);
       
  4423   } else if (nds_enc < 16) {
       
  4424     // implies dst_enc in upper bank with src as scratch
       
  4425     evmovdqul(nds, dst, Assembler::AVX_512bit);
       
  4426     Assembler::vpmullw(nds, nds, src, vector_len);
       
  4427     evmovdqul(dst, nds, Assembler::AVX_512bit);
       
  4428   } else {
       
  4429     // worse case scenario, all regs in upper bank
       
  4430     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
       
  4431     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
       
  4432     Assembler::vpmullw(xmm0, xmm0, src, vector_len);
       
  4433     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
       
  4434   }
       
  4435 }
       
  4436 
       
  4437 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
       
  4438   int dst_enc = dst->encoding();
       
  4439   int nds_enc = nds->encoding();
       
  4440   int shift_enc = shift->encoding();
       
  4441   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
       
  4442     Assembler::vpsraw(dst, nds, shift, vector_len);
       
  4443   } else if ((dst_enc < 16) && (shift_enc < 16)) {
       
  4444     Assembler::vpsraw(dst, dst, shift, vector_len);
       
  4445   } else if ((dst_enc < 16) && (nds_enc < 16)) {
       
  4446     // use nds_enc as scratch with shift
       
  4447     evmovdqul(nds, shift, Assembler::AVX_512bit);
       
  4448     Assembler::vpsraw(dst, dst, nds, vector_len);
       
  4449   } else if ((shift_enc < 16) && (nds_enc < 16)) {
       
  4450     // use nds as scratch with dst
       
  4451     evmovdqul(nds, dst, Assembler::AVX_512bit);
       
  4452     Assembler::vpsraw(nds, nds, shift, vector_len);
       
  4453     evmovdqul(dst, nds, Assembler::AVX_512bit);
       
  4454   } else if (dst_enc < 16) {
       
  4455     // use nds to save a copy of xmm0 and hold shift
       
  4456     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
       
  4457     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
       
  4458     Assembler::vpsraw(dst, dst, xmm0, vector_len);
       
  4459     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
       
  4460   } else if (nds_enc < 16) {
       
  4461     // use nds as dest as temps
       
  4462     evmovdqul(nds, dst, Assembler::AVX_512bit);
       
  4463     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
       
  4464     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
       
  4465     Assembler::vpsraw(nds, nds, xmm0, vector_len);
       
  4466     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
       
  4467     evmovdqul(dst, nds, Assembler::AVX_512bit);
       
  4468   } else {
       
  4469     // worse case scenario, all regs are in the upper bank
       
  4470     subptr(rsp, 64);
       
  4471     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
       
  4472     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
       
  4473     evmovdqul(xmm1, shift, Assembler::AVX_512bit);
       
  4474     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
       
  4475     Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
       
  4476     evmovdqul(xmm1, dst, Assembler::AVX_512bit);
       
  4477     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
       
  4478     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
       
  4479     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
       
  4480     addptr(rsp, 64);
       
  4481   }
       
  4482 }
       
  4483 
       
  4484 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
       
  4485   int dst_enc = dst->encoding();
       
  4486   int nds_enc = nds->encoding();
       
  4487   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
       
  4488     Assembler::vpsraw(dst, nds, shift, vector_len);
       
  4489   } else if (dst_enc < 16) {
       
  4490     Assembler::vpsraw(dst, dst, shift, vector_len);
       
  4491   } else if (nds_enc < 16) {
       
  4492     // use nds as scratch
       
  4493     evmovdqul(nds, dst, Assembler::AVX_512bit);
       
  4494     Assembler::vpsraw(nds, nds, shift, vector_len);
       
  4495     evmovdqul(dst, nds, Assembler::AVX_512bit);
       
  4496   } else {
       
  4497     // use nds as scratch for xmm0
       
  4498     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
       
  4499     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
       
  4500     Assembler::vpsraw(xmm0, xmm0, shift, vector_len);
       
  4501     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
       
  4502   }
       
  4503 }
       
  4504 
       
  4505 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
       
  4506   int dst_enc = dst->encoding();
       
  4507   int nds_enc = nds->encoding();
       
  4508   int shift_enc = shift->encoding();
       
  4509   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
       
  4510     Assembler::vpsrlw(dst, nds, shift, vector_len);
       
  4511   } else if ((dst_enc < 16) && (shift_enc < 16)) {
       
  4512     Assembler::vpsrlw(dst, dst, shift, vector_len);
       
  4513   } else if ((dst_enc < 16) && (nds_enc < 16)) {
       
  4514     // use nds_enc as scratch with shift
       
  4515     evmovdqul(nds, shift, Assembler::AVX_512bit);
       
  4516     Assembler::vpsrlw(dst, dst, nds, vector_len);
       
  4517   } else if ((shift_enc < 16) && (nds_enc < 16)) {
       
  4518     // use nds as scratch with dst
       
  4519     evmovdqul(nds, dst, Assembler::AVX_512bit);
       
  4520     Assembler::vpsrlw(nds, nds, shift, vector_len);
       
  4521     evmovdqul(dst, nds, Assembler::AVX_512bit);
       
  4522   } else if (dst_enc < 16) {
       
  4523     // use nds to save a copy of xmm0 and hold shift
       
  4524     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
       
  4525     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
       
  4526     Assembler::vpsrlw(dst, dst, xmm0, vector_len);
       
  4527     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
       
  4528   } else if (nds_enc < 16) {
       
  4529     // use nds as dest as temps
       
  4530     evmovdqul(nds, dst, Assembler::AVX_512bit);
       
  4531     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
       
  4532     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
       
  4533     Assembler::vpsrlw(nds, nds, xmm0, vector_len);
       
  4534     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
       
  4535     evmovdqul(dst, nds, Assembler::AVX_512bit);
       
  4536   } else {
       
  4537     // worse case scenario, all regs are in the upper bank
       
  4538     subptr(rsp, 64);
       
  4539     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
       
  4540     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
       
  4541     evmovdqul(xmm1, shift, Assembler::AVX_512bit);
       
  4542     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
       
  4543     Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
       
  4544     evmovdqul(xmm1, dst, Assembler::AVX_512bit);
       
  4545     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
       
  4546     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
       
  4547     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
       
  4548     addptr(rsp, 64);
       
  4549   }
       
  4550 }
       
  4551 
       
  4552 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
       
  4553   int dst_enc = dst->encoding();
       
  4554   int nds_enc = nds->encoding();
       
  4555   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
       
  4556     Assembler::vpsrlw(dst, nds, shift, vector_len);
       
  4557   } else if (dst_enc < 16) {
       
  4558     Assembler::vpsrlw(dst, dst, shift, vector_len);
       
  4559   } else if (nds_enc < 16) {
       
  4560     // use nds as scratch
       
  4561     evmovdqul(nds, dst, Assembler::AVX_512bit);
       
  4562     Assembler::vpsrlw(nds, nds, shift, vector_len);
       
  4563     evmovdqul(dst, nds, Assembler::AVX_512bit);
       
  4564   } else {
       
  4565     // use nds as scratch for xmm0
       
  4566     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
       
  4567     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
       
  4568     Assembler::vpsrlw(xmm0, xmm0, shift, vector_len);
       
  4569     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
       
  4570   }
       
  4571 }
       
  4572 
       
  4573 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
       
  4574   int dst_enc = dst->encoding();
       
  4575   int nds_enc = nds->encoding();
       
  4576   int shift_enc = shift->encoding();
       
  4577   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
       
  4578     Assembler::vpsllw(dst, nds, shift, vector_len);
       
  4579   } else if ((dst_enc < 16) && (shift_enc < 16)) {
       
  4580     Assembler::vpsllw(dst, dst, shift, vector_len);
       
  4581   } else if ((dst_enc < 16) && (nds_enc < 16)) {
       
  4582     // use nds_enc as scratch with shift
       
  4583     evmovdqul(nds, shift, Assembler::AVX_512bit);
       
  4584     Assembler::vpsllw(dst, dst, nds, vector_len);
       
  4585   } else if ((shift_enc < 16) && (nds_enc < 16)) {
       
  4586     // use nds as scratch with dst
       
  4587     evmovdqul(nds, dst, Assembler::AVX_512bit);
       
  4588     Assembler::vpsllw(nds, nds, shift, vector_len);
       
  4589     evmovdqul(dst, nds, Assembler::AVX_512bit);
       
  4590   } else if (dst_enc < 16) {
       
  4591     // use nds to save a copy of xmm0 and hold shift
       
  4592     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
       
  4593     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
       
  4594     Assembler::vpsllw(dst, dst, xmm0, vector_len);
       
  4595     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
       
  4596   } else if (nds_enc < 16) {
       
  4597     // use nds as dest as temps
       
  4598     evmovdqul(nds, dst, Assembler::AVX_512bit);
       
  4599     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
       
  4600     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
       
  4601     Assembler::vpsllw(nds, nds, xmm0, vector_len);
       
  4602     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
       
  4603     evmovdqul(dst, nds, Assembler::AVX_512bit);
       
  4604   } else {
       
  4605     // worse case scenario, all regs are in the upper bank
       
  4606     subptr(rsp, 64);
       
  4607     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
       
  4608     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
       
  4609     evmovdqul(xmm1, shift, Assembler::AVX_512bit);
       
  4610     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
       
  4611     Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
       
  4612     evmovdqul(xmm1, dst, Assembler::AVX_512bit);
       
  4613     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
       
  4614     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
       
  4615     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
       
  4616     addptr(rsp, 64);
       
  4617   }
       
  4618 }
       
  4619 
       
  4620 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
       
  4621   int dst_enc = dst->encoding();
       
  4622   int nds_enc = nds->encoding();
       
  4623   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
       
  4624     Assembler::vpsllw(dst, nds, shift, vector_len);
       
  4625   } else if (dst_enc < 16) {
       
  4626     Assembler::vpsllw(dst, dst, shift, vector_len);
       
  4627   } else if (nds_enc < 16) {
       
  4628     // use nds as scratch
       
  4629     evmovdqul(nds, dst, Assembler::AVX_512bit);
       
  4630     Assembler::vpsllw(nds, nds, shift, vector_len);
       
  4631     evmovdqul(dst, nds, Assembler::AVX_512bit);
       
  4632   } else {
       
  4633     // use nds as scratch for xmm0
       
  4634     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
       
  4635     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
       
  4636     Assembler::vpsllw(xmm0, xmm0, shift, vector_len);
       
  4637     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
       
  4638   }
       
  4639 }
       
  4640 
       
  4641 // This instruction exists within macros, ergo we cannot control its input
       
  4642 // when emitted through those patterns.
       
  4643 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
       
  4644   if (VM_Version::supports_avx512nobw()) {
       
  4645     int dst_enc = dst->encoding();
       
  4646     int src_enc = src->encoding();
       
  4647     if (dst_enc == src_enc) {
       
  4648       if (dst_enc < 16) {
       
  4649         Assembler::punpcklbw(dst, src);
       
  4650       } else {
       
  4651         subptr(rsp, 64);
       
  4652         evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
       
  4653         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
       
  4654         Assembler::punpcklbw(xmm0, xmm0);
       
  4655         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
       
  4656         evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
       
  4657         addptr(rsp, 64);
       
  4658       }
       
  4659     } else {
       
  4660       if ((src_enc < 16) && (dst_enc < 16)) {
       
  4661         Assembler::punpcklbw(dst, src);
       
  4662       } else if (src_enc < 16) {
       
  4663         subptr(rsp, 64);
       
  4664         evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
       
  4665         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
       
  4666         Assembler::punpcklbw(xmm0, src);
       
  4667         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
       
  4668         evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
       
  4669         addptr(rsp, 64);
       
  4670       } else if (dst_enc < 16) {
       
  4671         subptr(rsp, 64);
       
  4672         evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
       
  4673         evmovdqul(xmm0, src, Assembler::AVX_512bit);
       
  4674         Assembler::punpcklbw(dst, xmm0);
       
  4675         evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
       
  4676         addptr(rsp, 64);
       
  4677       } else {
       
  4678         subptr(rsp, 64);
       
  4679         evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
       
  4680         subptr(rsp, 64);
       
  4681         evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
       
  4682         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
       
  4683         evmovdqul(xmm1, src, Assembler::AVX_512bit);
       
  4684         Assembler::punpcklbw(xmm0, xmm1);
       
  4685         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
       
  4686         evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
       
  4687         addptr(rsp, 64);
       
  4688         evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
       
  4689         addptr(rsp, 64);
       
  4690       }
       
  4691     }
       
  4692   } else {
       
  4693     Assembler::punpcklbw(dst, src);
       
  4694   }
       
  4695 }
       
  4696 
       
  4697 // This instruction exists within macros, ergo we cannot control its input
       
  4698 // when emitted through those patterns.
       
  4699 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
       
  4700   if (VM_Version::supports_avx512nobw()) {
       
  4701     int dst_enc = dst->encoding();
       
  4702     int src_enc = src->encoding();
       
  4703     if (dst_enc == src_enc) {
       
  4704       if (dst_enc < 16) {
       
  4705         Assembler::pshuflw(dst, src, mode);
       
  4706       } else {
       
  4707         subptr(rsp, 64);
       
  4708         evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
       
  4709         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
       
  4710         Assembler::pshuflw(xmm0, xmm0, mode);
       
  4711         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
       
  4712         evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
       
  4713         addptr(rsp, 64);
       
  4714       }
       
  4715     } else {
       
  4716       if ((src_enc < 16) && (dst_enc < 16)) {
       
  4717         Assembler::pshuflw(dst, src, mode);
       
  4718       } else if (src_enc < 16) {
       
  4719         subptr(rsp, 64);
       
  4720         evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
       
  4721         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
       
  4722         Assembler::pshuflw(xmm0, src, mode);
       
  4723         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
       
  4724         evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
       
  4725         addptr(rsp, 64);
       
  4726       } else if (dst_enc < 16) {
       
  4727         subptr(rsp, 64);
       
  4728         evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
       
  4729         evmovdqul(xmm0, src, Assembler::AVX_512bit);
       
  4730         Assembler::pshuflw(dst, xmm0, mode);
       
  4731         evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
       
  4732         addptr(rsp, 64);
       
  4733       } else {
       
  4734         subptr(rsp, 64);
       
  4735         evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
       
  4736         subptr(rsp, 64);
       
  4737         evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
       
  4738         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
       
  4739         evmovdqul(xmm1, src, Assembler::AVX_512bit);
       
  4740         Assembler::pshuflw(xmm0, xmm1, mode);
       
  4741         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
       
  4742         evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
       
  4743         addptr(rsp, 64);
       
  4744         evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
       
  4745         addptr(rsp, 64);
       
  4746       }
       
  4747     }
       
  4748   } else {
       
  4749     Assembler::pshuflw(dst, src, mode);
  4050   }
  4750   }
  4051 }
  4751 }
  4052 
  4752 
  4053 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
  4753 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
  4054   if (reachable(src)) {
  4754   if (reachable(src)) {
  4131       (nds_upper_bank || dst_upper_bank)) {
  4831       (nds_upper_bank || dst_upper_bank)) {
  4132     if (dst_upper_bank) {
  4832     if (dst_upper_bank) {
  4133       subptr(rsp, 64);
  4833       subptr(rsp, 64);
  4134       evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
  4834       evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
  4135       movflt(xmm0, nds);
  4835       movflt(xmm0, nds);
  4136       if (reachable(src)) {
  4836       vxorps(xmm0, xmm0, src, Assembler::AVX_128bit);
  4137         vxorps(xmm0, xmm0, as_Address(src), Assembler::AVX_128bit);
       
  4138       } else {
       
  4139         lea(rscratch1, src);
       
  4140         vxorps(xmm0, xmm0, Address(rscratch1, 0), Assembler::AVX_128bit);
       
  4141       }
       
  4142       movflt(dst, xmm0);
  4837       movflt(dst, xmm0);
  4143       evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
  4838       evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
  4144       addptr(rsp, 64);
  4839       addptr(rsp, 64);
  4145     } else {
  4840     } else {
  4146       movflt(dst, nds);
  4841       movflt(dst, nds);
  4147       if (reachable(src)) {
  4842       vxorps(dst, dst, src, Assembler::AVX_128bit);
  4148         vxorps(dst, dst, as_Address(src), Assembler::AVX_128bit);
  4843     }
  4149       } else {
  4844   } else {
  4150         lea(rscratch1, src);
  4845     vxorps(dst, nds, src, Assembler::AVX_128bit);
  4151         vxorps(dst, dst, Address(rscratch1, 0), Assembler::AVX_128bit);
       
  4152       }
       
  4153     }
       
  4154   } else {
       
  4155     if (reachable(src)) {
       
  4156       vxorps(dst, nds, as_Address(src), Assembler::AVX_128bit);
       
  4157     } else {
       
  4158       lea(rscratch1, src);
       
  4159       vxorps(dst, nds, Address(rscratch1, 0), Assembler::AVX_128bit);
       
  4160     }
       
  4161   }
  4846   }
  4162 }
  4847 }
  4163 
  4848 
  4164 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
  4849 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
  4165   int nds_enc = nds->encoding();
  4850   int nds_enc = nds->encoding();
  4170       (nds_upper_bank || dst_upper_bank)) {
  4855       (nds_upper_bank || dst_upper_bank)) {
  4171     if (dst_upper_bank) {
  4856     if (dst_upper_bank) {
  4172       subptr(rsp, 64);
  4857       subptr(rsp, 64);
  4173       evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
  4858       evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
  4174       movdbl(xmm0, nds);
  4859       movdbl(xmm0, nds);
  4175       if (reachable(src)) {
  4860       vxorpd(xmm0, xmm0, src, Assembler::AVX_128bit);
  4176         vxorps(xmm0, xmm0, as_Address(src), Assembler::AVX_128bit);
       
  4177       } else {
       
  4178         lea(rscratch1, src);
       
  4179         vxorps(xmm0, xmm0, Address(rscratch1, 0), Assembler::AVX_128bit);
       
  4180       }
       
  4181       movdbl(dst, xmm0);
  4861       movdbl(dst, xmm0);
  4182       evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
  4862       evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
  4183       addptr(rsp, 64);
  4863       addptr(rsp, 64);
  4184     } else {
  4864     } else {
  4185       movdbl(dst, nds);
  4865       movdbl(dst, nds);
  4186       if (reachable(src)) {
  4866       vxorpd(dst, dst, src, Assembler::AVX_128bit);
  4187         vxorps(dst, dst, as_Address(src), Assembler::AVX_128bit);
  4867     }
  4188       } else {
  4868   } else {
  4189         lea(rscratch1, src);
  4869     vxorpd(dst, nds, src, Assembler::AVX_128bit);
  4190         vxorps(dst, dst, Address(rscratch1, 0), Assembler::AVX_128bit);
       
  4191       }
       
  4192     }
       
  4193   } else {
       
  4194     if (reachable(src)) {
       
  4195       vxorpd(dst, nds, as_Address(src), Assembler::AVX_128bit);
       
  4196     } else {
       
  4197       lea(rscratch1, src);
       
  4198       vxorpd(dst, nds, Address(rscratch1, 0), Assembler::AVX_128bit);
       
  4199     }
       
  4200   }
  4870   }
  4201 }
  4871 }
  4202 
  4872 
  4203 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
  4873 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
  4204   if (reachable(src)) {
  4874   if (reachable(src)) {
  4686 
  5356 
  4687 void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
  5357 void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
  4688   pusha();
  5358   pusha();
  4689 
  5359 
  4690   // if we are coming from c1, xmm registers may be live
  5360   // if we are coming from c1, xmm registers may be live
  4691   int off = 0;
       
  4692   int num_xmm_regs = LP64_ONLY(16) NOT_LP64(8);
  5361   int num_xmm_regs = LP64_ONLY(16) NOT_LP64(8);
  4693   if (UseAVX > 2) {
  5362   if (UseAVX > 2) {
  4694     num_xmm_regs = LP64_ONLY(32) NOT_LP64(8);
  5363     num_xmm_regs = LP64_ONLY(32) NOT_LP64(8);
  4695   }
  5364   }
  4696 
  5365 
  4697   if (UseSSE == 1)  {
  5366   if (UseSSE == 1)  {
  4698     subptr(rsp, sizeof(jdouble)*8);
  5367     subptr(rsp, sizeof(jdouble)*8);
  4699     for (int n = 0; n < 8; n++) {
  5368     for (int n = 0; n < 8; n++) {
  4700       movflt(Address(rsp, off++*sizeof(jdouble)), as_XMMRegister(n));
  5369       movflt(Address(rsp, n*sizeof(jdouble)), as_XMMRegister(n));
  4701     }
  5370     }
  4702   } else if (UseSSE >= 2)  {
  5371   } else if (UseSSE >= 2)  {
  4703     if (UseAVX > 2) {
  5372     if (UseAVX > 2) {
  4704       push(rbx);
  5373       push(rbx);
  4705       movl(rbx, 0xffff);
  5374       movl(rbx, 0xffff);
  4707       pop(rbx);
  5376       pop(rbx);
  4708     }
  5377     }
  4709 #ifdef COMPILER2
  5378 #ifdef COMPILER2
  4710     if (MaxVectorSize > 16) {
  5379     if (MaxVectorSize > 16) {
  4711       if(UseAVX > 2) {
  5380       if(UseAVX > 2) {
  4712         // Save upper half of ZMM registes
  5381         // Save upper half of ZMM registers
  4713         subptr(rsp, 32*num_xmm_regs);
  5382         subptr(rsp, 32*num_xmm_regs);
  4714         for (int n = 0; n < num_xmm_regs; n++) {
  5383         for (int n = 0; n < num_xmm_regs; n++) {
  4715           vextractf64x4h(Address(rsp, off++*32), as_XMMRegister(n));
  5384           vextractf64x4h(Address(rsp, n*32), as_XMMRegister(n), 1);
  4716         }
  5385         }
  4717         off = 0;
       
  4718       }
  5386       }
  4719       assert(UseAVX > 0, "256 bit vectors are supported only with AVX");
  5387       assert(UseAVX > 0, "256 bit vectors are supported only with AVX");
  4720       // Save upper half of YMM registes
  5388       // Save upper half of YMM registers
  4721       subptr(rsp, 16*num_xmm_regs);
  5389       subptr(rsp, 16*num_xmm_regs);
  4722       for (int n = 0; n < num_xmm_regs; n++) {
  5390       for (int n = 0; n < num_xmm_regs; n++) {
  4723         vextractf128h(Address(rsp, off++*16), as_XMMRegister(n));
  5391         vextractf128h(Address(rsp, n*16), as_XMMRegister(n));
  4724       }
  5392       }
  4725     }
  5393     }
  4726 #endif
  5394 #endif
  4727     // Save whole 128bit (16 bytes) XMM registers
  5395     // Save whole 128bit (16 bytes) XMM registers
  4728     subptr(rsp, 16*num_xmm_regs);
  5396     subptr(rsp, 16*num_xmm_regs);
  4729     off = 0;
       
  4730 #ifdef _LP64
  5397 #ifdef _LP64
  4731     if (VM_Version::supports_avx512novl()) {
  5398     if (VM_Version::supports_evex()) {
  4732       for (int n = 0; n < num_xmm_regs; n++) {
  5399       for (int n = 0; n < num_xmm_regs; n++) {
  4733         vextractf32x4h(Address(rsp, off++*16), as_XMMRegister(n), 0);
  5400         vextractf32x4h(Address(rsp, n*16), as_XMMRegister(n), 0);
  4734       }
  5401       }
  4735     } else {
  5402     } else {
  4736       for (int n = 0; n < num_xmm_regs; n++) {
  5403       for (int n = 0; n < num_xmm_regs; n++) {
  4737         movdqu(Address(rsp, off++*16), as_XMMRegister(n));
  5404         movdqu(Address(rsp, n*16), as_XMMRegister(n));
  4738       }
  5405       }
  4739     }
  5406     }
  4740 #else
  5407 #else
  4741     for (int n = 0; n < num_xmm_regs; n++) {
  5408     for (int n = 0; n < num_xmm_regs; n++) {
  4742       movdqu(Address(rsp, off++*16), as_XMMRegister(n));
  5409       movdqu(Address(rsp, n*16), as_XMMRegister(n));
  4743     }
  5410     }
  4744 #endif
  5411 #endif
  4745   }
  5412   }
  4746 
  5413 
  4747   // Preserve registers across runtime call
  5414   // Preserve registers across runtime call
  4806     }
  5473     }
  4807     fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
  5474     fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
  4808     addptr(rsp, sizeof(jdouble)*nb_args);
  5475     addptr(rsp, sizeof(jdouble)*nb_args);
  4809   }
  5476   }
  4810 
  5477 
  4811   off = 0;
       
  4812   if (UseSSE == 1)  {
  5478   if (UseSSE == 1)  {
  4813     for (int n = 0; n < 8; n++) {
  5479     for (int n = 0; n < 8; n++) {
  4814       movflt(as_XMMRegister(n), Address(rsp, off++*sizeof(jdouble)));
  5480       movflt(as_XMMRegister(n), Address(rsp, n*sizeof(jdouble)));
  4815     }
  5481     }
  4816     addptr(rsp, sizeof(jdouble)*8);
  5482     addptr(rsp, sizeof(jdouble)*8);
  4817   } else if (UseSSE >= 2)  {
  5483   } else if (UseSSE >= 2)  {
  4818     // Restore whole 128bit (16 bytes) XMM regiters
  5484     // Restore whole 128bit (16 bytes) XMM registers
  4819 #ifdef _LP64
  5485 #ifdef _LP64
  4820     if (VM_Version::supports_avx512novl()) {
  5486   if (VM_Version::supports_evex()) {
  4821       for (int n = 0; n < num_xmm_regs; n++) {
  5487     for (int n = 0; n < num_xmm_regs; n++) {
  4822         vinsertf32x4h(as_XMMRegister(n), Address(rsp, off++*16), 0);
  5488       vinsertf32x4h(as_XMMRegister(n), Address(rsp, n*16), 0);
  4823       }
  5489     }
  4824     }
  5490   } else {
  4825     else {
  5491     for (int n = 0; n < num_xmm_regs; n++) {
  4826       for (int n = 0; n < num_xmm_regs; n++) {
  5492       movdqu(as_XMMRegister(n), Address(rsp, n*16));
  4827         movdqu(as_XMMRegister(n), Address(rsp, off++*16));
  5493     }
  4828       }
  5494   }
  4829     }
       
  4830 #else
  5495 #else
  4831     for (int n = 0; n < num_xmm_regs; n++) {
  5496   for (int n = 0; n < num_xmm_regs; n++) {
  4832       movdqu(as_XMMRegister(n), Address(rsp, off++ * 16));
  5497     movdqu(as_XMMRegister(n), Address(rsp, n*16));
  4833     }
  5498   }
  4834 #endif
  5499 #endif
  4835     addptr(rsp, 16*num_xmm_regs);
  5500     addptr(rsp, 16*num_xmm_regs);
  4836 
  5501 
  4837 #ifdef COMPILER2
  5502 #ifdef COMPILER2
  4838     if (MaxVectorSize > 16) {
  5503     if (MaxVectorSize > 16) {
  4839       // Restore upper half of YMM registes.
  5504       // Restore upper half of YMM registers.
  4840       off = 0;
       
  4841       for (int n = 0; n < num_xmm_regs; n++) {
  5505       for (int n = 0; n < num_xmm_regs; n++) {
  4842         vinsertf128h(as_XMMRegister(n), Address(rsp, off++*16));
  5506         vinsertf128h(as_XMMRegister(n), Address(rsp, n*16));
  4843       }
  5507       }
  4844       addptr(rsp, 16*num_xmm_regs);
  5508       addptr(rsp, 16*num_xmm_regs);
  4845       if(UseAVX > 2) {
  5509       if(UseAVX > 2) {
  4846         off = 0;
       
  4847         for (int n = 0; n < num_xmm_regs; n++) {
  5510         for (int n = 0; n < num_xmm_regs; n++) {
  4848           vinsertf64x4h(as_XMMRegister(n), Address(rsp, off++*32));
  5511           vinsertf64x4h(as_XMMRegister(n), Address(rsp, n*32), 1);
  4849         }
  5512         }
  4850         addptr(rsp, 32*num_xmm_regs);
  5513         addptr(rsp, 32*num_xmm_regs);
  4851       }
  5514       }
  4852     }
  5515     }
  4853 #endif
  5516 #endif
  6829     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
  7492     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
  6830     andl(cnt1,0x0000000F);  //tail count (in chars)
  7493     andl(cnt1,0x0000000F);  //tail count (in chars)
  6831 
  7494 
  6832     bind(SCAN_TO_16_CHAR_LOOP);
  7495     bind(SCAN_TO_16_CHAR_LOOP);
  6833     vmovdqu(vec3, Address(result, 0));
  7496     vmovdqu(vec3, Address(result, 0));
  6834     vpcmpeqw(vec3, vec3, vec1, true);
  7497     vpcmpeqw(vec3, vec3, vec1, 1);
  6835     vptest(vec2, vec3);
  7498     vptest(vec2, vec3);
  6836     jcc(Assembler::carryClear, FOUND_CHAR);
  7499     jcc(Assembler::carryClear, FOUND_CHAR);
  6837     addptr(result, 32);
  7500     addptr(result, 32);
  6838     subl(tmp, 2*stride);
  7501     subl(tmp, 2*stride);
  6839     jccb(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
  7502     jccb(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
  7669         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
  8332         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
  7670 
  8333 
  7671         BIND(L_check_fill_32_bytes);
  8334         BIND(L_check_fill_32_bytes);
  7672         addl(count, 8 << shift);
  8335         addl(count, 8 << shift);
  7673         jccb(Assembler::less, L_check_fill_8_bytes);
  8336         jccb(Assembler::less, L_check_fill_8_bytes);
  7674         evmovdqul(Address(to, 0), xtmp, Assembler::AVX_256bit);
  8337         vmovdqu(Address(to, 0), xtmp);
  7675         addptr(to, 32);
  8338         addptr(to, 32);
  7676         subl(count, 8 << shift);
  8339         subl(count, 8 << shift);
  7677 
  8340 
  7678         BIND(L_check_fill_8_bytes);
  8341         BIND(L_check_fill_8_bytes);
  7679       } else if (UseAVX == 2 && UseUnalignedLoadStores) {
  8342       } else if (UseAVX == 2 && UseUnalignedLoadStores) {