# HG changeset patch # User vdeshpande # Date 1494037734 25200 # Node ID 61025eecb743343e87beedd4b02417da0726035b # Parent 6de560f6c1ad6ef6f5c6be4bef3b065568b90688 8178811: Minimize the AVX <-> SSE transition penalty through generation of vzeroupper instruction on x86 Reviewed-by: kvn diff -r 6de560f6c1ad -r 61025eecb743 hotspot/src/cpu/x86/vm/assembler_x86.cpp --- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp Sat May 06 00:05:32 2017 +0000 +++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp Fri May 05 19:28:54 2017 -0700 @@ -2103,12 +2103,20 @@ } void Assembler::ldmxcsr( Address src) { - NOT_LP64(assert(VM_Version::supports_sse(), "")); - InstructionMark im(this); - prefix(src); - emit_int8(0x0F); - emit_int8((unsigned char)0xAE); - emit_operand(as_Register(2), src); + if (UseAVX > 0 ) { + InstructionMark im(this); + InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false); + vex_prefix(src, 0, 0, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + emit_int8((unsigned char)0xAE); + emit_operand(as_Register(2), src); + } else { + NOT_LP64(assert(VM_Version::supports_sse(), "")); + InstructionMark im(this); + prefix(src); + emit_int8(0x0F); + emit_int8((unsigned char)0xAE); + emit_operand(as_Register(2), src); + } } void Assembler::leal(Register dst, Address src) { @@ -4416,12 +4424,21 @@ } void Assembler::stmxcsr( Address dst) { - NOT_LP64(assert(VM_Version::supports_sse(), "")); - InstructionMark im(this); - prefix(dst); - emit_int8(0x0F); - emit_int8((unsigned char)0xAE); - emit_operand(as_Register(3), dst); + if (UseAVX > 0 ) { + assert(VM_Version::supports_avx(), ""); + InstructionMark im(this); + InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false); + vex_prefix(dst, 0, 0, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + emit_int8((unsigned char)0xAE); + emit_operand(as_Register(3), dst); + } else { + NOT_LP64(assert(VM_Version::supports_sse(), "")); + InstructionMark im(this); + prefix(dst); + emit_int8(0x0F); + emit_int8((unsigned char)0xAE); + emit_operand(as_Register(3), dst); + } } void Assembler::subl(Address dst, int32_t imm32) { @@ -6620,10 +6637,11 @@ } void Assembler::vzeroupper() { - assert(VM_Version::supports_avx(), ""); - InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false); - (void)vex_prefix_and_encode(0, 0, 0, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); - emit_int8(0x77); + if (VM_Version::supports_vzeroupper()) { + InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false); + (void)vex_prefix_and_encode(0, 0, 0, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + emit_int8(0x77); + } } #ifndef _LP64 diff -r 6de560f6c1ad -r 61025eecb743 hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp --- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Sat May 06 00:05:32 2017 +0000 +++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Fri May 05 19:28:54 2017 -0700 @@ -763,11 +763,13 @@ // Always clear the pc because it could have been set by make_walkable() movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD); + vzeroupper(); } void MacroAssembler::set_last_Java_frame(Register last_java_sp, Register last_java_fp, address last_java_pc) { + vzeroupper(); // determine last_java_sp register if (!last_java_sp->is_valid()) { last_java_sp = rsp; @@ -3672,6 +3674,7 @@ // Always clear the pc because it could have been set by make_walkable() movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD); + vzeroupper(); } void MacroAssembler::restore_rax(Register tmp) { @@ -3714,6 +3717,7 @@ Register last_java_sp, Register last_java_fp, address last_java_pc) { + vzeroupper(); // determine java_thread register if (!java_thread->is_valid()) { java_thread = rdi; @@ -6524,10 +6528,8 @@ call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry())); } } - if (VM_Version::supports_avx()) { - // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty. - vzeroupper(); - } + // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty. + vzeroupper(); #ifndef _LP64 // Either restore the x87 floating pointer control word after returning diff -r 6de560f6c1ad -r 61025eecb743 hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp --- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp Sat May 06 00:05:32 2017 +0000 +++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp Fri May 05 19:28:54 2017 -0700 @@ -41,6 +41,7 @@ #ifdef COMPILER2 #include "opto/runtime.hpp" #endif +#include "vm_version_x86.hpp" #define __ masm-> @@ -120,8 +121,8 @@ int zmm_bytes = num_xmm_regs * 32; #ifdef COMPILER2 if (save_vectors) { - assert(UseAVX > 0, "up to 512bit vectors are supported with EVEX"); - assert(MaxVectorSize <= 64, "up to 512bit vectors are supported now"); + assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); + assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); // Save upper half of YMM registers int vect_bytes = ymm_bytes; if (UseAVX > 2) { @@ -219,6 +220,7 @@ } } } + __ vzeroupper(); // Set an oopmap for the call site. This oopmap will map all // oop-registers and debug-info registers as callee-saved. This @@ -269,8 +271,8 @@ int additional_frame_bytes = 0; #ifdef COMPILER2 if (restore_vectors) { - assert(UseAVX > 0, "up to 512bit vectors are supported with EVEX"); - assert(MaxVectorSize <= 64, "up to 512bit vectors are supported now"); + assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); + assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); // Save upper half of YMM registers additional_frame_bytes = ymm_bytes; if (UseAVX > 2) { @@ -285,6 +287,8 @@ int off = xmm0_off; int delta = xmm1_off - off; + __ vzeroupper(); + if (UseSSE == 1) { // Restore XMM registers assert(additional_frame_bytes == 0, ""); @@ -2123,6 +2127,8 @@ // preserved and correspond to the bcp/locals pointers. So we do a runtime call // by hand. // + __ vzeroupper(); + save_native_result(masm, ret_type, stack_slots); __ push(thread); if (!is_critical_native) { @@ -2304,7 +2310,7 @@ // BEGIN Slow path unlock __ bind(slow_path_unlock); - + __ vzeroupper(); // Slow path unlock if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { @@ -2349,6 +2355,7 @@ // SLOW PATH Reguard the stack if needed __ bind(reguard); + __ vzeroupper(); save_native_result(masm, ret_type, stack_slots); { __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages))); diff -r 6de560f6c1ad -r 61025eecb743 hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp --- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp Sat May 06 00:05:32 2017 +0000 +++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp Fri May 05 19:28:54 2017 -0700 @@ -47,6 +47,7 @@ #if INCLUDE_JVMCI #include "jvmci/jvmciJavaClasses.hpp" #endif +#include "vm_version_x86.hpp" #define __ masm-> @@ -151,8 +152,8 @@ } #if defined(COMPILER2) || INCLUDE_JVMCI if (save_vectors) { - assert(UseAVX > 0, "up to 512bit vectors are supported with EVEX"); - assert(MaxVectorSize <= 64, "up to 512bit vectors are supported now"); + assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); + assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); } #else assert(!save_vectors, "vectors are generated only by C2 and JVMCI"); @@ -206,6 +207,7 @@ } } } + __ vzeroupper(); if (frame::arg_reg_save_area_bytes != 0) { // Allocate argument register save area __ subptr(rsp, frame::arg_reg_save_area_bytes); @@ -322,13 +324,15 @@ #if defined(COMPILER2) || INCLUDE_JVMCI if (restore_vectors) { - assert(UseAVX > 0, "up to 512bit vectors are supported with EVEX"); - assert(MaxVectorSize <= 64, "up to 512bit vectors are supported now"); + assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); + assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); } #else assert(!restore_vectors, "vectors are generated only by C2"); #endif + __ vzeroupper(); + // On EVEX enabled targets everything is handled in pop fpu state if (restore_vectors) { // Restore upper half of YMM registers (0..15) @@ -528,7 +532,7 @@ // align stack so push_CPU_state doesn't fault __ andptr(rsp, -(StackAlignmentInBytes)); __ push_CPU_state(); - + __ vzeroupper(); // VM needs caller's callsite // VM needs target method // This needs to be a long call since we will relocate this adapter to @@ -547,6 +551,7 @@ __ addptr(rsp, frame::arg_reg_save_area_bytes); } + __ vzeroupper(); __ pop_CPU_state(); // restore sp __ mov(rsp, r13); @@ -1465,7 +1470,6 @@ save_or_restore_arguments(masm, stack_slots, total_in_args, arg_save_area, NULL, in_regs, in_sig_bt); - __ bind(cont); #ifdef ASSERT if (StressCriticalJNINatives) { @@ -2485,6 +2489,7 @@ // preserved and correspond to the bcp/locals pointers. So we do a runtime call // by hand. // + __ vzeroupper(); save_native_result(masm, ret_type, stack_slots); __ mov(c_rarg0, r15_thread); __ mov(r12, rsp); // remember sp @@ -2658,7 +2663,7 @@ // If we haven't already saved the native result we must save it now as xmm registers // are still exposed. - + __ vzeroupper(); if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) { save_native_result(masm, ret_type, stack_slots); } @@ -2704,6 +2709,7 @@ // SLOW PATH Reguard the stack if needed __ bind(reguard); + __ vzeroupper(); save_native_result(masm, ret_type, stack_slots); __ mov(r12, rsp); // remember sp __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows diff -r 6de560f6c1ad -r 61025eecb743 hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp --- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp Sat May 06 00:05:32 2017 +0000 +++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp Fri May 05 19:28:54 2017 -0700 @@ -1012,6 +1012,7 @@ __ pop(rdi); __ pop(rsi); __ leave(); // required for proper stackwalking of RuntimeStub frame + __ vzeroupper(); __ xorptr(rax, rax); // return 0 __ ret(0); return start; @@ -1247,6 +1248,7 @@ } inc_copy_counter_np(T_LONG); __ leave(); // required for proper stackwalking of RuntimeStub frame + __ vzeroupper(); __ xorptr(rax, rax); // return 0 __ ret(0); return start; @@ -3365,6 +3367,7 @@ __ pop(rbx); __ pop(rdi); __ pop(rsi); + __ vzeroupper(); __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); @@ -3422,6 +3425,7 @@ __ pop(h); __ pop(g); __ pop(d); + __ vzeroupper(); __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); diff -r 6de560f6c1ad -r 61025eecb743 hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp --- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp Sat May 06 00:05:32 2017 +0000 +++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp Fri May 05 19:28:54 2017 -0700 @@ -402,6 +402,7 @@ __ addptr(rsp, -rsp_after_call_off * wordSize); // return + __ vzeroupper(); __ pop(rbp); __ ret(0); @@ -1554,6 +1555,7 @@ restore_arg_regs(); inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free __ xorptr(rax, rax); // return 0 + __ vzeroupper(); __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); @@ -1643,6 +1645,7 @@ restore_arg_regs(); inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free __ xorptr(rax, rax); // return 0 + __ vzeroupper(); __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); @@ -1652,6 +1655,7 @@ restore_arg_regs(); inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free __ xorptr(rax, rax); // return 0 + __ vzeroupper(); __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); @@ -1746,6 +1750,7 @@ restore_arg_regs(); inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free __ xorptr(rax, rax); // return 0 + __ vzeroupper(); __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); @@ -1771,6 +1776,7 @@ __ generate_fill(t, aligned, to, value, count, rax, xmm0); + __ vzeroupper(); __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); return start; @@ -1847,6 +1853,7 @@ restore_arg_regs(); inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free __ xorptr(rax, rax); // return 0 + __ vzeroupper(); __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); @@ -1856,6 +1863,7 @@ restore_arg_regs(); inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free __ xorptr(rax, rax); // return 0 + __ vzeroupper(); __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); @@ -1945,6 +1953,7 @@ } restore_arg_regs(); inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free + __ vzeroupper(); __ xorptr(rax, rax); // return 0 __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); @@ -2030,6 +2039,7 @@ restore_arg_regs(); inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free __ xorptr(rax, rax); // return 0 + __ vzeroupper(); __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); @@ -2043,6 +2053,7 @@ restore_arg_regs(); inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free __ xorptr(rax, rax); // return 0 + __ vzeroupper(); __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); @@ -2120,6 +2131,7 @@ restore_arg_regs(); inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free __ xorptr(rax, rax); // return 0 + __ vzeroupper(); __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); } @@ -2137,6 +2149,7 @@ } else { inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free } + __ vzeroupper(); __ xorptr(rax, rax); // return 0 __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); @@ -2203,6 +2216,7 @@ restore_arg_regs(); inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free __ xorptr(rax, rax); // return 0 + __ vzeroupper(); __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); } @@ -2220,6 +2234,7 @@ } else { inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free } + __ vzeroupper(); __ xorptr(rax, rax); // return 0 __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); @@ -3774,7 +3789,7 @@ buf, state, ofs, limit, rsp, multi_block, shuf_mask); } __ addptr(rsp, 4 * wordSize); - + __ vzeroupper(); __ leave(); __ ret(0); return start; @@ -3808,6 +3823,7 @@ __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4, buf, state, ofs, limit, rsp, multi_block, shuf_mask); + __ vzeroupper(); __ leave(); __ ret(0); return start; @@ -4281,7 +4297,6 @@ __ BIND(L_exit); __ pshufb(xmm_temp6, xmm_temp10); // Byte swap 16-byte result __ movdqu(Address(state, 0), xmm_temp6); // store the result - __ leave(); __ ret(0); return start; @@ -4321,6 +4336,7 @@ __ kernel_crc32(crc, buf, len, table, tmp); __ movl(rax, crc); + __ vzeroupper(); __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); @@ -4380,6 +4396,7 @@ __ pop(z); __ pop(y); #endif + __ vzeroupper(); __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); @@ -4494,6 +4511,7 @@ __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2); + __ vzeroupper(); __ leave(); __ ret(0); @@ -4618,7 +4636,7 @@ BLOCK_COMMENT("Entry:"); __ enter(); // required for proper stackwalking of RuntimeStub frame - __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp); + __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp); __ leave(); // required for proper stackwalking of RuntimeStub frame __ ret(0); diff -r 6de560f6c1ad -r 61025eecb743 hotspot/src/cpu/x86/vm/vmStructs_x86.hpp --- a/hotspot/src/cpu/x86/vm/vmStructs_x86.hpp Sat May 06 00:05:32 2017 +0000 +++ b/hotspot/src/cpu/x86/vm/vmStructs_x86.hpp Fri May 05 19:28:54 2017 -0700 @@ -74,6 +74,7 @@ declare_preprocessor_constant("VM_Version::CPU_AVX512BW", CPU_AVX512BW) \ declare_preprocessor_constant("VM_Version::CPU_AVX512VL", CPU_AVX512VL) \ declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA) \ - declare_preprocessor_constant("VM_Version::CPU_FMA", CPU_FMA) + declare_preprocessor_constant("VM_Version::CPU_FMA", CPU_FMA) \ + declare_preprocessor_constant("VM_Version::CPU_VZEROUPPER", CPU_VZEROUPPER) #endif // CPU_X86_VM_VMSTRUCTS_X86_HPP diff -r 6de560f6c1ad -r 61025eecb743 hotspot/src/cpu/x86/vm/vm_version_x86.cpp --- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Sat May 06 00:05:32 2017 +0000 +++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Fri May 05 19:28:54 2017 -0700 @@ -436,14 +436,14 @@ __ movl(rax, 0x10000); __ andl(rax, Address(rsi, 4)); __ cmpl(rax, 0x10000); - __ jccb(Assembler::notEqual, legacy_save_restore); + __ jcc(Assembler::notEqual, legacy_save_restore); // check _cpuid_info.xem_xcr0_eax.bits.opmask // check _cpuid_info.xem_xcr0_eax.bits.zmm512 // check _cpuid_info.xem_xcr0_eax.bits.zmm32 __ movl(rax, 0xE0); __ andl(rax, Address(rbp, in_bytes(VM_Version::xem_xcr0_offset()))); // xcr0 bits sse | ymm __ cmpl(rax, 0xE0); - __ jccb(Assembler::notEqual, legacy_save_restore); + __ jcc(Assembler::notEqual, legacy_save_restore); // If UseAVX is unitialized or is set by the user to include EVEX if (use_evex) { @@ -469,11 +469,12 @@ __ evmovdqul(xmm7, Address(rsp, 0), Assembler::AVX_512bit); __ addptr(rsp, 64); #endif // _WINDOWS + generate_vzeroupper(wrapup); VM_Version::clean_cpuFeatures(); UseAVX = saved_useavx; UseSSE = saved_usesse; __ jmp(wrapup); - } + } __ bind(legacy_save_restore); // AVX check @@ -498,6 +499,7 @@ __ vmovdqu(xmm7, Address(rsp, 0)); __ addptr(rsp, 32); #endif // _WINDOWS + generate_vzeroupper(wrapup); VM_Version::clean_cpuFeatures(); UseAVX = saved_useavx; UseSSE = saved_usesse; @@ -513,6 +515,21 @@ return start; }; + void generate_vzeroupper(Label& L_wrapup) { +# define __ _masm-> + __ lea(rsi, Address(rbp, in_bytes(VM_Version::std_cpuid0_offset()))); + __ cmpl(Address(rsi, 4), 0x756e6547); // 'uneG' + __ jcc(Assembler::notEqual, L_wrapup); + __ movl(rcx, 0x0FFF0FF0); + __ lea(rsi, Address(rbp, in_bytes(VM_Version::std_cpuid1_offset()))); + __ andl(rcx, Address(rsi, 0)); + __ cmpl(rcx, 0x00050670); // If it is Xeon Phi 3200/5200/7200 + __ jcc(Assembler::equal, L_wrapup); + __ cmpl(rcx, 0x00080650); // If it is Future Xeon Phi + __ jcc(Assembler::equal, L_wrapup); + __ vzeroupper(); +# undef __ + } }; void VM_Version::get_processor_features() { @@ -619,8 +636,10 @@ if (UseAVX < 2) _features &= ~CPU_AVX2; - if (UseAVX < 1) + if (UseAVX < 1) { _features &= ~CPU_AVX; + _features &= ~CPU_VZEROUPPER; + } if (!UseAES && !FLAG_IS_DEFAULT(UseAES)) _features &= ~CPU_AES; @@ -630,6 +649,14 @@ _features &= ~CPU_HT; } + if( is_intel() ) { // Intel cpus specific settings + if ((cpu_family() == 0x06) && + ((extended_cpu_model() == 0x57) || // Xeon Phi 3200/5200/7200 + (extended_cpu_model() == 0x85))) { // Future Xeon Phi + _features &= ~CPU_VZEROUPPER; + } + } + char buf[256]; jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", cores_per_cpu(), threads_per_core(), @@ -918,16 +945,36 @@ warning("MaxVectorSize must be a power of 2"); FLAG_SET_DEFAULT(MaxVectorSize, 64); } - if (MaxVectorSize > 64) { - FLAG_SET_DEFAULT(MaxVectorSize, 64); - } - if (MaxVectorSize > 16 && (UseAVX == 0 || !os_supports_avx_vectors())) { - // 32 bytes vectors (in YMM) are only supported with AVX+ - FLAG_SET_DEFAULT(MaxVectorSize, 16); - } if (UseSSE < 2) { // Vectors (in XMM) are only supported with SSE2+ - FLAG_SET_DEFAULT(MaxVectorSize, 0); + if (MaxVectorSize > 0) { + if (!FLAG_IS_DEFAULT(MaxVectorSize)) + warning("MaxVectorSize must be 0"); + FLAG_SET_DEFAULT(MaxVectorSize, 0); + } + } + else if (UseAVX == 0 || !os_supports_avx_vectors()) { + // 32 bytes vectors (in YMM) are only supported with AVX+ + if (MaxVectorSize > 16) { + if (!FLAG_IS_DEFAULT(MaxVectorSize)) + warning("MaxVectorSize must be <= 16"); + FLAG_SET_DEFAULT(MaxVectorSize, 16); + } + } + else if (UseAVX == 1 || UseAVX == 2) { + // 64 bytes vectors (in ZMM) are only supported with AVX 3 + if (MaxVectorSize > 32) { + if (!FLAG_IS_DEFAULT(MaxVectorSize)) + warning("MaxVectorSize must be <= 32"); + FLAG_SET_DEFAULT(MaxVectorSize, 32); + } + } + else if (UseAVX > 2 ) { + if (MaxVectorSize > 64) { + if (!FLAG_IS_DEFAULT(MaxVectorSize)) + warning("MaxVectorSize must be <= 64"); + FLAG_SET_DEFAULT(MaxVectorSize, 64); + } } #if defined(COMPILER2) && defined(ASSERT) if (supports_avx() && PrintMiscellaneous && Verbose && TraceNewVectors) { diff -r 6de560f6c1ad -r 61025eecb743 hotspot/src/cpu/x86/vm/vm_version_x86.hpp --- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp Sat May 06 00:05:32 2017 +0000 +++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp Fri May 05 19:28:54 2017 -0700 @@ -291,6 +291,7 @@ #define CPU_AVX512VL ((uint64_t)UCONST64(0x200000000)) // EVEX instructions with smaller vector length #define CPU_SHA ((uint64_t)UCONST64(0x400000000)) // SHA instructions #define CPU_FMA ((uint64_t)UCONST64(0x800000000)) // FMA instructions +#define CPU_VZEROUPPER ((uint64_t)UCONST64(0x1000000000)) // Vzeroupper instruction enum Extended_Family { // AMD @@ -468,6 +469,7 @@ _cpuid_info.xem_xcr0_eax.bits.sse != 0 && _cpuid_info.xem_xcr0_eax.bits.ymm != 0) { result |= CPU_AVX; + result |= CPU_VZEROUPPER; if (_cpuid_info.sef_cpuid7_ebx.bits.avx2 != 0) result |= CPU_AVX2; if (_cpuid_info.sef_cpuid7_ebx.bits.avx512f != 0 && @@ -605,8 +607,8 @@ static address cpuinfo_cont_addr() { return _cpuinfo_cont_addr; } static void clean_cpuFeatures() { _features = 0; } - static void set_avx_cpuFeatures() { _features = (CPU_SSE | CPU_SSE2 | CPU_AVX); } - static void set_evex_cpuFeatures() { _features = (CPU_AVX512F | CPU_SSE | CPU_SSE2 ); } + static void set_avx_cpuFeatures() { _features = (CPU_SSE | CPU_SSE2 | CPU_AVX | CPU_VZEROUPPER ); } + static void set_evex_cpuFeatures() { _features = (CPU_AVX512F | CPU_SSE | CPU_SSE2 | CPU_VZEROUPPER ); } // Initialization @@ -731,6 +733,8 @@ static bool supports_avxonly() { return ((supports_avx2() || supports_avx()) && !supports_evex()); } static bool supports_sha() { return (_features & CPU_SHA) != 0; } static bool supports_fma() { return (_features & CPU_FMA) != 0; } + static bool supports_vzeroupper() { return (_features & CPU_VZEROUPPER) != 0; } + // Intel features static bool is_intel_family_core() { return is_intel() && extended_cpu_family() == CPU_FAMILY_INTEL_CORE; } diff -r 6de560f6c1ad -r 61025eecb743 hotspot/src/cpu/x86/vm/x86_32.ad --- a/hotspot/src/cpu/x86/vm/x86_32.ad Sat May 06 00:05:32 2017 +0000 +++ b/hotspot/src/cpu/x86/vm/x86_32.ad Fri May 05 19:28:54 2017 -0700 @@ -290,7 +290,7 @@ if (C->in_24_bit_fp_mode()) { size += 6; // fldcw } - if (C->max_vector_size() > 16) { + if (VM_Version::supports_vzeroupper()) { size += 3; // vzeroupper } return size; @@ -1884,7 +1884,6 @@ } %} - enc_class pre_call_resets %{ // If method sets FPU control word restore it here debug_only(int off0 = cbuf.insts_size()); @@ -1892,12 +1891,10 @@ MacroAssembler _masm(&cbuf); __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std())); } - if (ra_->C->max_vector_size() > 16) { - // Clear upper bits of YMM registers when current compiled code uses - // wide vectors to avoid AVX <-> SSE transition penalty during call. - MacroAssembler _masm(&cbuf); - __ vzeroupper(); - } + // Clear upper bits of YMM registers when current compiled code uses + // wide vectors to avoid AVX <-> SSE transition penalty during call. + MacroAssembler _masm(&cbuf); + __ vzeroupper(); debug_only(int off1 = cbuf.insts_size()); assert(off1 - off0 == pre_call_resets_size(), "correct size prediction"); %} @@ -13072,7 +13069,7 @@ ins_cost(300); format %{ "CALL_LEAF_NOFP,runtime " %} opcode(0xE8); /* E8 cd */ - ins_encode(Java_To_Runtime(meth)); + ins_encode(pre_call_resets, Java_To_Runtime(meth)); ins_pipe( pipe_slow ); %} diff -r 6de560f6c1ad -r 61025eecb743 hotspot/src/cpu/x86/vm/x86_64.ad --- a/hotspot/src/cpu/x86/vm/x86_64.ad Sat May 06 00:05:32 2017 +0000 +++ b/hotspot/src/cpu/x86/vm/x86_64.ad Fri May 05 19:28:54 2017 -0700 @@ -536,7 +536,7 @@ #define __ _masm. static int clear_avx_size() { - return (Compile::current()->max_vector_size() > 16) ? 3 : 0; // vzeroupper + return (VM_Version::supports_vzeroupper()) ? 3: 0; // vzeroupper } // !!!!! Special hack to get all types of calls to specify the byte offset @@ -919,7 +919,7 @@ void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const { Compile* C = ra_->C; - if (C->max_vector_size() > 16) { + if (VM_Version::supports_vzeroupper()) { st->print("vzeroupper"); st->cr(); st->print("\t"); } @@ -955,11 +955,9 @@ Compile* C = ra_->C; MacroAssembler _masm(&cbuf); - if (C->max_vector_size() > 16) { - // Clear upper bits of YMM registers when current compiled code uses - // wide vectors to avoid AVX <-> SSE transition penalty during call. - __ vzeroupper(); - } + // Clear upper bits of YMM registers when current compiled code uses + // wide vectors to avoid AVX <-> SSE transition penalty during call. + __ vzeroupper(); int framesize = C->frame_size_in_bytes(); assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); @@ -2092,12 +2090,11 @@ enc_class clear_avx %{ debug_only(int off0 = cbuf.insts_size()); - if (ra_->C->max_vector_size() > 16) { - // Clear upper bits of YMM registers when current compiled code uses - // wide vectors to avoid AVX <-> SSE transition penalty during call. - MacroAssembler _masm(&cbuf); - __ vzeroupper(); - } + // Clear upper bits of YMM registers to avoid AVX <-> SSE transition penalty + // Clear upper bits of YMM registers when current compiled code uses + // wide vectors to avoid AVX <-> SSE transition penalty during call. + MacroAssembler _masm(&cbuf); + __ vzeroupper(); debug_only(int off1 = cbuf.insts_size()); assert(off1 - off0 == clear_avx_size(), "correct size prediction"); %} @@ -12116,7 +12113,7 @@ ins_cost(300); format %{ "call_leaf_nofp,runtime " %} - ins_encode(Java_To_Runtime(meth)); + ins_encode(clear_avx, Java_To_Runtime(meth)); ins_pipe(pipe_slow); %} diff -r 6de560f6c1ad -r 61025eecb743 hotspot/src/share/vm/jvmci/vmStructs_jvmci.cpp --- a/hotspot/src/share/vm/jvmci/vmStructs_jvmci.cpp Sat May 06 00:05:32 2017 +0000 +++ b/hotspot/src/share/vm/jvmci/vmStructs_jvmci.cpp Fri May 05 19:28:54 2017 -0700 @@ -719,7 +719,8 @@ declare_preprocessor_constant("VM_Version::CPU_AVX512BW", CPU_AVX512BW) \ declare_preprocessor_constant("VM_Version::CPU_AVX512VL", CPU_AVX512VL) \ declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA) \ - declare_preprocessor_constant("VM_Version::CPU_FMA", CPU_FMA) + declare_preprocessor_constant("VM_Version::CPU_FMA", CPU_FMA) \ + declare_preprocessor_constant("VM_Version::CPU_VZEROUPPER", CPU_VZEROUPPER) #endif