8178811: Minimize the AVX <-> SSE transition penalty through generation of vzeroupper instruction on x86
Reviewed-by: kvn
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp Sat May 06 00:05:32 2017 +0000
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp Fri May 05 19:28:54 2017 -0700
@@ -2103,12 +2103,20 @@
}
void Assembler::ldmxcsr( Address src) {
- NOT_LP64(assert(VM_Version::supports_sse(), ""));
- InstructionMark im(this);
- prefix(src);
- emit_int8(0x0F);
- emit_int8((unsigned char)0xAE);
- emit_operand(as_Register(2), src);
+ if (UseAVX > 0 ) {
+ InstructionMark im(this);
+ InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+ vex_prefix(src, 0, 0, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+ emit_int8((unsigned char)0xAE);
+ emit_operand(as_Register(2), src);
+ } else {
+ NOT_LP64(assert(VM_Version::supports_sse(), ""));
+ InstructionMark im(this);
+ prefix(src);
+ emit_int8(0x0F);
+ emit_int8((unsigned char)0xAE);
+ emit_operand(as_Register(2), src);
+ }
}
void Assembler::leal(Register dst, Address src) {
@@ -4416,12 +4424,21 @@
}
void Assembler::stmxcsr( Address dst) {
- NOT_LP64(assert(VM_Version::supports_sse(), ""));
- InstructionMark im(this);
- prefix(dst);
- emit_int8(0x0F);
- emit_int8((unsigned char)0xAE);
- emit_operand(as_Register(3), dst);
+ if (UseAVX > 0 ) {
+ assert(VM_Version::supports_avx(), "");
+ InstructionMark im(this);
+ InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+ vex_prefix(dst, 0, 0, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+ emit_int8((unsigned char)0xAE);
+ emit_operand(as_Register(3), dst);
+ } else {
+ NOT_LP64(assert(VM_Version::supports_sse(), ""));
+ InstructionMark im(this);
+ prefix(dst);
+ emit_int8(0x0F);
+ emit_int8((unsigned char)0xAE);
+ emit_operand(as_Register(3), dst);
+ }
}
void Assembler::subl(Address dst, int32_t imm32) {
@@ -6620,10 +6637,11 @@
}
void Assembler::vzeroupper() {
- assert(VM_Version::supports_avx(), "");
- InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
- (void)vex_prefix_and_encode(0, 0, 0, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
- emit_int8(0x77);
+ if (VM_Version::supports_vzeroupper()) {
+ InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+ (void)vex_prefix_and_encode(0, 0, 0, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+ emit_int8(0x77);
+ }
}
#ifndef _LP64
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Sat May 06 00:05:32 2017 +0000
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Fri May 05 19:28:54 2017 -0700
@@ -763,11 +763,13 @@
// Always clear the pc because it could have been set by make_walkable()
movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
+ vzeroupper();
}
void MacroAssembler::set_last_Java_frame(Register last_java_sp,
Register last_java_fp,
address last_java_pc) {
+ vzeroupper();
// determine last_java_sp register
if (!last_java_sp->is_valid()) {
last_java_sp = rsp;
@@ -3672,6 +3674,7 @@
// Always clear the pc because it could have been set by make_walkable()
movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
+ vzeroupper();
}
void MacroAssembler::restore_rax(Register tmp) {
@@ -3714,6 +3717,7 @@
Register last_java_sp,
Register last_java_fp,
address last_java_pc) {
+ vzeroupper();
// determine java_thread register
if (!java_thread->is_valid()) {
java_thread = rdi;
@@ -6524,10 +6528,8 @@
call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
}
}
- if (VM_Version::supports_avx()) {
- // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
- vzeroupper();
- }
+ // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
+ vzeroupper();
#ifndef _LP64
// Either restore the x87 floating pointer control word after returning
--- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp Sat May 06 00:05:32 2017 +0000
+++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp Fri May 05 19:28:54 2017 -0700
@@ -41,6 +41,7 @@
#ifdef COMPILER2
#include "opto/runtime.hpp"
#endif
+#include "vm_version_x86.hpp"
#define __ masm->
@@ -120,8 +121,8 @@
int zmm_bytes = num_xmm_regs * 32;
#ifdef COMPILER2
if (save_vectors) {
- assert(UseAVX > 0, "up to 512bit vectors are supported with EVEX");
- assert(MaxVectorSize <= 64, "up to 512bit vectors are supported now");
+ assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
+ assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
// Save upper half of YMM registers
int vect_bytes = ymm_bytes;
if (UseAVX > 2) {
@@ -219,6 +220,7 @@
}
}
}
+ __ vzeroupper();
// Set an oopmap for the call site. This oopmap will map all
// oop-registers and debug-info registers as callee-saved. This
@@ -269,8 +271,8 @@
int additional_frame_bytes = 0;
#ifdef COMPILER2
if (restore_vectors) {
- assert(UseAVX > 0, "up to 512bit vectors are supported with EVEX");
- assert(MaxVectorSize <= 64, "up to 512bit vectors are supported now");
+ assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
+ assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
// Save upper half of YMM registers
additional_frame_bytes = ymm_bytes;
if (UseAVX > 2) {
@@ -285,6 +287,8 @@
int off = xmm0_off;
int delta = xmm1_off - off;
+ __ vzeroupper();
+
if (UseSSE == 1) {
// Restore XMM registers
assert(additional_frame_bytes == 0, "");
@@ -2123,6 +2127,8 @@
// preserved and correspond to the bcp/locals pointers. So we do a runtime call
// by hand.
//
+ __ vzeroupper();
+
save_native_result(masm, ret_type, stack_slots);
__ push(thread);
if (!is_critical_native) {
@@ -2304,7 +2310,7 @@
// BEGIN Slow path unlock
__ bind(slow_path_unlock);
-
+ __ vzeroupper();
// Slow path unlock
if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
@@ -2349,6 +2355,7 @@
// SLOW PATH Reguard the stack if needed
__ bind(reguard);
+ __ vzeroupper();
save_native_result(masm, ret_type, stack_slots);
{
__ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
--- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp Sat May 06 00:05:32 2017 +0000
+++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp Fri May 05 19:28:54 2017 -0700
@@ -47,6 +47,7 @@
#if INCLUDE_JVMCI
#include "jvmci/jvmciJavaClasses.hpp"
#endif
+#include "vm_version_x86.hpp"
#define __ masm->
@@ -151,8 +152,8 @@
}
#if defined(COMPILER2) || INCLUDE_JVMCI
if (save_vectors) {
- assert(UseAVX > 0, "up to 512bit vectors are supported with EVEX");
- assert(MaxVectorSize <= 64, "up to 512bit vectors are supported now");
+ assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
+ assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
}
#else
assert(!save_vectors, "vectors are generated only by C2 and JVMCI");
@@ -206,6 +207,7 @@
}
}
}
+ __ vzeroupper();
if (frame::arg_reg_save_area_bytes != 0) {
// Allocate argument register save area
__ subptr(rsp, frame::arg_reg_save_area_bytes);
@@ -322,13 +324,15 @@
#if defined(COMPILER2) || INCLUDE_JVMCI
if (restore_vectors) {
- assert(UseAVX > 0, "up to 512bit vectors are supported with EVEX");
- assert(MaxVectorSize <= 64, "up to 512bit vectors are supported now");
+ assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
+ assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
}
#else
assert(!restore_vectors, "vectors are generated only by C2");
#endif
+ __ vzeroupper();
+
// On EVEX enabled targets everything is handled in pop fpu state
if (restore_vectors) {
// Restore upper half of YMM registers (0..15)
@@ -528,7 +532,7 @@
// align stack so push_CPU_state doesn't fault
__ andptr(rsp, -(StackAlignmentInBytes));
__ push_CPU_state();
-
+ __ vzeroupper();
// VM needs caller's callsite
// VM needs target method
// This needs to be a long call since we will relocate this adapter to
@@ -547,6 +551,7 @@
__ addptr(rsp, frame::arg_reg_save_area_bytes);
}
+ __ vzeroupper();
__ pop_CPU_state();
// restore sp
__ mov(rsp, r13);
@@ -1465,7 +1470,6 @@
save_or_restore_arguments(masm, stack_slots, total_in_args,
arg_save_area, NULL, in_regs, in_sig_bt);
-
__ bind(cont);
#ifdef ASSERT
if (StressCriticalJNINatives) {
@@ -2485,6 +2489,7 @@
// preserved and correspond to the bcp/locals pointers. So we do a runtime call
// by hand.
//
+ __ vzeroupper();
save_native_result(masm, ret_type, stack_slots);
__ mov(c_rarg0, r15_thread);
__ mov(r12, rsp); // remember sp
@@ -2658,7 +2663,7 @@
// If we haven't already saved the native result we must save it now as xmm registers
// are still exposed.
-
+ __ vzeroupper();
if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
save_native_result(masm, ret_type, stack_slots);
}
@@ -2704,6 +2709,7 @@
// SLOW PATH Reguard the stack if needed
__ bind(reguard);
+ __ vzeroupper();
save_native_result(masm, ret_type, stack_slots);
__ mov(r12, rsp); // remember sp
__ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp Sat May 06 00:05:32 2017 +0000
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp Fri May 05 19:28:54 2017 -0700
@@ -1012,6 +1012,7 @@
__ pop(rdi);
__ pop(rsi);
__ leave(); // required for proper stackwalking of RuntimeStub frame
+ __ vzeroupper();
__ xorptr(rax, rax); // return 0
__ ret(0);
return start;
@@ -1247,6 +1248,7 @@
}
inc_copy_counter_np(T_LONG);
__ leave(); // required for proper stackwalking of RuntimeStub frame
+ __ vzeroupper();
__ xorptr(rax, rax); // return 0
__ ret(0);
return start;
@@ -3365,6 +3367,7 @@
__ pop(rbx);
__ pop(rdi);
__ pop(rsi);
+ __ vzeroupper();
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
@@ -3422,6 +3425,7 @@
__ pop(h);
__ pop(g);
__ pop(d);
+ __ vzeroupper();
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp Sat May 06 00:05:32 2017 +0000
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp Fri May 05 19:28:54 2017 -0700
@@ -402,6 +402,7 @@
__ addptr(rsp, -rsp_after_call_off * wordSize);
// return
+ __ vzeroupper();
__ pop(rbp);
__ ret(0);
@@ -1554,6 +1555,7 @@
restore_arg_regs();
inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
__ xorptr(rax, rax); // return 0
+ __ vzeroupper();
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
@@ -1643,6 +1645,7 @@
restore_arg_regs();
inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
__ xorptr(rax, rax); // return 0
+ __ vzeroupper();
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
@@ -1652,6 +1655,7 @@
restore_arg_regs();
inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
__ xorptr(rax, rax); // return 0
+ __ vzeroupper();
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
@@ -1746,6 +1750,7 @@
restore_arg_regs();
inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
__ xorptr(rax, rax); // return 0
+ __ vzeroupper();
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
@@ -1771,6 +1776,7 @@
__ generate_fill(t, aligned, to, value, count, rax, xmm0);
+ __ vzeroupper();
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
return start;
@@ -1847,6 +1853,7 @@
restore_arg_regs();
inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
__ xorptr(rax, rax); // return 0
+ __ vzeroupper();
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
@@ -1856,6 +1863,7 @@
restore_arg_regs();
inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
__ xorptr(rax, rax); // return 0
+ __ vzeroupper();
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
@@ -1945,6 +1953,7 @@
}
restore_arg_regs();
inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
+ __ vzeroupper();
__ xorptr(rax, rax); // return 0
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
@@ -2030,6 +2039,7 @@
restore_arg_regs();
inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
__ xorptr(rax, rax); // return 0
+ __ vzeroupper();
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
@@ -2043,6 +2053,7 @@
restore_arg_regs();
inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
__ xorptr(rax, rax); // return 0
+ __ vzeroupper();
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
@@ -2120,6 +2131,7 @@
restore_arg_regs();
inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
__ xorptr(rax, rax); // return 0
+ __ vzeroupper();
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
}
@@ -2137,6 +2149,7 @@
} else {
inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
}
+ __ vzeroupper();
__ xorptr(rax, rax); // return 0
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
@@ -2203,6 +2216,7 @@
restore_arg_regs();
inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
__ xorptr(rax, rax); // return 0
+ __ vzeroupper();
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
}
@@ -2220,6 +2234,7 @@
} else {
inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
}
+ __ vzeroupper();
__ xorptr(rax, rax); // return 0
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
@@ -3774,7 +3789,7 @@
buf, state, ofs, limit, rsp, multi_block, shuf_mask);
}
__ addptr(rsp, 4 * wordSize);
-
+ __ vzeroupper();
__ leave();
__ ret(0);
return start;
@@ -3808,6 +3823,7 @@
__ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
buf, state, ofs, limit, rsp, multi_block, shuf_mask);
+ __ vzeroupper();
__ leave();
__ ret(0);
return start;
@@ -4281,7 +4297,6 @@
__ BIND(L_exit);
__ pshufb(xmm_temp6, xmm_temp10); // Byte swap 16-byte result
__ movdqu(Address(state, 0), xmm_temp6); // store the result
-
__ leave();
__ ret(0);
return start;
@@ -4321,6 +4336,7 @@
__ kernel_crc32(crc, buf, len, table, tmp);
__ movl(rax, crc);
+ __ vzeroupper();
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
@@ -4380,6 +4396,7 @@
__ pop(z);
__ pop(y);
#endif
+ __ vzeroupper();
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
@@ -4494,6 +4511,7 @@
__ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
+ __ vzeroupper();
__ leave();
__ ret(0);
@@ -4618,7 +4636,7 @@
BLOCK_COMMENT("Entry:");
__ enter(); // required for proper stackwalking of RuntimeStub frame
- __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
+ __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
--- a/hotspot/src/cpu/x86/vm/vmStructs_x86.hpp Sat May 06 00:05:32 2017 +0000
+++ b/hotspot/src/cpu/x86/vm/vmStructs_x86.hpp Fri May 05 19:28:54 2017 -0700
@@ -74,6 +74,7 @@
declare_preprocessor_constant("VM_Version::CPU_AVX512BW", CPU_AVX512BW) \
declare_preprocessor_constant("VM_Version::CPU_AVX512VL", CPU_AVX512VL) \
declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA) \
- declare_preprocessor_constant("VM_Version::CPU_FMA", CPU_FMA)
+ declare_preprocessor_constant("VM_Version::CPU_FMA", CPU_FMA) \
+ declare_preprocessor_constant("VM_Version::CPU_VZEROUPPER", CPU_VZEROUPPER)
#endif // CPU_X86_VM_VMSTRUCTS_X86_HPP
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Sat May 06 00:05:32 2017 +0000
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Fri May 05 19:28:54 2017 -0700
@@ -436,14 +436,14 @@
__ movl(rax, 0x10000);
__ andl(rax, Address(rsi, 4));
__ cmpl(rax, 0x10000);
- __ jccb(Assembler::notEqual, legacy_save_restore);
+ __ jcc(Assembler::notEqual, legacy_save_restore);
// check _cpuid_info.xem_xcr0_eax.bits.opmask
// check _cpuid_info.xem_xcr0_eax.bits.zmm512
// check _cpuid_info.xem_xcr0_eax.bits.zmm32
__ movl(rax, 0xE0);
__ andl(rax, Address(rbp, in_bytes(VM_Version::xem_xcr0_offset()))); // xcr0 bits sse | ymm
__ cmpl(rax, 0xE0);
- __ jccb(Assembler::notEqual, legacy_save_restore);
+ __ jcc(Assembler::notEqual, legacy_save_restore);
// If UseAVX is unitialized or is set by the user to include EVEX
if (use_evex) {
@@ -469,11 +469,12 @@
__ evmovdqul(xmm7, Address(rsp, 0), Assembler::AVX_512bit);
__ addptr(rsp, 64);
#endif // _WINDOWS
+ generate_vzeroupper(wrapup);
VM_Version::clean_cpuFeatures();
UseAVX = saved_useavx;
UseSSE = saved_usesse;
__ jmp(wrapup);
- }
+ }
__ bind(legacy_save_restore);
// AVX check
@@ -498,6 +499,7 @@
__ vmovdqu(xmm7, Address(rsp, 0));
__ addptr(rsp, 32);
#endif // _WINDOWS
+ generate_vzeroupper(wrapup);
VM_Version::clean_cpuFeatures();
UseAVX = saved_useavx;
UseSSE = saved_usesse;
@@ -513,6 +515,21 @@
return start;
};
+ void generate_vzeroupper(Label& L_wrapup) {
+# define __ _masm->
+ __ lea(rsi, Address(rbp, in_bytes(VM_Version::std_cpuid0_offset())));
+ __ cmpl(Address(rsi, 4), 0x756e6547); // 'uneG'
+ __ jcc(Assembler::notEqual, L_wrapup);
+ __ movl(rcx, 0x0FFF0FF0);
+ __ lea(rsi, Address(rbp, in_bytes(VM_Version::std_cpuid1_offset())));
+ __ andl(rcx, Address(rsi, 0));
+ __ cmpl(rcx, 0x00050670); // If it is Xeon Phi 3200/5200/7200
+ __ jcc(Assembler::equal, L_wrapup);
+ __ cmpl(rcx, 0x00080650); // If it is Future Xeon Phi
+ __ jcc(Assembler::equal, L_wrapup);
+ __ vzeroupper();
+# undef __
+ }
};
void VM_Version::get_processor_features() {
@@ -619,8 +636,10 @@
if (UseAVX < 2)
_features &= ~CPU_AVX2;
- if (UseAVX < 1)
+ if (UseAVX < 1) {
_features &= ~CPU_AVX;
+ _features &= ~CPU_VZEROUPPER;
+ }
if (!UseAES && !FLAG_IS_DEFAULT(UseAES))
_features &= ~CPU_AES;
@@ -630,6 +649,14 @@
_features &= ~CPU_HT;
}
+ if( is_intel() ) { // Intel cpus specific settings
+ if ((cpu_family() == 0x06) &&
+ ((extended_cpu_model() == 0x57) || // Xeon Phi 3200/5200/7200
+ (extended_cpu_model() == 0x85))) { // Future Xeon Phi
+ _features &= ~CPU_VZEROUPPER;
+ }
+ }
+
char buf[256];
jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
cores_per_cpu(), threads_per_core(),
@@ -918,16 +945,36 @@
warning("MaxVectorSize must be a power of 2");
FLAG_SET_DEFAULT(MaxVectorSize, 64);
}
- if (MaxVectorSize > 64) {
- FLAG_SET_DEFAULT(MaxVectorSize, 64);
- }
- if (MaxVectorSize > 16 && (UseAVX == 0 || !os_supports_avx_vectors())) {
- // 32 bytes vectors (in YMM) are only supported with AVX+
- FLAG_SET_DEFAULT(MaxVectorSize, 16);
- }
if (UseSSE < 2) {
// Vectors (in XMM) are only supported with SSE2+
- FLAG_SET_DEFAULT(MaxVectorSize, 0);
+ if (MaxVectorSize > 0) {
+ if (!FLAG_IS_DEFAULT(MaxVectorSize))
+ warning("MaxVectorSize must be 0");
+ FLAG_SET_DEFAULT(MaxVectorSize, 0);
+ }
+ }
+ else if (UseAVX == 0 || !os_supports_avx_vectors()) {
+ // 32 bytes vectors (in YMM) are only supported with AVX+
+ if (MaxVectorSize > 16) {
+ if (!FLAG_IS_DEFAULT(MaxVectorSize))
+ warning("MaxVectorSize must be <= 16");
+ FLAG_SET_DEFAULT(MaxVectorSize, 16);
+ }
+ }
+ else if (UseAVX == 1 || UseAVX == 2) {
+ // 64 bytes vectors (in ZMM) are only supported with AVX 3
+ if (MaxVectorSize > 32) {
+ if (!FLAG_IS_DEFAULT(MaxVectorSize))
+ warning("MaxVectorSize must be <= 32");
+ FLAG_SET_DEFAULT(MaxVectorSize, 32);
+ }
+ }
+ else if (UseAVX > 2 ) {
+ if (MaxVectorSize > 64) {
+ if (!FLAG_IS_DEFAULT(MaxVectorSize))
+ warning("MaxVectorSize must be <= 64");
+ FLAG_SET_DEFAULT(MaxVectorSize, 64);
+ }
}
#if defined(COMPILER2) && defined(ASSERT)
if (supports_avx() && PrintMiscellaneous && Verbose && TraceNewVectors) {
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp Sat May 06 00:05:32 2017 +0000
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp Fri May 05 19:28:54 2017 -0700
@@ -291,6 +291,7 @@
#define CPU_AVX512VL ((uint64_t)UCONST64(0x200000000)) // EVEX instructions with smaller vector length
#define CPU_SHA ((uint64_t)UCONST64(0x400000000)) // SHA instructions
#define CPU_FMA ((uint64_t)UCONST64(0x800000000)) // FMA instructions
+#define CPU_VZEROUPPER ((uint64_t)UCONST64(0x1000000000)) // Vzeroupper instruction
enum Extended_Family {
// AMD
@@ -468,6 +469,7 @@
_cpuid_info.xem_xcr0_eax.bits.sse != 0 &&
_cpuid_info.xem_xcr0_eax.bits.ymm != 0) {
result |= CPU_AVX;
+ result |= CPU_VZEROUPPER;
if (_cpuid_info.sef_cpuid7_ebx.bits.avx2 != 0)
result |= CPU_AVX2;
if (_cpuid_info.sef_cpuid7_ebx.bits.avx512f != 0 &&
@@ -605,8 +607,8 @@
static address cpuinfo_cont_addr() { return _cpuinfo_cont_addr; }
static void clean_cpuFeatures() { _features = 0; }
- static void set_avx_cpuFeatures() { _features = (CPU_SSE | CPU_SSE2 | CPU_AVX); }
- static void set_evex_cpuFeatures() { _features = (CPU_AVX512F | CPU_SSE | CPU_SSE2 ); }
+ static void set_avx_cpuFeatures() { _features = (CPU_SSE | CPU_SSE2 | CPU_AVX | CPU_VZEROUPPER ); }
+ static void set_evex_cpuFeatures() { _features = (CPU_AVX512F | CPU_SSE | CPU_SSE2 | CPU_VZEROUPPER ); }
// Initialization
@@ -731,6 +733,8 @@
static bool supports_avxonly() { return ((supports_avx2() || supports_avx()) && !supports_evex()); }
static bool supports_sha() { return (_features & CPU_SHA) != 0; }
static bool supports_fma() { return (_features & CPU_FMA) != 0; }
+ static bool supports_vzeroupper() { return (_features & CPU_VZEROUPPER) != 0; }
+
// Intel features
static bool is_intel_family_core() { return is_intel() &&
extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }
--- a/hotspot/src/cpu/x86/vm/x86_32.ad Sat May 06 00:05:32 2017 +0000
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad Fri May 05 19:28:54 2017 -0700
@@ -290,7 +290,7 @@
if (C->in_24_bit_fp_mode()) {
size += 6; // fldcw
}
- if (C->max_vector_size() > 16) {
+ if (VM_Version::supports_vzeroupper()) {
size += 3; // vzeroupper
}
return size;
@@ -1884,7 +1884,6 @@
}
%}
-
enc_class pre_call_resets %{
// If method sets FPU control word restore it here
debug_only(int off0 = cbuf.insts_size());
@@ -1892,12 +1891,10 @@
MacroAssembler _masm(&cbuf);
__ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
}
- if (ra_->C->max_vector_size() > 16) {
- // Clear upper bits of YMM registers when current compiled code uses
- // wide vectors to avoid AVX <-> SSE transition penalty during call.
- MacroAssembler _masm(&cbuf);
- __ vzeroupper();
- }
+ // Clear upper bits of YMM registers when current compiled code uses
+ // wide vectors to avoid AVX <-> SSE transition penalty during call.
+ MacroAssembler _masm(&cbuf);
+ __ vzeroupper();
debug_only(int off1 = cbuf.insts_size());
assert(off1 - off0 == pre_call_resets_size(), "correct size prediction");
%}
@@ -13072,7 +13069,7 @@
ins_cost(300);
format %{ "CALL_LEAF_NOFP,runtime " %}
opcode(0xE8); /* E8 cd */
- ins_encode(Java_To_Runtime(meth));
+ ins_encode(pre_call_resets, Java_To_Runtime(meth));
ins_pipe( pipe_slow );
%}
--- a/hotspot/src/cpu/x86/vm/x86_64.ad Sat May 06 00:05:32 2017 +0000
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad Fri May 05 19:28:54 2017 -0700
@@ -536,7 +536,7 @@
#define __ _masm.
static int clear_avx_size() {
- return (Compile::current()->max_vector_size() > 16) ? 3 : 0; // vzeroupper
+ return (VM_Version::supports_vzeroupper()) ? 3: 0; // vzeroupper
}
// !!!!! Special hack to get all types of calls to specify the byte offset
@@ -919,7 +919,7 @@
void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const
{
Compile* C = ra_->C;
- if (C->max_vector_size() > 16) {
+ if (VM_Version::supports_vzeroupper()) {
st->print("vzeroupper");
st->cr(); st->print("\t");
}
@@ -955,11 +955,9 @@
Compile* C = ra_->C;
MacroAssembler _masm(&cbuf);
- if (C->max_vector_size() > 16) {
- // Clear upper bits of YMM registers when current compiled code uses
- // wide vectors to avoid AVX <-> SSE transition penalty during call.
- __ vzeroupper();
- }
+ // Clear upper bits of YMM registers when current compiled code uses
+ // wide vectors to avoid AVX <-> SSE transition penalty during call.
+ __ vzeroupper();
int framesize = C->frame_size_in_bytes();
assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
@@ -2092,12 +2090,11 @@
enc_class clear_avx %{
debug_only(int off0 = cbuf.insts_size());
- if (ra_->C->max_vector_size() > 16) {
- // Clear upper bits of YMM registers when current compiled code uses
- // wide vectors to avoid AVX <-> SSE transition penalty during call.
- MacroAssembler _masm(&cbuf);
- __ vzeroupper();
- }
+ // Clear upper bits of YMM registers to avoid AVX <-> SSE transition penalty
+ // Clear upper bits of YMM registers when current compiled code uses
+ // wide vectors to avoid AVX <-> SSE transition penalty during call.
+ MacroAssembler _masm(&cbuf);
+ __ vzeroupper();
debug_only(int off1 = cbuf.insts_size());
assert(off1 - off0 == clear_avx_size(), "correct size prediction");
%}
@@ -12116,7 +12113,7 @@
ins_cost(300);
format %{ "call_leaf_nofp,runtime " %}
- ins_encode(Java_To_Runtime(meth));
+ ins_encode(clear_avx, Java_To_Runtime(meth));
ins_pipe(pipe_slow);
%}
--- a/hotspot/src/share/vm/jvmci/vmStructs_jvmci.cpp Sat May 06 00:05:32 2017 +0000
+++ b/hotspot/src/share/vm/jvmci/vmStructs_jvmci.cpp Fri May 05 19:28:54 2017 -0700
@@ -719,7 +719,8 @@
declare_preprocessor_constant("VM_Version::CPU_AVX512BW", CPU_AVX512BW) \
declare_preprocessor_constant("VM_Version::CPU_AVX512VL", CPU_AVX512VL) \
declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA) \
- declare_preprocessor_constant("VM_Version::CPU_FMA", CPU_FMA)
+ declare_preprocessor_constant("VM_Version::CPU_FMA", CPU_FMA) \
+ declare_preprocessor_constant("VM_Version::CPU_VZEROUPPER", CPU_VZEROUPPER)
#endif