8178811: Minimize the AVX <-> SSE transition penalty through generation of vzeroupper instruction on x86
authorvdeshpande
Fri, 05 May 2017 19:28:54 -0700
changeset 46440 61025eecb743
parent 46439 6de560f6c1ad
child 46441 514c0eddaccc
8178811: Minimize the AVX <-> SSE transition penalty through generation of vzeroupper instruction on x86 Reviewed-by: kvn
hotspot/src/cpu/x86/vm/assembler_x86.cpp
hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp
hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp
hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
hotspot/src/cpu/x86/vm/vmStructs_x86.hpp
hotspot/src/cpu/x86/vm/vm_version_x86.cpp
hotspot/src/cpu/x86/vm/vm_version_x86.hpp
hotspot/src/cpu/x86/vm/x86_32.ad
hotspot/src/cpu/x86/vm/x86_64.ad
hotspot/src/share/vm/jvmci/vmStructs_jvmci.cpp
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Sat May 06 00:05:32 2017 +0000
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Fri May 05 19:28:54 2017 -0700
@@ -2103,12 +2103,20 @@
 }
 
 void Assembler::ldmxcsr( Address src) {
-  NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionMark im(this);
-  prefix(src);
-  emit_int8(0x0F);
-  emit_int8((unsigned char)0xAE);
-  emit_operand(as_Register(2), src);
+  if (UseAVX > 0 ) {
+    InstructionMark im(this);
+    InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+    vex_prefix(src, 0, 0, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+    emit_int8((unsigned char)0xAE);
+    emit_operand(as_Register(2), src);
+  } else {
+    NOT_LP64(assert(VM_Version::supports_sse(), ""));
+    InstructionMark im(this);
+    prefix(src);
+    emit_int8(0x0F);
+    emit_int8((unsigned char)0xAE);
+    emit_operand(as_Register(2), src);
+  }
 }
 
 void Assembler::leal(Register dst, Address src) {
@@ -4416,12 +4424,21 @@
 }
 
 void Assembler::stmxcsr( Address dst) {
-  NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionMark im(this);
-  prefix(dst);
-  emit_int8(0x0F);
-  emit_int8((unsigned char)0xAE);
-  emit_operand(as_Register(3), dst);
+  if (UseAVX > 0 ) {
+    assert(VM_Version::supports_avx(), "");
+    InstructionMark im(this);
+    InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+    vex_prefix(dst, 0, 0, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+    emit_int8((unsigned char)0xAE);
+    emit_operand(as_Register(3), dst);
+  } else {
+    NOT_LP64(assert(VM_Version::supports_sse(), ""));
+    InstructionMark im(this);
+    prefix(dst);
+    emit_int8(0x0F);
+    emit_int8((unsigned char)0xAE);
+    emit_operand(as_Register(3), dst);
+  }
 }
 
 void Assembler::subl(Address dst, int32_t imm32) {
@@ -6620,10 +6637,11 @@
 }
 
 void Assembler::vzeroupper() {
-  assert(VM_Version::supports_avx(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
-  (void)vex_prefix_and_encode(0, 0, 0, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
-  emit_int8(0x77);
+  if (VM_Version::supports_vzeroupper()) {
+    InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+    (void)vex_prefix_and_encode(0, 0, 0, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
+    emit_int8(0x77);
+  }
 }
 
 #ifndef _LP64
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Sat May 06 00:05:32 2017 +0000
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Fri May 05 19:28:54 2017 -0700
@@ -763,11 +763,13 @@
 
   // Always clear the pc because it could have been set by make_walkable()
   movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
+  vzeroupper();
 }
 
 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
                                          Register last_java_fp,
                                          address  last_java_pc) {
+  vzeroupper();
   // determine last_java_sp register
   if (!last_java_sp->is_valid()) {
     last_java_sp = rsp;
@@ -3672,6 +3674,7 @@
   // Always clear the pc because it could have been set by make_walkable()
   movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
 
+  vzeroupper();
 }
 
 void MacroAssembler::restore_rax(Register tmp) {
@@ -3714,6 +3717,7 @@
                                          Register last_java_sp,
                                          Register last_java_fp,
                                          address  last_java_pc) {
+  vzeroupper();
   // determine java_thread register
   if (!java_thread->is_valid()) {
     java_thread = rdi;
@@ -6524,10 +6528,8 @@
       call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
     }
   }
-  if (VM_Version::supports_avx()) {
-    // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
-    vzeroupper();
-  }
+  // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
+  vzeroupper();
 
 #ifndef _LP64
   // Either restore the x87 floating pointer control word after returning
--- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Sat May 06 00:05:32 2017 +0000
+++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Fri May 05 19:28:54 2017 -0700
@@ -41,6 +41,7 @@
 #ifdef COMPILER2
 #include "opto/runtime.hpp"
 #endif
+#include "vm_version_x86.hpp"
 
 #define __ masm->
 
@@ -120,8 +121,8 @@
   int zmm_bytes = num_xmm_regs * 32;
 #ifdef COMPILER2
   if (save_vectors) {
-    assert(UseAVX > 0, "up to 512bit vectors are supported with EVEX");
-    assert(MaxVectorSize <= 64, "up to 512bit vectors are supported now");
+    assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
+    assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
     // Save upper half of YMM registers
     int vect_bytes = ymm_bytes;
     if (UseAVX > 2) {
@@ -219,6 +220,7 @@
       }
     }
   }
+  __ vzeroupper();
 
   // Set an oopmap for the call site.  This oopmap will map all
   // oop-registers and debug-info registers as callee-saved.  This
@@ -269,8 +271,8 @@
   int additional_frame_bytes = 0;
 #ifdef COMPILER2
   if (restore_vectors) {
-    assert(UseAVX > 0, "up to 512bit vectors are supported with EVEX");
-    assert(MaxVectorSize <= 64, "up to 512bit vectors are supported now");
+    assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
+    assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
     // Save upper half of YMM registers
     additional_frame_bytes = ymm_bytes;
     if (UseAVX > 2) {
@@ -285,6 +287,8 @@
   int off = xmm0_off;
   int delta = xmm1_off - off;
 
+  __ vzeroupper();
+
   if (UseSSE == 1) {
     // Restore XMM registers
     assert(additional_frame_bytes == 0, "");
@@ -2123,6 +2127,8 @@
     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
     // by hand.
     //
+    __ vzeroupper();
+
     save_native_result(masm, ret_type, stack_slots);
     __ push(thread);
     if (!is_critical_native) {
@@ -2304,7 +2310,7 @@
 
     // BEGIN Slow path unlock
     __ bind(slow_path_unlock);
-
+    __ vzeroupper();
     // Slow path unlock
 
     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
@@ -2349,6 +2355,7 @@
   // SLOW PATH Reguard the stack if needed
 
   __ bind(reguard);
+  __ vzeroupper();
   save_native_result(masm, ret_type, stack_slots);
   {
     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
--- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Sat May 06 00:05:32 2017 +0000
+++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Fri May 05 19:28:54 2017 -0700
@@ -47,6 +47,7 @@
 #if INCLUDE_JVMCI
 #include "jvmci/jvmciJavaClasses.hpp"
 #endif
+#include "vm_version_x86.hpp"
 
 #define __ masm->
 
@@ -151,8 +152,8 @@
   }
 #if defined(COMPILER2) || INCLUDE_JVMCI
   if (save_vectors) {
-    assert(UseAVX > 0, "up to 512bit vectors are supported with EVEX");
-    assert(MaxVectorSize <= 64, "up to 512bit vectors are supported now");
+    assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
+    assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
   }
 #else
   assert(!save_vectors, "vectors are generated only by C2 and JVMCI");
@@ -206,6 +207,7 @@
       }
     }
   }
+  __ vzeroupper();
   if (frame::arg_reg_save_area_bytes != 0) {
     // Allocate argument register save area
     __ subptr(rsp, frame::arg_reg_save_area_bytes);
@@ -322,13 +324,15 @@
 
 #if defined(COMPILER2) || INCLUDE_JVMCI
   if (restore_vectors) {
-    assert(UseAVX > 0, "up to 512bit vectors are supported with EVEX");
-    assert(MaxVectorSize <= 64, "up to 512bit vectors are supported now");
+    assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
+    assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
   }
 #else
   assert(!restore_vectors, "vectors are generated only by C2");
 #endif
 
+  __ vzeroupper();
+
   // On EVEX enabled targets everything is handled in pop fpu state
   if (restore_vectors) {
     // Restore upper half of YMM registers (0..15)
@@ -528,7 +532,7 @@
   // align stack so push_CPU_state doesn't fault
   __ andptr(rsp, -(StackAlignmentInBytes));
   __ push_CPU_state();
-
+  __ vzeroupper();
   // VM needs caller's callsite
   // VM needs target method
   // This needs to be a long call since we will relocate this adapter to
@@ -547,6 +551,7 @@
     __ addptr(rsp, frame::arg_reg_save_area_bytes);
   }
 
+  __ vzeroupper();
   __ pop_CPU_state();
   // restore sp
   __ mov(rsp, r13);
@@ -1465,7 +1470,6 @@
 
   save_or_restore_arguments(masm, stack_slots, total_in_args,
                             arg_save_area, NULL, in_regs, in_sig_bt);
-
   __ bind(cont);
 #ifdef ASSERT
   if (StressCriticalJNINatives) {
@@ -2485,6 +2489,7 @@
     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
     // by hand.
     //
+    __ vzeroupper();
     save_native_result(masm, ret_type, stack_slots);
     __ mov(c_rarg0, r15_thread);
     __ mov(r12, rsp); // remember sp
@@ -2658,7 +2663,7 @@
 
     // If we haven't already saved the native result we must save it now as xmm registers
     // are still exposed.
-
+    __ vzeroupper();
     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
       save_native_result(masm, ret_type, stack_slots);
     }
@@ -2704,6 +2709,7 @@
   // SLOW PATH Reguard the stack if needed
 
   __ bind(reguard);
+  __ vzeroupper();
   save_native_result(masm, ret_type, stack_slots);
   __ mov(r12, rsp); // remember sp
   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Sat May 06 00:05:32 2017 +0000
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Fri May 05 19:28:54 2017 -0700
@@ -1012,6 +1012,7 @@
     __ pop(rdi);
     __ pop(rsi);
     __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ vzeroupper();
     __ xorptr(rax, rax); // return 0
     __ ret(0);
     return start;
@@ -1247,6 +1248,7 @@
     }
     inc_copy_counter_np(T_LONG);
     __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ vzeroupper();
     __ xorptr(rax, rax); // return 0
     __ ret(0);
     return start;
@@ -3365,6 +3367,7 @@
     __ pop(rbx);
     __ pop(rdi);
     __ pop(rsi);
+    __ vzeroupper();
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
@@ -3422,6 +3425,7 @@
       __ pop(h);
       __ pop(g);
       __ pop(d);
+    __ vzeroupper();
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Sat May 06 00:05:32 2017 +0000
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Fri May 05 19:28:54 2017 -0700
@@ -402,6 +402,7 @@
     __ addptr(rsp, -rsp_after_call_off * wordSize);
 
     // return
+    __ vzeroupper();
     __ pop(rbp);
     __ ret(0);
 
@@ -1554,6 +1555,7 @@
     restore_arg_regs();
     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
     __ xorptr(rax, rax); // return 0
+    __ vzeroupper();
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
@@ -1643,6 +1645,7 @@
     restore_arg_regs();
     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
     __ xorptr(rax, rax); // return 0
+    __ vzeroupper();
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
@@ -1652,6 +1655,7 @@
     restore_arg_regs();
     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
     __ xorptr(rax, rax); // return 0
+    __ vzeroupper();
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
@@ -1746,6 +1750,7 @@
     restore_arg_regs();
     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
     __ xorptr(rax, rax); // return 0
+    __ vzeroupper();
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
@@ -1771,6 +1776,7 @@
 
     __ generate_fill(t, aligned, to, value, count, rax, xmm0);
 
+    __ vzeroupper();
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
     return start;
@@ -1847,6 +1853,7 @@
     restore_arg_regs();
     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
     __ xorptr(rax, rax); // return 0
+    __ vzeroupper();
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
@@ -1856,6 +1863,7 @@
     restore_arg_regs();
     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
     __ xorptr(rax, rax); // return 0
+    __ vzeroupper();
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
@@ -1945,6 +1953,7 @@
     }
     restore_arg_regs();
     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
+    __ vzeroupper();
     __ xorptr(rax, rax); // return 0
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
@@ -2030,6 +2039,7 @@
     restore_arg_regs();
     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
     __ xorptr(rax, rax); // return 0
+    __ vzeroupper();
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
@@ -2043,6 +2053,7 @@
     restore_arg_regs();
     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
     __ xorptr(rax, rax); // return 0
+    __ vzeroupper();
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
@@ -2120,6 +2131,7 @@
       restore_arg_regs();
       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
       __ xorptr(rax, rax); // return 0
+      __ vzeroupper();
       __ leave(); // required for proper stackwalking of RuntimeStub frame
       __ ret(0);
     }
@@ -2137,6 +2149,7 @@
     } else {
       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
     }
+    __ vzeroupper();
     __ xorptr(rax, rax); // return 0
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
@@ -2203,6 +2216,7 @@
       restore_arg_regs();
       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
       __ xorptr(rax, rax); // return 0
+      __ vzeroupper();
       __ leave(); // required for proper stackwalking of RuntimeStub frame
       __ ret(0);
     }
@@ -2220,6 +2234,7 @@
     } else {
       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
     }
+    __ vzeroupper();
     __ xorptr(rax, rax); // return 0
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
@@ -3774,7 +3789,7 @@
         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
     }
     __ addptr(rsp, 4 * wordSize);
-
+    __ vzeroupper();
     __ leave();
     __ ret(0);
     return start;
@@ -3808,6 +3823,7 @@
     __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
     buf, state, ofs, limit, rsp, multi_block, shuf_mask);
 
+    __ vzeroupper();
     __ leave();
     __ ret(0);
     return start;
@@ -4281,7 +4297,6 @@
     __ BIND(L_exit);
     __ pshufb(xmm_temp6, xmm_temp10);          // Byte swap 16-byte result
     __ movdqu(Address(state, 0), xmm_temp6);   // store the result
-
     __ leave();
     __ ret(0);
     return start;
@@ -4321,6 +4336,7 @@
     __ kernel_crc32(crc, buf, len, table, tmp);
 
     __ movl(rax, crc);
+    __ vzeroupper();
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
@@ -4380,6 +4396,7 @@
       __ pop(z);
       __ pop(y);
 #endif
+      __ vzeroupper();
       __ leave(); // required for proper stackwalking of RuntimeStub frame
       __ ret(0);
 
@@ -4494,6 +4511,7 @@
 
     __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
 
+    __ vzeroupper();
     __ leave();
     __ ret(0);
 
@@ -4618,7 +4636,7 @@
     BLOCK_COMMENT("Entry:");
     __ enter(); // required for proper stackwalking of RuntimeStub frame
 
-      __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
+    __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
 
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
--- a/hotspot/src/cpu/x86/vm/vmStructs_x86.hpp	Sat May 06 00:05:32 2017 +0000
+++ b/hotspot/src/cpu/x86/vm/vmStructs_x86.hpp	Fri May 05 19:28:54 2017 -0700
@@ -74,6 +74,7 @@
   declare_preprocessor_constant("VM_Version::CPU_AVX512BW", CPU_AVX512BW) \
   declare_preprocessor_constant("VM_Version::CPU_AVX512VL", CPU_AVX512VL) \
   declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA)           \
-  declare_preprocessor_constant("VM_Version::CPU_FMA", CPU_FMA)
+  declare_preprocessor_constant("VM_Version::CPU_FMA", CPU_FMA)           \
+  declare_preprocessor_constant("VM_Version::CPU_VZEROUPPER", CPU_VZEROUPPER)
 
 #endif // CPU_X86_VM_VMSTRUCTS_X86_HPP
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp	Sat May 06 00:05:32 2017 +0000
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp	Fri May 05 19:28:54 2017 -0700
@@ -436,14 +436,14 @@
     __ movl(rax, 0x10000);
     __ andl(rax, Address(rsi, 4));
     __ cmpl(rax, 0x10000);
-    __ jccb(Assembler::notEqual, legacy_save_restore);
+    __ jcc(Assembler::notEqual, legacy_save_restore);
     // check _cpuid_info.xem_xcr0_eax.bits.opmask
     // check _cpuid_info.xem_xcr0_eax.bits.zmm512
     // check _cpuid_info.xem_xcr0_eax.bits.zmm32
     __ movl(rax, 0xE0);
     __ andl(rax, Address(rbp, in_bytes(VM_Version::xem_xcr0_offset()))); // xcr0 bits sse | ymm
     __ cmpl(rax, 0xE0);
-    __ jccb(Assembler::notEqual, legacy_save_restore);
+    __ jcc(Assembler::notEqual, legacy_save_restore);
 
     // If UseAVX is unitialized or is set by the user to include EVEX
     if (use_evex) {
@@ -469,11 +469,12 @@
       __ evmovdqul(xmm7, Address(rsp, 0), Assembler::AVX_512bit);
       __ addptr(rsp, 64);
 #endif // _WINDOWS
+      generate_vzeroupper(wrapup);
       VM_Version::clean_cpuFeatures();
       UseAVX = saved_useavx;
       UseSSE = saved_usesse;
       __ jmp(wrapup);
-    }
+   }
 
     __ bind(legacy_save_restore);
     // AVX check
@@ -498,6 +499,7 @@
     __ vmovdqu(xmm7, Address(rsp, 0));
     __ addptr(rsp, 32);
 #endif // _WINDOWS
+    generate_vzeroupper(wrapup);
     VM_Version::clean_cpuFeatures();
     UseAVX = saved_useavx;
     UseSSE = saved_usesse;
@@ -513,6 +515,21 @@
 
     return start;
   };
+  void generate_vzeroupper(Label& L_wrapup) {
+#   define __ _masm->
+    __ lea(rsi, Address(rbp, in_bytes(VM_Version::std_cpuid0_offset())));
+    __ cmpl(Address(rsi, 4), 0x756e6547);  // 'uneG'
+    __ jcc(Assembler::notEqual, L_wrapup);
+    __ movl(rcx, 0x0FFF0FF0);
+    __ lea(rsi, Address(rbp, in_bytes(VM_Version::std_cpuid1_offset())));
+    __ andl(rcx, Address(rsi, 0));
+    __ cmpl(rcx, 0x00050670);              // If it is Xeon Phi 3200/5200/7200
+    __ jcc(Assembler::equal, L_wrapup);
+    __ cmpl(rcx, 0x00080650);              // If it is Future Xeon Phi
+    __ jcc(Assembler::equal, L_wrapup);
+    __ vzeroupper();
+#   undef __
+  }
 };
 
 void VM_Version::get_processor_features() {
@@ -619,8 +636,10 @@
   if (UseAVX < 2)
     _features &= ~CPU_AVX2;
 
-  if (UseAVX < 1)
+  if (UseAVX < 1) {
     _features &= ~CPU_AVX;
+    _features &= ~CPU_VZEROUPPER;
+  }
 
   if (!UseAES && !FLAG_IS_DEFAULT(UseAES))
     _features &= ~CPU_AES;
@@ -630,6 +649,14 @@
     _features &= ~CPU_HT;
   }
 
+  if( is_intel() ) { // Intel cpus specific settings
+    if ((cpu_family() == 0x06) &&
+        ((extended_cpu_model() == 0x57) ||   // Xeon Phi 3200/5200/7200
+        (extended_cpu_model() == 0x85))) {  // Future Xeon Phi
+      _features &= ~CPU_VZEROUPPER;
+    }
+  }
+
   char buf[256];
   jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
                cores_per_cpu(), threads_per_core(),
@@ -918,16 +945,36 @@
       warning("MaxVectorSize must be a power of 2");
       FLAG_SET_DEFAULT(MaxVectorSize, 64);
     }
-    if (MaxVectorSize > 64) {
-      FLAG_SET_DEFAULT(MaxVectorSize, 64);
-    }
-    if (MaxVectorSize > 16 && (UseAVX == 0 || !os_supports_avx_vectors())) {
-      // 32 bytes vectors (in YMM) are only supported with AVX+
-      FLAG_SET_DEFAULT(MaxVectorSize, 16);
-    }
     if (UseSSE < 2) {
       // Vectors (in XMM) are only supported with SSE2+
-      FLAG_SET_DEFAULT(MaxVectorSize, 0);
+      if (MaxVectorSize > 0) {
+        if (!FLAG_IS_DEFAULT(MaxVectorSize))
+          warning("MaxVectorSize must be 0");
+        FLAG_SET_DEFAULT(MaxVectorSize, 0);
+      }
+    }
+    else if (UseAVX == 0 || !os_supports_avx_vectors()) {
+      // 32 bytes vectors (in YMM) are only supported with AVX+
+      if (MaxVectorSize > 16) {
+        if (!FLAG_IS_DEFAULT(MaxVectorSize))
+          warning("MaxVectorSize must be <= 16");
+        FLAG_SET_DEFAULT(MaxVectorSize, 16);
+      }
+    }
+    else if (UseAVX == 1 || UseAVX == 2) {
+      // 64 bytes vectors (in ZMM) are only supported with AVX 3
+      if (MaxVectorSize > 32) {
+        if (!FLAG_IS_DEFAULT(MaxVectorSize))
+          warning("MaxVectorSize must be <= 32");
+        FLAG_SET_DEFAULT(MaxVectorSize, 32);
+      }
+    }
+    else if (UseAVX > 2 ) {
+      if (MaxVectorSize > 64) {
+        if (!FLAG_IS_DEFAULT(MaxVectorSize))
+          warning("MaxVectorSize must be <= 64");
+        FLAG_SET_DEFAULT(MaxVectorSize, 64);
+      }
     }
 #if defined(COMPILER2) && defined(ASSERT)
     if (supports_avx() && PrintMiscellaneous && Verbose && TraceNewVectors) {
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp	Sat May 06 00:05:32 2017 +0000
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp	Fri May 05 19:28:54 2017 -0700
@@ -291,6 +291,7 @@
 #define CPU_AVX512VL ((uint64_t)UCONST64(0x200000000)) // EVEX instructions with smaller vector length
 #define CPU_SHA ((uint64_t)UCONST64(0x400000000))      // SHA instructions
 #define CPU_FMA ((uint64_t)UCONST64(0x800000000))      // FMA instructions
+#define CPU_VZEROUPPER ((uint64_t)UCONST64(0x1000000000))      // Vzeroupper instruction
 
   enum Extended_Family {
     // AMD
@@ -468,6 +469,7 @@
         _cpuid_info.xem_xcr0_eax.bits.sse != 0 &&
         _cpuid_info.xem_xcr0_eax.bits.ymm != 0) {
       result |= CPU_AVX;
+      result |= CPU_VZEROUPPER;
       if (_cpuid_info.sef_cpuid7_ebx.bits.avx2 != 0)
         result |= CPU_AVX2;
       if (_cpuid_info.sef_cpuid7_ebx.bits.avx512f != 0 &&
@@ -605,8 +607,8 @@
   static address  cpuinfo_cont_addr()           { return _cpuinfo_cont_addr; }
 
   static void clean_cpuFeatures()   { _features = 0; }
-  static void set_avx_cpuFeatures() { _features = (CPU_SSE | CPU_SSE2 | CPU_AVX); }
-  static void set_evex_cpuFeatures() { _features = (CPU_AVX512F | CPU_SSE | CPU_SSE2 ); }
+  static void set_avx_cpuFeatures() { _features = (CPU_SSE | CPU_SSE2 | CPU_AVX | CPU_VZEROUPPER ); }
+  static void set_evex_cpuFeatures() { _features = (CPU_AVX512F | CPU_SSE | CPU_SSE2 | CPU_VZEROUPPER ); }
 
 
   // Initialization
@@ -731,6 +733,8 @@
   static bool supports_avxonly()    { return ((supports_avx2() || supports_avx()) && !supports_evex()); }
   static bool supports_sha()        { return (_features & CPU_SHA) != 0; }
   static bool supports_fma()        { return (_features & CPU_FMA) != 0; }
+  static bool supports_vzeroupper() { return (_features & CPU_VZEROUPPER) != 0; }
+
   // Intel features
   static bool is_intel_family_core() { return is_intel() &&
                                        extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }
--- a/hotspot/src/cpu/x86/vm/x86_32.ad	Sat May 06 00:05:32 2017 +0000
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad	Fri May 05 19:28:54 2017 -0700
@@ -290,7 +290,7 @@
   if (C->in_24_bit_fp_mode()) {
     size += 6; // fldcw
   }
-  if (C->max_vector_size() > 16) {
+  if (VM_Version::supports_vzeroupper()) {
     size += 3; // vzeroupper
   }
   return size;
@@ -1884,7 +1884,6 @@
     }
   %}
 
-
   enc_class pre_call_resets %{
     // If method sets FPU control word restore it here
     debug_only(int off0 = cbuf.insts_size());
@@ -1892,12 +1891,10 @@
       MacroAssembler _masm(&cbuf);
       __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
     }
-    if (ra_->C->max_vector_size() > 16) {
-      // Clear upper bits of YMM registers when current compiled code uses
-      // wide vectors to avoid AVX <-> SSE transition penalty during call.
-      MacroAssembler _masm(&cbuf);
-      __ vzeroupper();
-    }
+    // Clear upper bits of YMM registers when current compiled code uses
+    // wide vectors to avoid AVX <-> SSE transition penalty during call.
+    MacroAssembler _masm(&cbuf);
+    __ vzeroupper();
     debug_only(int off1 = cbuf.insts_size());
     assert(off1 - off0 == pre_call_resets_size(), "correct size prediction");
   %}
@@ -13072,7 +13069,7 @@
   ins_cost(300);
   format %{ "CALL_LEAF_NOFP,runtime " %}
   opcode(0xE8); /* E8 cd */
-  ins_encode(Java_To_Runtime(meth));
+  ins_encode(pre_call_resets, Java_To_Runtime(meth));
   ins_pipe( pipe_slow );
 %}
 
--- a/hotspot/src/cpu/x86/vm/x86_64.ad	Sat May 06 00:05:32 2017 +0000
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad	Fri May 05 19:28:54 2017 -0700
@@ -536,7 +536,7 @@
 #define __ _masm.
 
 static int clear_avx_size() {
-  return (Compile::current()->max_vector_size() > 16) ? 3 : 0;  // vzeroupper
+  return (VM_Version::supports_vzeroupper()) ? 3: 0;  // vzeroupper
 }
 
 // !!!!! Special hack to get all types of calls to specify the byte offset
@@ -919,7 +919,7 @@
 void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const
 {
   Compile* C = ra_->C;
-  if (C->max_vector_size() > 16) {
+  if (VM_Version::supports_vzeroupper()) {
     st->print("vzeroupper");
     st->cr(); st->print("\t");
   }
@@ -955,11 +955,9 @@
   Compile* C = ra_->C;
   MacroAssembler _masm(&cbuf);
 
-  if (C->max_vector_size() > 16) {
-    // Clear upper bits of YMM registers when current compiled code uses
-    // wide vectors to avoid AVX <-> SSE transition penalty during call.
-    __ vzeroupper();
-  }
+  // Clear upper bits of YMM registers when current compiled code uses
+  // wide vectors to avoid AVX <-> SSE transition penalty during call.
+  __ vzeroupper();
 
   int framesize = C->frame_size_in_bytes();
   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
@@ -2092,12 +2090,11 @@
 
   enc_class clear_avx %{
     debug_only(int off0 = cbuf.insts_size());
-    if (ra_->C->max_vector_size() > 16) {
-      // Clear upper bits of YMM registers when current compiled code uses
-      // wide vectors to avoid AVX <-> SSE transition penalty during call.
-      MacroAssembler _masm(&cbuf);
-      __ vzeroupper();
-    }
+    // Clear upper bits of YMM registers to avoid AVX <-> SSE transition penalty
+    // Clear upper bits of YMM registers when current compiled code uses
+    // wide vectors to avoid AVX <-> SSE transition penalty during call.
+    MacroAssembler _masm(&cbuf);
+    __ vzeroupper();
     debug_only(int off1 = cbuf.insts_size());
     assert(off1 - off0 == clear_avx_size(), "correct size prediction");
   %}
@@ -12116,7 +12113,7 @@
 
   ins_cost(300);
   format %{ "call_leaf_nofp,runtime " %}
-  ins_encode(Java_To_Runtime(meth));
+  ins_encode(clear_avx, Java_To_Runtime(meth));
   ins_pipe(pipe_slow);
 %}
 
--- a/hotspot/src/share/vm/jvmci/vmStructs_jvmci.cpp	Sat May 06 00:05:32 2017 +0000
+++ b/hotspot/src/share/vm/jvmci/vmStructs_jvmci.cpp	Fri May 05 19:28:54 2017 -0700
@@ -719,7 +719,8 @@
   declare_preprocessor_constant("VM_Version::CPU_AVX512BW", CPU_AVX512BW) \
   declare_preprocessor_constant("VM_Version::CPU_AVX512VL", CPU_AVX512VL) \
   declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA)           \
-  declare_preprocessor_constant("VM_Version::CPU_FMA", CPU_FMA)
+  declare_preprocessor_constant("VM_Version::CPU_FMA", CPU_FMA)           \
+  declare_preprocessor_constant("VM_Version::CPU_VZEROUPPER", CPU_VZEROUPPER)
 
 #endif