8011102: Clear AVX registers after return from JNI call
Summary: Execute vzeroupper instruction after JNI call and on exits in jit compiled code which use 256bit vectors.
Reviewed-by: roland
--- a/hotspot/src/cpu/x86/vm/cppInterpreter_x86.cpp Tue Apr 02 09:30:07 2013 +0200
+++ b/hotspot/src/cpu/x86/vm/cppInterpreter_x86.cpp Wed Apr 03 11:12:57 2013 -0700
@@ -1299,25 +1299,8 @@
__ push(rdx);
#endif // _LP64
- // Either restore the MXCSR register after returning from the JNI Call
- // or verify that it wasn't changed.
- if (VM_Version::supports_sse()) {
- if (RestoreMXCSROnJNICalls) {
- __ ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
- }
- else if (CheckJNICalls ) {
- __ call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
- }
- }
-
-#ifndef _LP64
- // Either restore the x87 floating pointer control word after returning
- // from the JNI call or verify that it wasn't changed.
- if (CheckJNICalls) {
- __ call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
- }
-#endif // _LP64
-
+ // Verify or restore cpu control state after JNI call
+ __ restore_cpu_control_state_after_jni();
// change thread state
__ movl(Address(thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Tue Apr 02 09:30:07 2013 +0200
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Wed Apr 03 11:12:57 2013 -0700
@@ -4765,6 +4765,31 @@
pop_CPU_state();
}
+void MacroAssembler::restore_cpu_control_state_after_jni() {
+ // Either restore the MXCSR register after returning from the JNI Call
+ // or verify that it wasn't changed (with -Xcheck:jni flag).
+ if (VM_Version::supports_sse()) {
+ if (RestoreMXCSROnJNICalls) {
+ ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
+ } else if (CheckJNICalls) {
+ call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
+ }
+ }
+ if (VM_Version::supports_avx()) {
+ // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
+ vzeroupper();
+ }
+
+#ifndef _LP64
+ // Either restore the x87 floating pointer control word after returning
+ // from the JNI call or verify that it wasn't changed.
+ if (CheckJNICalls) {
+ call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
+ }
+#endif // _LP64
+}
+
+
void MacroAssembler::load_klass(Register dst, Register src) {
#ifdef _LP64
if (UseCompressedKlassPointers) {
@@ -5759,6 +5784,8 @@
addptr(result, stride2);
subl(cnt2, stride2);
jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
+ // clean upper bits of YMM registers
+ vzeroupper();
// compare wide vectors tail
bind(COMPARE_WIDE_TAIL);
@@ -5772,6 +5799,8 @@
// Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
bind(VECTOR_NOT_EQUAL);
+ // clean upper bits of YMM registers
+ vzeroupper();
lea(str1, Address(str1, result, scale));
lea(str2, Address(str2, result, scale));
jmp(COMPARE_16_CHARS);
@@ -6028,6 +6057,10 @@
// That's it
bind(DONE);
+ if (UseAVX >= 2) {
+ // clean upper bits of YMM registers
+ vzeroupper();
+ }
}
void MacroAssembler::generate_fill(BasicType t, bool aligned,
@@ -6157,6 +6190,10 @@
vmovdqu(Address(to, 0), xtmp);
addptr(to, 32);
subl(count, 8 << shift);
+
+ BIND(L_check_fill_8_bytes);
+ // clean upper bits of YMM registers
+ vzeroupper();
} else {
// Fill 32-byte chunks
pshufd(xtmp, xtmp, 0);
@@ -6180,8 +6217,9 @@
addptr(to, 32);
subl(count, 8 << shift);
jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
+
+ BIND(L_check_fill_8_bytes);
}
- BIND(L_check_fill_8_bytes);
addl(count, 8 << shift);
jccb(Assembler::zero, L_exit);
jmpb(L_fill_8_bytes);
@@ -6316,6 +6354,10 @@
jccb(Assembler::lessEqual, L_copy_16_chars);
bind(L_copy_16_chars_exit);
+ if (UseAVX >= 2) {
+ // clean upper bits of YMM registers
+ vzeroupper();
+ }
subptr(len, 8);
jccb(Assembler::greater, L_copy_8_chars_exit);
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp Tue Apr 02 09:30:07 2013 +0200
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp Wed Apr 03 11:12:57 2013 -0700
@@ -582,6 +582,9 @@
// only if +VerifyFPU
void verify_FPU(int stack_depth, const char* s = "illegal FPU state");
+ // Verify or restore cpu control state after JNI call
+ void restore_cpu_control_state_after_jni();
+
// prints msg, dumps registers and stops execution
void stop(const char* msg);
--- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp Tue Apr 02 09:30:07 2013 +0200
+++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp Wed Apr 03 11:12:57 2013 -0700
@@ -2065,6 +2065,9 @@
__ call(RuntimeAddress(native_func));
+ // Verify or restore cpu control state after JNI call
+ __ restore_cpu_control_state_after_jni();
+
// WARNING - on Windows Java Natives use pascal calling convention and pop the
// arguments off of the stack. We could just re-adjust the stack pointer here
// and continue to do SP relative addressing but we instead switch to FP
--- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp Tue Apr 02 09:30:07 2013 +0200
+++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp Wed Apr 03 11:12:57 2013 -0700
@@ -2315,16 +2315,8 @@
__ call(RuntimeAddress(native_func));
- // Either restore the MXCSR register after returning from the JNI Call
- // or verify that it wasn't changed.
- if (RestoreMXCSROnJNICalls) {
- __ ldmxcsr(ExternalAddress(StubRoutines::x86::mxcsr_std()));
-
- }
- else if (CheckJNICalls ) {
- __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::verify_mxcsr_entry())));
- }
-
+ // Verify or restore cpu control state after JNI call
+ __ restore_cpu_control_state_after_jni();
// Unpack native results.
switch (ret_type) {
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp Tue Apr 02 09:30:07 2013 +0200
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp Wed Apr 03 11:12:57 2013 -0700
@@ -835,6 +835,11 @@
__ BIND(L_copy_64_bytes);
__ subl(qword_count, 8);
__ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
+
+ if (UseUnalignedLoadStores && (UseAVX >= 2)) {
+ // clean upper bits of YMM registers
+ __ vzeroupper();
+ }
__ addl(qword_count, 8);
__ jccb(Assembler::zero, L_exit);
//
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp Tue Apr 02 09:30:07 2013 +0200
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp Wed Apr 03 11:12:57 2013 -0700
@@ -1331,6 +1331,10 @@
}
__ addptr(qword_count, 4);
__ BIND(L_end);
+ if (UseAVX >= 2) {
+ // clean upper bits of YMM registers
+ __ vzeroupper();
+ }
} else {
// Copy 32-bytes per iteration
__ BIND(L_loop);
@@ -1404,6 +1408,10 @@
}
__ subptr(qword_count, 4);
__ BIND(L_end);
+ if (UseAVX >= 2) {
+ // clean upper bits of YMM registers
+ __ vzeroupper();
+ }
} else {
// Copy 32-bytes per iteration
__ BIND(L_loop);
--- a/hotspot/src/cpu/x86/vm/templateInterpreter_x86_32.cpp Tue Apr 02 09:30:07 2013 +0200
+++ b/hotspot/src/cpu/x86/vm/templateInterpreter_x86_32.cpp Wed Apr 03 11:12:57 2013 -0700
@@ -1080,22 +1080,8 @@
// result potentially in rdx:rax or ST0
- // Either restore the MXCSR register after returning from the JNI Call
- // or verify that it wasn't changed.
- if (VM_Version::supports_sse()) {
- if (RestoreMXCSROnJNICalls) {
- __ ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
- }
- else if (CheckJNICalls ) {
- __ call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
- }
- }
-
- // Either restore the x87 floating pointer control word after returning
- // from the JNI call or verify that it wasn't changed.
- if (CheckJNICalls) {
- __ call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
- }
+ // Verify or restore cpu control state after JNI call
+ __ restore_cpu_control_state_after_jni();
// save potential result in ST(0) & rdx:rax
// (if result handler is the T_FLOAT or T_DOUBLE handler, result must be in ST0 -
--- a/hotspot/src/cpu/x86/vm/templateInterpreter_x86_64.cpp Tue Apr 02 09:30:07 2013 +0200
+++ b/hotspot/src/cpu/x86/vm/templateInterpreter_x86_64.cpp Wed Apr 03 11:12:57 2013 -0700
@@ -1079,15 +1079,8 @@
__ call(rax);
// result potentially in rax or xmm0
- // Depending on runtime options, either restore the MXCSR
- // register after returning from the JNI Call or verify that
- // it wasn't changed during -Xcheck:jni.
- if (RestoreMXCSROnJNICalls) {
- __ ldmxcsr(ExternalAddress(StubRoutines::x86::mxcsr_std()));
- }
- else if (CheckJNICalls) {
- __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::verify_mxcsr_entry())));
- }
+ // Verify or restore cpu control state after JNI call
+ __ restore_cpu_control_state_after_jni();
// NOTE: The order of these pushes is known to frame::interpreter_frame_result
// in order to extract the result of a method call. If the order of these
--- a/hotspot/src/cpu/x86/vm/x86_32.ad Tue Apr 02 09:30:07 2013 +0200
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad Wed Apr 03 11:12:57 2013 -0700
@@ -228,10 +228,16 @@
static jlong *double_signflip_pool = double_quadword(&fp_signmask_pool[4*2], CONST64(0x8000000000000000), CONST64(0x8000000000000000));
// Offset hacking within calls.
-static int pre_call_FPU_size() {
- if (Compile::current()->in_24_bit_fp_mode())
- return 6; // fldcw
- return 0;
+static int pre_call_resets_size() {
+ int size = 0;
+ Compile* C = Compile::current();
+ if (C->in_24_bit_fp_mode()) {
+ size += 6; // fldcw
+ }
+ if (C->max_vector_size() > 16) {
+ size += 3; // vzeroupper
+ }
+ return size;
}
static int preserve_SP_size() {
@@ -242,21 +248,21 @@
// from the start of the call to the point where the return address
// will point.
int MachCallStaticJavaNode::ret_addr_offset() {
- int offset = 5 + pre_call_FPU_size(); // 5 bytes from start of call to where return address points
+ int offset = 5 + pre_call_resets_size(); // 5 bytes from start of call to where return address points
if (_method_handle_invoke)
offset += preserve_SP_size();
return offset;
}
int MachCallDynamicJavaNode::ret_addr_offset() {
- return 10 + pre_call_FPU_size(); // 10 bytes from start of call to where return address points
+ return 10 + pre_call_resets_size(); // 10 bytes from start of call to where return address points
}
static int sizeof_FFree_Float_Stack_All = -1;
int MachCallRuntimeNode::ret_addr_offset() {
assert(sizeof_FFree_Float_Stack_All != -1, "must have been emitted already");
- return sizeof_FFree_Float_Stack_All + 5 + pre_call_FPU_size();
+ return sizeof_FFree_Float_Stack_All + 5 + pre_call_resets_size();
}
// Indicate if the safepoint node needs the polling page as an input.
@@ -272,7 +278,7 @@
// The address of the call instruction needs to be 4-byte aligned to
// ensure that it does not span a cache line so that it can be patched.
int CallStaticJavaDirectNode::compute_padding(int current_offset) const {
- current_offset += pre_call_FPU_size(); // skip fldcw, if any
+ current_offset += pre_call_resets_size(); // skip fldcw, if any
current_offset += 1; // skip call opcode byte
return round_to(current_offset, alignment_required()) - current_offset;
}
@@ -280,7 +286,7 @@
// The address of the call instruction needs to be 4-byte aligned to
// ensure that it does not span a cache line so that it can be patched.
int CallStaticJavaHandleNode::compute_padding(int current_offset) const {
- current_offset += pre_call_FPU_size(); // skip fldcw, if any
+ current_offset += pre_call_resets_size(); // skip fldcw, if any
current_offset += preserve_SP_size(); // skip mov rbp, rsp
current_offset += 1; // skip call opcode byte
return round_to(current_offset, alignment_required()) - current_offset;
@@ -289,7 +295,7 @@
// The address of the call instruction needs to be 4-byte aligned to
// ensure that it does not span a cache line so that it can be patched.
int CallDynamicJavaDirectNode::compute_padding(int current_offset) const {
- current_offset += pre_call_FPU_size(); // skip fldcw, if any
+ current_offset += pre_call_resets_size(); // skip fldcw, if any
current_offset += 5; // skip MOV instruction
current_offset += 1; // skip call opcode byte
return round_to(current_offset, alignment_required()) - current_offset;
@@ -583,16 +589,20 @@
// Remove two words for return addr and rbp,
framesize -= 2*wordSize;
- if( C->in_24_bit_fp_mode() ) {
+ if (C->max_vector_size() > 16) {
+ st->print("VZEROUPPER");
+ st->cr(); st->print("\t");
+ }
+ if (C->in_24_bit_fp_mode()) {
st->print("FLDCW standard control word");
st->cr(); st->print("\t");
}
- if( framesize ) {
+ if (framesize) {
st->print("ADD ESP,%d\t# Destroy frame",framesize);
st->cr(); st->print("\t");
}
st->print_cr("POPL EBP"); st->print("\t");
- if( do_polling() && C->is_method_compilation() ) {
+ if (do_polling() && C->is_method_compilation()) {
st->print("TEST PollPage,EAX\t! Poll Safepoint");
st->cr(); st->print("\t");
}
@@ -602,8 +612,14 @@
void MachEpilogNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
Compile *C = ra_->C;
+ if (C->max_vector_size() > 16) {
+ // Clear upper bits of YMM registers when current compiled code uses
+ // wide vectors to avoid AVX <-> SSE transition penalty during call.
+ MacroAssembler masm(&cbuf);
+ masm.vzeroupper();
+ }
// If method set FPU control word, restore to standard control word
- if( C->in_24_bit_fp_mode() ) {
+ if (C->in_24_bit_fp_mode()) {
MacroAssembler masm(&cbuf);
masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
}
@@ -615,12 +631,11 @@
// Note that VerifyStackAtCalls' Majik cookie does not change the frame size popped here
- if( framesize >= 128 ) {
+ if (framesize >= 128) {
emit_opcode(cbuf, 0x81); // add SP, #framesize
emit_rm(cbuf, 0x3, 0x00, ESP_enc);
emit_d32(cbuf, framesize);
- }
- else if( framesize ) {
+ } else if (framesize) {
emit_opcode(cbuf, 0x83); // add SP, #framesize
emit_rm(cbuf, 0x3, 0x00, ESP_enc);
emit_d8(cbuf, framesize);
@@ -628,7 +643,7 @@
emit_opcode(cbuf, 0x58 | EBP_enc);
- if( do_polling() && C->is_method_compilation() ) {
+ if (do_polling() && C->is_method_compilation()) {
cbuf.relocate(cbuf.insts_end(), relocInfo::poll_return_type, 0);
emit_opcode(cbuf,0x85);
emit_rm(cbuf, 0x0, EAX_enc, 0x5); // EAX
@@ -640,7 +655,8 @@
Compile *C = ra_->C;
// If method set FPU control word, restore to standard control word
int size = C->in_24_bit_fp_mode() ? 6 : 0;
- if( do_polling() && C->is_method_compilation() ) size += 6;
+ if (C->max_vector_size() > 16) size += 3; // vzeroupper
+ if (do_polling() && C->is_method_compilation()) size += 6;
int framesize = C->frame_slots() << LogBytesPerInt;
assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
@@ -649,7 +665,7 @@
size++; // popl rbp,
- if( framesize >= 128 ) {
+ if (framesize >= 128) {
size += 6;
} else {
size += framesize ? 3 : 0;
@@ -1853,20 +1869,26 @@
%}
- enc_class pre_call_FPU %{
+ enc_class pre_call_resets %{
// If method sets FPU control word restore it here
debug_only(int off0 = cbuf.insts_size());
- if( Compile::current()->in_24_bit_fp_mode() ) {
- MacroAssembler masm(&cbuf);
- masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
+ if (ra_->C->in_24_bit_fp_mode()) {
+ MacroAssembler _masm(&cbuf);
+ __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
+ }
+ if (ra_->C->max_vector_size() > 16) {
+ // Clear upper bits of YMM registers when current compiled code uses
+ // wide vectors to avoid AVX <-> SSE transition penalty during call.
+ MacroAssembler _masm(&cbuf);
+ __ vzeroupper();
}
debug_only(int off1 = cbuf.insts_size());
- assert(off1 - off0 == pre_call_FPU_size(), "correct size prediction");
+ assert(off1 - off0 == pre_call_resets_size(), "correct size prediction");
%}
enc_class post_call_FPU %{
// If method sets FPU control word do it here also
- if( Compile::current()->in_24_bit_fp_mode() ) {
+ if (Compile::current()->in_24_bit_fp_mode()) {
MacroAssembler masm(&cbuf);
masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
}
@@ -1877,17 +1899,17 @@
// who we intended to call.
cbuf.set_insts_mark();
$$$emit8$primary;
- if ( !_method ) {
+ if (!_method) {
emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4),
runtime_call_Relocation::spec(), RELOC_IMM32 );
- } else if(_optimized_virtual) {
+ } else if (_optimized_virtual) {
emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4),
opt_virtual_call_Relocation::spec(), RELOC_IMM32 );
} else {
emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4),
static_call_Relocation::spec(), RELOC_IMM32 );
}
- if( _method ) { // Emit stub for static call
+ if (_method) { // Emit stub for static call
emit_java_to_interp(cbuf);
}
%}
@@ -12828,7 +12850,7 @@
ins_cost(300);
format %{ "CALL,static " %}
opcode(0xE8); /* E8 cd */
- ins_encode( pre_call_FPU,
+ ins_encode( pre_call_resets,
Java_Static_Call( meth ),
call_epilog,
post_call_FPU );
@@ -12849,7 +12871,7 @@
ins_cost(300);
format %{ "CALL,static/MethodHandle " %}
opcode(0xE8); /* E8 cd */
- ins_encode( pre_call_FPU,
+ ins_encode( pre_call_resets,
preserve_SP,
Java_Static_Call( meth ),
restore_SP,
@@ -12870,7 +12892,7 @@
format %{ "MOV EAX,(oop)-1\n\t"
"CALL,dynamic" %}
opcode(0xE8); /* E8 cd */
- ins_encode( pre_call_FPU,
+ ins_encode( pre_call_resets,
Java_Dynamic_Call( meth ),
call_epilog,
post_call_FPU );
@@ -12887,7 +12909,7 @@
format %{ "CALL,runtime " %}
opcode(0xE8); /* E8 cd */
// Use FFREEs to clear entries in float stack
- ins_encode( pre_call_FPU,
+ ins_encode( pre_call_resets,
FFree_Float_Stack_All,
Java_To_Runtime( meth ),
post_call_FPU );
@@ -12902,7 +12924,7 @@
ins_cost(300);
format %{ "CALL_LEAF,runtime " %}
opcode(0xE8); /* E8 cd */
- ins_encode( pre_call_FPU,
+ ins_encode( pre_call_resets,
FFree_Float_Stack_All,
Java_To_Runtime( meth ),
Verify_FPU_For_Leaf, post_call_FPU );
--- a/hotspot/src/cpu/x86/vm/x86_64.ad Tue Apr 02 09:30:07 2013 +0200
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad Wed Apr 03 11:12:57 2013 -0700
@@ -399,6 +399,9 @@
static int preserve_SP_size() {
return 3; // rex.w, op, rm(reg/reg)
}
+static int clear_avx_size() {
+ return (Compile::current()->max_vector_size() > 16) ? 3 : 0; // vzeroupper
+}
// !!!!! Special hack to get all types of calls to specify the byte offset
// from the start of the call to the point where the return address
@@ -406,6 +409,7 @@
int MachCallStaticJavaNode::ret_addr_offset()
{
int offset = 5; // 5 bytes from start of call to where return address points
+ offset += clear_avx_size();
if (_method_handle_invoke)
offset += preserve_SP_size();
return offset;
@@ -413,11 +417,16 @@
int MachCallDynamicJavaNode::ret_addr_offset()
{
- return 15; // 15 bytes from start of call to where return address points
+ int offset = 15; // 15 bytes from start of call to where return address points
+ offset += clear_avx_size();
+ return offset;
}
-// In os_cpu .ad file
-// int MachCallRuntimeNode::ret_addr_offset()
+int MachCallRuntimeNode::ret_addr_offset() {
+ int offset = 13; // movq r10,#addr; callq (r10)
+ offset += clear_avx_size();
+ return offset;
+}
// Indicate if the safepoint node needs the polling page as an input,
// it does if the polling page is more than disp32 away.
@@ -434,6 +443,7 @@
// ensure that it does not span a cache line so that it can be patched.
int CallStaticJavaDirectNode::compute_padding(int current_offset) const
{
+ current_offset += clear_avx_size(); // skip vzeroupper
current_offset += 1; // skip call opcode byte
return round_to(current_offset, alignment_required()) - current_offset;
}
@@ -443,6 +453,7 @@
int CallStaticJavaHandleNode::compute_padding(int current_offset) const
{
current_offset += preserve_SP_size(); // skip mov rbp, rsp
+ current_offset += clear_avx_size(); // skip vzeroupper
current_offset += 1; // skip call opcode byte
return round_to(current_offset, alignment_required()) - current_offset;
}
@@ -451,6 +462,7 @@
// ensure that it does not span a cache line so that it can be patched.
int CallDynamicJavaDirectNode::compute_padding(int current_offset) const
{
+ current_offset += clear_avx_size(); // skip vzeroupper
current_offset += 11; // skip movq instruction + call opcode byte
return round_to(current_offset, alignment_required()) - current_offset;
}
@@ -764,6 +776,11 @@
void MachEpilogNode::format(PhaseRegAlloc* ra_, outputStream* st) const
{
Compile* C = ra_->C;
+ if (C->max_vector_size() > 16) {
+ st->print("vzeroupper");
+ st->cr(); st->print("\t");
+ }
+
int framesize = C->frame_slots() << LogBytesPerInt;
assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
// Remove word for return adr already pushed
@@ -793,6 +810,13 @@
void MachEpilogNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
{
Compile* C = ra_->C;
+ if (C->max_vector_size() > 16) {
+ // Clear upper bits of YMM registers when current compiled code uses
+ // wide vectors to avoid AVX <-> SSE transition penalty during call.
+ MacroAssembler _masm(&cbuf);
+ __ vzeroupper();
+ }
+
int framesize = C->frame_slots() << LogBytesPerInt;
assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
// Remove word for return adr already pushed
@@ -2008,6 +2032,25 @@
__ bind(miss);
%}
+ enc_class clear_avx %{
+ debug_only(int off0 = cbuf.insts_size());
+ if (ra_->C->max_vector_size() > 16) {
+ // Clear upper bits of YMM registers when current compiled code uses
+ // wide vectors to avoid AVX <-> SSE transition penalty during call.
+ MacroAssembler _masm(&cbuf);
+ __ vzeroupper();
+ }
+ debug_only(int off1 = cbuf.insts_size());
+ assert(off1 - off0 == clear_avx_size(), "correct size prediction");
+ %}
+
+ enc_class Java_To_Runtime(method meth) %{
+ // No relocation needed
+ MacroAssembler _masm(&cbuf);
+ __ mov64(r10, (int64_t) $meth$$method);
+ __ call(r10);
+ %}
+
enc_class Java_To_Interpreter(method meth)
%{
// CALL Java_To_Interpreter
@@ -11366,7 +11409,7 @@
ins_cost(300);
format %{ "call,static " %}
opcode(0xE8); /* E8 cd */
- ins_encode(Java_Static_Call(meth), call_epilog);
+ ins_encode(clear_avx, Java_Static_Call(meth), call_epilog);
ins_pipe(pipe_slow);
ins_alignment(4);
%}
@@ -11384,7 +11427,7 @@
ins_cost(300);
format %{ "call,static/MethodHandle " %}
opcode(0xE8); /* E8 cd */
- ins_encode(preserve_SP,
+ ins_encode(clear_avx, preserve_SP,
Java_Static_Call(meth),
restore_SP,
call_epilog);
@@ -11403,7 +11446,7 @@
ins_cost(300);
format %{ "movq rax, #Universe::non_oop_word()\n\t"
"call,dynamic " %}
- ins_encode(Java_Dynamic_Call(meth), call_epilog);
+ ins_encode(clear_avx, Java_Dynamic_Call(meth), call_epilog);
ins_pipe(pipe_slow);
ins_alignment(4);
%}
@@ -11416,8 +11459,7 @@
ins_cost(300);
format %{ "call,runtime " %}
- opcode(0xE8); /* E8 cd */
- ins_encode(Java_To_Runtime(meth));
+ ins_encode(clear_avx, Java_To_Runtime(meth));
ins_pipe(pipe_slow);
%}
@@ -11429,8 +11471,7 @@
ins_cost(300);
format %{ "call_leaf,runtime " %}
- opcode(0xE8); /* E8 cd */
- ins_encode(Java_To_Runtime(meth));
+ ins_encode(clear_avx, Java_To_Runtime(meth));
ins_pipe(pipe_slow);
%}
@@ -11442,7 +11483,6 @@
ins_cost(300);
format %{ "call_leaf_nofp,runtime " %}
- opcode(0xE8); /* E8 cd */
ins_encode(Java_To_Runtime(meth));
ins_pipe(pipe_slow);
%}
--- a/hotspot/src/os_cpu/bsd_x86/vm/bsd_x86_64.ad Tue Apr 02 09:30:07 2013 +0200
+++ b/hotspot/src/os_cpu/bsd_x86/vm/bsd_x86_64.ad Wed Apr 03 11:12:57 2013 -0700
@@ -55,20 +55,6 @@
// adding a syntax that specifies the sizes of fields in an order,
// so that the adlc can build the emit functions automagically
- enc_class Java_To_Runtime(method meth) %{
- // No relocation needed
-
- // movq r10, <meth>
- emit_opcode(cbuf, Assembler::REX_WB);
- emit_opcode(cbuf, 0xB8 | (R10_enc - 8));
- emit_d64(cbuf, (int64_t) $meth$$method);
-
- // call (r10)
- emit_opcode(cbuf, Assembler::REX_B);
- emit_opcode(cbuf, 0xFF);
- emit_opcode(cbuf, 0xD0 | (R10_enc - 8));
- %}
-
%}
@@ -76,8 +62,4 @@
source %{
-int MachCallRuntimeNode::ret_addr_offset() {
- return 13; // movq r10,#addr; callq (r10)
-}
-
%}
--- a/hotspot/src/os_cpu/linux_x86/vm/linux_x86_64.ad Tue Apr 02 09:30:07 2013 +0200
+++ b/hotspot/src/os_cpu/linux_x86/vm/linux_x86_64.ad Wed Apr 03 11:12:57 2013 -0700
@@ -55,20 +55,6 @@
// adding a syntax that specifies the sizes of fields in an order,
// so that the adlc can build the emit functions automagically
- enc_class Java_To_Runtime(method meth) %{
- // No relocation needed
-
- // movq r10, <meth>
- emit_opcode(cbuf, Assembler::REX_WB);
- emit_opcode(cbuf, 0xB8 | (R10_enc - 8));
- emit_d64(cbuf, (int64_t) $meth$$method);
-
- // call (r10)
- emit_opcode(cbuf, Assembler::REX_B);
- emit_opcode(cbuf, 0xFF);
- emit_opcode(cbuf, 0xD0 | (R10_enc - 8));
- %}
-
%}
@@ -76,8 +62,4 @@
source %{
-int MachCallRuntimeNode::ret_addr_offset() {
- return 13; // movq r10,#addr; callq (r10)
-}
-
%}
--- a/hotspot/src/os_cpu/solaris_x86/vm/solaris_x86_64.ad Tue Apr 02 09:30:07 2013 +0200
+++ b/hotspot/src/os_cpu/solaris_x86/vm/solaris_x86_64.ad Wed Apr 03 11:12:57 2013 -0700
@@ -54,39 +54,10 @@
// main source block for now. In future, we can generalize this by
// adding a syntax that specifies the sizes of fields in an order,
// so that the adlc can build the emit functions automagically
-
- enc_class Java_To_Runtime(method meth) %{
- // No relocation needed
-
- // movq r10, <meth>
- emit_opcode(cbuf, Assembler::REX_WB);
- emit_opcode(cbuf, 0xB8 | (R10_enc - 8));
- emit_d64(cbuf, (int64_t) $meth$$method);
-
- // call (r10)
- emit_opcode(cbuf, Assembler::REX_B);
- emit_opcode(cbuf, 0xFF);
- emit_opcode(cbuf, 0xD0 | (R10_enc - 8));
- %}
-
- enc_class post_call_verify_mxcsr %{
- MacroAssembler _masm(&cbuf);
- if (RestoreMXCSROnJNICalls) {
- __ ldmxcsr(ExternalAddress(StubRoutines::amd64::mxcsr_std()));
- }
- else if (CheckJNICalls) {
- __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::amd64::verify_mxcsr_entry())));
- }
- %}
%}
// Platform dependent source
source %{
-
-int MachCallRuntimeNode::ret_addr_offset() {
- return 13; // movq r10,#addr; callq (r10)
-}
-
%}
--- a/hotspot/src/os_cpu/windows_x86/vm/windows_x86_64.ad Tue Apr 02 09:30:07 2013 +0200
+++ b/hotspot/src/os_cpu/windows_x86/vm/windows_x86_64.ad Wed Apr 03 11:12:57 2013 -0700
@@ -53,30 +53,11 @@
// adding a syntax that specifies the sizes of fields in an order,
// so that the adlc can build the emit functions automagically
- enc_class Java_To_Runtime (method meth) %{ // CALL Java_To_Runtime
- // No relocation needed
+%}
+
- // movq r10, <meth>
- emit_opcode(cbuf, Assembler::REX_WB);
- emit_opcode(cbuf, 0xB8 | (R10_enc - 8));
- emit_d64(cbuf, (int64_t) $meth$$method);
+// Platform dependent source
- // call (r10)
- emit_opcode(cbuf, Assembler::REX_B);
- emit_opcode(cbuf, 0xFF);
- emit_opcode(cbuf, 0xD0 | (R10_enc - 8));
- %}
+source %{
%}
-
-//
-// Platform dependent source
-//
-source %{
-
-int MachCallRuntimeNode::ret_addr_offset()
-{
- return 13; // movq r10,#addr; callq (r10)
-}
-
-%}