--- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp Wed Nov 11 23:47:41 2015 +0000
+++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp Fri Nov 13 13:31:48 2015 +0100
@@ -72,45 +72,28 @@
class RegisterSaver {
// Capture info about frame layout. Layout offsets are in jint
// units because compiler frame slots are jints.
-#define HALF_ZMM_BANK_WORDS 128
+#define XSAVE_AREA_BEGIN 160
+#define XSAVE_AREA_YMM_BEGIN 576
+#define XSAVE_AREA_ZMM_BEGIN 1152
+#define XSAVE_AREA_UPPERBANK 1664
#define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
+#define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
#define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
enum layout {
fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
- xmm_off = fpu_state_off + 160/BytesPerInt, // offset in fxsave save area
+ xmm_off = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt, // offset in fxsave save area
DEF_XMM_OFFS(0),
DEF_XMM_OFFS(1),
- DEF_XMM_OFFS(2),
- DEF_XMM_OFFS(3),
- DEF_XMM_OFFS(4),
- DEF_XMM_OFFS(5),
- DEF_XMM_OFFS(6),
- DEF_XMM_OFFS(7),
- DEF_XMM_OFFS(8),
- DEF_XMM_OFFS(9),
- DEF_XMM_OFFS(10),
- DEF_XMM_OFFS(11),
- DEF_XMM_OFFS(12),
- DEF_XMM_OFFS(13),
- DEF_XMM_OFFS(14),
- DEF_XMM_OFFS(15),
- zmm_off = fpu_state_off + ((FPUStateSizeInWords - (HALF_ZMM_BANK_WORDS + 1))*wordSize / BytesPerInt),
+ // 2..15 are implied in range usage
+ ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
+ DEF_YMM_OFFS(0),
+ DEF_YMM_OFFS(1),
+ // 2..15 are implied in range usage
+ zmm_high = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
+ zmm_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
DEF_ZMM_OFFS(16),
DEF_ZMM_OFFS(17),
- DEF_ZMM_OFFS(18),
- DEF_ZMM_OFFS(19),
- DEF_ZMM_OFFS(20),
- DEF_ZMM_OFFS(21),
- DEF_ZMM_OFFS(22),
- DEF_ZMM_OFFS(23),
- DEF_ZMM_OFFS(24),
- DEF_ZMM_OFFS(25),
- DEF_ZMM_OFFS(26),
- DEF_ZMM_OFFS(27),
- DEF_ZMM_OFFS(28),
- DEF_ZMM_OFFS(29),
- DEF_ZMM_OFFS(30),
- DEF_ZMM_OFFS(31),
+ // 18..31 are implied in range usage
fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
fpu_stateH_end,
r15_off, r15H_off,
@@ -160,8 +143,6 @@
};
OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
- int vect_words = 0;
- int ymmhi_offset = -1;
int off = 0;
int num_xmm_regs = XMMRegisterImpl::number_of_registers;
if (UseAVX < 3) {
@@ -171,24 +152,15 @@
if (save_vectors) {
assert(UseAVX > 0, "512bit vectors are supported only with EVEX");
assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
- // Save upper half of YMM registers
- vect_words = 16 * num_xmm_regs / wordSize;
- if (UseAVX < 3) {
- ymmhi_offset = additional_frame_words;
- additional_frame_words += vect_words;
- }
}
#else
assert(!save_vectors, "vectors are generated only by C2 and JVMCI");
#endif
- // Always make the frame size 16-byte aligned
- int frame_size_in_bytes = round_to(additional_frame_words*wordSize +
- reg_save_size*BytesPerInt, num_xmm_regs);
+ // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
+ int frame_size_in_bytes = round_to(reg_save_size*BytesPerInt, num_xmm_regs);
// OopMap frame size is in compiler stack slots (jint's) not bytes or words
int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
- // The caller will allocate additional_frame_words
- int additional_frame_slots = additional_frame_words*wordSize / BytesPerInt;
// CodeBlob frame size is in words.
int frame_size_in_words = frame_size_in_bytes / wordSize;
*total_frame_words = frame_size_in_words;
@@ -203,12 +175,34 @@
__ push_CPU_state(); // Push a multiple of 16 bytes
// push cpu state handles this on EVEX enabled targets
- if ((vect_words > 0) && (UseAVX < 3)) {
- assert(vect_words*wordSize >= 256, "");
- // Save upper half of YMM registes(0..num_xmm_regs)
- __ subptr(rsp, num_xmm_regs*16);
- for (int n = 0; n < num_xmm_regs; n++) {
- __ vextractf128h(Address(rsp, off++*16), as_XMMRegister(n));
+ if (save_vectors) {
+ // Save upper half of YMM registes(0..15)
+ int base_addr = XSAVE_AREA_YMM_BEGIN;
+ for (int n = 0; n < 16; n++) {
+ __ vextractf128h(Address(rsp, base_addr+n*16), as_XMMRegister(n));
+ }
+ if (VM_Version::supports_evex()) {
+ // Save upper half of ZMM registes(0..15)
+ base_addr = XSAVE_AREA_ZMM_BEGIN;
+ for (int n = 0; n < 16; n++) {
+ __ vextractf64x4h(Address(rsp, base_addr+n*32), as_XMMRegister(n), 1);
+ }
+ // Save full ZMM registes(16..num_xmm_regs)
+ base_addr = XSAVE_AREA_UPPERBANK;
+ int off = 0;
+ int vector_len = Assembler::AVX_512bit;
+ for (int n = 16; n < num_xmm_regs; n++) {
+ __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
+ }
+ }
+ } else {
+ if (VM_Version::supports_evex()) {
+ // Save upper bank of ZMM registers(16..31) for double/float usage
+ int base_addr = XSAVE_AREA_UPPERBANK;
+ int off = 0;
+ for (int n = 16; n < num_xmm_regs; n++) {
+ __ movsd(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n));
+ }
}
}
if (frame::arg_reg_save_area_bytes != 0) {
@@ -224,8 +218,7 @@
OopMapSet *oop_maps = new OopMapSet();
OopMap* map = new OopMap(frame_size_in_slots, 0);
-#define STACK_OFFSET(x) VMRegImpl::stack2reg((x) + additional_frame_slots)
-#define YMMHI_STACK_OFFSET(x) VMRegImpl::stack2reg((x / VMRegImpl::stack_slot_size) + ymmhi_offset)
+#define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
@@ -257,31 +250,21 @@
off = zmm16_off;
delta = zmm17_off - off;
for (int n = 16; n < num_xmm_regs; n++) {
- XMMRegister xmm_name = as_XMMRegister(n);
- map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
+ XMMRegister zmm_name = as_XMMRegister(n);
+ map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
off += delta;
}
}
#if defined(COMPILER2) || INCLUDE_JVMCI
if (save_vectors) {
- assert(ymmhi_offset != -1, "save area must exist");
- map->set_callee_saved(YMMHI_STACK_OFFSET( 0), xmm0->as_VMReg()->next(4));
- map->set_callee_saved(YMMHI_STACK_OFFSET( 16), xmm1->as_VMReg()->next(4));
- map->set_callee_saved(YMMHI_STACK_OFFSET( 32), xmm2->as_VMReg()->next(4));
- map->set_callee_saved(YMMHI_STACK_OFFSET( 48), xmm3->as_VMReg()->next(4));
- map->set_callee_saved(YMMHI_STACK_OFFSET( 64), xmm4->as_VMReg()->next(4));
- map->set_callee_saved(YMMHI_STACK_OFFSET( 80), xmm5->as_VMReg()->next(4));
- map->set_callee_saved(YMMHI_STACK_OFFSET( 96), xmm6->as_VMReg()->next(4));
- map->set_callee_saved(YMMHI_STACK_OFFSET(112), xmm7->as_VMReg()->next(4));
- map->set_callee_saved(YMMHI_STACK_OFFSET(128), xmm8->as_VMReg()->next(4));
- map->set_callee_saved(YMMHI_STACK_OFFSET(144), xmm9->as_VMReg()->next(4));
- map->set_callee_saved(YMMHI_STACK_OFFSET(160), xmm10->as_VMReg()->next(4));
- map->set_callee_saved(YMMHI_STACK_OFFSET(176), xmm11->as_VMReg()->next(4));
- map->set_callee_saved(YMMHI_STACK_OFFSET(192), xmm12->as_VMReg()->next(4));
- map->set_callee_saved(YMMHI_STACK_OFFSET(208), xmm13->as_VMReg()->next(4));
- map->set_callee_saved(YMMHI_STACK_OFFSET(224), xmm14->as_VMReg()->next(4));
- map->set_callee_saved(YMMHI_STACK_OFFSET(240), xmm15->as_VMReg()->next(4));
+ off = ymm0_off;
+ int delta = ymm1_off - off;
+ for (int n = 0; n < 16; n++) {
+ XMMRegister ymm_name = as_XMMRegister(n);
+ map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
+ off += delta;
+ }
}
#endif // COMPILER2 || INCLUDE_JVMCI
@@ -316,8 +299,8 @@
off = zmm16H_off;
delta = zmm17H_off - off;
for (int n = 16; n < num_xmm_regs; n++) {
- XMMRegister xmm_name = as_XMMRegister(n);
- map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
+ XMMRegister zmm_name = as_XMMRegister(n);
+ map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
off += delta;
}
}
@@ -335,21 +318,48 @@
// Pop arg register save area
__ addptr(rsp, frame::arg_reg_save_area_bytes);
}
+
#if defined(COMPILER2) || INCLUDE_JVMCI
- // On EVEX enabled targets everything is handled in pop fpu state
- if ((restore_vectors) && (UseAVX < 3)) {
- assert(UseAVX > 0, "256/512-bit vectors are supported only with AVX");
- assert(MaxVectorSize == 64, "up to 512bit vectors are supported now");
- int off = 0;
- // Restore upper half of YMM registes (0..num_xmm_regs)
- for (int n = 0; n < num_xmm_regs; n++) {
- __ vinsertf128h(as_XMMRegister(n), Address(rsp, off++*16));
- }
- __ addptr(rsp, num_xmm_regs*16);
+ if (restore_vectors) {
+ assert(UseAVX > 0, "512bit vectors are supported only with EVEX");
+ assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
}
#else
- assert(!restore_vectors, "vectors are generated only by C2 and JVMCI");
+ assert(!save_vectors, "vectors are generated only by C2");
#endif
+
+ // On EVEX enabled targets everything is handled in pop fpu state
+ if (restore_vectors) {
+ // Restore upper half of YMM registes (0..15)
+ int base_addr = XSAVE_AREA_YMM_BEGIN;
+ for (int n = 0; n < 16; n++) {
+ __ vinsertf128h(as_XMMRegister(n), Address(rsp, base_addr+n*16));
+ }
+ if (VM_Version::supports_evex()) {
+ // Restore upper half of ZMM registes (0..15)
+ base_addr = XSAVE_AREA_ZMM_BEGIN;
+ for (int n = 0; n < 16; n++) {
+ __ vinsertf64x4h(as_XMMRegister(n), Address(rsp, base_addr+n*32), 1);
+ }
+ // Restore full ZMM registes(16..num_xmm_regs)
+ base_addr = XSAVE_AREA_UPPERBANK;
+ int vector_len = Assembler::AVX_512bit;
+ int off = 0;
+ for (int n = 16; n < num_xmm_regs; n++) {
+ __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
+ }
+ }
+ } else {
+ if (VM_Version::supports_evex()) {
+ // Restore upper bank of ZMM registes(16..31) for double/float usage
+ int base_addr = XSAVE_AREA_UPPERBANK;
+ int off = 0;
+ for (int n = 16; n < num_xmm_regs; n++) {
+ __ movsd(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)));
+ }
+ }
+ }
+
// Recover CPU state
__ pop_CPU_state();
// Get the rbp described implicitly by the calling convention (no oopMap)
@@ -2819,6 +2829,7 @@
__ movl(r14, (int32_t)Deoptimization::Unpack_reexecute);
__ mov(c_rarg0, r15_thread);
+ __ movl(c_rarg2, r14); // exec mode
__ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
@@ -2905,6 +2916,7 @@
}
#endif // ASSERT
__ mov(c_rarg0, r15_thread);
+ __ movl(c_rarg1, r14); // exec_mode
__ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
// Need to have an oopmap that tells fetch_unroll_info where to
@@ -2922,6 +2934,7 @@
// Load UnrollBlock* into rdi
__ mov(rdi, rax);
+ __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
Label noException;
__ cmpl(r14, Deoptimization::Unpack_exception); // Was exception pending?
__ jcc(Assembler::notEqual, noException);
@@ -3140,6 +3153,7 @@
// UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
__ mov(c_rarg0, r15_thread);
+ __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
__ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
// Set an oopmap for the call site
@@ -3155,6 +3169,16 @@
// Load UnrollBlock* into rdi
__ mov(rdi, rax);
+#ifdef ASSERT
+ { Label L;
+ __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()),
+ (int32_t)Deoptimization::Unpack_uncommon_trap);
+ __ jcc(Assembler::equal, L);
+ __ stop("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap");
+ __ bind(L);
+ }
+#endif
+
// Pop all the frames we must move/replace.
//
// Frame picture (youngest to oldest)