8180659: [s390] micro-optimization in resize_frame_absolute()
authorlucy
Thu, 27 Jul 2017 15:36:15 +0200
changeset 46726 7801367e3cc9
parent 46725 537540cbebb3
child 46727 6e4a84748e2c
8180659: [s390] micro-optimization in resize_frame_absolute() Reviewed-by: simonis, mdoerr
hotspot/src/cpu/s390/vm/c1_LIRAssembler_s390.cpp
hotspot/src/cpu/s390/vm/c1_MacroAssembler_s390.cpp
hotspot/src/cpu/s390/vm/macroAssembler_s390.cpp
hotspot/src/cpu/s390/vm/macroAssembler_s390.hpp
hotspot/src/cpu/s390/vm/s390.ad
hotspot/src/cpu/s390/vm/sharedRuntime_s390.cpp
hotspot/src/cpu/s390/vm/stubGenerator_s390.cpp
hotspot/src/cpu/s390/vm/templateInterpreterGenerator_s390.cpp
--- a/hotspot/src/cpu/s390/vm/c1_LIRAssembler_s390.cpp	Fri Jul 21 12:43:47 2017 +0200
+++ b/hotspot/src/cpu/s390/vm/c1_LIRAssembler_s390.cpp	Thu Jul 27 15:36:15 2017 +0200
@@ -1139,14 +1139,7 @@
   __ load_const_optimized(Z_R1_scratch, pp);
 
   // Pop the frame before the safepoint code.
-  int retPC_offset = initial_frame_size_in_bytes() + _z_abi16(return_pc);
-  if (Displacement::is_validDisp(retPC_offset)) {
-    __ z_lg(Z_R14, retPC_offset, Z_SP);
-    __ add2reg(Z_SP, initial_frame_size_in_bytes());
-  } else {
-    __ add2reg(Z_SP, initial_frame_size_in_bytes());
-    __ restore_return_pc();
-  }
+  __ pop_frame_restore_retPC(initial_frame_size_in_bytes());
 
   if (StackReservedPages > 0 && compilation()->has_reserved_stack_access()) {
     __ reserved_stack_check(Z_R14);
--- a/hotspot/src/cpu/s390/vm/c1_MacroAssembler_s390.cpp	Fri Jul 21 12:43:47 2017 +0200
+++ b/hotspot/src/cpu/s390/vm/c1_MacroAssembler_s390.cpp	Thu Jul 27 15:36:15 2017 +0200
@@ -70,7 +70,7 @@
   assert(bang_size_in_bytes >= frame_size_in_bytes, "stack bang size incorrect");
   generate_stack_overflow_check(bang_size_in_bytes);
   save_return_pc();
-  push_frame(frame_size_in_bytes); // TODO: Must we add z_abi_160?
+  push_frame(frame_size_in_bytes);
 }
 
 void C1_MacroAssembler::unverified_entry(Register receiver, Register ic_klass) {
--- a/hotspot/src/cpu/s390/vm/macroAssembler_s390.cpp	Fri Jul 21 12:43:47 2017 +0200
+++ b/hotspot/src/cpu/s390/vm/macroAssembler_s390.cpp	Thu Jul 27 15:36:15 2017 +0200
@@ -2022,17 +2022,41 @@
   z_stg(fp, _z_abi(callers_sp), Z_SP);
 }
 
-// Resize_frame with SP(new) = [addr].
-void MacroAssembler::resize_frame_absolute(Register addr, Register fp, bool load_fp) {
-  assert_different_registers(addr, fp, Z_SP);
-  if (load_fp) { z_lg(fp, _z_abi(callers_sp), Z_SP); }
-
-  if (addr != Z_R0) {
-    // Minimize stalls by not using Z_SP immediately after update.
-    z_stg(fp, _z_abi(callers_sp), addr);
-    z_lgr(Z_SP, addr);
+// Resize_frame with SP(new) = [newSP] + offset.
+//   This emitter is useful if we already have calculated a pointer
+//   into the to-be-allocated stack space, e.g. with special alignment properties,
+//   but need some additional space, e.g. for spilling.
+//   newSP    is the pre-calculated pointer. It must not be modified.
+//   fp       holds, or is filled with, the frame pointer.
+//   offset   is the additional increment which is added to addr to form the new SP.
+//            Note: specify a negative value to reserve more space!
+//   load_fp == true  only indicates that fp is not pre-filled with the frame pointer.
+//                    It does not guarantee that fp contains the frame pointer at the end.
+void MacroAssembler::resize_frame_abs_with_offset(Register newSP, Register fp, int offset, bool load_fp) {
+  assert_different_registers(newSP, fp, Z_SP);
+
+  if (load_fp) {
+    z_lg(fp, _z_abi(callers_sp), Z_SP);
+  }
+
+  add2reg(Z_SP, offset, newSP);
+  z_stg(fp, _z_abi(callers_sp), Z_SP);
+}
+
+// Resize_frame with SP(new) = [newSP].
+//   load_fp == true  only indicates that fp is not pre-filled with the frame pointer.
+//                    It does not guarantee that fp contains the frame pointer at the end.
+void MacroAssembler::resize_frame_absolute(Register newSP, Register fp, bool load_fp) {
+  assert_different_registers(newSP, fp, Z_SP);
+
+  if (load_fp) {
+    z_lg(fp, _z_abi(callers_sp), Z_SP); // need to use load/store.
+  }
+
+  z_lgr(Z_SP, newSP);
+  if (newSP != Z_R0) { // make sure we generate correct code, no matter what register newSP uses.
+    z_stg(fp, _z_abi(callers_sp), newSP);
   } else {
-    z_lgr(Z_SP, addr);
     z_stg(fp, _z_abi(callers_sp), Z_SP);
   }
 }
@@ -2040,17 +2064,12 @@
 // Resize_frame with SP(new) = SP(old) + offset.
 void MacroAssembler::resize_frame(RegisterOrConstant offset, Register fp, bool load_fp) {
   assert_different_registers(fp, Z_SP);
-  if (load_fp) z_lg(fp, _z_abi(callers_sp), Z_SP);
-
-  if (Displacement::is_validDisp((int)_z_abi(callers_sp) + offset.constant_or_zero())) {
-    // Minimize stalls by first using, then updating Z_SP.
-    // Do that only if we have a small positive offset or if ExtImm are available.
-    z_stg(fp, Address(Z_SP, offset, _z_abi(callers_sp)));
-    add64(Z_SP, offset);
-  } else {
-    add64(Z_SP, offset);
-    z_stg(fp, _z_abi(callers_sp), Z_SP);
-  }
+
+  if (load_fp) {
+    z_lg(fp, _z_abi(callers_sp), Z_SP);
+  }
+  add64(Z_SP, offset);
+  z_stg(fp, _z_abi(callers_sp), Z_SP);
 }
 
 void MacroAssembler::push_frame(Register bytes, Register old_sp, bool copy_sp, bool bytes_with_inverted_sign) {
@@ -2063,32 +2082,32 @@
 #endif
   if (copy_sp) { z_lgr(old_sp, Z_SP); }
   if (bytes_with_inverted_sign) {
-    z_stg(old_sp, 0, bytes, Z_SP);
-    add2reg_with_index(Z_SP, 0, bytes, Z_SP);
+    z_agr(Z_SP, bytes);
   } else {
     z_sgr(Z_SP, bytes); // Z_sgfr sufficient, but probably not faster.
-    z_stg(old_sp, 0, Z_SP);
-  }
+  }
+  z_stg(old_sp, _z_abi(callers_sp), Z_SP);
 }
 
 unsigned int MacroAssembler::push_frame(unsigned int bytes, Register scratch) {
   long offset = Assembler::align(bytes, frame::alignment_in_bytes);
-
-  if (Displacement::is_validDisp(-offset)) {
-    // Minimize stalls by first using, then updating Z_SP.
-    // Do that only if we have ExtImm available.
-    z_stg(Z_SP, -offset, Z_SP);
-    add2reg(Z_SP, -offset);
-  } else {
-    if (scratch != Z_R0 && scratch != Z_R1) {
-      z_stg(Z_SP, -offset, Z_SP);
-      add2reg(Z_SP, -offset);
-    } else {   // scratch == Z_R0 || scratch == Z_R1
-      z_lgr(scratch, Z_SP);
-      add2reg(Z_SP, -offset);
-      z_stg(scratch, 0, Z_SP);
-    }
-  }
+  assert(offset > 0, "should push a frame with positive size, size = %ld.", offset);
+  assert(Displacement::is_validDisp(-offset), "frame size out of range, size = %ld", offset);
+
+  // We must not write outside the current stack bounds (given by Z_SP).
+  // Thus, we have to first update Z_SP and then store the previous SP as stack linkage.
+  // We rely on Z_R0 by default to be available as scratch.
+  z_lgr(scratch, Z_SP);
+  add2reg(Z_SP, -offset);
+  z_stg(scratch, _z_abi(callers_sp), Z_SP);
+#ifdef ASSERT
+  // Just make sure nobody uses the value in the default scratch register.
+  // When another register is used, the caller might rely on it containing the frame pointer.
+  if (scratch == Z_R0) {
+    z_iihf(scratch, 0xbaadbabe);
+    z_iilf(scratch, 0xdeadbeef);
+  }
+#endif
   return offset;
 }
 
@@ -2106,6 +2125,20 @@
   Assembler::z_lg(Z_SP, _z_abi(callers_sp), Z_SP);
 }
 
+// Pop current C frame and restore return PC register (Z_R14).
+void MacroAssembler::pop_frame_restore_retPC(int frame_size_in_bytes) {
+  BLOCK_COMMENT("pop_frame_restore_retPC:");
+  int retPC_offset = _z_abi16(return_pc) + frame_size_in_bytes;
+  // If possible, pop frame by add instead of load (a penny saved is a penny got :-).
+  if (Displacement::is_validDisp(retPC_offset)) {
+    z_lg(Z_R14, retPC_offset, Z_SP);
+    add2reg(Z_SP, frame_size_in_bytes);
+  } else {
+    add2reg(Z_SP, frame_size_in_bytes);
+    restore_return_pc();
+  }
+}
+
 void MacroAssembler::call_VM_leaf_base(address entry_point, bool allow_relocation) {
   if (allow_relocation) {
     call_c(entry_point);
@@ -3485,6 +3518,17 @@
 // Purpose: record the previous value if it is not null.
 // All non-tmps are preserved.
 //------------------------------------------------------
+// Note: Rpre_val needs special attention.
+//   The flag pre_val_needed indicated that the caller of this emitter function
+//   relies on Rpre_val containing the correct value, that is:
+//     either the value it contained on entry to this code segment
+//     or the value that was loaded into the register from (Robj+offset).
+//
+//   Independent from this requirement, the contents of Rpre_val must survive
+//   the push_frame() operation. push_frame() uses Z_R0_scratch by default
+//   to temporarily remember the frame pointer.
+//   If Rpre_val is assigned Z_R0_scratch by the caller, code must be emitted to
+//   save it's value.
 void MacroAssembler::g1_write_barrier_pre(Register           Robj,
                                           RegisterOrConstant offset,
                                           Register           Rpre_val,      // Ideally, this is a non-volatile register.
@@ -3498,6 +3542,16 @@
   const int buffer_offset = in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_buf());
   const int index_offset  = in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index());
   assert_different_registers(Rtmp1, Rtmp2, Z_R0_scratch); // None of the Rtmp<i> must be Z_R0!!
+  assert_different_registers(Robj, Z_R0_scratch);         // Used for addressing. Furthermore, push_frame destroys Z_R0!!
+  assert_different_registers(Rval, Z_R0_scratch);         // push_frame destroys Z_R0!!
+
+#ifdef ASSERT
+  // make sure the register is not Z_R0. Used for addressing. Furthermore, would be destroyed by push_frame.
+  if (offset.is_register() && offset.as_register()->encoding() == 0) {
+    tty->print_cr("Roffset(g1_write_barrier_pre)  = %%r%d", offset.as_register()->encoding());
+    assert(false, "bad register for offset");
+  }
+#endif
 
   BLOCK_COMMENT("g1_write_barrier_pre {");
 
@@ -3511,7 +3565,10 @@
   }
   z_bre(filtered); // Activity indicator is zero, so there is no marking going on currently.
 
-  // Do we need to load the previous value into Rpre_val?
+  assert(Rpre_val != noreg, "must have a real register");
+
+
+  // If an object is given, we need to load the previous value into Rpre_val.
   if (Robj != noreg) {
     // Load the previous value...
     Register ixReg = offset.is_register() ? offset.register_or_noreg() : Z_R0;
@@ -3521,9 +3578,9 @@
       z_lg(Rpre_val, offset.constant_or_zero(), ixReg, Robj);
     }
   }
-  assert(Rpre_val != noreg, "must have a real register");
 
   // Is the previous value NULL?
+  // If so, we don't need to record it and we're done.
   // Note: pre_val is loaded, decompressed and stored (directly or via runtime call).
   //       Register contents is preserved across runtime call if caller requests to do so.
   z_ltgr(Rpre_val, Rpre_val);
@@ -3540,6 +3597,7 @@
   // only if index > 0. Otherwise, we need runtime to handle.
   // (The index field is typed as size_t.)
   Register Rbuffer = Rtmp1, Rindex = Rtmp2;
+  assert_different_registers(Rbuffer, Rindex, Rpre_val);
 
   z_lg(Rbuffer, buffer_offset, Z_thread);
 
@@ -3558,16 +3616,8 @@
 
   bind(callRuntime);
 
-  // Save Rpre_val (result) over runtime call.
-  // Requires Rtmp1, Rtmp2, or Rpre_val to be non-volatile.
-  Register Rpre_save = Rpre_val;
-  if (pre_val_needed && Rpre_val->is_volatile()) {
-    guarantee(!Rtmp1->is_volatile() || !Rtmp2->is_volatile(), "oops!");
-    Rpre_save = !Rtmp1->is_volatile() ? Rtmp1 : Rtmp2;
-  }
-  lgr_if_needed(Rpre_save, Rpre_val);
-
-  // Preserve inputs by spilling them into the top frame.
+  // Save some registers (inputs and result) over runtime call
+  // by spilling them into the top frame.
   if (Robj != noreg && Robj->is_volatile()) {
     z_stg(Robj, Robj->encoding()*BytesPerWord, Z_SP);
   }
@@ -3579,11 +3629,20 @@
     z_stg(Rval, Rval->encoding()*BytesPerWord, Z_SP);
   }
 
+  // Save Rpre_val (result) over runtime call.
+  Register Rpre_save = Rpre_val;
+  if ((Rpre_val == Z_R0_scratch) || (pre_val_needed && Rpre_val->is_volatile())) {
+    guarantee(!Rtmp1->is_volatile() || !Rtmp2->is_volatile(), "oops!");
+    Rpre_save = !Rtmp1->is_volatile() ? Rtmp1 : Rtmp2;
+  }
+  lgr_if_needed(Rpre_save, Rpre_val);
+
   // Push frame to protect top frame with return pc and spilled register values.
   save_return_pc();
-  push_frame_abi160(0); // Will use Z_R0 as tmp on old CPUs.
-
-  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, Z_thread);
+  push_frame_abi160(0); // Will use Z_R0 as tmp.
+
+  // Rpre_val may be destroyed by push_frame().
+  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_save, Z_thread);
 
   pop_frame();
   restore_return_pc();
@@ -3599,9 +3658,9 @@
   if (Rval != noreg && Rval->is_volatile()) {
     z_lg(Rval, Rval->encoding()*BytesPerWord, Z_SP);
   }
-
-  // Restore Rpre_val (result) after runtime call.
-  lgr_if_needed(Rpre_val, Rpre_save);
+  if (pre_val_needed && Rpre_val->is_volatile()) {
+    lgr_if_needed(Rpre_val, Rpre_save);
+  }
 
   bind(filtered);
   BLOCK_COMMENT("} g1_write_barrier_pre");
@@ -3654,7 +3713,7 @@
   // calculate address of card
   load_const_optimized(Rbase, (address)bs->byte_map_base);        // Card table base.
   z_srlg(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift); // Index into card table.
-  add2reg_with_index(Rcard_addr, 0, Rcard_addr, Rbase);           // Explicit calculation needed for cli.
+  z_algr(Rcard_addr, Rbase);                                      // Explicit calculation needed for cli.
   Rbase = noreg; // end of lifetime
 
   // Filter young.
@@ -3698,6 +3757,7 @@
 
   // TODO: do we need a frame? Introduced to be on the safe side.
   bool needs_frame = true;
+  lgr_if_needed(Rcard_addr, Rcard_addr_x); // copy back asap. push_frame will destroy Z_R0_scratch!
 
   // VM call need frame to access(write) O register.
   if (needs_frame) {
@@ -3706,7 +3766,7 @@
   }
 
   // Save the live input values.
-  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr_x, Z_thread);
+  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, Z_thread);
 
   if (needs_frame) {
     pop_frame();
@@ -4062,7 +4122,12 @@
 void MacroAssembler::store_klass_gap(Register s, Register d) {
   if (UseCompressedClassPointers) {
     assert(s != d, "not enough registers");
-    z_st(s, Address(d, oopDesc::klass_gap_offset_in_bytes()));
+    // Support s = noreg.
+    if (s != noreg) {
+      z_st(s, Address(d, oopDesc::klass_gap_offset_in_bytes()));
+    } else {
+      z_mvhi(Address(d, oopDesc::klass_gap_offset_in_bytes()), 0);
+    }
   }
 }
 
@@ -6621,11 +6686,12 @@
 
   BLOCK_COMMENT("verify_oop {");
   Register tmp = Z_R0;
-  unsigned int nbytes_save = 6 *8;
+  unsigned int nbytes_save = 5*BytesPerWord;
   address entry = StubRoutines::verify_oop_subroutine_entry_address();
+
   save_return_pc();
   push_frame_abi160(nbytes_save);
-  z_stmg(Z_R0, Z_R5, 160, Z_SP);
+  z_stmg(Z_R1, Z_R5, frame::z_abi_160_size, Z_SP);
 
   z_lgr(Z_ARG2, oop);
   load_const(Z_ARG1, (address) msg);
@@ -6633,10 +6699,10 @@
   z_lg(Z_R1, 0, Z_R1);
   call_c(Z_R1);
 
-  z_lmg(Z_R0, Z_R5, 160, Z_SP);
+  z_lmg(Z_R1, Z_R5, frame::z_abi_160_size, Z_SP);
   pop_frame();
-
   restore_return_pc();
+
   BLOCK_COMMENT("} verify_oop ");
 }
 
@@ -6658,8 +6724,8 @@
   // Setup arguments.
   load_const(Z_ARG1, (void*) stop_types[type%stop_end]);
   load_const(Z_ARG2, (void*) msg);
-  get_PC(Z_R14); // Following code pushes a frame without entering a new function. Use current pc as return address.
-  save_return_pc();    // Saves return pc Z_R14.
+  get_PC(Z_R14);     // Following code pushes a frame without entering a new function. Use current pc as return address.
+  save_return_pc();  // Saves return pc Z_R14.
   push_frame_abi160(0);
   call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), Z_ARG1, Z_ARG2);
   // The plain disassembler does not recognize illtrap. It instead displays
--- a/hotspot/src/cpu/s390/vm/macroAssembler_s390.hpp	Fri Jul 21 12:43:47 2017 +0200
+++ b/hotspot/src/cpu/s390/vm/macroAssembler_s390.hpp	Thu Jul 27 15:36:15 2017 +0200
@@ -440,9 +440,21 @@
   // Get current PC + offset. Offset given in bytes, must be even!
   address get_PC(Register result, int64_t offset);
 
+  // Accessing, and in particular modifying, a stack location is only safe if
+  // the stack pointer (Z_SP) is set such that the accessed stack location is
+  // in the reserved range.
+  //
+  // From a performance point of view, it is desirable not to change the SP
+  // first and then immediately use it to access the freshly reserved space.
+  // That opens a small gap, though. If, just after storing some value (the
+  // frame pointer) into the to-be-reserved space, an interrupt is caught,
+  // the handler might use the space beyond Z_SP for it's own purpose.
+  // If that happens, the stored value might get altered.
+
   // Resize current frame either relatively wrt to current SP or absolute.
   void resize_frame_sub(Register offset, Register fp, bool load_fp=true);
-  void resize_frame_absolute(Register addr, Register fp, bool load_fp=true);
+  void resize_frame_abs_with_offset(Register newSP, Register fp, int offset, bool load_fp);
+  void resize_frame_absolute(Register addr, Register fp, bool load_fp);
   void resize_frame(RegisterOrConstant offset, Register fp, bool load_fp=true);
 
   // Push a frame of size bytes, if copy_sp is false, old_sp must already
@@ -461,6 +473,8 @@
 
   // Pop current C frame.
   void pop_frame();
+  // Pop current C frame and restore return PC register (Z_R14).
+  void pop_frame_restore_retPC(int frame_size_in_bytes);
 
   //
   // Calls
--- a/hotspot/src/cpu/s390/vm/s390.ad	Fri Jul 21 12:43:47 2017 +0200
+++ b/hotspot/src/cpu/s390/vm/s390.ad	Thu Jul 27 15:36:15 2017 +0200
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved.
-// Copyright (c) 2016 SAP SE. All rights reserved.
+// Copyright (c) 2017, SAP SE. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
 // This code is free software; you can redistribute it and/or modify it
@@ -910,16 +910,8 @@
   bool need_polling = do_polling() && C->is_method_compilation();
 
   // Pop frame, restore return_pc, and all stuff needed by interpreter.
-  // Pop frame by add instead of load (a penny saved is a penny got :-).
   int frame_size_in_bytes = Assembler::align((C->frame_slots() << LogBytesPerInt), frame::alignment_in_bytes);
-  int retPC_offset        = frame_size_in_bytes + _z_abi16(return_pc);
-  if (Displacement::is_validDisp(retPC_offset)) {
-    __ z_lg(Z_R14, retPC_offset, Z_SP);
-    __ add2reg(Z_SP, frame_size_in_bytes);
-  } else {
-    __ add2reg(Z_SP, frame_size_in_bytes);
-    __ restore_return_pc();
-  }
+  __ pop_frame_restore_retPC(frame_size_in_bytes);
 
   if (StackReservedPages > 0 && C->has_reserved_stack_access()) {
     __ reserved_stack_check(Z_R14);
--- a/hotspot/src/cpu/s390/vm/sharedRuntime_s390.cpp	Fri Jul 21 12:43:47 2017 +0200
+++ b/hotspot/src/cpu/s390/vm/sharedRuntime_s390.cpp	Thu Jul 27 15:36:15 2017 +0200
@@ -312,7 +312,13 @@
   __ save_return_pc(return_pc);
 
   // Push a new frame (includes stack linkage).
-  __ push_frame(frame_size_in_bytes);
+  // use return_pc as scratch for push_frame. Z_R0_scratch (the default) and Z_R1_scratch are
+  // illegally used to pass parameters (SAPJVM extension) by RangeCheckStub::emit_code().
+  __ push_frame(frame_size_in_bytes, return_pc);
+  // We have to restore return_pc right away.
+  // Nobody else will. Furthermore, return_pc isn't necessarily the default (Z_R14).
+  // Nobody else knows which register we saved.
+  __ z_lg(return_pc, _z_abi16(return_pc) + frame_size_in_bytes, Z_SP);
 
   // Register save area in new frame starts above z_abi_160 area.
   int offset = register_save_offset;
--- a/hotspot/src/cpu/s390/vm/stubGenerator_s390.cpp	Fri Jul 21 12:43:47 2017 +0200
+++ b/hotspot/src/cpu/s390/vm/stubGenerator_s390.cpp	Thu Jul 27 15:36:15 2017 +0200
@@ -291,7 +291,7 @@
       // Restore frame pointer.
       __ z_lg(r_entryframe_fp, _z_abi(callers_sp), Z_SP);
       // Pop frame. Done here to minimize stalls.
-      __ z_lg(Z_SP, _z_abi(callers_sp), Z_SP);
+      __ pop_frame();
 
       // Reload some volatile registers which we've spilled before the call
       // to frame manager / native entry.
@@ -563,6 +563,9 @@
   address generate_throw_exception(const char* name, address runtime_entry,
                                    bool restore_saved_exception_pc,
                                    Register arg1 = noreg, Register arg2 = noreg) {
+    assert_different_registers(arg1, Z_R0_scratch);  // would be destroyed by push_frame()
+    assert_different_registers(arg2, Z_R0_scratch);  // would be destroyed by push_frame()
+
     int insts_size = 256;
     int locs_size  = 0;
     CodeBuffer      code(name, insts_size, locs_size);
@@ -693,11 +696,13 @@
     BarrierSet* const bs = Universe::heap()->barrier_set();
     switch (bs->kind()) {
       case BarrierSet::G1SATBCTLogging:
-        // With G1, don't generate the call if we statically know that the target in uninitialized.
+        // With G1, don't generate the call if we statically know that the target is uninitialized.
         if (!dest_uninitialized) {
           // Is marking active?
           Label filtered;
-          Register Rtmp1 = Z_R0;
+          assert_different_registers(addr,  Z_R0_scratch);  // would be destroyed by push_frame()
+          assert_different_registers(count, Z_R0_scratch);  // would be destroyed by push_frame()
+          Register Rtmp1 = Z_R0_scratch;
           const int active_offset = in_bytes(JavaThread::satb_mark_queue_offset() +
                                              SATBMarkQueue::byte_offset_of_active());
           if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
@@ -708,11 +713,11 @@
           }
           __ z_bre(filtered); // Activity indicator is zero, so there is no marking going on currently.
 
-          // __ push_frame_abi160(0);
+          // __ push_frame_abi160(0);  // implicitly done in save_live_registers()
           (void) RegisterSaver::save_live_registers(_masm, RegisterSaver::arg_registers);
           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), addr, count);
           (void) RegisterSaver::restore_live_registers(_masm, RegisterSaver::arg_registers);
-          // __ pop_frame();
+          // __ pop_frame();  // implicitly done in restore_live_registers()
 
           __ bind(filtered);
         }
@@ -739,16 +744,18 @@
       case BarrierSet::G1SATBCTLogging:
         {
           if (branchToEnd) {
-            // __ push_frame_abi160(0);
+            assert_different_registers(addr,  Z_R0_scratch);  // would be destroyed by push_frame()
+            assert_different_registers(count, Z_R0_scratch);  // would be destroyed by push_frame()
+            // __ push_frame_abi160(0);  // implicitly done in save_live_registers()
             (void) RegisterSaver::save_live_registers(_masm, RegisterSaver::arg_registers);
             __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), addr, count);
             (void) RegisterSaver::restore_live_registers(_masm, RegisterSaver::arg_registers);
-            // __ pop_frame();
+            // __ pop_frame();   // implicitly done in restore_live_registers()
           } else {
             // Tail call: call c and return to stub caller.
             address entry_point = CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post);
-            if (Z_ARG1 != addr) __ z_lgr(Z_ARG1, addr);
-            if (Z_ARG2 != count) __ z_lgr(Z_ARG2, count);
+            __ lgr_if_needed(Z_ARG1, addr);
+            __ lgr_if_needed(Z_ARG2, count);
             __ load_const(Z_R1, entry_point);
             __ z_br(Z_R1); // Branch without linking, callee will return to stub caller.
           }
@@ -1677,7 +1684,7 @@
 
   // Helper function which generates code to
   //  - load the function code in register fCode (== Z_R0)
-  //  - load the data block length (depends on cipher function) in register srclen if requested.
+  //  - load the data block length (depends on cipher function) into register srclen if requested.
   //  - is_decipher switches between cipher/decipher function codes
   //  - set_len requests (if true) loading the data block length in register srclen
   void generate_load_AES_fCode(Register keylen, Register fCode, Register srclen, bool is_decipher) {
@@ -1689,6 +1696,7 @@
                                   && (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES256_dataBlk);
       // Expanded key length is 44/52/60 * 4 bytes for AES-128/AES-192/AES-256.
       __ z_cghi(keylen, 52);
+
       __ z_lghi(fCode, VM_Version::Cipher::_AES256 + mode);
       if (!identical_dataBlk_len) {
         __ z_lghi(srclen, VM_Version::Cipher::_AES256_dataBlk);
@@ -1706,6 +1714,7 @@
         __ z_lghi(srclen, VM_Version::Cipher::_AES128_dataBlk);
       }
       // __ z_brl(fCode_set);  // keyLen <  52: AES128           // fallthru
+
       __ bind(fCode_set);
       if (identical_dataBlk_len) {
         __ z_lghi(srclen, VM_Version::Cipher::_AES128_dataBlk);
--- a/hotspot/src/cpu/s390/vm/templateInterpreterGenerator_s390.cpp	Fri Jul 21 12:43:47 2017 +0200
+++ b/hotspot/src/cpu/s390/vm/templateInterpreterGenerator_s390.cpp	Thu Jul 27 15:36:15 2017 +0200
@@ -121,9 +121,8 @@
 
   // We use target_sp for storing arguments in the C frame.
   __ save_return_pc();
-
-  __ z_stmg(Z_R10,Z_R13,-32,Z_SP);
-  __ push_frame_abi160(32);
+  __ push_frame_abi160(4*BytesPerWord);                 // Reserve space to save the tmp_[1..4] registers.
+  __ z_stmg(Z_R10, Z_R13, frame::z_abi_160_size, Z_SP); // Save registers only after frame is pushed.
 
   __ z_lgr(arg_java, Z_ARG1);
 
@@ -341,9 +340,9 @@
 
   // Method exit, all arguments proocessed.
   __ bind(loop_end);
+  __ z_lmg(Z_R10, Z_R13, frame::z_abi_160_size, Z_SP); // restore registers before frame is popped.
   __ pop_frame();
   __ restore_return_pc();
-  __ z_lmg(Z_R10,Z_R13,-32,Z_SP);
   __ z_br(Z_R14);
 
   // Copy int arguments.
@@ -1232,13 +1231,9 @@
 
     // Advance local_addr to point behind locals (creates positive incr. in loop).
     __ z_lg(Z_R1_scratch, Address(Z_method, Method::const_offset()));
-    __ z_llgh(Z_R0_scratch,
-              Address(Z_R1_scratch, ConstMethod::size_of_locals_offset()));
-    if (Z_R0_scratch == Z_R0) {
-      __ z_aghi(Z_R0_scratch, -1);
-    } else {
-      __ add2reg(Z_R0_scratch, -1);
-    }
+    __ z_llgh(Z_R0_scratch, Address(Z_R1_scratch, ConstMethod::size_of_locals_offset()));
+    __ add2reg(Z_R0_scratch, -1);
+
     __ z_lgr(local_addr/*locals*/, Z_locals);
     __ z_sllg(Z_R0_scratch, Z_R0_scratch, LogBytesPerWord);
     __ z_sllg(local_count, local_count, LogBytesPerWord); // Local_count are non param locals.