8193257: PPC64, s390 implementation for Thread-local handshakes
authormdoerr
Thu, 14 Dec 2017 13:05:20 +0100
changeset 48332 651a95f30dfb
parent 48331 a8e39cc7b88f
child 48333 f47c18852172
child 55982 b6ff245c0db6
8193257: PPC64, s390 implementation for Thread-local handshakes Reviewed-by: goetz, lucy
src/hotspot/cpu/ppc/c1_LIRAssembler_ppc.cpp
src/hotspot/cpu/ppc/globalDefinitions_ppc.hpp
src/hotspot/cpu/ppc/globals_ppc.hpp
src/hotspot/cpu/ppc/interp_masm_ppc.hpp
src/hotspot/cpu/ppc/interp_masm_ppc_64.cpp
src/hotspot/cpu/ppc/macroAssembler_ppc.cpp
src/hotspot/cpu/ppc/macroAssembler_ppc.hpp
src/hotspot/cpu/ppc/ppc.ad
src/hotspot/cpu/ppc/sharedRuntime_ppc.cpp
src/hotspot/cpu/ppc/templateInterpreterGenerator_ppc.cpp
src/hotspot/cpu/ppc/templateTable_ppc_64.cpp
src/hotspot/cpu/s390/c1_LIRAssembler_s390.cpp
src/hotspot/cpu/s390/globalDefinitions_s390.hpp
src/hotspot/cpu/s390/globals_s390.hpp
src/hotspot/cpu/s390/interp_masm_s390.cpp
src/hotspot/cpu/s390/interp_masm_s390.hpp
src/hotspot/cpu/s390/macroAssembler_s390.cpp
src/hotspot/cpu/s390/macroAssembler_s390.hpp
src/hotspot/cpu/s390/s390.ad
src/hotspot/cpu/s390/sharedRuntime_s390.cpp
src/hotspot/cpu/s390/templateInterpreterGenerator_s390.cpp
src/hotspot/cpu/s390/templateTable_s390.cpp
--- a/src/hotspot/cpu/ppc/c1_LIRAssembler_ppc.cpp	Thu Dec 14 12:02:16 2017 +0100
+++ b/src/hotspot/cpu/ppc/c1_LIRAssembler_ppc.cpp	Thu Dec 14 13:05:20 2017 +0100
@@ -36,6 +36,7 @@
 #include "gc/shared/cardTableModRefBS.hpp"
 #include "nativeInst_ppc.hpp"
 #include "oops/objArrayKlass.hpp"
+#include "runtime/safepointMechanism.inline.hpp"
 #include "runtime/sharedRuntime.hpp"
 
 #define __ _masm->
@@ -1314,11 +1315,10 @@
     __ pop_frame();
   }
 
-  if (LoadPollAddressFromThread) {
-    // TODO: PPC port __ ld(polling_page, in_bytes(JavaThread::poll_address_offset()), R16_thread);
-    Unimplemented();
+  if (SafepointMechanism::uses_thread_local_poll()) {
+    __ ld(polling_page, in_bytes(Thread::polling_page_offset()), R16_thread);
   } else {
-    __ load_const_optimized(polling_page, (long)(address) os::get_polling_page(), R0); // TODO: PPC port: get_standard_polling_page()
+    __ load_const_optimized(polling_page, (long)(address) os::get_polling_page(), R0);
   }
 
   // Restore return pc relative to callers' sp.
@@ -1341,26 +1341,18 @@
 
 
 int LIR_Assembler::safepoint_poll(LIR_Opr tmp, CodeEmitInfo* info) {
-
-  if (LoadPollAddressFromThread) {
-    const Register poll_addr = tmp->as_register();
-    // TODO: PPC port __ ld(poll_addr, in_bytes(JavaThread::poll_address_offset()), R16_thread);
-    Unimplemented();
-    __ relocate(relocInfo::poll_type); // XXX
-    guarantee(info != NULL, "Shouldn't be NULL");
-    int offset = __ offset();
-    add_debug_info_for_branch(info);
-    __ load_from_polling_page(poll_addr);
-    return offset;
+  const Register poll_addr = tmp->as_register();
+  if (SafepointMechanism::uses_thread_local_poll()) {
+    __ ld(poll_addr, in_bytes(Thread::polling_page_offset()), R16_thread);
+  } else {
+    __ load_const_optimized(poll_addr, (intptr_t)os::get_polling_page(), R0);
   }
-
-  __ load_const_optimized(tmp->as_register(), (intptr_t)os::get_polling_page(), R0); // TODO: PPC port: get_standard_polling_page()
   if (info != NULL) {
     add_debug_info_for_branch(info);
   }
   int offset = __ offset();
   __ relocate(relocInfo::poll_type);
-  __ load_from_polling_page(tmp->as_register());
+  __ load_from_polling_page(poll_addr);
 
   return offset;
 }
--- a/src/hotspot/cpu/ppc/globalDefinitions_ppc.hpp	Thu Dec 14 12:02:16 2017 +0100
+++ b/src/hotspot/cpu/ppc/globalDefinitions_ppc.hpp	Thu Dec 14 13:05:20 2017 +0100
@@ -54,4 +54,6 @@
 
 #define SUPPORT_RESERVED_STACK_AREA
 
+#define THREAD_LOCAL_POLL
+
 #endif // CPU_PPC_VM_GLOBALDEFINITIONS_PPC_HPP
--- a/src/hotspot/cpu/ppc/globals_ppc.hpp	Thu Dec 14 12:02:16 2017 +0100
+++ b/src/hotspot/cpu/ppc/globals_ppc.hpp	Thu Dec 14 13:05:20 2017 +0100
@@ -83,7 +83,7 @@
 // 2x unrolled loop is shorter with more than 9 HeapWords.
 define_pd_global(intx, InitArrayShortSize, 9*BytesPerLong);
 
-define_pd_global(bool, ThreadLocalHandshakes, false);
+define_pd_global(bool, ThreadLocalHandshakes, true);
 
 // Platform dependent flag handling: flags only defined on this platform.
 #define ARCH_FLAGS(develop, \
@@ -95,12 +95,6 @@
                    constraint, \
                    writeable)  \
                                                                             \
-  /* Load poll address from thread. This is used to implement per-thread */ \
-  /* safepoints on platforms != IA64. */                                    \
-  product(bool, LoadPollAddressFromThread, false,                           \
-          "Load polling page address from thread object (required for "     \
-          "per-thread safepoints on platforms != IA64)")                    \
-                                                                            \
   product(uintx, PowerArchitecturePPC64, 0,                                 \
           "CPU Version: x for PowerX. Currently recognizes Power5 to "      \
           "Power8. Default is 0. Newer CPUs will be recognized as Power8.") \
--- a/src/hotspot/cpu/ppc/interp_masm_ppc.hpp	Thu Dec 14 12:02:16 2017 +0100
+++ b/src/hotspot/cpu/ppc/interp_masm_ppc.hpp	Thu Dec 14 13:05:20 2017 +0100
@@ -57,10 +57,10 @@
   static const Address d_tmp;
 
   // dispatch routines
-  void dispatch_next(TosState state, int step = 0);
+  void dispatch_next(TosState state, int step = 0, bool generate_poll = false);
   void dispatch_via (TosState state, address* table);
   void load_dispatch_table(Register dst, address* table);
-  void dispatch_Lbyte_code(TosState state, Register bytecode, address* table, bool verify = false);
+  void dispatch_Lbyte_code(TosState state, Register bytecode, address* table, bool generate_poll = false);
 
   // Called by shared interpreter generator.
   void dispatch_prolog(TosState state, int step = 0);
--- a/src/hotspot/cpu/ppc/interp_masm_ppc_64.cpp	Thu Dec 14 12:02:16 2017 +0100
+++ b/src/hotspot/cpu/ppc/interp_masm_ppc_64.cpp	Thu Dec 14 13:05:20 2017 +0100
@@ -29,6 +29,7 @@
 #include "interp_masm_ppc.hpp"
 #include "interpreter/interpreterRuntime.hpp"
 #include "prims/jvmtiThreadState.hpp"
+#include "runtime/safepointMechanism.hpp"
 #include "runtime/sharedRuntime.hpp"
 
 #ifdef PRODUCT
@@ -53,7 +54,7 @@
   }
 }
 
-void InterpreterMacroAssembler::dispatch_next(TosState state, int bcp_incr) {
+void InterpreterMacroAssembler::dispatch_next(TosState state, int bcp_incr, bool generate_poll) {
   Register bytecode = R12_scratch2;
   if (bcp_incr != 0) {
     lbzu(bytecode, bcp_incr, R14_bcp);
@@ -61,7 +62,7 @@
     lbz(bytecode, 0, R14_bcp);
   }
 
-  dispatch_Lbyte_code(state, bytecode, Interpreter::dispatch_table(state));
+  dispatch_Lbyte_code(state, bytecode, Interpreter::dispatch_table(state), generate_poll);
 }
 
 void InterpreterMacroAssembler::dispatch_via(TosState state, address* table) {
@@ -203,16 +204,26 @@
 }
 
 void InterpreterMacroAssembler::dispatch_Lbyte_code(TosState state, Register bytecode,
-                                                    address* table, bool verify) {
-  if (verify) {
-    unimplemented("dispatch_Lbyte_code: verify"); // See Sparc Implementation to implement this
-  }
-
+                                                    address* table, bool generate_poll) {
   assert_different_registers(bytecode, R11_scratch1);
 
   // Calc dispatch table address.
   load_dispatch_table(R11_scratch1, table);
 
+  if (SafepointMechanism::uses_thread_local_poll() && generate_poll) {
+    address *sfpt_tbl = Interpreter::safept_table(state);
+    if (table != sfpt_tbl) {
+      Label dispatch;
+      ld(R0, in_bytes(Thread::polling_page_offset()), R16_thread);
+      // Armed page has poll_bit set, if poll bit is cleared just continue.
+      andi_(R0, R0, SafepointMechanism::poll_bit());
+      beq(CCR0, dispatch);
+      load_dispatch_table(R11_scratch1, sfpt_tbl);
+      align(32, 16);
+      bind(dispatch);
+    }
+  }
+
   sldi(R12_scratch2, bytecode, LogBytesPerWord);
   ldx(R11_scratch1, R11_scratch1, R12_scratch2);
 
--- a/src/hotspot/cpu/ppc/macroAssembler_ppc.cpp	Thu Dec 14 12:02:16 2017 +0100
+++ b/src/hotspot/cpu/ppc/macroAssembler_ppc.cpp	Thu Dec 14 13:05:20 2017 +0100
@@ -37,6 +37,8 @@
 #include "runtime/interfaceSupport.hpp"
 #include "runtime/objectMonitor.hpp"
 #include "runtime/os.hpp"
+#include "runtime/safepoint.hpp"
+#include "runtime/safepointMechanism.hpp"
 #include "runtime/sharedRuntime.hpp"
 #include "runtime/stubRoutines.hpp"
 #include "utilities/macros.hpp"
@@ -3019,6 +3021,18 @@
   stwx(R0, tmp1, tmp2);
 }
 
+void MacroAssembler::safepoint_poll(Label& slow_path, Register temp_reg) {
+  if (SafepointMechanism::uses_thread_local_poll()) {
+    ld(temp_reg, in_bytes(Thread::polling_page_offset()), R16_thread);
+    // Armed page has poll_bit set.
+    andi_(temp_reg, temp_reg, SafepointMechanism::poll_bit());
+  } else {
+    lwz(temp_reg, (RegisterOrConstant)(intptr_t)SafepointSynchronize::address_of_state());
+    cmpwi(CCR0, temp_reg, SafepointSynchronize::_not_synchronized);
+  }
+  bne(CCR0, slow_path);
+}
+
 
 // GC barrier helper macros
 
--- a/src/hotspot/cpu/ppc/macroAssembler_ppc.hpp	Thu Dec 14 12:02:16 2017 +0100
+++ b/src/hotspot/cpu/ppc/macroAssembler_ppc.hpp	Thu Dec 14 13:05:20 2017 +0100
@@ -647,6 +647,9 @@
   // Support for serializing memory accesses between threads
   void serialize_memory(Register thread, Register tmp1, Register tmp2);
 
+  // Check if safepoint requested and if so branch
+  void safepoint_poll(Label& slow_path, Register temp_reg);
+
   // GC barrier support.
   void card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp);
   void card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj);
--- a/src/hotspot/cpu/ppc/ppc.ad	Thu Dec 14 12:02:16 2017 +0100
+++ b/src/hotspot/cpu/ppc/ppc.ad	Thu Dec 14 13:05:20 2017 +0100
@@ -1577,11 +1577,10 @@
   }
 
   if (method_needs_polling) {
-    if (LoadPollAddressFromThread) {
-      // TODO: PPC port __ ld(polling_page, in_bytes(JavaThread::poll_address_offset()), R16_thread);
-      Unimplemented();
+    if (SafepointMechanism::uses_thread_local_poll()) {
+      __ ld(polling_page, in_bytes(JavaThread::polling_page_offset()), R16_thread);
     } else {
-      __ load_const_optimized(polling_page, (long)(address) os::get_polling_page()); // TODO: PPC port: get_standard_polling_page()
+      __ load_const_optimized(polling_page, (long)(address) os::get_polling_page());
     }
   }
 
@@ -14147,7 +14146,6 @@
 
 instruct safePoint_poll(iRegPdst poll) %{
   match(SafePoint poll);
-  predicate(LoadPollAddressFromThread);
 
   // It caused problems to add the effect that r0 is killed, but this
   // effect no longer needs to be mentioned, since r0 is not contained
@@ -14159,24 +14157,6 @@
   ins_pipe(pipe_class_default);
 %}
 
-// Safepoint without per-thread support. Load address of page to poll
-// as constant.
-// Rscratch2RegP is R12.
-// LoadConPollAddr node is added in pd_post_matching_hook(). It must be
-// a seperate node so that the oop map is at the right location.
-instruct safePoint_poll_conPollAddr(rscratch2RegP poll) %{
-  match(SafePoint poll);
-  predicate(!LoadPollAddressFromThread);
-
-  // It caused problems to add the effect that r0 is killed, but this
-  // effect no longer needs to be mentioned, since r0 is not contained
-  // in a reg_class.
-
-  format %{ "LD      R0, #0, R12 \t// Safepoint poll for GC" %}
-  ins_encode( enc_poll(0x0, poll) );
-  ins_pipe(pipe_class_default);
-%}
-
 // ============================================================================
 // Call Instructions
 
--- a/src/hotspot/cpu/ppc/sharedRuntime_ppc.cpp	Thu Dec 14 12:02:16 2017 +0100
+++ b/src/hotspot/cpu/ppc/sharedRuntime_ppc.cpp	Thu Dec 14 13:05:20 2017 +0100
@@ -214,6 +214,7 @@
   // StackFrameStream construction (needed for deoptimization; see
   // compiledVFrame::create_stack_value).
   // If return_pc_adjustment != 0 adjust the return pc by return_pc_adjustment.
+  // Updated return pc is returned in R31 (if not return_pc_is_pre_saved).
 
   int i;
   int offset;
@@ -233,16 +234,17 @@
 
   BLOCK_COMMENT("push_frame_reg_args_and_save_live_registers {");
 
-  // Save r31 in the last slot of the not yet pushed frame so that we
-  // can use it as scratch reg.
-  __ std(R31, -reg_size, R1_SP);
+  // Save some registers in the last slots of the not yet pushed frame so that we
+  // can use them as scratch regs.
+  __ std(R31, -  reg_size, R1_SP);
+  __ std(R30, -2*reg_size, R1_SP);
   assert(-reg_size == register_save_offset - frame_size_in_bytes + ((regstosave_num-1)*reg_size),
          "consistency check");
 
   // save the flags
   // Do the save_LR_CR by hand and adjust the return pc if requested.
-  __ mfcr(R31);
-  __ std(R31, _abi(cr), R1_SP);
+  __ mfcr(R30);
+  __ std(R30, _abi(cr), R1_SP);
   switch (return_pc_location) {
     case return_pc_is_lr: __ mflr(R31); break;
     case return_pc_is_pre_saved: assert(return_pc_adjustment == 0, "unsupported"); break;
@@ -257,7 +259,7 @@
   }
 
   // push a new frame
-  __ push_frame(frame_size_in_bytes, R31);
+  __ push_frame(frame_size_in_bytes, R30);
 
   // save all registers (ints and floats)
   offset = register_save_offset;
@@ -267,7 +269,7 @@
 
     switch (reg_type) {
       case RegisterSaver::int_reg: {
-        if (reg_num != 31) { // We spilled R31 right at the beginning.
+        if (reg_num < 30) { // We spilled R30-31 right at the beginning.
           __ std(as_Register(reg_num), offset, R1_SP);
         }
         break;
@@ -278,8 +280,8 @@
       }
       case RegisterSaver::special_reg: {
         if (reg_num == SR_CTR_SpecialRegisterEnumValue) {
-          __ mfctr(R31);
-          __ std(R31, offset, R1_SP);
+          __ mfctr(R30);
+          __ std(R30, offset, R1_SP);
         } else {
           Unimplemented();
         }
@@ -2364,23 +2366,14 @@
     Register sync_state      = r_temp_5;
     Register suspend_flags   = r_temp_6;
 
-    __ load_const(sync_state_addr, SafepointSynchronize::address_of_state(), /*temp*/ sync_state);
-
-    // TODO: PPC port assert(4 == SafepointSynchronize::sz_state(), "unexpected field size");
-    __ lwz(sync_state, 0, sync_state_addr);
-
+    // No synchronization in progress nor yet synchronized
+    // (cmp-br-isync on one path, release (same as acquire on PPC64) on the other path).
+    __ safepoint_poll(sync, sync_state);
+
+    // Not suspended.
     // TODO: PPC port assert(4 == Thread::sz_suspend_flags(), "unexpected field size");
     __ lwz(suspend_flags, thread_(suspend_flags));
-
-    __ acquire();
-
-    Label do_safepoint;
-    // No synchronization in progress nor yet synchronized.
-    __ cmpwi(CCR0, sync_state, SafepointSynchronize::_not_synchronized);
-    // Not suspended.
     __ cmpwi(CCR1, suspend_flags, 0);
-
-    __ bne(CCR0, sync);
     __ beq(CCR1, no_block);
 
     // Block. Save any potential method result value before the operation and
@@ -2388,6 +2381,7 @@
     // lets us share the oopMap we used when we went native rather than create
     // a distinct one for this pc.
     __ bind(sync);
+    __ isync();
 
     address entry_point = is_critical_native
       ? CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans_and_transition)
@@ -2410,7 +2404,7 @@
 
   // Transition from _thread_in_native_trans to _thread_in_Java.
   __ li(R0, _thread_in_Java);
-  __ release();
+  __ lwsync(); // Acquire safepoint and suspend state, release thread state.
   // TODO: PPC port assert(4 == JavaThread::sz_thread_state(), "unexpected field size");
   __ stw(R0, thread_(thread_state));
   __ bind(after_transition);
@@ -3093,7 +3087,7 @@
     return_pc_location = RegisterSaver::return_pc_is_thread_saved_exception_pc;
   }
 
-  // Save registers, fpu state, and flags.
+  // Save registers, fpu state, and flags. Set R31 = return pc.
   map = RegisterSaver::push_frame_reg_args_and_save_live_registers(masm,
                                                                    &frame_size_in_bytes,
                                                                    /*generate_oop_map=*/ true,
@@ -3142,6 +3136,19 @@
   // No exception case.
   __ BIND(noException);
 
+  if (SafepointMechanism::uses_thread_local_poll() && !cause_return) {
+    Label no_adjust;
+    // If our stashed return pc was modified by the runtime we avoid touching it
+    __ ld(R0, frame_size_in_bytes + _abi(lr), R1_SP);
+    __ cmpd(CCR0, R0, R31);
+    __ bne(CCR0, no_adjust);
+
+    // Adjust return pc forward to step over the safepoint poll instruction
+    __ addi(R31, R31, 4);
+    __ std(R31, frame_size_in_bytes + _abi(lr), R1_SP);
+
+    __ bind(no_adjust);
+  }
 
   // Normal exit, restore registers and exit.
   RegisterSaver::restore_live_registers_and_pop_frame(masm,
--- a/src/hotspot/cpu/ppc/templateInterpreterGenerator_ppc.cpp	Thu Dec 14 12:02:16 2017 +0100
+++ b/src/hotspot/cpu/ppc/templateInterpreterGenerator_ppc.cpp	Thu Dec 14 13:05:20 2017 +0100
@@ -1535,23 +1535,17 @@
   // Acquire isn't strictly necessary here because of the fence, but
   // sync_state is declared to be volatile, so we do it anyway
   // (cmp-br-isync on one path, release (same as acquire on PPC64) on the other path).
-  int sync_state_offs = __ load_const_optimized(sync_state_addr, SafepointSynchronize::address_of_state(), /*temp*/R0, true);
 
-  // TODO PPC port assert(4 == SafepointSynchronize::sz_state(), "unexpected field size");
-  __ lwz(sync_state, sync_state_offs, sync_state_addr);
+  Label do_safepoint, sync_check_done;
+  // No synchronization in progress nor yet synchronized.
+  __ safepoint_poll(do_safepoint, sync_state);
 
+  // Not suspended.
   // TODO PPC port assert(4 == Thread::sz_suspend_flags(), "unexpected field size");
   __ lwz(suspend_flags, thread_(suspend_flags));
+  __ cmpwi(CCR1, suspend_flags, 0);
+  __ beq(CCR1, sync_check_done);
 
-  Label sync_check_done;
-  Label do_safepoint;
-  // No synchronization in progress nor yet synchronized.
-  __ cmpwi(CCR0, sync_state, SafepointSynchronize::_not_synchronized);
-  // Not suspended.
-  __ cmpwi(CCR1, suspend_flags, 0);
-
-  __ bne(CCR0, do_safepoint);
-  __ beq(CCR1, sync_check_done);
   __ bind(do_safepoint);
   __ isync();
   // Block. We do the call directly and leave the current
@@ -1592,7 +1586,7 @@
   // we don't want the current thread to continue until all our prior memory
   // accesses (including the new thread state) are visible to other threads.
   __ li(R0/*thread_state*/, _thread_in_Java);
-  __ release();
+  __ lwsync(); // Acquire safepoint and suspend state, release thread state.
   __ stw(R0/*thread_state*/, thread_(thread_state));
 
   if (CheckJNICalls) {
@@ -1858,10 +1852,7 @@
 
     // Safepoint check
     const Register sync_state = R11_scratch1;
-    int sync_state_offs = __ load_const_optimized(sync_state, SafepointSynchronize::address_of_state(), /*temp*/R0, true);
-    __ lwz(sync_state, sync_state_offs, sync_state);
-    __ cmpwi(CCR0, sync_state, SafepointSynchronize::_not_synchronized);
-    __ bne(CCR0, slow_path);
+    __ safepoint_poll(slow_path, sync_state);
 
     // We don't generate local frame and don't align stack because
     // we not even call stub code (we generate the code inline)
@@ -1918,10 +1909,7 @@
 
     // Safepoint check
     const Register sync_state = R11_scratch1;
-    int sync_state_offs = __ load_const_optimized(sync_state, SafepointSynchronize::address_of_state(), /*temp*/R0, true);
-    __ lwz(sync_state, sync_state_offs, sync_state);
-    __ cmpwi(CCR0, sync_state, SafepointSynchronize::_not_synchronized);
-    __ bne(CCR0, slow_path);
+    __ safepoint_poll(slow_path, sync_state);
 
     // We don't generate local frame and don't align stack because
     // we not even call stub code (we generate the code inline)
--- a/src/hotspot/cpu/ppc/templateTable_ppc_64.cpp	Thu Dec 14 12:02:16 2017 +0100
+++ b/src/hotspot/cpu/ppc/templateTable_ppc_64.cpp	Thu Dec 14 13:05:20 2017 +0100
@@ -1630,7 +1630,7 @@
     // Push returnAddress for "ret" on stack.
     __ push_ptr(R17_tos);
     // And away we go!
-    __ dispatch_next(vtos);
+    __ dispatch_next(vtos, 0 ,true);
     return;
   }
 
@@ -1643,7 +1643,6 @@
   const bool increment_invocation_counter_for_backward_branches = UseCompiler && UseLoopCounter;
   if (increment_invocation_counter_for_backward_branches) {
     Label Lforward;
-    __ dispatch_prolog(vtos);
 
     // Check branch direction.
     __ cmpdi(CCR0, Rdisp, 0);
@@ -1744,11 +1743,8 @@
     }
 
     __ bind(Lforward);
-    __ dispatch_epilog(vtos);
-
-  } else {
-    __ dispatch_next(vtos);
   }
+  __ dispatch_next(vtos, 0, true);
 }
 
 // Helper function for if_cmp* methods below.
@@ -1829,7 +1825,7 @@
   __ ld(R11_scratch1, in_bytes(Method::const_offset()), R19_method);
   __ add(R11_scratch1, R17_tos, R11_scratch1);
   __ addi(R14_bcp, R11_scratch1, in_bytes(ConstMethod::codes_offset()));
-  __ dispatch_next(vtos);
+  __ dispatch_next(vtos, 0, true);
 }
 
 void TemplateTable::wide_ret() {
@@ -1846,7 +1842,7 @@
   __ ld(Rscratch1, in_bytes(Method::const_offset()), R19_method);
   __ addi(Rscratch2, R17_tos, in_bytes(ConstMethod::codes_offset()));
   __ add(R14_bcp, Rscratch1, Rscratch2);
-  __ dispatch_next(vtos);
+  __ dispatch_next(vtos, 0, true);
 }
 
 void TemplateTable::tableswitch() {
@@ -1896,7 +1892,7 @@
   __ bind(Ldispatch);
 
   __ add(R14_bcp, Roffset, R14_bcp);
-  __ dispatch_next(vtos);
+  __ dispatch_next(vtos, 0, true);
 }
 
 void TemplateTable::lookupswitch() {
@@ -1960,7 +1956,7 @@
 
   __ bind(Lcontinue_execution);
   __ add(R14_bcp, Roffset, R14_bcp);
-  __ dispatch_next(vtos);
+  __ dispatch_next(vtos, 0, true);
 }
 
 // Table switch using binary search (value/offset pairs are ordered).
@@ -2093,7 +2089,7 @@
 
   __ extsw(Rj, Rj);
   __ add(R14_bcp, Rj, R14_bcp);
-  __ dispatch_next(vtos);
+  __ dispatch_next(vtos, 0 , true);
 }
 
 void TemplateTable::_return(TosState state) {
@@ -2124,6 +2120,17 @@
     __ bind(Lskip_register_finalizer);
   }
 
+  if (SafepointMechanism::uses_thread_local_poll() && _desc->bytecode() != Bytecodes::_return_register_finalizer) {
+    Label no_safepoint;
+    __ ld(R11_scratch1, in_bytes(Thread::polling_page_offset()), R16_thread);
+    __ andi_(R11_scratch1, R11_scratch1, SafepointMechanism::poll_bit());
+    __ beq(CCR0, no_safepoint);
+    __ push(state);
+    __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::at_safepoint));
+    __ pop(state);
+    __ bind(no_safepoint);
+  }
+
   // Move the result value into the correct register and remove memory stack frame.
   __ remove_activation(state, /* throw_monitor_exception */ true);
   // Restoration of lr done by remove_activation.
--- a/src/hotspot/cpu/s390/c1_LIRAssembler_s390.cpp	Thu Dec 14 12:02:16 2017 +0100
+++ b/src/hotspot/cpu/s390/c1_LIRAssembler_s390.cpp	Thu Dec 14 13:05:20 2017 +0100
@@ -36,6 +36,7 @@
 #include "gc/shared/cardTableModRefBS.hpp"
 #include "nativeInst_s390.hpp"
 #include "oops/objArrayKlass.hpp"
+#include "runtime/safepointMechanism.inline.hpp"
 #include "runtime/sharedRuntime.hpp"
 #include "vmreg_s390.inline.hpp"
 
@@ -1135,8 +1136,12 @@
          (result->is_single_fpu() && result->as_float_reg() == Z_F0) ||
          (result->is_double_fpu() && result->as_double_reg() == Z_F0), "convention");
 
-  AddressLiteral pp(os::get_polling_page());
-  __ load_const_optimized(Z_R1_scratch, pp);
+  if (SafepointMechanism::uses_thread_local_poll()) {
+    __ z_lg(Z_R1_scratch, Address(Z_thread, Thread::polling_page_offset()));
+  } else {
+    AddressLiteral pp(os::get_polling_page());
+    __ load_const_optimized(Z_R1_scratch, pp);
+  }
 
   // Pop the frame before the safepoint code.
   __ pop_frame_restore_retPC(initial_frame_size_in_bytes());
@@ -1154,13 +1159,18 @@
 }
 
 int LIR_Assembler::safepoint_poll(LIR_Opr tmp, CodeEmitInfo* info) {
-  AddressLiteral pp(os::get_polling_page());
-  __ load_const_optimized(tmp->as_register_lo(), pp);
+  const Register poll_addr = tmp->as_register_lo();
+  if (SafepointMechanism::uses_thread_local_poll()) {
+    __ z_lg(poll_addr, Address(Z_thread, Thread::polling_page_offset()));
+  } else {
+    AddressLiteral pp(os::get_polling_page());
+    __ load_const_optimized(poll_addr, pp);
+  }
   guarantee(info != NULL, "Shouldn't be NULL");
   add_debug_info_for_branch(info);
   int offset = __ offset();
   __ relocate(relocInfo::poll_type);
-  __ load_from_polling_page(tmp->as_register_lo());
+  __ load_from_polling_page(poll_addr);
   return offset;
 }
 
--- a/src/hotspot/cpu/s390/globalDefinitions_s390.hpp	Thu Dec 14 12:02:16 2017 +0100
+++ b/src/hotspot/cpu/s390/globalDefinitions_s390.hpp	Thu Dec 14 13:05:20 2017 +0100
@@ -54,4 +54,6 @@
 
 #define SUPPORT_RESERVED_STACK_AREA
 
+#define THREAD_LOCAL_POLL
+
 #endif // CPU_S390_VM_GLOBALDEFINITIONS_S390_HPP
--- a/src/hotspot/cpu/s390/globals_s390.hpp	Thu Dec 14 12:02:16 2017 +0100
+++ b/src/hotspot/cpu/s390/globals_s390.hpp	Thu Dec 14 13:05:20 2017 +0100
@@ -85,7 +85,7 @@
 // 8146801 (Short Array Allocation): No performance work done here yet.
 define_pd_global(intx, InitArrayShortSize, 1*BytesPerLong);
 
-define_pd_global(bool, ThreadLocalHandshakes, false);
+define_pd_global(bool, ThreadLocalHandshakes, true);
 
 #define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint, writeable) \
                                                                               \
--- a/src/hotspot/cpu/s390/interp_masm_s390.cpp	Thu Dec 14 12:02:16 2017 +0100
+++ b/src/hotspot/cpu/s390/interp_masm_s390.cpp	Thu Dec 14 13:05:20 2017 +0100
@@ -36,6 +36,7 @@
 #include "prims/jvmtiThreadState.hpp"
 #include "runtime/basicLock.hpp"
 #include "runtime/biasedLocking.hpp"
+#include "runtime/safepointMechanism.hpp"
 #include "runtime/sharedRuntime.hpp"
 #include "runtime/thread.inline.hpp"
 
@@ -74,16 +75,16 @@
   dispatch_next(state, step);
 }
 
-void InterpreterMacroAssembler::dispatch_next(TosState state, int bcp_incr) {
+void InterpreterMacroAssembler::dispatch_next(TosState state, int bcp_incr, bool generate_poll) {
   z_llgc(Z_bytecode, bcp_incr, Z_R0, Z_bcp);  // Load next bytecode.
   add2reg(Z_bcp, bcp_incr);                   // Advance bcp. Add2reg produces optimal code.
-  dispatch_base(state, Interpreter::dispatch_table(state));
+  dispatch_base(state, Interpreter::dispatch_table(state), generate_poll);
 }
 
 // Common code to dispatch and dispatch_only.
 // Dispatch value in Lbyte_code and increment Lbcp.
 
-void InterpreterMacroAssembler::dispatch_base(TosState state, address* table) {
+void InterpreterMacroAssembler::dispatch_base(TosState state, address* table, bool generate_poll) {
   verify_FPU(1, state);
 
 #ifdef ASSERT
@@ -109,7 +110,20 @@
   verify_oop(Z_tos, state);
 
   // Dispatch table to use.
-  load_absolute_address(Z_tmp_1, (address) table);  // Z_tmp_1 = table;
+  load_absolute_address(Z_tmp_1, (address)table);  // Z_tmp_1 = table;
+
+  if (SafepointMechanism::uses_thread_local_poll() && generate_poll) {
+    address *sfpt_tbl = Interpreter::safept_table(state);
+    if (table != sfpt_tbl) {
+      Label dispatch;
+      const Address poll_byte_addr(Z_thread, in_bytes(Thread::polling_page_offset()) + 7 /* Big Endian */);
+      // Armed page has poll_bit set, if poll bit is cleared just continue.
+      z_tm(poll_byte_addr, SafepointMechanism::poll_bit());
+      z_braz(dispatch);
+      load_absolute_address(Z_tmp_1, (address)sfpt_tbl);  // Z_tmp_1 = table;
+      bind(dispatch);
+    }
+  }
 
   // 0 <= Z_bytecode < 256 => Use a 32 bit shift, because it is shorter than sllg.
   // Z_bytecode must have been loaded zero-extended for this approach to be correct.
@@ -119,8 +133,8 @@
   z_br(Z_tmp_1);
 }
 
-void InterpreterMacroAssembler::dispatch_only(TosState state) {
-  dispatch_base(state, Interpreter::dispatch_table(state));
+void InterpreterMacroAssembler::dispatch_only(TosState state, bool generate_poll) {
+  dispatch_base(state, Interpreter::dispatch_table(state), generate_poll);
 }
 
 void InterpreterMacroAssembler::dispatch_only_normal(TosState state) {
--- a/src/hotspot/cpu/s390/interp_masm_s390.hpp	Thu Dec 14 12:02:16 2017 +0100
+++ b/src/hotspot/cpu/s390/interp_masm_s390.hpp	Thu Dec 14 13:05:20 2017 +0100
@@ -49,7 +49,7 @@
                             bool check_exceptions);
 
   // Base routine for all dispatches.
-  void dispatch_base(TosState state, address* table);
+  void dispatch_base(TosState state, address* table, bool generate_poll = false);
 
  public:
   InterpreterMacroAssembler(CodeBuffer* c)
@@ -78,11 +78,11 @@
   // dispatch routines
   void dispatch_prolog(TosState state, int step = 0);
   void dispatch_epilog(TosState state, int step = 0);
-  void dispatch_only(TosState state);
+  void dispatch_only(TosState state, bool generate_poll = false);
   // Dispatch normal table via Z_bytecode (assume Z_bytecode is loaded already).
   void dispatch_only_normal(TosState state);
   void dispatch_normal(TosState state);
-  void dispatch_next(TosState state, int step = 0);
+  void dispatch_next(TosState state, int step = 0, bool generate_poll = false);
   void dispatch_next_noverify_oop(TosState state, int step = 0);
   void dispatch_via(TosState state, address* table);
 
--- a/src/hotspot/cpu/s390/macroAssembler_s390.cpp	Thu Dec 14 12:02:16 2017 +0100
+++ b/src/hotspot/cpu/s390/macroAssembler_s390.cpp	Thu Dec 14 13:05:20 2017 +0100
@@ -43,6 +43,8 @@
 #include "runtime/interfaceSupport.hpp"
 #include "runtime/objectMonitor.hpp"
 #include "runtime/os.hpp"
+#include "runtime/safepoint.hpp"
+#include "runtime/safepointMechanism.hpp"
 #include "runtime/sharedRuntime.hpp"
 #include "runtime/stubRoutines.hpp"
 #include "utilities/events.hpp"
@@ -2019,6 +2021,15 @@
   return here + offset;
 }
 
+void MacroAssembler::instr_size(Register size, Register pc) {
+  // Extract 2 most significant bits of current instruction.
+  z_llgc(size, Address(pc));
+  z_srl(size, 6);
+  // Compute (x+3)&6 which translates 0->2, 1->4, 2->4, 3->6.
+  z_ahi(size, 3);
+  z_nill(size, 6);
+}
+
 // Resize_frame with SP(new) = SP(old) - [offset].
 void MacroAssembler::resize_frame_sub(Register offset, Register fp, bool load_fp)
 {
@@ -2705,6 +2716,19 @@
   z_st(Z_R0, 0, tmp2, tmp1);
 }
 
+void MacroAssembler::safepoint_poll(Label& slow_path, Register temp_reg) {
+  if (SafepointMechanism::uses_thread_local_poll()) {
+    const Address poll_byte_addr(Z_thread, in_bytes(Thread::polling_page_offset()) + 7 /* Big Endian */);
+    // Armed page has poll_bit set.
+    z_tm(poll_byte_addr, SafepointMechanism::poll_bit());
+    z_brnaz(slow_path);
+  } else {
+    load_const_optimized(temp_reg, SafepointSynchronize::address_of_state());
+    z_cli(/*SafepointSynchronize::sz_state()*/4-1, temp_reg, SafepointSynchronize::_not_synchronized);
+    z_brne(slow_path);
+  }
+}
+
 // Don't rely on register locking, always use Z_R1 as scratch register instead.
 void MacroAssembler::bang_stack_with_offset(int offset) {
   // Stack grows down, caller passes positive offset.
@@ -6457,27 +6481,6 @@
   Assembler::z_brc(Assembler::bcondOverflow /* CC==3 (iterate) */, retry);
 }
 
-void MacroAssembler::generate_safepoint_check(Label& slow_path, Register scratch, bool may_relocate) {
-  if (scratch == noreg) scratch = Z_R1;
-  address Astate = SafepointSynchronize::address_of_state();
-  BLOCK_COMMENT("safepoint check:");
-
-  if (may_relocate) {
-    ptrdiff_t total_distance = Astate - this->pc();
-    if (RelAddr::is_in_range_of_RelAddr32(total_distance)) {
-      RelocationHolder rspec = external_word_Relocation::spec(Astate);
-      (this)->relocate(rspec, relocInfo::pcrel_addr_format);
-      load_absolute_address(scratch, Astate);
-    } else {
-      load_const_optimized(scratch, Astate);
-    }
-  } else {
-    load_absolute_address(scratch, Astate);
-  }
-  z_cli(/*SafepointSynchronize::sz_state()*/4-1, scratch, SafepointSynchronize::_not_synchronized);
-  z_brne(slow_path);
-}
-
 
 void MacroAssembler::generate_type_profiling(const Register Rdata,
                                              const Register Rreceiver_klass,
--- a/src/hotspot/cpu/s390/macroAssembler_s390.hpp	Thu Dec 14 12:02:16 2017 +0100
+++ b/src/hotspot/cpu/s390/macroAssembler_s390.hpp	Thu Dec 14 13:05:20 2017 +0100
@@ -260,8 +260,6 @@
   //
   // Constants, loading constants, TOC support
   //
-  // Safepoint check factored out.
-  void generate_safepoint_check(Label& slow_path, Register scratch = noreg, bool may_relocate = true);
 
   // Load generic address: d <- base(a) + index(a) + disp(a).
   inline void load_address(Register d, const Address &a);
@@ -443,6 +441,9 @@
   // Get current PC + offset. Offset given in bytes, must be even!
   address get_PC(Register result, int64_t offset);
 
+  // Get size of instruction at pc (which must point to valid code).
+  void instr_size(Register size, Register pc);
+
   // Accessing, and in particular modifying, a stack location is only safe if
   // the stack pointer (Z_SP) is set such that the accessed stack location is
   // in the reserved range.
@@ -641,6 +642,9 @@
   // Support for serializing memory accesses between threads.
   void serialize_memory(Register thread, Register tmp1, Register tmp2);
 
+  // Check if safepoint requested and if so branch
+  void safepoint_poll(Label& slow_path, Register temp_reg);
+
   // Stack overflow checking
   void bang_stack_with_offset(int offset);
 
--- a/src/hotspot/cpu/s390/s390.ad	Thu Dec 14 12:02:16 2017 +0100
+++ b/src/hotspot/cpu/s390/s390.ad	Thu Dec 14 13:05:20 2017 +0100
@@ -919,8 +919,12 @@
 
   // Touch the polling page.
   if (need_polling) {
-    AddressLiteral pp(os::get_polling_page());
-    __ load_const_optimized(Z_R1_scratch, pp);
+    if (SafepointMechanism::uses_thread_local_poll()) {
+      __ z_lg(Z_R1_scratch, Address(Z_thread, Thread::polling_page_offset()));
+    } else {
+      AddressLiteral pp(os::get_polling_page());
+      __ load_const_optimized(Z_R1_scratch, pp);
+    }
     // We need to mark the code position where the load from the safepoint
     // polling page was emitted as relocInfo::poll_return_type here.
     __ relocate(relocInfo::poll_return_type);
--- a/src/hotspot/cpu/s390/sharedRuntime_s390.cpp	Thu Dec 14 12:02:16 2017 +0100
+++ b/src/hotspot/cpu/s390/sharedRuntime_s390.cpp	Thu Dec 14 13:05:20 2017 +0100
@@ -2165,7 +2165,7 @@
         __ serialize_memory(Z_thread, Z_R1, Z_R2);
       }
     }
-    __ generate_safepoint_check(sync, Z_R1, true);
+    __ safepoint_poll(sync, Z_R1);
 
     __ load_and_test_int(Z_R0, Address(Z_thread, JavaThread::suspend_flags_offset()));
     __ z_bre(no_block);
@@ -3190,12 +3190,18 @@
 
   bool cause_return = (poll_type == POLL_AT_RETURN);
   // Make room for return address (or push it again)
-  if (!cause_return)
+  if (!cause_return) {
     __ z_lg(Z_R14, Address(Z_thread, JavaThread::saved_exception_pc_offset()));
+  }
 
   // Save registers, fpu state, and flags
   map = RegisterSaver::save_live_registers(masm, RegisterSaver::all_registers);
 
+  if (SafepointMechanism::uses_thread_local_poll() && !cause_return) {
+    // Keep a copy of the return pc to detect if it gets modified.
+    __ z_lgr(Z_R6, Z_R14);
+  }
+
   // The following is basically a call_VM. However, we need the precise
   // address of the call in order to generate an oopmap. Hence, we do all the
   // work outselves.
@@ -3231,6 +3237,21 @@
   // No exception case
   __ bind(noException);
 
+  if (SafepointMechanism::uses_thread_local_poll() && !cause_return) {
+    Label no_adjust;
+     // If our stashed return pc was modified by the runtime we avoid touching it
+    const int offset_of_return_pc = _z_abi16(return_pc) + RegisterSaver::live_reg_frame_size(RegisterSaver::all_registers);
+    __ z_cg(Z_R6, offset_of_return_pc, Z_SP);
+    __ z_brne(no_adjust);
+
+    // Adjust return pc forward to step over the safepoint poll instruction
+    __ instr_size(Z_R1_scratch, Z_R6);
+    __ z_agr(Z_R6, Z_R1_scratch);
+    __ z_stg(Z_R6, offset_of_return_pc, Z_SP);
+
+    __ bind(no_adjust);
+  }
+
   // Normal exit, restore registers and exit.
   RegisterSaver::restore_live_registers(masm, RegisterSaver::all_registers);
 
--- a/src/hotspot/cpu/s390/templateInterpreterGenerator_s390.cpp	Thu Dec 14 12:02:16 2017 +0100
+++ b/src/hotspot/cpu/s390/templateInterpreterGenerator_s390.cpp	Thu Dec 14 13:05:20 2017 +0100
@@ -1633,7 +1633,7 @@
   // Check for safepoint operation in progress and/or pending suspend requests.
   {
     Label Continue, do_safepoint;
-    __ generate_safepoint_check(do_safepoint, Z_R1, true);
+    __ safepoint_poll(do_safepoint, Z_R1);
     // Check for suspend.
     __ load_and_test_int(Z_R0/*suspend_flags*/, thread_(suspend_flags));
     __ z_bre(Continue); // 0 -> no flag set -> not suspended
@@ -1937,7 +1937,7 @@
     Label    slow_path;
 
     // If we need a safepoint check, generate full interpreter entry.
-    __ generate_safepoint_check(slow_path, Z_R1, false);
+    __ safepoint_poll(slow_path, Z_R1);
 
     BLOCK_COMMENT("CRC32_update {");
 
@@ -1990,7 +1990,7 @@
     Label    slow_path;
 
     // If we need a safepoint check, generate full interpreter entry.
-    __ generate_safepoint_check(slow_path, Z_R1, false);
+    __ safepoint_poll(slow_path, Z_R1);
 
     // We don't generate local frame and don't align stack because
     // we call stub code and there is no safepoint on this path.
--- a/src/hotspot/cpu/s390/templateTable_s390.cpp	Thu Dec 14 12:02:16 2017 +0100
+++ b/src/hotspot/cpu/s390/templateTable_s390.cpp	Thu Dec 14 13:05:20 2017 +0100
@@ -1853,7 +1853,7 @@
     // Push return address for "ret" on stack.
     __ push_ptr(Z_tos);
     // And away we go!
-    __ dispatch_next(vtos);
+    __ dispatch_next(vtos, 0 , true);
     return;
   }
 
@@ -1961,7 +1961,7 @@
   // Z_tos: Return bci for jsr's, unused otherwise.
   // Z_bytecode: target bytecode
   // Z_bcp: target bcp
-  __ dispatch_only(vtos);
+  __ dispatch_only(vtos, true);
 
   // Out-of-line code runtime calls.
   if (UseLoopCounter) {
@@ -2072,7 +2072,7 @@
   __ get_method(Z_tos);
   __ mem2reg_opt(Z_R1_scratch, Address(Z_tos, Method::const_offset()));
   __ load_address(Z_bcp, Address(Z_R1_scratch, Z_tmp_1, ConstMethod::codes_offset()));
-  __ dispatch_next(vtos);
+  __ dispatch_next(vtos, 0 , true);
 }
 
 void TemplateTable::wide_ret() {
@@ -2085,7 +2085,7 @@
   __ get_method(Z_tos);
   __ mem2reg_opt(Z_R1_scratch, Address(Z_tos, Method::const_offset()));
   __ load_address(Z_bcp, Address(Z_R1_scratch, Z_tmp_1, ConstMethod::codes_offset()));
-  __ dispatch_next(vtos);
+  __ dispatch_next(vtos, 0, true);
 }
 
 void TemplateTable::tableswitch () {
@@ -2129,7 +2129,7 @@
   // Load next bytecode.
   __ z_llgc(Z_bytecode, Address(Z_bcp, index));
   __ z_agr(Z_bcp, index); // Advance bcp.
-  __ dispatch_only(vtos);
+  __ dispatch_only(vtos, true);
 
   // Handle default.
   __ bind(default_case);
@@ -2193,7 +2193,7 @@
   // Load next bytecode.
   __ z_llgc(Z_bytecode, Address(Z_bcp, offset, 0));
   __ z_agr(Z_bcp, offset); // Advance bcp.
-  __ dispatch_only(vtos);
+  __ dispatch_only(vtos, true);
 }
 
 
@@ -2302,7 +2302,7 @@
   // Load next bytecode.
   __ z_llgc(Z_bytecode, Address(Z_bcp, j));
   __ z_agr(Z_bcp, j);       // Advance bcp.
-  __ dispatch_only(vtos);
+  __ dispatch_only(vtos, true);
 
   // default case -> j = default offset
   __ bind(default_case);
@@ -2312,7 +2312,7 @@
   // Load next bytecode.
   __ z_llgc(Z_bytecode, Address(Z_bcp, j));
   __ z_agr(Z_bcp, j);       // Advance bcp.
-  __ dispatch_only(vtos);
+  __ dispatch_only(vtos, true);
 }
 
 void TemplateTable::_return(TosState state) {
@@ -2333,6 +2333,17 @@
     __ bind(skip_register_finalizer);
   }
 
+  if (SafepointMechanism::uses_thread_local_poll() && _desc->bytecode() != Bytecodes::_return_register_finalizer) {
+    Label no_safepoint;
+    const Address poll_byte_addr(Z_thread, in_bytes(Thread::polling_page_offset()) + 7 /* Big Endian */);
+    __ z_tm(poll_byte_addr, SafepointMechanism::poll_bit());
+    __ z_braz(no_safepoint);
+    __ push(state);
+    __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::at_safepoint));
+    __ pop(state);
+    __ bind(no_safepoint);
+  }
+
   if (state == itos) {
     // Narrow result if state is itos but result type is smaller.
     // Need to narrow in the return bytecode rather than in generate_return_entry