8218543: ThreadsList handling during error reporting can crash
authorrehn
Mon, 14 Oct 2019 08:59:12 +0200
changeset 58579 05dd6144d434
parent 58575 6941d77417f4
child 58580 adbd1504c998
8218543: ThreadsList handling during error reporting can crash Reviewed-by: dcubed, dholmes
src/hotspot/share/runtime/threadSMR.cpp
src/hotspot/share/utilities/decoder.cpp
src/hotspot/share/utilities/vmError.cpp
src/hotspot/share/utilities/vmError.hpp
--- a/src/hotspot/share/runtime/threadSMR.cpp	Sat Oct 12 00:22:53 2019 -0400
+++ b/src/hotspot/share/runtime/threadSMR.cpp	Mon Oct 14 08:59:12 2019 +0200
@@ -528,6 +528,22 @@
     return;
   }
 
+  if ( _thread == VM_Exit::shutdown_thread()) {
+    // The shutdown thread has removed itself from the Threads
+    // list and is safe to have a waiver from this check because
+    // VM_Exit::_shutdown_thread is not set until after the VMThread
+    // has started the final safepoint which holds the Threads_lock
+    // for the remainder of the VM's life.
+    return;
+  }
+
+  if (VMError::is_error_reported() &&
+      VMError::get_first_error_tid() == os::current_thread_id()) {
+    // If there is an error reported by this thread it may use ThreadsList even
+    // if it's unsafe.
+    return;
+  }
+
   // The closure will attempt to verify that the calling thread can
   // be found by threads_do() on the specified ThreadsList. If it
   // is successful, then the specified ThreadsList was acquired as
@@ -540,12 +556,6 @@
   // ThreadsList is not a stable hazard ptr and can be freed by
   // another thread from the to-be-deleted list at any time.
   //
-  // Note: The shutdown thread has removed itself from the Threads
-  // list and is safe to have a waiver from this check because
-  // VM_Exit::_shutdown_thread is not set until after the VMThread
-  // has started the final safepoint which holds the Threads_lock
-  // for the remainder of the VM's life.
-  //
   VerifyHazardPtrThreadClosure cl(_thread);
   ThreadsSMRSupport::threads_do(&cl, _list);
 
@@ -555,7 +565,7 @@
   // In either case, we won't get past this point with a badly placed
   // ThreadsListHandle.
 
-  assert(cl.found() || _thread == VM_Exit::shutdown_thread(), "Acquired a ThreadsList snapshot from a thread not recognized by the Thread-SMR protocol.");
+  assert(cl.found(), "Acquired a ThreadsList snapshot from a thread not recognized by the Thread-SMR protocol.");
 #endif
 }
 
--- a/src/hotspot/share/utilities/decoder.cpp	Sat Oct 12 00:22:53 2019 -0400
+++ b/src/hotspot/share/utilities/decoder.cpp	Mon Oct 14 08:59:12 2019 +0200
@@ -84,7 +84,7 @@
 }
 
 bool Decoder::decode(address addr, char* buf, int buflen, int* offset, const char* modulepath, bool demangle) {
-  bool error_handling_thread = os::current_thread_id() == VMError::first_error_tid;
+  bool error_handling_thread = os::current_thread_id() == VMError::get_first_error_tid();
   if (error_handling_thread) {
     return get_error_handler_instance()->decode(addr, buf, buflen, offset, modulepath, demangle);
   } else {
@@ -95,7 +95,7 @@
 }
 
 bool Decoder::decode(address addr, char* buf, int buflen, int* offset, const void* base) {
-  bool error_handling_thread = os::current_thread_id() == VMError::first_error_tid;
+  bool error_handling_thread = os::current_thread_id() == VMError::get_first_error_tid();
   if (error_handling_thread) {
     return get_error_handler_instance()->decode(addr, buf, buflen, offset, base);
   } else {
@@ -106,7 +106,7 @@
 
 
 bool Decoder::demangle(const char* symbol, char* buf, int buflen) {
-  bool error_handling_thread = os::current_thread_id() == VMError::first_error_tid;
+  bool error_handling_thread = os::current_thread_id() == VMError::get_first_error_tid();
   if (error_handling_thread) {
     return get_error_handler_instance()->demangle(symbol, buf, buflen);
   } else {
--- a/src/hotspot/share/utilities/vmError.cpp	Sat Oct 12 00:22:53 2019 -0400
+++ b/src/hotspot/share/utilities/vmError.cpp	Mon Oct 14 08:59:12 2019 +0200
@@ -1205,7 +1205,7 @@
   st->print_cr("END.");
 }
 
-volatile intptr_t VMError::first_error_tid = -1;
+volatile intptr_t VMError::_first_error_tid = -1;
 
 /** Expand a pattern into a buffer starting at pos and open a file using constructed path */
 static int expand_and_open(const char* pattern, bool overwrite_existing, char* buf, size_t buflen, size_t pos) {
@@ -1355,8 +1355,8 @@
       os::abort(CreateCoredumpOnCrash);
   }
   intptr_t mytid = os::current_thread_id();
-  if (first_error_tid == -1 &&
-      Atomic::cmpxchg(mytid, &first_error_tid, (intptr_t)-1) == -1) {
+  if (_first_error_tid == -1 &&
+      Atomic::cmpxchg(mytid, &_first_error_tid, (intptr_t)-1) == -1) {
 
     // Initialize time stamps to use the same base.
     out.time_stamp().update_to(1);
@@ -1416,7 +1416,7 @@
 
     // This is not the first error, see if it happened in a different thread
     // or in the same thread during error reporting.
-    if (first_error_tid != mytid) {
+    if (_first_error_tid != mytid) {
       char msgbuf[64];
       jio_snprintf(msgbuf, sizeof(msgbuf),
                    "[thread " INTX_FORMAT " also had an error]",
--- a/src/hotspot/share/utilities/vmError.hpp	Sat Oct 12 00:22:53 2019 -0400
+++ b/src/hotspot/share/utilities/vmError.hpp	Mon Oct 14 08:59:12 2019 +0200
@@ -32,8 +32,6 @@
 class VM_ReportJavaOutOfMemory;
 
 class VMError : public AllStatic {
-  friend class VM_ReportJavaOutOfMemory;
-  friend class Decoder;
   friend class VMStructs;
 
   static int         _id;               // Solaris/Linux signals: 0 - SIGRTMAX
@@ -65,7 +63,7 @@
 
   // Thread id of the first error. We must be able to handle native thread,
   // so use thread id instead of Thread* to identify thread.
-  static volatile intptr_t first_error_tid;
+  static volatile intptr_t _first_error_tid;
 
   // Core dump status, false if we have been unable to write a core/minidump for some reason
   static bool coredump_status;
@@ -177,9 +175,9 @@
   static address get_resetted_sighandler(int sig);
 
   // check to see if fatal error reporting is in progress
-  static bool fatal_error_in_progress() { return first_error_tid != -1; }
+  static bool fatal_error_in_progress() { return _first_error_tid != -1; }
 
-  static intptr_t get_first_error_tid() { return first_error_tid; }
+  static intptr_t get_first_error_tid() { return _first_error_tid; }
 
   // Called by the WatcherThread to check if error reporting has timed-out.
   //  Returns true if error reporting has not completed within the ErrorLogTimeout limit.