8166944: Hanging Error Reporting steps may lead to torn error logs.
authorstuefe
Thu, 23 Feb 2017 15:59:14 -0500
changeset 46284 ad578adff5df
parent 46283 09e328af295a
child 46285 5b673a9fa682
child 46287 ebfe616a604b
8166944: Hanging Error Reporting steps may lead to torn error logs. Summary: Interupt error reporting if reporting steps hang to enable subsequent reporting steps to run. Reviewed-by: cjplummer, dholmes
hotspot/src/os/posix/vm/vmError_posix.cpp
hotspot/src/os/windows/vm/vmError_windows.cpp
hotspot/src/share/vm/runtime/globals.hpp
hotspot/src/share/vm/runtime/thread.cpp
hotspot/src/share/vm/utilities/vmError.cpp
hotspot/src/share/vm/utilities/vmError.hpp
hotspot/test/TEST.groups
hotspot/test/runtime/ErrorHandling/TimeoutInErrorHandlingTest.java
--- a/hotspot/src/os/posix/vm/vmError_posix.cpp	Thu Feb 23 11:57:07 2017 +0100
+++ b/hotspot/src/os/posix/vm/vmError_posix.cpp	Thu Feb 23 15:59:14 2017 -0500
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2017, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -59,6 +59,21 @@
 static int resettedSigflags[NUM_SIGNALS];
 static address resettedSighandler[NUM_SIGNALS];
 
+// Needed for cancelable steps.
+static volatile pthread_t reporter_thread_id;
+
+void VMError::reporting_started() {
+  // record pthread id of reporter thread.
+  reporter_thread_id = ::pthread_self();
+}
+
+void VMError::interrupt_reporting_thread() {
+  // We misuse SIGILL here, but it does not really matter. We need
+  //  a signal which is handled by crash_handler and not likely to
+  //  occurr during error reporting itself.
+  ::pthread_kill(reporter_thread_id, SIGILL);
+}
+
 static void save_signal(int idx, int sig)
 {
   struct sigaction sa;
--- a/hotspot/src/os/windows/vm/vmError_windows.cpp	Thu Feb 23 11:57:07 2017 +0100
+++ b/hotspot/src/os/windows/vm/vmError_windows.cpp	Thu Feb 23 15:59:14 2017 -0500
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2017, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -66,3 +66,10 @@
     }
   }
 }
+
+// Error reporting cancellation: there is no easy way to implement this on Windows, because we do
+// not have an easy way to send signals to threads (aka to cause a win32 Exception in another
+// thread). We would need something like "RaiseException(HANDLE thread)"...
+void VMError::reporting_started() {}
+void VMError::interrupt_reporting_thread() {}
+
--- a/hotspot/src/share/vm/runtime/globals.hpp	Thu Feb 23 11:57:07 2017 +0100
+++ b/hotspot/src/share/vm/runtime/globals.hpp	Thu Feb 23 15:59:14 2017 -0500
@@ -935,6 +935,9 @@
   notproduct(bool, TestSafeFetchInErrorHandler, false,                      \
           "If true, tests SafeFetch inside error handler.")                 \
                                                                             \
+  notproduct(bool, TestUnresponsiveErrorHandler, false,                     \
+          "If true, simulates an unresponsive error handler.")              \
+                                                                            \
   develop(bool, Verbose, false,                                             \
           "Print additional debugging information from other modes")        \
                                                                             \
--- a/hotspot/src/share/vm/runtime/thread.cpp	Thu Feb 23 11:57:07 2017 +0100
+++ b/hotspot/src/share/vm/runtime/thread.cpp	Thu Feb 23 15:59:14 2017 -0500
@@ -1289,28 +1289,32 @@
     if (is_error_reported()) {
       // A fatal error has happened, the error handler(VMError::report_and_die)
       // should abort JVM after creating an error log file. However in some
-      // rare cases, the error handler itself might deadlock. Here we try to
-      // kill JVM if the fatal error handler fails to abort in 2 minutes.
-      //
+      // rare cases, the error handler itself might deadlock. Here periodically
+      // check for error reporting timeouts, and if it happens, just proceed to
+      // abort the VM.
+
       // This code is in WatcherThread because WatcherThread wakes up
       // periodically so the fatal error handler doesn't need to do anything;
       // also because the WatcherThread is less likely to crash than other
       // threads.
 
       for (;;) {
-        if (!ShowMessageBoxOnError
-            && (OnError == NULL || OnError[0] == '\0')
-            && Arguments::abort_hook() == NULL) {
-          os::sleep(this, (jlong)ErrorLogTimeout * 1000, false); // in seconds
+        // Note: we use naked sleep in this loop because we want to avoid using
+        // any kind of VM infrastructure which may be broken at this point.
+        if (VMError::check_timeout()) {
+          // We hit error reporting timeout. Error reporting was interrupted and
+          // will be wrapping things up now (closing files etc). Give it some more
+          // time, then quit the VM.
+          os::naked_short_sleep(200);
+          // Print a message to stderr.
           fdStream err(defaultStream::output_fd());
           err.print_raw_cr("# [ timer expired, abort... ]");
           // skip atexit/vm_exit/vm_abort hooks
           os::die();
         }
 
-        // Wake up 5 seconds later, the fatal handler may reset OnError or
-        // ShowMessageBoxOnError when it is ready to abort.
-        os::sleep(this, 5 * 1000, false);
+        // Wait a second, then recheck for timeout.
+        os::naked_short_sleep(999);
       }
     }
 
--- a/hotspot/src/share/vm/utilities/vmError.cpp	Thu Feb 23 11:57:07 2017 +0100
+++ b/hotspot/src/share/vm/utilities/vmError.cpp	Thu Feb 23 15:59:14 2017 -0500
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2017, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -38,6 +38,7 @@
 #include "runtime/thread.inline.hpp"
 #include "runtime/vmThread.hpp"
 #include "runtime/vm_operations.hpp"
+#include "runtime/vm_version.hpp"
 #include "services/memTracker.hpp"
 #include "trace/traceMacros.hpp"
 #include "utilities/debug.hpp"
@@ -312,10 +313,41 @@
 int          VMError::_current_step;
 const char*  VMError::_current_step_info;
 
+volatile jlong VMError::_reporting_start_time = -1;
+volatile bool VMError::_reporting_did_timeout = false;
+volatile jlong VMError::_step_start_time = -1;
+volatile bool VMError::_step_did_timeout = false;
+
+// Helper, return current timestamp for timeout handling.
+jlong VMError::get_current_timestamp() {
+  return os::javaTimeNanos();
+}
+// Factor to translate the timestamp to seconds.
+#define TIMESTAMP_TO_SECONDS_FACTOR (1000 * 1000 * 1000)
+
+void VMError::record_reporting_start_time() {
+  const jlong now = get_current_timestamp();
+  Atomic::store(now, &_reporting_start_time);
+}
+
+jlong VMError::get_reporting_start_time() {
+  return Atomic::load(&_reporting_start_time);
+}
+
+void VMError::record_step_start_time() {
+  const jlong now = get_current_timestamp();
+  Atomic::store(now, &_step_start_time);
+}
+
+jlong VMError::get_step_start_time() {
+  return Atomic::load(&_step_start_time);
+}
+
 void VMError::report(outputStream* st, bool _verbose) {
 
 # define BEGIN if (_current_step == 0) { _current_step = __LINE__;
-# define STEP(s) } if (_current_step < __LINE__) { _current_step = __LINE__; _current_step_info = s;
+# define STEP(s) } if (_current_step < __LINE__) { _current_step = __LINE__; _current_step_info = s; \
+  record_step_start_time(); _step_did_timeout = false;
 # define END }
 
   // don't allocate large buffer on stack
@@ -352,6 +384,18 @@
       controlled_crash(TestCrashInErrorHandler);
     }
 
+  // TestUnresponsiveErrorHandler: We want to test both step timeouts and global timeout.
+  // Step to global timeout ratio is 4:1, so in order to be absolutely sure we hit the
+  // global timeout, let's execute the timeout step five times.
+  // See corresponding test in test/runtime/ErrorHandling/TimeoutInErrorHandlingTest.java
+  #define TIMEOUT_TEST_STEP STEP("test unresponsive error reporting step") \
+    if (_verbose && TestUnresponsiveErrorHandler) { os::infinite_sleep(); }
+  TIMEOUT_TEST_STEP
+  TIMEOUT_TEST_STEP
+  TIMEOUT_TEST_STEP
+  TIMEOUT_TEST_STEP
+  TIMEOUT_TEST_STEP
+
   STEP("test safefetch in error handler")
     // test whether it is safe to use SafeFetch32 in Crash Handler. Test twice
     // to test that resetting the signal handler works correctly.
@@ -1176,6 +1220,9 @@
     // first time
     set_error_reported();
 
+    reporting_started();
+    record_reporting_start_time();
+
     if (ShowMessageBoxOnError || PauseAtExit) {
       show_message_box(buffer, sizeof(buffer));
 
@@ -1216,17 +1263,31 @@
         os::die();
       }
 
-      jio_snprintf(buffer, sizeof(buffer),
-                   "[error occurred during error reporting (%s), id 0x%x]",
+      outputStream* const st = log.is_open() ? &log : &out;
+      st->cr();
+
+      // Timeout handling.
+      if (_step_did_timeout) {
+        // The current step had a timeout. Lets continue reporting with the next step.
+        st->print_raw("[timeout occurred during error reporting in step \"");
+        st->print_raw(_current_step_info);
+        st->print_cr("\"] after " INT64_FORMAT " s.",
+          (get_current_timestamp() - _step_start_time) / TIMESTAMP_TO_SECONDS_FACTOR);
+      } else if (_reporting_did_timeout) {
+        // We hit ErrorLogTimeout. Reporting will stop altogether. Let's wrap things
+        // up, the process is about to be stopped by the WatcherThread.
+        st->print_cr("------ Timeout during error reporting after " INT64_FORMAT " s. ------",
+          (get_current_timestamp() - _reporting_start_time) / TIMESTAMP_TO_SECONDS_FACTOR);
+        st->flush();
+        // Watcherthread is about to call os::die. Lets just wait.
+        os::infinite_sleep();
+      } else {
+        // Crash or assert during error reporting. Lets continue reporting with the next step.
+        jio_snprintf(buffer, sizeof(buffer),
+           "[error occurred during error reporting (%s), id 0x%x]",
                    _current_step_info, _id);
-      if (log.is_open()) {
-        log.cr();
-        log.print_raw_cr(buffer);
-        log.cr();
-      } else {
-        out.cr();
-        out.print_raw_cr(buffer);
-        out.cr();
+        st->print_raw_cr(buffer);
+        st->cr();
       }
     }
   }
@@ -1421,3 +1482,50 @@
     yes = os::start_debugging(buf,buflen);
   } while (yes);
 }
+
+// Timeout handling: check if a timeout happened (either a single step did
+// timeout or the whole of error reporting hit ErrorLogTimeout). Interrupt
+// the reporting thread if that is the case.
+bool VMError::check_timeout() {
+
+  if (ErrorLogTimeout == 0) {
+    return false;
+  }
+
+  // Do not check for timeouts if we still have a message box to show to the
+  // user or if there are OnError handlers to be run.
+  if (ShowMessageBoxOnError
+      || (OnError != NULL && OnError[0] != '\0')
+      || Arguments::abort_hook() != NULL) {
+    return false;
+  }
+
+  const jlong reporting_start_time_l = get_reporting_start_time();
+  const jlong now = get_current_timestamp();
+  // Timestamp is stored in nanos.
+  if (reporting_start_time_l > 0) {
+    const jlong end = reporting_start_time_l + (jlong)ErrorLogTimeout * TIMESTAMP_TO_SECONDS_FACTOR;
+    if (end <= now) {
+      _reporting_did_timeout = true;
+      interrupt_reporting_thread();
+      return true; // global timeout
+    }
+  }
+
+  const jlong step_start_time_l = get_step_start_time();
+  if (step_start_time_l > 0) {
+    // A step times out after a quarter of the total timeout. Steps are mostly fast unless they
+    // hang for some reason, so this simple rule allows for three hanging step and still
+    // hopefully leaves time enough for the rest of the steps to finish.
+    const jlong end = step_start_time_l + (jlong)ErrorLogTimeout * TIMESTAMP_TO_SECONDS_FACTOR / 4;
+    if (end <= now) {
+      _step_did_timeout = true;
+      interrupt_reporting_thread();
+      return false; // (Not a global timeout)
+    }
+  }
+
+  return false;
+
+}
+
--- a/hotspot/src/share/vm/utilities/vmError.hpp	Thu Feb 23 11:57:07 2017 +0100
+++ b/hotspot/src/share/vm/utilities/vmError.hpp	Thu Feb 23 15:59:14 2017 -0500
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2017, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -74,6 +74,16 @@
   // no core/minidump has been written to disk
   static char coredump_message[O_BUFLEN];
 
+  // Timeout handling:
+  // Timestamp at which error reporting started; -1 if no error reporting in progress.
+  static volatile jlong _reporting_start_time;
+  // Whether or not error reporting did timeout.
+  static volatile bool _reporting_did_timeout;
+  // Timestamp at which the last error reporting step started; -1 if no error reporting
+  //   in progress.
+  static volatile jlong _step_start_time;
+  // Whether or not the last error reporting step did timeout.
+  static volatile bool _step_did_timeout;
 
   // set signal handlers on Solaris/Linux or the default exception filter
   // on Windows, to handle recursive crashes.
@@ -104,6 +114,20 @@
   static fdStream out;
   static fdStream log; // error log used by VMError::report_and_die()
 
+  // Timeout handling.
+  // Hook functions for platform dependend functionality:
+  static void reporting_started();
+  static void interrupt_reporting_thread();
+
+  // Helper function to get the current timestamp.
+  static jlong get_current_timestamp();
+
+  // Accessors to get/set the start times for step and total timeout.
+  static void record_reporting_start_time();
+  static jlong get_reporting_start_time();
+  static void record_step_start_time();
+  static jlong get_step_start_time();
+
 public:
 
   // return a string to describe the error
@@ -147,6 +171,11 @@
   static bool fatal_error_in_progress() { return first_error_tid != -1; }
 
   static intptr_t get_first_error_tid() { return first_error_tid; }
+
+  // Called by the WatcherThread to check if error reporting has timed-out.
+  //  Returns true if error reporting has not completed within the ErrorLogTimeout limit.
+  static bool check_timeout();
+
 };
 
 #endif // SHARE_VM_UTILITIES_VMERROR_HPP
--- a/hotspot/test/TEST.groups	Thu Feb 23 11:57:07 2017 +0100
+++ b/hotspot/test/TEST.groups	Thu Feb 23 15:59:14 2017 -0500
@@ -371,6 +371,7 @@
  -runtime/ConstantPool/IntfMethod.java \
  -runtime/ErrorHandling/CreateCoredumpOnCrash.java \
  -runtime/ErrorHandling/ErrorHandler.java \
+ -runtime/ErrorHandling/TimeoutInErrorHandlingTest.java \
  -runtime/logging/MonitorMismatchTest.java \
  -runtime/memory/ReserveMemory.java \
  -runtime/memory/RunUnitTestsConcurrently.java \
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/runtime/ErrorHandling/TimeoutInErrorHandlingTest.java	Thu Feb 23 15:59:14 2017 -0500
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+import java.util.regex.Pattern;
+
+import jdk.test.lib.process.OutputAnalyzer;
+import jdk.test.lib.Platform;
+import jdk.test.lib.process.ProcessTools;
+
+/*
+ * @test
+ * @bug 8166944
+ * @summary Hanging Error Reporting steps may lead to torn error logs
+ * @modules java.base/jdk.internal.misc
+ * @library /test/lib
+ * @requires (vm.debug == true) & (os.family != "windows")
+ * @author Thomas Stuefe (SAP)
+ */
+
+public class TimeoutInErrorHandlingTest {
+
+
+    public static void main(String[] args) throws Exception {
+
+        /* Start the VM and let it crash. Specify TestUnresponsiveErrorHandler which will
+         * let five subsequent error reporting steps hang. The Timeout handling triggered
+         * by the WatcherThread should kick in and interrupt those steps. In theory, the
+         * text "timeout occurred during error reporting in step .." (the little timeouts)
+         * should occur in the error log up to four times, followed by the final big timeout
+         * "------ Timeout during error reporting after xx s. ------"
+         *
+         * Note that there are a number of uncertainties which make writing a 100% foolproof
+         * test challenging. The time the error reporting thread takes to react to the
+         * timeout triggers is unknown. So it is difficult to predict how many little timeouts
+         * will be visible before the big timeout kicks in. Also, once the big timeout hits,
+         * error reporting thread and Watcherthread will race. The former writes his last
+         * message to the error logs and flushes, the latter waits 200ms and then exits the
+         * process without further synchronization with the error reporting thread.
+         *
+         * Because of all this and the desire to write a bullet proof test which does
+         * not fail sporadically, we will not test for the final timeout message nor for all
+         * of the optimally expected little timeout messages. We just test for two of the
+         * little timeout messages to see that repeated timeout handling is basically working.
+         */
+
+        ProcessBuilder pb = ProcessTools.createJavaProcessBuilder(
+            "-XX:+UnlockDiagnosticVMOptions",
+            "-Xmx100M",
+            "-XX:ErrorHandlerTest=14",
+            "-XX:+TestUnresponsiveErrorHandler",
+            "-XX:ErrorLogTimeout=16", // 16 seconds big timeout = 4 seconds per little timeout
+            "-XX:-CreateCoredumpOnCrash",
+            "-version");
+
+        OutputAnalyzer output_detail = new OutputAnalyzer(pb.start());
+
+        // we should have crashed with a SIGSEGV
+        output_detail.shouldMatch("# A fatal error has been detected by the Java Runtime Environment:.*");
+        output_detail.shouldMatch("# +(?:SIGSEGV|EXCEPTION_ACCESS_VIOLATION).*");
+
+        // VM should have been aborted by WatcherThread
+        output_detail.shouldMatch(".*timer expired, abort.*");
+
+        // extract hs-err file
+        String hs_err_file = output_detail.firstMatch("# *(\\S*hs_err_pid\\d+\\.log)", 1);
+        if (hs_err_file == null) {
+            throw new RuntimeException("Did not find hs-err file in output.\n");
+        }
+
+        File f = new File(hs_err_file);
+        if (!f.exists()) {
+            throw new RuntimeException("hs-err file missing at "
+                + f.getAbsolutePath() + ".\n");
+        }
+
+        System.out.println("Found hs_err file. Scanning...");
+
+        FileInputStream fis = new FileInputStream(f);
+        BufferedReader br = new BufferedReader(new InputStreamReader(fis));
+        String line = null;
+
+
+
+        Pattern [] pattern = new Pattern[] {
+            Pattern.compile(".*timeout occurred during error reporting in step.*"),
+            Pattern.compile(".*timeout occurred during error reporting in step.*")
+        };
+        int currentPattern = 0;
+
+        String lastLine = null;
+        while ((line = br.readLine()) != null) {
+            if (currentPattern < pattern.length) {
+              if (pattern[currentPattern].matcher(line).matches()) {
+                System.out.println("Found: " + line + ".");
+                currentPattern ++;
+              }
+            }
+            lastLine = line;
+        }
+        br.close();
+
+        if (currentPattern < pattern.length) {
+            throw new RuntimeException("hs-err file incomplete (first missing pattern: " +  currentPattern + ")");
+        }
+
+        System.out.println("OK.");
+
+    }
+
+}
+