# HG changeset patch # User stuefe # Date 1487883554 18000 # Node ID ad578adff5dfc962c5cc3a9b73b6345d1859f8ee # Parent 09e328af295aa99df08f40a0f94727d7df09a2a4 8166944: Hanging Error Reporting steps may lead to torn error logs. Summary: Interupt error reporting if reporting steps hang to enable subsequent reporting steps to run. Reviewed-by: cjplummer, dholmes diff -r 09e328af295a -r ad578adff5df hotspot/src/os/posix/vm/vmError_posix.cpp --- a/hotspot/src/os/posix/vm/vmError_posix.cpp Thu Feb 23 11:57:07 2017 +0100 +++ b/hotspot/src/os/posix/vm/vmError_posix.cpp Thu Feb 23 15:59:14 2017 -0500 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2003, 2017, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -59,6 +59,21 @@ static int resettedSigflags[NUM_SIGNALS]; static address resettedSighandler[NUM_SIGNALS]; +// Needed for cancelable steps. +static volatile pthread_t reporter_thread_id; + +void VMError::reporting_started() { + // record pthread id of reporter thread. + reporter_thread_id = ::pthread_self(); +} + +void VMError::interrupt_reporting_thread() { + // We misuse SIGILL here, but it does not really matter. We need + // a signal which is handled by crash_handler and not likely to + // occurr during error reporting itself. + ::pthread_kill(reporter_thread_id, SIGILL); +} + static void save_signal(int idx, int sig) { struct sigaction sa; diff -r 09e328af295a -r ad578adff5df hotspot/src/os/windows/vm/vmError_windows.cpp --- a/hotspot/src/os/windows/vm/vmError_windows.cpp Thu Feb 23 11:57:07 2017 +0100 +++ b/hotspot/src/os/windows/vm/vmError_windows.cpp Thu Feb 23 15:59:14 2017 -0500 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2003, 2017, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -66,3 +66,10 @@ } } } + +// Error reporting cancellation: there is no easy way to implement this on Windows, because we do +// not have an easy way to send signals to threads (aka to cause a win32 Exception in another +// thread). We would need something like "RaiseException(HANDLE thread)"... +void VMError::reporting_started() {} +void VMError::interrupt_reporting_thread() {} + diff -r 09e328af295a -r ad578adff5df hotspot/src/share/vm/runtime/globals.hpp --- a/hotspot/src/share/vm/runtime/globals.hpp Thu Feb 23 11:57:07 2017 +0100 +++ b/hotspot/src/share/vm/runtime/globals.hpp Thu Feb 23 15:59:14 2017 -0500 @@ -935,6 +935,9 @@ notproduct(bool, TestSafeFetchInErrorHandler, false, \ "If true, tests SafeFetch inside error handler.") \ \ + notproduct(bool, TestUnresponsiveErrorHandler, false, \ + "If true, simulates an unresponsive error handler.") \ + \ develop(bool, Verbose, false, \ "Print additional debugging information from other modes") \ \ diff -r 09e328af295a -r ad578adff5df hotspot/src/share/vm/runtime/thread.cpp --- a/hotspot/src/share/vm/runtime/thread.cpp Thu Feb 23 11:57:07 2017 +0100 +++ b/hotspot/src/share/vm/runtime/thread.cpp Thu Feb 23 15:59:14 2017 -0500 @@ -1289,28 +1289,32 @@ if (is_error_reported()) { // A fatal error has happened, the error handler(VMError::report_and_die) // should abort JVM after creating an error log file. However in some - // rare cases, the error handler itself might deadlock. Here we try to - // kill JVM if the fatal error handler fails to abort in 2 minutes. - // + // rare cases, the error handler itself might deadlock. Here periodically + // check for error reporting timeouts, and if it happens, just proceed to + // abort the VM. + // This code is in WatcherThread because WatcherThread wakes up // periodically so the fatal error handler doesn't need to do anything; // also because the WatcherThread is less likely to crash than other // threads. for (;;) { - if (!ShowMessageBoxOnError - && (OnError == NULL || OnError[0] == '\0') - && Arguments::abort_hook() == NULL) { - os::sleep(this, (jlong)ErrorLogTimeout * 1000, false); // in seconds + // Note: we use naked sleep in this loop because we want to avoid using + // any kind of VM infrastructure which may be broken at this point. + if (VMError::check_timeout()) { + // We hit error reporting timeout. Error reporting was interrupted and + // will be wrapping things up now (closing files etc). Give it some more + // time, then quit the VM. + os::naked_short_sleep(200); + // Print a message to stderr. fdStream err(defaultStream::output_fd()); err.print_raw_cr("# [ timer expired, abort... ]"); // skip atexit/vm_exit/vm_abort hooks os::die(); } - // Wake up 5 seconds later, the fatal handler may reset OnError or - // ShowMessageBoxOnError when it is ready to abort. - os::sleep(this, 5 * 1000, false); + // Wait a second, then recheck for timeout. + os::naked_short_sleep(999); } } diff -r 09e328af295a -r ad578adff5df hotspot/src/share/vm/utilities/vmError.cpp --- a/hotspot/src/share/vm/utilities/vmError.cpp Thu Feb 23 11:57:07 2017 +0100 +++ b/hotspot/src/share/vm/utilities/vmError.cpp Thu Feb 23 15:59:14 2017 -0500 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2003, 2017, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -38,6 +38,7 @@ #include "runtime/thread.inline.hpp" #include "runtime/vmThread.hpp" #include "runtime/vm_operations.hpp" +#include "runtime/vm_version.hpp" #include "services/memTracker.hpp" #include "trace/traceMacros.hpp" #include "utilities/debug.hpp" @@ -312,10 +313,41 @@ int VMError::_current_step; const char* VMError::_current_step_info; +volatile jlong VMError::_reporting_start_time = -1; +volatile bool VMError::_reporting_did_timeout = false; +volatile jlong VMError::_step_start_time = -1; +volatile bool VMError::_step_did_timeout = false; + +// Helper, return current timestamp for timeout handling. +jlong VMError::get_current_timestamp() { + return os::javaTimeNanos(); +} +// Factor to translate the timestamp to seconds. +#define TIMESTAMP_TO_SECONDS_FACTOR (1000 * 1000 * 1000) + +void VMError::record_reporting_start_time() { + const jlong now = get_current_timestamp(); + Atomic::store(now, &_reporting_start_time); +} + +jlong VMError::get_reporting_start_time() { + return Atomic::load(&_reporting_start_time); +} + +void VMError::record_step_start_time() { + const jlong now = get_current_timestamp(); + Atomic::store(now, &_step_start_time); +} + +jlong VMError::get_step_start_time() { + return Atomic::load(&_step_start_time); +} + void VMError::report(outputStream* st, bool _verbose) { # define BEGIN if (_current_step == 0) { _current_step = __LINE__; -# define STEP(s) } if (_current_step < __LINE__) { _current_step = __LINE__; _current_step_info = s; +# define STEP(s) } if (_current_step < __LINE__) { _current_step = __LINE__; _current_step_info = s; \ + record_step_start_time(); _step_did_timeout = false; # define END } // don't allocate large buffer on stack @@ -352,6 +384,18 @@ controlled_crash(TestCrashInErrorHandler); } + // TestUnresponsiveErrorHandler: We want to test both step timeouts and global timeout. + // Step to global timeout ratio is 4:1, so in order to be absolutely sure we hit the + // global timeout, let's execute the timeout step five times. + // See corresponding test in test/runtime/ErrorHandling/TimeoutInErrorHandlingTest.java + #define TIMEOUT_TEST_STEP STEP("test unresponsive error reporting step") \ + if (_verbose && TestUnresponsiveErrorHandler) { os::infinite_sleep(); } + TIMEOUT_TEST_STEP + TIMEOUT_TEST_STEP + TIMEOUT_TEST_STEP + TIMEOUT_TEST_STEP + TIMEOUT_TEST_STEP + STEP("test safefetch in error handler") // test whether it is safe to use SafeFetch32 in Crash Handler. Test twice // to test that resetting the signal handler works correctly. @@ -1176,6 +1220,9 @@ // first time set_error_reported(); + reporting_started(); + record_reporting_start_time(); + if (ShowMessageBoxOnError || PauseAtExit) { show_message_box(buffer, sizeof(buffer)); @@ -1216,17 +1263,31 @@ os::die(); } - jio_snprintf(buffer, sizeof(buffer), - "[error occurred during error reporting (%s), id 0x%x]", + outputStream* const st = log.is_open() ? &log : &out; + st->cr(); + + // Timeout handling. + if (_step_did_timeout) { + // The current step had a timeout. Lets continue reporting with the next step. + st->print_raw("[timeout occurred during error reporting in step \""); + st->print_raw(_current_step_info); + st->print_cr("\"] after " INT64_FORMAT " s.", + (get_current_timestamp() - _step_start_time) / TIMESTAMP_TO_SECONDS_FACTOR); + } else if (_reporting_did_timeout) { + // We hit ErrorLogTimeout. Reporting will stop altogether. Let's wrap things + // up, the process is about to be stopped by the WatcherThread. + st->print_cr("------ Timeout during error reporting after " INT64_FORMAT " s. ------", + (get_current_timestamp() - _reporting_start_time) / TIMESTAMP_TO_SECONDS_FACTOR); + st->flush(); + // Watcherthread is about to call os::die. Lets just wait. + os::infinite_sleep(); + } else { + // Crash or assert during error reporting. Lets continue reporting with the next step. + jio_snprintf(buffer, sizeof(buffer), + "[error occurred during error reporting (%s), id 0x%x]", _current_step_info, _id); - if (log.is_open()) { - log.cr(); - log.print_raw_cr(buffer); - log.cr(); - } else { - out.cr(); - out.print_raw_cr(buffer); - out.cr(); + st->print_raw_cr(buffer); + st->cr(); } } } @@ -1421,3 +1482,50 @@ yes = os::start_debugging(buf,buflen); } while (yes); } + +// Timeout handling: check if a timeout happened (either a single step did +// timeout or the whole of error reporting hit ErrorLogTimeout). Interrupt +// the reporting thread if that is the case. +bool VMError::check_timeout() { + + if (ErrorLogTimeout == 0) { + return false; + } + + // Do not check for timeouts if we still have a message box to show to the + // user or if there are OnError handlers to be run. + if (ShowMessageBoxOnError + || (OnError != NULL && OnError[0] != '\0') + || Arguments::abort_hook() != NULL) { + return false; + } + + const jlong reporting_start_time_l = get_reporting_start_time(); + const jlong now = get_current_timestamp(); + // Timestamp is stored in nanos. + if (reporting_start_time_l > 0) { + const jlong end = reporting_start_time_l + (jlong)ErrorLogTimeout * TIMESTAMP_TO_SECONDS_FACTOR; + if (end <= now) { + _reporting_did_timeout = true; + interrupt_reporting_thread(); + return true; // global timeout + } + } + + const jlong step_start_time_l = get_step_start_time(); + if (step_start_time_l > 0) { + // A step times out after a quarter of the total timeout. Steps are mostly fast unless they + // hang for some reason, so this simple rule allows for three hanging step and still + // hopefully leaves time enough for the rest of the steps to finish. + const jlong end = step_start_time_l + (jlong)ErrorLogTimeout * TIMESTAMP_TO_SECONDS_FACTOR / 4; + if (end <= now) { + _step_did_timeout = true; + interrupt_reporting_thread(); + return false; // (Not a global timeout) + } + } + + return false; + +} + diff -r 09e328af295a -r ad578adff5df hotspot/src/share/vm/utilities/vmError.hpp --- a/hotspot/src/share/vm/utilities/vmError.hpp Thu Feb 23 11:57:07 2017 +0100 +++ b/hotspot/src/share/vm/utilities/vmError.hpp Thu Feb 23 15:59:14 2017 -0500 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2003, 2017, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -74,6 +74,16 @@ // no core/minidump has been written to disk static char coredump_message[O_BUFLEN]; + // Timeout handling: + // Timestamp at which error reporting started; -1 if no error reporting in progress. + static volatile jlong _reporting_start_time; + // Whether or not error reporting did timeout. + static volatile bool _reporting_did_timeout; + // Timestamp at which the last error reporting step started; -1 if no error reporting + // in progress. + static volatile jlong _step_start_time; + // Whether or not the last error reporting step did timeout. + static volatile bool _step_did_timeout; // set signal handlers on Solaris/Linux or the default exception filter // on Windows, to handle recursive crashes. @@ -104,6 +114,20 @@ static fdStream out; static fdStream log; // error log used by VMError::report_and_die() + // Timeout handling. + // Hook functions for platform dependend functionality: + static void reporting_started(); + static void interrupt_reporting_thread(); + + // Helper function to get the current timestamp. + static jlong get_current_timestamp(); + + // Accessors to get/set the start times for step and total timeout. + static void record_reporting_start_time(); + static jlong get_reporting_start_time(); + static void record_step_start_time(); + static jlong get_step_start_time(); + public: // return a string to describe the error @@ -147,6 +171,11 @@ static bool fatal_error_in_progress() { return first_error_tid != -1; } static intptr_t get_first_error_tid() { return first_error_tid; } + + // Called by the WatcherThread to check if error reporting has timed-out. + // Returns true if error reporting has not completed within the ErrorLogTimeout limit. + static bool check_timeout(); + }; #endif // SHARE_VM_UTILITIES_VMERROR_HPP diff -r 09e328af295a -r ad578adff5df hotspot/test/TEST.groups --- a/hotspot/test/TEST.groups Thu Feb 23 11:57:07 2017 +0100 +++ b/hotspot/test/TEST.groups Thu Feb 23 15:59:14 2017 -0500 @@ -371,6 +371,7 @@ -runtime/ConstantPool/IntfMethod.java \ -runtime/ErrorHandling/CreateCoredumpOnCrash.java \ -runtime/ErrorHandling/ErrorHandler.java \ + -runtime/ErrorHandling/TimeoutInErrorHandlingTest.java \ -runtime/logging/MonitorMismatchTest.java \ -runtime/memory/ReserveMemory.java \ -runtime/memory/RunUnitTestsConcurrently.java \ diff -r 09e328af295a -r ad578adff5df hotspot/test/runtime/ErrorHandling/TimeoutInErrorHandlingTest.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hotspot/test/runtime/ErrorHandling/TimeoutInErrorHandlingTest.java Thu Feb 23 15:59:14 2017 -0500 @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStreamReader; +import java.util.regex.Pattern; + +import jdk.test.lib.process.OutputAnalyzer; +import jdk.test.lib.Platform; +import jdk.test.lib.process.ProcessTools; + +/* + * @test + * @bug 8166944 + * @summary Hanging Error Reporting steps may lead to torn error logs + * @modules java.base/jdk.internal.misc + * @library /test/lib + * @requires (vm.debug == true) & (os.family != "windows") + * @author Thomas Stuefe (SAP) + */ + +public class TimeoutInErrorHandlingTest { + + + public static void main(String[] args) throws Exception { + + /* Start the VM and let it crash. Specify TestUnresponsiveErrorHandler which will + * let five subsequent error reporting steps hang. The Timeout handling triggered + * by the WatcherThread should kick in and interrupt those steps. In theory, the + * text "timeout occurred during error reporting in step .." (the little timeouts) + * should occur in the error log up to four times, followed by the final big timeout + * "------ Timeout during error reporting after xx s. ------" + * + * Note that there are a number of uncertainties which make writing a 100% foolproof + * test challenging. The time the error reporting thread takes to react to the + * timeout triggers is unknown. So it is difficult to predict how many little timeouts + * will be visible before the big timeout kicks in. Also, once the big timeout hits, + * error reporting thread and Watcherthread will race. The former writes his last + * message to the error logs and flushes, the latter waits 200ms and then exits the + * process without further synchronization with the error reporting thread. + * + * Because of all this and the desire to write a bullet proof test which does + * not fail sporadically, we will not test for the final timeout message nor for all + * of the optimally expected little timeout messages. We just test for two of the + * little timeout messages to see that repeated timeout handling is basically working. + */ + + ProcessBuilder pb = ProcessTools.createJavaProcessBuilder( + "-XX:+UnlockDiagnosticVMOptions", + "-Xmx100M", + "-XX:ErrorHandlerTest=14", + "-XX:+TestUnresponsiveErrorHandler", + "-XX:ErrorLogTimeout=16", // 16 seconds big timeout = 4 seconds per little timeout + "-XX:-CreateCoredumpOnCrash", + "-version"); + + OutputAnalyzer output_detail = new OutputAnalyzer(pb.start()); + + // we should have crashed with a SIGSEGV + output_detail.shouldMatch("# A fatal error has been detected by the Java Runtime Environment:.*"); + output_detail.shouldMatch("# +(?:SIGSEGV|EXCEPTION_ACCESS_VIOLATION).*"); + + // VM should have been aborted by WatcherThread + output_detail.shouldMatch(".*timer expired, abort.*"); + + // extract hs-err file + String hs_err_file = output_detail.firstMatch("# *(\\S*hs_err_pid\\d+\\.log)", 1); + if (hs_err_file == null) { + throw new RuntimeException("Did not find hs-err file in output.\n"); + } + + File f = new File(hs_err_file); + if (!f.exists()) { + throw new RuntimeException("hs-err file missing at " + + f.getAbsolutePath() + ".\n"); + } + + System.out.println("Found hs_err file. Scanning..."); + + FileInputStream fis = new FileInputStream(f); + BufferedReader br = new BufferedReader(new InputStreamReader(fis)); + String line = null; + + + + Pattern [] pattern = new Pattern[] { + Pattern.compile(".*timeout occurred during error reporting in step.*"), + Pattern.compile(".*timeout occurred during error reporting in step.*") + }; + int currentPattern = 0; + + String lastLine = null; + while ((line = br.readLine()) != null) { + if (currentPattern < pattern.length) { + if (pattern[currentPattern].matcher(line).matches()) { + System.out.println("Found: " + line + "."); + currentPattern ++; + } + } + lastLine = line; + } + br.close(); + + if (currentPattern < pattern.length) { + throw new RuntimeException("hs-err file incomplete (first missing pattern: " + currentPattern + ")"); + } + + System.out.println("OK."); + + } + +} +