8181143: Introduce diagnostic flag to abort VM on too long VM operations
authorshade
Thu, 13 Dec 2018 16:45:24 +0100
changeset 53020 c403f39ec349
parent 53019 4ddd3c410a85
child 53021 cc4098b3bc10
8181143: Introduce diagnostic flag to abort VM on too long VM operations Reviewed-by: rkennke, zgu, dholmes, stuefe, rehn
src/hotspot/share/runtime/globals.hpp
src/hotspot/share/runtime/vmThread.cpp
src/hotspot/share/runtime/vmThread.hpp
test/hotspot/jtreg/runtime/Safepoint/TestAbortOnVMOperationTimeout.java
--- a/src/hotspot/share/runtime/globals.hpp	Thu Dec 13 10:35:09 2018 -0500
+++ b/src/hotspot/share/runtime/globals.hpp	Thu Dec 13 16:45:24 2018 +0100
@@ -501,6 +501,13 @@
   diagnostic(bool, AbortVMOnSafepointTimeout, false,                        \
           "Abort upon failure to reach safepoint (see SafepointTimeout)")   \
                                                                             \
+  diagnostic(bool, AbortVMOnVMOperationTimeout, false,                      \
+          "Abort upon failure to complete VM operation promptly")           \
+                                                                            \
+  diagnostic(intx, AbortVMOnVMOperationTimeoutDelay, 1000,                  \
+          "Delay in milliseconds for option AbortVMOnVMOperationTimeout")   \
+          range(0, max_intx)                                                \
+                                                                            \
   /* 50 retries * (5 * current_retry_count) millis = ~6.375 seconds */      \
   /* typically, at most a few retries are needed                    */      \
   product(intx, SuspendRetryCount, 50,                                      \
--- a/src/hotspot/share/runtime/vmThread.cpp	Thu Dec 13 10:35:09 2018 -0500
+++ b/src/hotspot/share/runtime/vmThread.cpp	Thu Dec 13 16:45:24 2018 +0100
@@ -28,6 +28,7 @@
 #include "jfr/jfrEvents.hpp"
 #include "jfr/support/jfrThreadId.hpp"
 #include "logging/log.hpp"
+#include "logging/logStream.hpp"
 #include "logging/logConfiguration.hpp"
 #include "memory/resourceArea.hpp"
 #include "oops/method.hpp"
@@ -197,6 +198,32 @@
   drain_list_oops_do(f);
 }
 
+//------------------------------------------------------------------------------------------------------------------
+// Timeout machinery
+
+void VMOperationTimeoutTask::task() {
+  assert(AbortVMOnVMOperationTimeout, "only if enabled");
+  if (is_armed()) {
+    jlong delay = (os::javaTimeMillis() - _arm_time);
+    if (delay > AbortVMOnVMOperationTimeoutDelay) {
+      fatal("VM operation took too long: " SIZE_FORMAT " ms (timeout: " SIZE_FORMAT " ms)",
+            delay, AbortVMOnVMOperationTimeoutDelay);
+    }
+  }
+}
+
+bool VMOperationTimeoutTask::is_armed() {
+  return OrderAccess::load_acquire(&_armed) != 0;
+}
+
+void VMOperationTimeoutTask::arm() {
+  _arm_time = os::javaTimeMillis();
+  OrderAccess::release_store_fence(&_armed, 1);
+}
+
+void VMOperationTimeoutTask::disarm() {
+  OrderAccess::release_store_fence(&_armed, 0);
+}
 
 //------------------------------------------------------------------------------------------------------------------
 // Implementation of VMThread stuff
@@ -209,12 +236,28 @@
 VMOperationQueue* VMThread::_vm_queue           = NULL;
 PerfCounter*      VMThread::_perf_accumulated_vm_operation_time = NULL;
 const char*       VMThread::_no_op_reason       = NULL;
+VMOperationTimeoutTask* VMThread::_timeout_task = NULL;
 
 
 void VMThread::create() {
   assert(vm_thread() == NULL, "we can only allocate one VMThread");
   _vm_thread = new VMThread();
 
+  if (AbortVMOnVMOperationTimeout) {
+    // Make sure we call the timeout task frequently enough, but not too frequent.
+    // Try to make the interval 10% of the timeout delay, so that we miss the timeout
+    // by those 10% at max. Periodic task also expects it to fit min/max intervals.
+    size_t interval = (size_t)AbortVMOnVMOperationTimeoutDelay / 10;
+    interval = interval / PeriodicTask::interval_gran * PeriodicTask::interval_gran;
+    interval = MAX2<size_t>(interval, PeriodicTask::min_interval);
+    interval = MIN2<size_t>(interval, PeriodicTask::max_interval);
+
+    _timeout_task = new VMOperationTimeoutTask(interval);
+    _timeout_task->enroll();
+  } else {
+    assert(_timeout_task == NULL, "sanity");
+  }
+
   // Create VM operation queue
   _vm_queue = new VMOperationQueue();
   guarantee(_vm_queue != NULL, "just checking");
@@ -492,6 +535,11 @@
         _vm_queue->set_drain_list(safepoint_ops); // ensure ops can be scanned
 
         SafepointSynchronize::begin();
+
+        if (_timeout_task != NULL) {
+          _timeout_task->arm();
+        }
+
         evaluate_operation(_cur_vm_operation);
         // now process all queued safepoint ops, iteratively draining
         // the queue until there are none left
@@ -533,6 +581,10 @@
 
         _vm_queue->set_drain_list(NULL);
 
+        if (_timeout_task != NULL) {
+          _timeout_task->disarm();
+        }
+
         // Complete safepoint synchronization
         SafepointSynchronize::end();
 
--- a/src/hotspot/share/runtime/vmThread.hpp	Thu Dec 13 10:35:09 2018 -0500
+++ b/src/hotspot/share/runtime/vmThread.hpp	Thu Dec 13 16:45:24 2018 +0100
@@ -27,6 +27,7 @@
 
 #include "runtime/perfData.hpp"
 #include "runtime/thread.hpp"
+#include "runtime/task.hpp"
 #include "runtime/vmOperations.hpp"
 
 //
@@ -84,6 +85,26 @@
 };
 
 
+// VM operation timeout handling: warn or abort the VM when VM operation takes
+// too long. Periodic tasks do not participate in safepoint protocol, and therefore
+// can fire when application threads are stopped.
+
+class VMOperationTimeoutTask : public PeriodicTask {
+private:
+  volatile int _armed;
+  jlong _arm_time;
+
+public:
+  VMOperationTimeoutTask(size_t interval_time) :
+          PeriodicTask(interval_time), _armed(0), _arm_time(0) {}
+
+  virtual void task();
+
+  bool is_armed();
+  void arm();
+  void disarm();
+};
+
 //
 // A single VMThread (the primordial thread) spawns all other threads
 // and is itself used by other threads to offload heavy vm operations
@@ -101,6 +122,8 @@
 
   static const char* _no_op_reason;
 
+  static VMOperationTimeoutTask* _timeout_task;
+
   static bool no_op_safepoint_needed(bool check_time);
 
   void evaluate_operation(VM_Operation* op);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/hotspot/jtreg/runtime/Safepoint/TestAbortOnVMOperationTimeout.java	Thu Dec 13 16:45:24 2018 +0100
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2018, Red Hat, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+import jdk.test.lib.*;
+import jdk.test.lib.process.*;
+
+/*
+ * @test TestAbortOnVMOperationTimeout
+ * @bug 8181143
+ * @summary Check abort on VM timeout is working
+ * @library /test/lib
+ * @modules java.base/jdk.internal.misc
+ *          java.management
+ */
+
+public class TestAbortOnVMOperationTimeout {
+
+    public static void main(String[] args) throws Exception {
+        if (args.length > 0) {
+            Object[] arr = new Object[10_000_000];
+            for (int i = 0; i < arr.length; i++) {
+               arr[i] = new Object();
+            }
+            return;
+        }
+
+        // These should definitely pass: more than a minute is enough for Serial to act.
+        // The values are deliberately non-round to trip off periodic task granularity.
+        for (int delay : new int[]{63423, 12388131}) {
+            testWith(delay, true);
+        }
+
+        // These should fail: Serial is not very fast. Traversing 10M objects in 5 ms
+        // means less than 0.5 ns per object, which is not doable.
+        for (int delay : new int[]{0, 1, 5}) {
+            testWith(delay, false);
+        }
+    }
+
+    public static void testWith(int delay, boolean shouldPass) throws Exception {
+        ProcessBuilder pb = ProcessTools.createJavaProcessBuilder(
+                "-XX:+UnlockDiagnosticVMOptions",
+                "-XX:+AbortVMOnVMOperationTimeout",
+                "-XX:AbortVMOnVMOperationTimeoutDelay=" + delay,
+                "-Xmx256m",
+                "-XX:+UseSerialGC",
+                "-XX:-CreateCoredumpOnCrash",
+                "TestAbortOnVMOperationTimeout",
+                "foo"
+        );
+
+        OutputAnalyzer output = new OutputAnalyzer(pb.start());
+        if (shouldPass) {
+            output.shouldHaveExitValue(0);
+        } else {
+            output.shouldMatch("VM operation took too long");
+            output.shouldNotHaveExitValue(0);
+        }
+    }
+}
+