8049717: expose L1_data_cache_line_size for diagnostic/sanity checks
authordcubed
Tue, 15 Jul 2014 07:33:49 -0700 (2014-07-15)
changeset 25633 4cd9c4622c8c
parent 25632 d200adafaee5
child 25634 999012f63df5
child 25712 1f37709c1c95
8049717: expose L1_data_cache_line_size for diagnostic/sanity checks Summary: Add support for VM_Version::L1_data_cache_line_size(). Reviewed-by: dsimms, kvn, dholmes
hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp
hotspot/src/cpu/x86/vm/vm_version_x86.cpp
hotspot/src/cpu/x86/vm/vm_version_x86.hpp
hotspot/src/share/vm/prims/jni.cpp
hotspot/src/share/vm/runtime/objectMonitor.cpp
hotspot/src/share/vm/runtime/objectMonitor.hpp
hotspot/src/share/vm/runtime/synchronizer.cpp
hotspot/src/share/vm/runtime/synchronizer.hpp
hotspot/src/share/vm/runtime/vm_version.cpp
hotspot/src/share/vm/runtime/vm_version.hpp
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp	Mon Jul 14 21:48:47 2014 +0000
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp	Tue Jul 15 07:33:49 2014 -0700
@@ -251,6 +251,49 @@
   // buf is started with ", " or is empty
   _features_str = strdup(strlen(buf) > 2 ? buf + 2 : buf);
 
+  // There are three 64-bit SPARC families that do not overlap, e.g.,
+  // both is_ultra3() and is_sparc64() cannot be true at the same time.
+  // Within these families, there can be more than one chip, e.g.,
+  // is_T4() and is_T7() machines are also is_niagara().
+  if (is_ultra3()) {
+    assert(_L1_data_cache_line_size == 0, "overlap with Ultra3 family");
+    // Ref: UltraSPARC III Cu Processor
+    _L1_data_cache_line_size = 64;
+  }
+  if (is_niagara()) {
+    assert(_L1_data_cache_line_size == 0, "overlap with niagara family");
+    // All Niagara's are sun4v's, but not all sun4v's are Niagaras, e.g.,
+    // Fujitsu SPARC64 is sun4v, but we don't want it in this block.
+    //
+    // Ref: UltraSPARC T1 Supplement to the UltraSPARC Architecture 2005
+    // Appendix F.1.3.1 Cacheable Accesses
+    // -> 16-byte L1 cache line size
+    //
+    // Ref: UltraSPARC T2: A Highly-Threaded, Power-Efficient, SPARC SOC
+    // Section III: SPARC Processor Core
+    // -> 16-byte L1 cache line size
+    //
+    // Ref: Oracle's SPARC T4-1, SPARC T4-2, SPARC T4-4, and SPARC T4-1B Server Architecture
+    // Section SPARC T4 Processor Cache Architecture
+    // -> 32-byte L1 cache line size (no longer see that info on this ref)
+    //
+    // XXX - still need a T7 reference here
+    //
+    if (is_T7()) {  // T7 or newer
+      _L1_data_cache_line_size = 64;
+    } else if (is_T4()) {  // T4 or newer (until T7)
+      _L1_data_cache_line_size = 32;
+    } else {  // T1 or newer (until T4)
+      _L1_data_cache_line_size = 16;
+    }
+  }
+  if (is_sparc64()) {
+    guarantee(_L1_data_cache_line_size == 0, "overlap with SPARC64 family");
+    // Ref: Fujitsu SPARC64 VII Processor
+    // Section 4 Cache System
+    _L1_data_cache_line_size = 64;
+  }
+
   // UseVIS is set to the smallest of what hardware supports and what
   // the command line requires.  I.e., you cannot set UseVIS to 3 on
   // older UltraSparc which do not support it.
@@ -356,6 +399,7 @@
 
 #ifndef PRODUCT
   if (PrintMiscellaneous && Verbose) {
+    tty->print_cr("L1 data cache line size: %u", L1_data_cache_line_size());
     tty->print("Allocation");
     if (AllocatePrefetchStyle <= 0) {
       tty->print_cr(": no prefetching");
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp	Mon Jul 14 21:48:47 2014 +0000
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp	Tue Jul 15 07:33:49 2014 -0700
@@ -394,6 +394,8 @@
   _stepping = 0;
   _cpuFeatures = 0;
   _logical_processors_per_package = 1;
+  // i486 internal cache is both I&D and has a 16-byte line size
+  _L1_data_cache_line_size = 16;
 
   if (!Use486InstrsOnly) {
     // Get raw processor info
@@ -412,6 +414,7 @@
       // Logical processors are only available on P4s and above,
       // and only if hyperthreading is available.
       _logical_processors_per_package = logical_processor_count();
+      _L1_data_cache_line_size = L1_line_size();
     }
   }
 
@@ -924,6 +927,7 @@
   if (PrintMiscellaneous && Verbose) {
     tty->print_cr("Logical CPUs per core: %u",
                   logical_processors_per_package());
+    tty->print_cr("L1 data cache line size: %u", L1_data_cache_line_size());
     tty->print("UseSSE=%d", (int) UseSSE);
     if (UseAVX > 0) {
       tty->print("  UseAVX=%d", (int) UseAVX);
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp	Mon Jul 14 21:48:47 2014 +0000
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp	Tue Jul 15 07:33:49 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -581,7 +581,7 @@
     return result;
   }
 
-  static intx prefetch_data_size()  {
+  static intx L1_line_size()  {
     intx result = 0;
     if (is_intel()) {
       result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1);
@@ -593,6 +593,10 @@
     return result;
   }
 
+  static intx prefetch_data_size()  {
+    return L1_line_size();
+  }
+
   //
   // Feature identification
   //
--- a/hotspot/src/share/vm/prims/jni.cpp	Mon Jul 14 21:48:47 2014 +0000
+++ b/hotspot/src/share/vm/prims/jni.cpp	Tue Jul 15 07:33:49 2014 -0700
@@ -3886,6 +3886,7 @@
     run_unit_test(TestKlass_test());
     run_unit_test(TestBitMap_test());
     run_unit_test(TestAsUtf8());
+    run_unit_test(ObjectMonitor::sanity_checks());
 #if INCLUDE_VM_STRUCTS
     run_unit_test(VMStructs::test());
 #endif
--- a/hotspot/src/share/vm/runtime/objectMonitor.cpp	Mon Jul 14 21:48:47 2014 +0000
+++ b/hotspot/src/share/vm/runtime/objectMonitor.cpp	Tue Jul 15 07:33:49 2014 -0700
@@ -2497,6 +2497,10 @@
   SETKNOB(FastHSSEC);
   #undef SETKNOB
 
+  if (Knob_Verbose) {
+    sanity_checks();
+  }
+
   if (os::is_MP()) {
      BackOffMask = (1 << Knob_SpinBackOff) - 1;
      if (Knob_ReportSettings) ::printf("BackOffMask=%X\n", BackOffMask);
@@ -2517,6 +2521,66 @@
   InitDone = 1;
 }
 
+void ObjectMonitor::sanity_checks() {
+  int error_cnt = 0;
+  int warning_cnt = 0;
+  bool verbose = Knob_Verbose != 0 NOT_PRODUCT(|| VerboseInternalVMTests);
+
+  if (verbose) {
+    tty->print_cr("INFO: sizeof(ObjectMonitor)=" SIZE_FORMAT,
+                  sizeof(ObjectMonitor));
+  }
+
+  uint cache_line_size = VM_Version::L1_data_cache_line_size();
+  if (verbose) {
+    tty->print_cr("INFO: L1_data_cache_line_size=%u", cache_line_size);
+  }
+
+  ObjectMonitor dummy;
+  u_char *addr_begin  = (u_char*)&dummy;
+  u_char *addr_header = (u_char*)&dummy._header;
+  u_char *addr_owner  = (u_char*)&dummy._owner;
+
+  uint offset_header = (uint)(addr_header - addr_begin);
+  if (verbose) tty->print_cr("INFO: offset(_header)=%u", offset_header);
+
+  uint offset_owner = (uint)(addr_owner - addr_begin);
+  if (verbose) tty->print_cr("INFO: offset(_owner)=%u", offset_owner);
+
+  if ((uint)(addr_header - addr_begin) != 0) {
+    tty->print_cr("ERROR: offset(_header) must be zero (0).");
+    error_cnt++;
+  }
+
+  if (cache_line_size != 0) {
+    // We were able to determine the L1 data cache line size so
+    // do some cache line specific sanity checks
+
+    if ((offset_owner - offset_header) < cache_line_size) {
+      tty->print_cr("WARNING: the _header and _owner fields are closer "
+                    "than a cache line which permits false sharing.");
+      warning_cnt++;
+    }
+
+    if ((sizeof(ObjectMonitor) % cache_line_size) != 0) {
+      tty->print_cr("WARNING: ObjectMonitor size is not a multiple of "
+                    "a cache line which permits false sharing.");
+      warning_cnt++;
+    }
+  }
+
+  ObjectSynchronizer::sanity_checks(verbose, cache_line_size, &error_cnt,
+                                    &warning_cnt);
+
+  if (verbose || error_cnt != 0 || warning_cnt != 0) {
+    tty->print_cr("INFO: error_cnt=%d", error_cnt);
+    tty->print_cr("INFO: warning_cnt=%d", warning_cnt);
+  }
+
+  guarantee(error_cnt == 0,
+            "Fatal error(s) found in ObjectMonitor::sanity_checks()");
+}
+
 #ifndef PRODUCT
 void ObjectMonitor::verify() {
 }
--- a/hotspot/src/share/vm/runtime/objectMonitor.hpp	Mon Jul 14 21:48:47 2014 +0000
+++ b/hotspot/src/share/vm/runtime/objectMonitor.hpp	Tue Jul 15 07:33:49 2014 -0700
@@ -189,6 +189,8 @@
   bool      check(TRAPS);       // true if the thread owns the monitor.
   void      check_slow(TRAPS);
   void      clear();
+  static void sanity_checks();  // public for -XX:+ExecuteInternalVMTests
+                                // in PRODUCT for -XX:SyncKnobs=Verbose=1
 #ifndef PRODUCT
   void      verify();
   void      print();
@@ -234,8 +236,6 @@
 
   // WARNING: this must be the very first word of ObjectMonitor
   // This means this class can't use any virtual member functions.
-  // TODO-FIXME: assert that offsetof(_header) is 0 or get rid of the
-  // implicit 0 offset in emitted code.
 
   volatile markOop   _header;       // displaced object header word - mark
   void*     volatile _object;       // backward object pointer - strong root
--- a/hotspot/src/share/vm/runtime/synchronizer.cpp	Mon Jul 14 21:48:47 2014 +0000
+++ b/hotspot/src/share/vm/runtime/synchronizer.cpp	Tue Jul 15 07:33:49 2014 -0700
@@ -392,19 +392,22 @@
 // Hash Code handling
 //
 // Performance concern:
-// OrderAccess::storestore() calls release() which STs 0 into the global volatile
-// OrderAccess::Dummy variable.  This store is unnecessary for correctness.
-// Many threads STing into a common location causes considerable cache migration
-// or "sloshing" on large SMP system.  As such, I avoid using OrderAccess::storestore()
-// until it's repaired.  In some cases OrderAccess::fence() -- which incurs local
-// latency on the executing processor -- is a better choice as it scales on SMP
-// systems.  See http://blogs.sun.com/dave/entry/biased_locking_in_hotspot for a
-// discussion of coherency costs.  Note that all our current reference platforms
-// provide strong ST-ST order, so the issue is moot on IA32, x64, and SPARC.
+// OrderAccess::storestore() calls release() which at one time stored 0
+// into the global volatile OrderAccess::dummy variable. This store was
+// unnecessary for correctness. Many threads storing into a common location
+// causes considerable cache migration or "sloshing" on large SMP systems.
+// As such, I avoided using OrderAccess::storestore(). In some cases
+// OrderAccess::fence() -- which incurs local latency on the executing
+// processor -- is a better choice as it scales on SMP systems.
+//
+// See http://blogs.oracle.com/dave/entry/biased_locking_in_hotspot for
+// a discussion of coherency costs. Note that all our current reference
+// platforms provide strong ST-ST order, so the issue is moot on IA32,
+// x64, and SPARC.
 //
 // As a general policy we use "volatile" to control compiler-based reordering
-// and explicit fences (barriers) to control for architectural reordering performed
-// by the CPU(s) or platform.
+// and explicit fences (barriers) to control for architectural reordering
+// performed by the CPU(s) or platform.
 
 struct SharedGlobals {
     // These are highly shared mostly-read variables.
@@ -1596,7 +1599,55 @@
 }
 
 //------------------------------------------------------------------------------
-// Non-product code
+// Debugging code
+
+void ObjectSynchronizer::sanity_checks(const bool verbose,
+                                       const uint cache_line_size,
+                                       int *error_cnt_ptr,
+                                       int *warning_cnt_ptr) {
+  u_char *addr_begin      = (u_char*)&GVars;
+  u_char *addr_stwRandom  = (u_char*)&GVars.stwRandom;
+  u_char *addr_hcSequence = (u_char*)&GVars.hcSequence;
+
+  if (verbose) {
+    tty->print_cr("INFO: sizeof(SharedGlobals)=" SIZE_FORMAT,
+                  sizeof(SharedGlobals));
+  }
+
+  uint offset_stwRandom = (uint)(addr_stwRandom - addr_begin);
+  if (verbose) tty->print_cr("INFO: offset(stwRandom)=%u", offset_stwRandom);
+
+  uint offset_hcSequence = (uint)(addr_hcSequence - addr_begin);
+  if (verbose) {
+    tty->print_cr("INFO: offset(_hcSequence)=%u", offset_hcSequence);
+  }
+
+  if (cache_line_size != 0) {
+    // We were able to determine the L1 data cache line size so
+    // do some cache line specific sanity checks
+
+    if (offset_stwRandom < cache_line_size) {
+      tty->print_cr("WARNING: the SharedGlobals.stwRandom field is closer "
+                    "to the struct beginning than a cache line which permits "
+                    "false sharing.");
+      (*warning_cnt_ptr)++;
+    }
+
+    if ((offset_hcSequence - offset_stwRandom) < cache_line_size) {
+      tty->print_cr("WARNING: the SharedGlobals.stwRandom and "
+                    "SharedGlobals.hcSequence fields are closer than a cache "
+                    "line which permits false sharing.");
+      (*warning_cnt_ptr)++;
+    }
+
+    if ((sizeof(SharedGlobals) - offset_hcSequence) < cache_line_size) {
+      tty->print_cr("WARNING: the SharedGlobals.hcSequence field is closer "
+                    "to the struct end than a cache line which permits false "
+                    "sharing.");
+      (*warning_cnt_ptr)++;
+    }
+  }
+}
 
 #ifndef PRODUCT
 
--- a/hotspot/src/share/vm/runtime/synchronizer.hpp	Mon Jul 14 21:48:47 2014 +0000
+++ b/hotspot/src/share/vm/runtime/synchronizer.hpp	Tue Jul 15 07:33:49 2014 -0700
@@ -121,6 +121,9 @@
   static void oops_do(OopClosure* f);
 
   // debugging
+  static void sanity_checks(const bool verbose,
+                            const unsigned int cache_line_size,
+                            int *error_cnt_ptr, int *warning_cnt_ptr);
   static void verify() PRODUCT_RETURN;
   static int  verify_objmon_isinpool(ObjectMonitor *addr) PRODUCT_RETURN0;
 
--- a/hotspot/src/share/vm/runtime/vm_version.cpp	Mon Jul 14 21:48:47 2014 +0000
+++ b/hotspot/src/share/vm/runtime/vm_version.cpp	Tue Jul 15 07:33:49 2014 -0700
@@ -50,6 +50,7 @@
 bool Abstract_VM_Version::_supports_atomic_getadd4 = false;
 bool Abstract_VM_Version::_supports_atomic_getadd8 = false;
 unsigned int Abstract_VM_Version::_logical_processors_per_package = 1U;
+unsigned int Abstract_VM_Version::_L1_data_cache_line_size = 0;
 int Abstract_VM_Version::_reserve_for_allocation_prefetch = 0;
 
 #ifndef HOTSPOT_RELEASE_VERSION
--- a/hotspot/src/share/vm/runtime/vm_version.hpp	Mon Jul 14 21:48:47 2014 +0000
+++ b/hotspot/src/share/vm/runtime/vm_version.hpp	Tue Jul 15 07:33:49 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -42,6 +42,7 @@
   static bool         _supports_atomic_getadd4;
   static bool         _supports_atomic_getadd8;
   static unsigned int _logical_processors_per_package;
+  static unsigned int _L1_data_cache_line_size;
   static int          _vm_major_version;
   static int          _vm_minor_version;
   static int          _vm_micro_version;
@@ -98,6 +99,10 @@
     return _logical_processors_per_package;
   }
 
+  static unsigned int L1_data_cache_line_size() {
+    return _L1_data_cache_line_size;
+  }
+
   // Need a space at the end of TLAB for prefetch instructions
   // which may fault when accessing memory outside of heap.
   static int reserve_for_allocation_prefetch() {