6779436: NUMA allocator: libnuma expects certain size of the buffer in numa_node_to_cpus()
authoriveresov
Wed, 03 Dec 2008 14:18:57 -0800
changeset 1615 b46d9f19bde2
parent 1610 5dddd195cc86
child 1616 884d2c56df7d
6779436: NUMA allocator: libnuma expects certain size of the buffer in numa_node_to_cpus() Summary: In os::Linux::rebuild_cpu_to_node_map() fix the size of the CPU bitmap. Fixed arithmetic in MutableNUMASpace::adaptive_chunk_size() that could cause overflows and underflows of the chunk_size variable. Reviewed-by: apetrusenko
hotspot/src/os/linux/vm/os_linux.cpp
hotspot/src/os/linux/vm/os_linux.hpp
hotspot/src/os/solaris/vm/os_solaris.cpp
hotspot/src/os/solaris/vm/os_solaris.hpp
hotspot/src/os/windows/vm/os_windows.cpp
hotspot/src/share/vm/gc_implementation/shared/mutableNUMASpace.cpp
hotspot/src/share/vm/runtime/globals.hpp
--- a/hotspot/src/os/linux/vm/os_linux.cpp	Mon Dec 01 23:25:24 2008 -0800
+++ b/hotspot/src/os/linux/vm/os_linux.cpp	Wed Dec 03 14:18:57 2008 -0800
@@ -2272,7 +2272,9 @@
   uncommit_memory(addr, bytes);
 }
 
-void os::numa_make_global(char *addr, size_t bytes)    { }
+void os::numa_make_global(char *addr, size_t bytes) {
+  Linux::numa_interleave_memory(addr, bytes);
+}
 
 void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint) {
   Linux::numa_tonode_memory(addr, bytes, lgrp_hint);
@@ -2314,7 +2316,7 @@
 extern "C" void numa_warn(int number, char *where, ...) { }
 extern "C" void numa_error(char *where) { }
 
-void os::Linux::libnuma_init() {
+bool os::Linux::libnuma_init() {
   // sched_getcpu() should be in libc.
   set_sched_getcpu(CAST_TO_FN_PTR(sched_getcpu_func_t,
                                   dlsym(RTLD_DEFAULT, "sched_getcpu")));
@@ -2330,31 +2332,51 @@
                                         dlsym(handle, "numa_available")));
       set_numa_tonode_memory(CAST_TO_FN_PTR(numa_tonode_memory_func_t,
                                             dlsym(handle, "numa_tonode_memory")));
+      set_numa_interleave_memory(CAST_TO_FN_PTR(numa_interleave_memory_func_t,
+                                            dlsym(handle, "numa_interleave_memory")));
+
+
       if (numa_available() != -1) {
+        set_numa_all_nodes((unsigned long*)dlsym(handle, "numa_all_nodes"));
         // Create a cpu -> node mapping
         _cpu_to_node = new (ResourceObj::C_HEAP) GrowableArray<int>(0, true);
         rebuild_cpu_to_node_map();
+        return true;
       }
     }
   }
+  return false;
 }
 
 // rebuild_cpu_to_node_map() constructs a table mapping cpud id to node id.
 // The table is later used in get_node_by_cpu().
 void os::Linux::rebuild_cpu_to_node_map() {
-  int cpu_num = os::active_processor_count();
+  const size_t NCPUS = 32768; // Since the buffer size computation is very obscure
+                              // in libnuma (possible values are starting from 16,
+                              // and continuing up with every other power of 2, but less
+                              // than the maximum number of CPUs supported by kernel), and
+                              // is a subject to change (in libnuma version 2 the requirements
+                              // are more reasonable) we'll just hardcode the number they use
+                              // in the library.
+  const size_t BitsPerCLong = sizeof(long) * CHAR_BIT;
+
+  size_t cpu_num = os::active_processor_count();
+  size_t cpu_map_size = NCPUS / BitsPerCLong;
+  size_t cpu_map_valid_size =
+    MIN2((cpu_num + BitsPerCLong - 1) / BitsPerCLong, cpu_map_size);
+
   cpu_to_node()->clear();
   cpu_to_node()->at_grow(cpu_num - 1);
-  int node_num = numa_get_groups_num();
-  int cpu_map_size = (cpu_num + BitsPerLong - 1) / BitsPerLong;
+  size_t node_num = numa_get_groups_num();
+
   unsigned long *cpu_map = NEW_C_HEAP_ARRAY(unsigned long, cpu_map_size);
-  for (int i = 0; i < node_num; i++) {
+  for (size_t i = 0; i < node_num; i++) {
     if (numa_node_to_cpus(i, cpu_map, cpu_map_size * sizeof(unsigned long)) != -1) {
-      for (int j = 0; j < cpu_map_size; j++) {
+      for (size_t j = 0; j < cpu_map_valid_size; j++) {
         if (cpu_map[j] != 0) {
-          for (int k = 0; k < BitsPerLong; k++) {
+          for (size_t k = 0; k < BitsPerCLong; k++) {
             if (cpu_map[j] & (1UL << k)) {
-              cpu_to_node()->at_put(j * BitsPerLong + k, i);
+              cpu_to_node()->at_put(j * BitsPerCLong + k, i);
             }
           }
         }
@@ -2377,7 +2399,8 @@
 os::Linux::numa_max_node_func_t os::Linux::_numa_max_node;
 os::Linux::numa_available_func_t os::Linux::_numa_available;
 os::Linux::numa_tonode_memory_func_t os::Linux::_numa_tonode_memory;
-
+os::Linux::numa_interleave_memory_func_t os::Linux::_numa_interleave_memory;
+unsigned long* os::Linux::_numa_all_nodes;
 
 bool os::uncommit_memory(char* addr, size_t size) {
   return ::mmap(addr, size,
@@ -3695,7 +3718,17 @@
   }
 
   if (UseNUMA) {
-    Linux::libnuma_init();
+    if (!Linux::libnuma_init()) {
+      UseNUMA = false;
+    } else {
+      if ((Linux::numa_max_node() < 1)) {
+        // There's only one node(they start from 0), disable NUMA.
+        UseNUMA = false;
+      }
+    }
+    if (!UseNUMA && ForceNUMA) {
+      UseNUMA = true;
+    }
   }
 
   if (MaxFDLimit) {
--- a/hotspot/src/os/linux/vm/os_linux.hpp	Mon Dec 01 23:25:24 2008 -0800
+++ b/hotspot/src/os/linux/vm/os_linux.hpp	Wed Dec 03 14:18:57 2008 -0800
@@ -146,7 +146,7 @@
   static bool is_floating_stack()             { return _is_floating_stack; }
 
   static void libpthread_init();
-  static void libnuma_init();
+  static bool libnuma_init();
 
   // Minimum stack size a thread can be created with (allowing
   // the VM to completely create the thread and enter user code)
@@ -240,20 +240,23 @@
   typedef int (*numa_max_node_func_t)(void);
   typedef int (*numa_available_func_t)(void);
   typedef int (*numa_tonode_memory_func_t)(void *start, size_t size, int node);
-
+  typedef void (*numa_interleave_memory_func_t)(void *start, size_t size, unsigned long *nodemask);
 
   static sched_getcpu_func_t _sched_getcpu;
   static numa_node_to_cpus_func_t _numa_node_to_cpus;
   static numa_max_node_func_t _numa_max_node;
   static numa_available_func_t _numa_available;
   static numa_tonode_memory_func_t _numa_tonode_memory;
+  static numa_interleave_memory_func_t _numa_interleave_memory;
+  static unsigned long* _numa_all_nodes;
 
   static void set_sched_getcpu(sched_getcpu_func_t func) { _sched_getcpu = func; }
   static void set_numa_node_to_cpus(numa_node_to_cpus_func_t func) { _numa_node_to_cpus = func; }
   static void set_numa_max_node(numa_max_node_func_t func) { _numa_max_node = func; }
   static void set_numa_available(numa_available_func_t func) { _numa_available = func; }
   static void set_numa_tonode_memory(numa_tonode_memory_func_t func) { _numa_tonode_memory = func; }
-
+  static void set_numa_interleave_memory(numa_interleave_memory_func_t func) { _numa_interleave_memory = func; }
+  static void set_numa_all_nodes(unsigned long* ptr) { _numa_all_nodes = ptr; }
 public:
   static int sched_getcpu()  { return _sched_getcpu != NULL ? _sched_getcpu() : -1; }
   static int numa_node_to_cpus(int node, unsigned long *buffer, int bufferlen) {
@@ -264,6 +267,11 @@
   static int numa_tonode_memory(void *start, size_t size, int node) {
     return _numa_tonode_memory != NULL ? _numa_tonode_memory(start, size, node) : -1;
   }
+  static void numa_interleave_memory(void *start, size_t size) {
+    if (_numa_interleave_memory != NULL && _numa_all_nodes != NULL) {
+      _numa_interleave_memory(start, size, _numa_all_nodes);
+    }
+  }
   static int get_node_by_cpu(int cpu_id);
 };
 
--- a/hotspot/src/os/solaris/vm/os_solaris.cpp	Mon Dec 01 23:25:24 2008 -0800
+++ b/hotspot/src/os/solaris/vm/os_solaris.cpp	Wed Dec 03 14:18:57 2008 -0800
@@ -4638,7 +4638,7 @@
   }
 }
 
-void os::Solaris::liblgrp_init() {
+bool os::Solaris::liblgrp_init() {
   void *handle = dlopen("liblgrp.so.1", RTLD_LAZY);
   if (handle != NULL) {
     os::Solaris::set_lgrp_home(CAST_TO_FN_PTR(lgrp_home_func_t, dlsym(handle, "lgrp_home")));
@@ -4653,9 +4653,9 @@
 
     lgrp_cookie_t c = lgrp_init(LGRP_VIEW_CALLER);
     set_lgrp_cookie(c);
-  } else {
-    warning("your OS does not support NUMA");
-  }
+    return true;
+  }
+  return false;
 }
 
 void os::Solaris::misc_sym_init() {
@@ -4824,9 +4824,25 @@
         vm_page_size()));
 
   Solaris::libthread_init();
+
   if (UseNUMA) {
-    Solaris::liblgrp_init();
-  }
+    if (!Solaris::liblgrp_init()) {
+      UseNUMA = false;
+    } else {
+      size_t lgrp_limit = os::numa_get_groups_num();
+      int *lgrp_ids = NEW_C_HEAP_ARRAY(int, lgrp_limit);
+      size_t lgrp_num = os::numa_get_leaf_groups(lgrp_ids, lgrp_limit);
+      FREE_C_HEAP_ARRAY(int, lgrp_ids);
+      if (lgrp_num < 2) {
+        // There's only one locality group, disable NUMA.
+        UseNUMA = false;
+      }
+    }
+    if (!UseNUMA && ForceNUMA) {
+      UseNUMA = true;
+    }
+  }
+
   Solaris::misc_sym_init();
   Solaris::signal_sets_init();
   Solaris::init_signal_mem();
--- a/hotspot/src/os/solaris/vm/os_solaris.hpp	Mon Dec 01 23:25:24 2008 -0800
+++ b/hotspot/src/os/solaris/vm/os_solaris.hpp	Wed Dec 03 14:18:57 2008 -0800
@@ -176,7 +176,7 @@
  public:
   static void libthread_init();
   static void synchronization_init();
-  static void liblgrp_init();
+  static bool liblgrp_init();
   // Load miscellaneous symbols.
   static void misc_sym_init();
   // This boolean allows users to forward their own non-matching signals
--- a/hotspot/src/os/windows/vm/os_windows.cpp	Mon Dec 01 23:25:24 2008 -0800
+++ b/hotspot/src/os/windows/vm/os_windows.cpp	Wed Dec 03 14:18:57 2008 -0800
@@ -3353,6 +3353,10 @@
   // initialize thread priority policy
   prio_init();
 
+  if (UseNUMA && !ForceNUMA) {
+    UseNUMA = false; // Currently unsupported.
+  }
+
   return JNI_OK;
 }
 
--- a/hotspot/src/share/vm/gc_implementation/shared/mutableNUMASpace.cpp	Mon Dec 01 23:25:24 2008 -0800
+++ b/hotspot/src/share/vm/gc_implementation/shared/mutableNUMASpace.cpp	Wed Dec 03 14:18:57 2008 -0800
@@ -414,9 +414,20 @@
   if (limit > 0) {
     limit = round_down(limit, page_size());
     if (chunk_size > current_chunk_size(i)) {
-      chunk_size = MIN2((off_t)chunk_size, (off_t)current_chunk_size(i) + (off_t)limit);
+      size_t upper_bound = pages_available * page_size();
+      if (upper_bound > limit &&
+          current_chunk_size(i) < upper_bound - limit) {
+        // The resulting upper bound should not exceed the available
+        // amount of memory (pages_available * page_size()).
+        upper_bound = current_chunk_size(i) + limit;
+      }
+      chunk_size = MIN2(chunk_size, upper_bound);
     } else {
-      chunk_size = MAX2((off_t)chunk_size, (off_t)current_chunk_size(i) - (off_t)limit);
+      size_t lower_bound = page_size();
+      if (current_chunk_size(i) > limit) { // lower_bound shouldn't underflow.
+        lower_bound = current_chunk_size(i) - limit;
+      }
+      chunk_size = MAX2(chunk_size, lower_bound);
     }
   }
   assert(chunk_size <= pages_available * page_size(), "Chunk size out of range");
--- a/hotspot/src/share/vm/runtime/globals.hpp	Mon Dec 01 23:25:24 2008 -0800
+++ b/hotspot/src/share/vm/runtime/globals.hpp	Wed Dec 03 14:18:57 2008 -0800
@@ -342,6 +342,9 @@
   product(bool, UseNUMA, false,                                             \
           "Use NUMA if available")                                          \
                                                                             \
+  product(bool, ForceNUMA, false,                                           \
+          "Force NUMA optimizations on single-node/UMA systems")            \
+                                                                            \
   product(intx, NUMAChunkResizeWeight, 20,                                  \
           "Percentage (0-100) used to weight the current sample when "      \
           "computing exponentially decaying average for "                   \