7082969: NUMA interleaving
authoriveresov
Thu, 25 Aug 2011 02:57:46 -0700
changeset 10494 3f347ed8bd3c
parent 10284 17533dd1b807
child 10495 d20531ba2b31
7082969: NUMA interleaving Summary: Support interleaving on NUMA systems for collectors that don't have NUMA-awareness. Reviewed-by: iveresov, ysr Contributed-by: Tom Deneau <tom.deneau@amd.com>
hotspot/src/os/linux/vm/os_linux.cpp
hotspot/src/os/solaris/vm/os_solaris.cpp
hotspot/src/os/windows/vm/os_windows.cpp
hotspot/src/os/windows/vm/os_windows.hpp
hotspot/src/share/vm/runtime/arguments.cpp
hotspot/src/share/vm/runtime/globals.hpp
--- a/hotspot/src/os/linux/vm/os_linux.cpp	Tue Aug 23 11:06:30 2011 +0200
+++ b/hotspot/src/os/linux/vm/os_linux.cpp	Thu Aug 25 02:57:46 2011 -0700
@@ -2502,7 +2502,13 @@
   int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
   uintptr_t res = (uintptr_t) ::mmap(addr, size, prot,
                                    MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0);
-  return res != (uintptr_t) MAP_FAILED;
+  if (res != (uintptr_t) MAP_FAILED) {
+    if (UseNUMAInterleaving) {
+      numa_make_global(addr, size);
+    }
+    return true;
+  }
+  return false;
 }
 
 // Define MAP_HUGETLB here so we can build HotSpot on old systems.
@@ -2523,7 +2529,13 @@
       (uintptr_t) ::mmap(addr, size, prot,
                          MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS|MAP_HUGETLB,
                          -1, 0);
-    return res != (uintptr_t) MAP_FAILED;
+    if (res != (uintptr_t) MAP_FAILED) {
+      if (UseNUMAInterleaving) {
+        numa_make_global(addr, size);
+      }
+      return true;
+    }
+    return false;
   }
 
   return commit_memory(addr, size, exec);
@@ -3115,6 +3127,10 @@
      return NULL;
   }
 
+  if ((addr != NULL) && UseNUMAInterleaving) {
+    numa_make_global(addr, bytes);
+  }
+
   return addr;
 }
 
--- a/hotspot/src/os/solaris/vm/os_solaris.cpp	Tue Aug 23 11:06:30 2011 +0200
+++ b/hotspot/src/os/solaris/vm/os_solaris.cpp	Thu Aug 25 02:57:46 2011 -0700
@@ -2777,8 +2777,14 @@
 bool os::commit_memory(char* addr, size_t bytes, bool exec) {
   int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
   size_t size = bytes;
-  return
-     NULL != Solaris::mmap_chunk(addr, size, MAP_PRIVATE|MAP_FIXED, prot);
+  char *res = Solaris::mmap_chunk(addr, size, MAP_PRIVATE|MAP_FIXED, prot);
+  if (res != NULL) {
+    if (UseNUMAInterleaving) {
+      numa_make_global(addr, bytes);
+    }
+    return true;
+  }
+  return false;
 }
 
 bool os::commit_memory(char* addr, size_t bytes, size_t alignment_hint,
@@ -3389,12 +3395,11 @@
   return true;
 }
 
-char* os::reserve_memory_special(size_t bytes, char* addr, bool exec) {
+char* os::reserve_memory_special(size_t size, char* addr, bool exec) {
   // "exec" is passed in but not used.  Creating the shared image for
   // the code cache doesn't have an SHM_X executable permission to check.
   assert(UseLargePages && UseISM, "only for ISM large pages");
 
-  size_t size = bytes;
   char* retAddr = NULL;
   int shmid;
   key_t ismKey;
@@ -3436,7 +3441,9 @@
     }
     return NULL;
   }
-
+  if ((retAddr != NULL) && UseNUMAInterleaving) {
+    numa_make_global(retAddr, size);
+  }
   return retAddr;
 }
 
--- a/hotspot/src/os/windows/vm/os_windows.cpp	Tue Aug 23 11:06:30 2011 +0200
+++ b/hotspot/src/os/windows/vm/os_windows.cpp	Thu Aug 25 02:57:46 2011 -0700
@@ -2614,6 +2614,57 @@
 static HANDLE    _hProcess;
 static HANDLE    _hToken;
 
+// Container for NUMA node list info
+class NUMANodeListHolder {
+private:
+  int *_numa_used_node_list;  // allocated below
+  int _numa_used_node_count;
+
+  void free_node_list() {
+    if (_numa_used_node_list != NULL) {
+      FREE_C_HEAP_ARRAY(int, _numa_used_node_list);
+    }
+  }
+
+public:
+  NUMANodeListHolder() {
+    _numa_used_node_count = 0;
+    _numa_used_node_list = NULL;
+    // do rest of initialization in build routine (after function pointers are set up)
+  }
+
+  ~NUMANodeListHolder() {
+    free_node_list();
+  }
+
+  bool build() {
+    DWORD_PTR proc_aff_mask;
+    DWORD_PTR sys_aff_mask;
+    if (!GetProcessAffinityMask(GetCurrentProcess(), &proc_aff_mask, &sys_aff_mask)) return false;
+    ULONG highest_node_number;
+    if (!os::Kernel32Dll::GetNumaHighestNodeNumber(&highest_node_number)) return false;
+    free_node_list();
+    _numa_used_node_list = NEW_C_HEAP_ARRAY(int, highest_node_number);
+    for (unsigned int i = 0; i <= highest_node_number; i++) {
+      ULONGLONG proc_mask_numa_node;
+      if (!os::Kernel32Dll::GetNumaNodeProcessorMask(i, &proc_mask_numa_node)) return false;
+      if ((proc_aff_mask & proc_mask_numa_node)!=0) {
+        _numa_used_node_list[_numa_used_node_count++] = i;
+      }
+    }
+    return (_numa_used_node_count > 1);
+  }
+
+  int get_count() {return _numa_used_node_count;}
+  int get_node_list_entry(int n) {
+    // for indexes out of range, returns -1
+    return (n < _numa_used_node_count ? _numa_used_node_list[n] : -1);
+  }
+
+} numa_node_list_holder;
+
+
+
 static size_t _large_page_size = 0;
 
 static bool resolve_functions_for_large_page_init() {
@@ -2653,6 +2704,154 @@
   _hToken = NULL;
 }
 
+static bool numa_interleaving_init() {
+  bool success = false;
+  bool use_numa_specified = !FLAG_IS_DEFAULT(UseNUMA);
+  bool use_numa_interleaving_specified = !FLAG_IS_DEFAULT(UseNUMAInterleaving);
+
+  // print a warning if UseNUMA or UseNUMAInterleaving flag is specified on command line
+  bool warn_on_failure =  use_numa_specified || use_numa_interleaving_specified;
+# define WARN(msg) if (warn_on_failure) { warning(msg); }
+
+  // NUMAInterleaveGranularity cannot be less than vm_allocation_granularity (or _large_page_size if using large pages)
+  size_t min_interleave_granularity = UseLargePages ? _large_page_size : os::vm_allocation_granularity();
+  NUMAInterleaveGranularity = align_size_up(NUMAInterleaveGranularity, min_interleave_granularity);
+
+  if (os::Kernel32Dll::NumaCallsAvailable()) {
+    if (numa_node_list_holder.build()) {
+      if (PrintMiscellaneous && Verbose) {
+        tty->print("NUMA UsedNodeCount=%d, namely ", os::numa_get_groups_num());
+        for (int i = 0; i < numa_node_list_holder.get_count(); i++) {
+          tty->print("%d ", numa_node_list_holder.get_node_list_entry(i));
+        }
+        tty->print("\n");
+      }
+      success = true;
+    } else {
+      WARN("Process does not cover multiple NUMA nodes.");
+    }
+  } else {
+    WARN("NUMA Interleaving is not supported by the operating system.");
+  }
+  if (!success) {
+    if (use_numa_specified) WARN("...Ignoring UseNUMA flag.");
+    if (use_numa_interleaving_specified) WARN("...Ignoring UseNUMAInterleaving flag.");
+  }
+  return success;
+#undef WARN
+}
+
+// this routine is used whenever we need to reserve a contiguous VA range
+// but we need to make separate VirtualAlloc calls for each piece of the range
+// Reasons for doing this:
+//  * UseLargePagesIndividualAllocation was set (normally only needed on WS2003 but possible to be set otherwise)
+//  * UseNUMAInterleaving requires a separate node for each piece
+static char* allocate_pages_individually(size_t bytes, char* addr, DWORD flags, DWORD prot,
+                                         bool should_inject_error=false) {
+  char * p_buf;
+  // note: at setup time we guaranteed that NUMAInterleaveGranularity was aligned up to a page size
+  size_t page_size = UseLargePages ? _large_page_size : os::vm_allocation_granularity();
+  size_t chunk_size = UseNUMAInterleaving ? NUMAInterleaveGranularity : page_size;
+
+  // first reserve enough address space in advance since we want to be
+  // able to break a single contiguous virtual address range into multiple
+  // large page commits but WS2003 does not allow reserving large page space
+  // so we just use 4K pages for reserve, this gives us a legal contiguous
+  // address space. then we will deallocate that reservation, and re alloc
+  // using large pages
+  const size_t size_of_reserve = bytes + chunk_size;
+  if (bytes > size_of_reserve) {
+    // Overflowed.
+    return NULL;
+  }
+  p_buf = (char *) VirtualAlloc(addr,
+                                size_of_reserve,  // size of Reserve
+                                MEM_RESERVE,
+                                PAGE_READWRITE);
+  // If reservation failed, return NULL
+  if (p_buf == NULL) return NULL;
+
+  os::release_memory(p_buf, bytes + chunk_size);
+
+  // we still need to round up to a page boundary (in case we are using large pages)
+  // but not to a chunk boundary (in case InterleavingGranularity doesn't align with page size)
+  // instead we handle this in the bytes_to_rq computation below
+  p_buf = (char *) align_size_up((size_t)p_buf, page_size);
+
+  // now go through and allocate one chunk at a time until all bytes are
+  // allocated
+  size_t  bytes_remaining = bytes;
+  // An overflow of align_size_up() would have been caught above
+  // in the calculation of size_of_reserve.
+  char * next_alloc_addr = p_buf;
+  HANDLE hProc = GetCurrentProcess();
+
+#ifdef ASSERT
+  // Variable for the failure injection
+  long ran_num = os::random();
+  size_t fail_after = ran_num % bytes;
+#endif
+
+  int count=0;
+  while (bytes_remaining) {
+    // select bytes_to_rq to get to the next chunk_size boundary
+
+    size_t bytes_to_rq = MIN2(bytes_remaining, chunk_size - ((size_t)next_alloc_addr % chunk_size));
+    // Note allocate and commit
+    char * p_new;
+
+#ifdef ASSERT
+    bool inject_error_now = should_inject_error && (bytes_remaining <= fail_after);
+#else
+    const bool inject_error_now = false;
+#endif
+
+    if (inject_error_now) {
+      p_new = NULL;
+    } else {
+      if (!UseNUMAInterleaving) {
+        p_new = (char *) VirtualAlloc(next_alloc_addr,
+                                      bytes_to_rq,
+                                      flags,
+                                      prot);
+      } else {
+        // get the next node to use from the used_node_list
+        DWORD node = numa_node_list_holder.get_node_list_entry(count % os::numa_get_groups_num());
+        p_new = (char *)os::Kernel32Dll::VirtualAllocExNuma(hProc,
+                                                            next_alloc_addr,
+                                                            bytes_to_rq,
+                                                            flags,
+                                                            prot,
+                                                            node);
+      }
+    }
+
+    if (p_new == NULL) {
+      // Free any allocated pages
+      if (next_alloc_addr > p_buf) {
+        // Some memory was committed so release it.
+        size_t bytes_to_release = bytes - bytes_remaining;
+        os::release_memory(p_buf, bytes_to_release);
+      }
+#ifdef ASSERT
+      if (should_inject_error) {
+        if (TracePageSizes && Verbose) {
+          tty->print_cr("Reserving pages individually failed.");
+        }
+      }
+#endif
+      return NULL;
+    }
+    bytes_remaining -= bytes_to_rq;
+    next_alloc_addr += bytes_to_rq;
+    count++;
+  }
+  // made it this far, success
+  return p_buf;
+}
+
+
+
 void os::large_page_init() {
   if (!UseLargePages) return;
 
@@ -2722,9 +2921,30 @@
   assert((size_t)addr % os::vm_allocation_granularity() == 0,
          "reserve alignment");
   assert(bytes % os::vm_allocation_granularity() == 0, "reserve block size");
-  char* res = (char*)VirtualAlloc(addr, bytes, MEM_RESERVE, PAGE_READWRITE);
+  char* res;
+  // note that if UseLargePages is on, all the areas that require interleaving
+  // will go thru reserve_memory_special rather than thru here.
+  bool use_individual = (UseNUMAInterleaving && !UseLargePages);
+  if (!use_individual) {
+    res = (char*)VirtualAlloc(addr, bytes, MEM_RESERVE, PAGE_READWRITE);
+  } else {
+    elapsedTimer reserveTimer;
+    if( Verbose && PrintMiscellaneous ) reserveTimer.start();
+    // in numa interleaving, we have to allocate pages individually
+    // (well really chunks of NUMAInterleaveGranularity size)
+    res = allocate_pages_individually(bytes, addr, MEM_RESERVE, PAGE_READWRITE);
+    if (res == NULL) {
+      warning("NUMA page allocation failed");
+    }
+    if( Verbose && PrintMiscellaneous ) {
+      reserveTimer.stop();
+      tty->print_cr("reserve_memory of %Ix bytes took %ld ms (%ld ticks)", bytes,
+                    reserveTimer.milliseconds(), reserveTimer.ticks());
+    }
+  }
   assert(res == NULL || addr == NULL || addr == res,
          "Unexpected address from reserve.");
+
   return res;
 }
 
@@ -2754,92 +2974,27 @@
 char* os::reserve_memory_special(size_t bytes, char* addr, bool exec) {
 
   const DWORD prot = exec ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE;
-
-  if (UseLargePagesIndividualAllocation) {
+  const DWORD flags = MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES;
+
+  // with large pages, there are two cases where we need to use Individual Allocation
+  // 1) the UseLargePagesIndividualAllocation flag is set (set by default on WS2003)
+  // 2) NUMA Interleaving is enabled, in which case we use a different node for each page
+  if (UseLargePagesIndividualAllocation || UseNUMAInterleaving) {
     if (TracePageSizes && Verbose) {
        tty->print_cr("Reserving large pages individually.");
     }
-    char * p_buf;
-    // first reserve enough address space in advance since we want to be
-    // able to break a single contiguous virtual address range into multiple
-    // large page commits but WS2003 does not allow reserving large page space
-    // so we just use 4K pages for reserve, this gives us a legal contiguous
-    // address space. then we will deallocate that reservation, and re alloc
-    // using large pages
-    const size_t size_of_reserve = bytes + _large_page_size;
-    if (bytes > size_of_reserve) {
-      // Overflowed.
-      warning("Individually allocated large pages failed, "
-        "use -XX:-UseLargePagesIndividualAllocation to turn off");
+    char * p_buf = allocate_pages_individually(bytes, addr, flags, prot, LargePagesIndividualAllocationInjectError);
+    if (p_buf == NULL) {
+      // give an appropriate warning message
+      if (UseNUMAInterleaving) {
+        warning("NUMA large page allocation failed, UseLargePages flag ignored");
+      }
+      if (UseLargePagesIndividualAllocation) {
+        warning("Individually allocated large pages failed, "
+                "use -XX:-UseLargePagesIndividualAllocation to turn off");
+      }
       return NULL;
     }
-    p_buf = (char *) VirtualAlloc(addr,
-                                 size_of_reserve,  // size of Reserve
-                                 MEM_RESERVE,
-                                 PAGE_READWRITE);
-    // If reservation failed, return NULL
-    if (p_buf == NULL) return NULL;
-
-    release_memory(p_buf, bytes + _large_page_size);
-    // round up to page boundary.  If the size_of_reserve did not
-    // overflow and the reservation did not fail, this align up
-    // should not overflow.
-    p_buf = (char *) align_size_up((size_t)p_buf, _large_page_size);
-
-    // now go through and allocate one page at a time until all bytes are
-    // allocated
-    size_t  bytes_remaining = align_size_up(bytes, _large_page_size);
-    // An overflow of align_size_up() would have been caught above
-    // in the calculation of size_of_reserve.
-    char * next_alloc_addr = p_buf;
-
-#ifdef ASSERT
-    // Variable for the failure injection
-    long ran_num = os::random();
-    size_t fail_after = ran_num % bytes;
-#endif
-
-    while (bytes_remaining) {
-      size_t bytes_to_rq = MIN2(bytes_remaining, _large_page_size);
-      // Note allocate and commit
-      char * p_new;
-
-#ifdef ASSERT
-      bool inject_error = LargePagesIndividualAllocationInjectError &&
-          (bytes_remaining <= fail_after);
-#else
-      const bool inject_error = false;
-#endif
-
-      if (inject_error) {
-        p_new = NULL;
-      } else {
-        p_new = (char *) VirtualAlloc(next_alloc_addr,
-                                    bytes_to_rq,
-                                    MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES,
-                                    prot);
-      }
-
-      if (p_new == NULL) {
-        // Free any allocated pages
-        if (next_alloc_addr > p_buf) {
-          // Some memory was committed so release it.
-          size_t bytes_to_release = bytes - bytes_remaining;
-          release_memory(p_buf, bytes_to_release);
-        }
-#ifdef ASSERT
-        if (UseLargePagesIndividualAllocation &&
-            LargePagesIndividualAllocationInjectError) {
-          if (TracePageSizes && Verbose) {
-             tty->print_cr("Reserving large pages individually failed.");
-          }
-        }
-#endif
-        return NULL;
-      }
-      bytes_remaining -= bytes_to_rq;
-      next_alloc_addr += bytes_to_rq;
-    }
 
     return p_buf;
 
@@ -2867,14 +3022,43 @@
   assert(bytes % os::vm_page_size() == 0, "commit in page-sized chunks");
   // Don't attempt to print anything if the OS call fails. We're
   // probably low on resources, so the print itself may cause crashes.
-  bool result = VirtualAlloc(addr, bytes, MEM_COMMIT, PAGE_READWRITE) != 0;
-  if (result != NULL && exec) {
-    DWORD oldprot;
-    // Windows doc says to use VirtualProtect to get execute permissions
-    return VirtualProtect(addr, bytes, PAGE_EXECUTE_READWRITE, &oldprot) != 0;
+
+  // unless we have NUMAInterleaving enabled, the range of a commit
+  // is always within a reserve covered by a single VirtualAlloc
+  // in that case we can just do a single commit for the requested size
+  if (!UseNUMAInterleaving) {
+    if (VirtualAlloc(addr, bytes, MEM_COMMIT, PAGE_READWRITE) == NULL) return false;
+    if (exec) {
+      DWORD oldprot;
+      // Windows doc says to use VirtualProtect to get execute permissions
+      if (!VirtualProtect(addr, bytes, PAGE_EXECUTE_READWRITE, &oldprot)) return false;
+    }
+    return true;
   } else {
-    return result;
-  }
+
+    // when NUMAInterleaving is enabled, the commit might cover a range that
+    // came from multiple VirtualAlloc reserves (using allocate_pages_individually).
+    // VirtualQuery can help us determine that.  The RegionSize that VirtualQuery
+    // returns represents the number of bytes that can be committed in one step.
+    size_t bytes_remaining = bytes;
+    char * next_alloc_addr = addr;
+    while (bytes_remaining > 0) {
+      MEMORY_BASIC_INFORMATION alloc_info;
+      VirtualQuery(next_alloc_addr, &alloc_info, sizeof(alloc_info));
+      size_t bytes_to_rq = MIN2(bytes_remaining, (size_t)alloc_info.RegionSize);
+      if (VirtualAlloc(next_alloc_addr, bytes_to_rq, MEM_COMMIT, PAGE_READWRITE) == NULL)
+        return false;
+      if (exec) {
+        DWORD oldprot;
+        if (!VirtualProtect(next_alloc_addr, bytes_to_rq, PAGE_EXECUTE_READWRITE, &oldprot))
+          return false;
+      }
+      bytes_remaining -= bytes_to_rq;
+      next_alloc_addr += bytes_to_rq;
+    }
+  }
+  // if we made it this far, return true
+  return true;
 }
 
 bool os::commit_memory(char* addr, size_t size, size_t alignment_hint,
@@ -2948,14 +3132,15 @@
 void os::numa_make_global(char *addr, size_t bytes)    { }
 void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint)    { }
 bool os::numa_topology_changed()                       { return false; }
-size_t os::numa_get_groups_num()                       { return 1; }
+size_t os::numa_get_groups_num()                       { return numa_node_list_holder.get_count(); }
 int os::numa_get_group_id()                            { return 0; }
 size_t os::numa_get_leaf_groups(int *ids, size_t size) {
-  if (size > 0) {
-    ids[0] = 0;
-    return 1;
-  }
-  return 0;
+  // check for size bigger than actual groups_num
+  size = MIN2(size, numa_get_groups_num());
+  for (int i = 0; i < (int)size; i++) {
+    ids[i] = numa_node_list_holder.get_node_list_entry(i);
+  }
+  return size;
 }
 
 bool os::get_page_info(char *start, page_info* info) {
@@ -3480,7 +3665,7 @@
     if(Verbose && PrintMiscellaneous)
       tty->print("[Memory Serialize  Page address: " INTPTR_FORMAT "]\n", (intptr_t)mem_serialize_page);
 #endif
-}
+  }
 
   os::large_page_init();
 
@@ -3583,8 +3768,10 @@
   // initialize thread priority policy
   prio_init();
 
-  if (UseNUMA && !ForceNUMA) {
-    UseNUMA = false; // Currently unsupported.
+  if (UseNUMAInterleaving) {
+    // first check whether this Windows OS supports VirtualAllocExNuma, if not ignore this flag
+    bool success = numa_interleaving_init();
+    if (!success) UseNUMAInterleaving = false;
   }
 
   return JNI_OK;
@@ -4758,7 +4945,14 @@
 
 // Kernel32 API
 typedef SIZE_T (WINAPI* GetLargePageMinimum_Fn)(void);
+typedef LPVOID (WINAPI *VirtualAllocExNuma_Fn) (HANDLE, LPVOID, SIZE_T, DWORD, DWORD, DWORD);
+typedef BOOL (WINAPI *GetNumaHighestNodeNumber_Fn) (PULONG);
+typedef BOOL (WINAPI *GetNumaNodeProcessorMask_Fn) (UCHAR, PULONGLONG);
+
 GetLargePageMinimum_Fn      os::Kernel32Dll::_GetLargePageMinimum = NULL;
+VirtualAllocExNuma_Fn       os::Kernel32Dll::_VirtualAllocExNuma = NULL;
+GetNumaHighestNodeNumber_Fn os::Kernel32Dll::_GetNumaHighestNodeNumber = NULL;
+GetNumaNodeProcessorMask_Fn os::Kernel32Dll::_GetNumaNodeProcessorMask = NULL;
 BOOL                        os::Kernel32Dll::initialized = FALSE;
 SIZE_T os::Kernel32Dll::GetLargePageMinimum() {
   assert(initialized && _GetLargePageMinimum != NULL,
@@ -4773,16 +4967,53 @@
   return _GetLargePageMinimum != NULL;
 }
 
+BOOL os::Kernel32Dll::NumaCallsAvailable() {
+  if (!initialized) {
+    initialize();
+  }
+  return _VirtualAllocExNuma != NULL;
+}
+
+LPVOID os::Kernel32Dll::VirtualAllocExNuma(HANDLE hProc, LPVOID addr, SIZE_T bytes, DWORD flags, DWORD prot, DWORD node) {
+  assert(initialized && _VirtualAllocExNuma != NULL,
+    "NUMACallsAvailable() not yet called");
+
+  return _VirtualAllocExNuma(hProc, addr, bytes, flags, prot, node);
+}
+
+BOOL os::Kernel32Dll::GetNumaHighestNodeNumber(PULONG ptr_highest_node_number) {
+  assert(initialized && _GetNumaHighestNodeNumber != NULL,
+    "NUMACallsAvailable() not yet called");
+
+  return _GetNumaHighestNodeNumber(ptr_highest_node_number);
+}
+
+BOOL os::Kernel32Dll::GetNumaNodeProcessorMask(UCHAR node, PULONGLONG proc_mask) {
+  assert(initialized && _GetNumaNodeProcessorMask != NULL,
+    "NUMACallsAvailable() not yet called");
+
+  return _GetNumaNodeProcessorMask(node, proc_mask);
+}
+
+
+void os::Kernel32Dll::initializeCommon() {
+  if (!initialized) {
+    HMODULE handle = ::GetModuleHandle("Kernel32.dll");
+    assert(handle != NULL, "Just check");
+    _GetLargePageMinimum = (GetLargePageMinimum_Fn)::GetProcAddress(handle, "GetLargePageMinimum");
+    _VirtualAllocExNuma = (VirtualAllocExNuma_Fn)::GetProcAddress(handle, "VirtualAllocExNuma");
+    _GetNumaHighestNodeNumber = (GetNumaHighestNodeNumber_Fn)::GetProcAddress(handle, "GetNumaHighestNodeNumber");
+    _GetNumaNodeProcessorMask = (GetNumaNodeProcessorMask_Fn)::GetProcAddress(handle, "GetNumaNodeProcessorMask");
+    initialized = TRUE;
+  }
+}
+
+
 
 #ifndef JDK6_OR_EARLIER
 
 void os::Kernel32Dll::initialize() {
-  if (!initialized) {
-    HMODULE handle = ::GetModuleHandle("Kernel32.dll");
-    assert(handle != NULL, "Just check");
-    _GetLargePageMinimum = (GetLargePageMinimum_Fn)::GetProcAddress(handle, "GetLargePageMinimum");
-    initialized = TRUE;
-  }
+  initializeCommon();
 }
 
 
@@ -4887,18 +5118,19 @@
 Module32Next_Fn             os::Kernel32Dll::_Module32Next = NULL;
 GetNativeSystemInfo_Fn      os::Kernel32Dll::_GetNativeSystemInfo = NULL;
 
+
 void os::Kernel32Dll::initialize() {
   if (!initialized) {
     HMODULE handle = ::GetModuleHandle("Kernel32.dll");
     assert(handle != NULL, "Just check");
 
     _SwitchToThread = (SwitchToThread_Fn)::GetProcAddress(handle, "SwitchToThread");
-    _GetLargePageMinimum = (GetLargePageMinimum_Fn)::GetProcAddress(handle, "GetLargePageMinimum");
     _CreateToolhelp32Snapshot = (CreateToolhelp32Snapshot_Fn)
       ::GetProcAddress(handle, "CreateToolhelp32Snapshot");
     _Module32First = (Module32First_Fn)::GetProcAddress(handle, "Module32First");
     _Module32Next = (Module32Next_Fn)::GetProcAddress(handle, "Module32Next");
     _GetNativeSystemInfo = (GetNativeSystemInfo_Fn)::GetProcAddress(handle, "GetNativeSystemInfo");
+    initializeCommon();  // resolve the functions that always need resolving
 
     initialized = TRUE;
   }
@@ -4964,6 +5196,8 @@
   _GetNativeSystemInfo(lpSystemInfo);
 }
 
+
+
 // PSAPI API
 
 
--- a/hotspot/src/os/windows/vm/os_windows.hpp	Tue Aug 23 11:06:30 2011 +0200
+++ b/hotspot/src/os/windows/vm/os_windows.hpp	Thu Aug 25 02:57:46 2011 -0700
@@ -173,13 +173,25 @@
   static BOOL GetNativeSystemInfoAvailable();
   static void GetNativeSystemInfo(LPSYSTEM_INFO);
 
+  // NUMA calls
+  static BOOL NumaCallsAvailable();
+  static LPVOID VirtualAllocExNuma(HANDLE, LPVOID, SIZE_T, DWORD, DWORD, DWORD);
+  static BOOL GetNumaHighestNodeNumber(PULONG);
+  static BOOL GetNumaNodeProcessorMask(UCHAR, PULONGLONG);
+
 private:
   // GetLargePageMinimum available on Windows Vista/Windows Server 2003
   // and later
+  // NUMA calls available Windows Vista/WS2008 and later
+
   static SIZE_T (WINAPI *_GetLargePageMinimum)(void);
+  static LPVOID (WINAPI *_VirtualAllocExNuma) (HANDLE, LPVOID, SIZE_T, DWORD, DWORD, DWORD);
+  static BOOL (WINAPI *_GetNumaHighestNodeNumber) (PULONG);
+  static BOOL (WINAPI *_GetNumaNodeProcessorMask) (UCHAR, PULONGLONG);
   static BOOL initialized;
 
   static void initialize();
+  static void initializeCommon();
 
 #ifdef JDK6_OR_EARLIER
 private:
--- a/hotspot/src/share/vm/runtime/arguments.cpp	Tue Aug 23 11:06:30 2011 +0200
+++ b/hotspot/src/share/vm/runtime/arguments.cpp	Thu Aug 25 02:57:46 2011 -0700
@@ -1423,6 +1423,9 @@
     if (FLAG_IS_DEFAULT(MinHeapDeltaBytes)) {
       FLAG_SET_DEFAULT(MinHeapDeltaBytes, 64*M);
     }
+    // For those collectors or operating systems (eg, Windows) that do
+    // not support full UseNUMA, we will map to UseNUMAInterleaving for now
+    UseNUMAInterleaving = true;
   }
 }
 
--- a/hotspot/src/share/vm/runtime/globals.hpp	Tue Aug 23 11:06:30 2011 +0200
+++ b/hotspot/src/share/vm/runtime/globals.hpp	Thu Aug 25 02:57:46 2011 -0700
@@ -475,6 +475,12 @@
   product(bool, UseNUMA, false,                                             \
           "Use NUMA if available")                                          \
                                                                             \
+  product(bool, UseNUMAInterleaving, false,                                 \
+          "Interleave memory across NUMA nodes if available")               \
+                                                                            \
+  product(uintx, NUMAInterleaveGranularity, 2*M,                            \
+          "Granularity to use for NUMA interleaving on Windows OS")         \
+                                                                            \
   product(bool, ForceNUMA, false,                                           \
           "Force NUMA optimizations on single-node/UMA systems")            \
                                                                             \