--- a/hotspot/src/os/windows/vm/os_windows.cpp Tue Aug 23 11:06:30 2011 +0200
+++ b/hotspot/src/os/windows/vm/os_windows.cpp Thu Aug 25 02:57:46 2011 -0700
@@ -2614,6 +2614,57 @@
static HANDLE _hProcess;
static HANDLE _hToken;
+// Container for NUMA node list info
+class NUMANodeListHolder {
+private:
+ int *_numa_used_node_list; // allocated below
+ int _numa_used_node_count;
+
+ void free_node_list() {
+ if (_numa_used_node_list != NULL) {
+ FREE_C_HEAP_ARRAY(int, _numa_used_node_list);
+ }
+ }
+
+public:
+ NUMANodeListHolder() {
+ _numa_used_node_count = 0;
+ _numa_used_node_list = NULL;
+ // do rest of initialization in build routine (after function pointers are set up)
+ }
+
+ ~NUMANodeListHolder() {
+ free_node_list();
+ }
+
+ bool build() {
+ DWORD_PTR proc_aff_mask;
+ DWORD_PTR sys_aff_mask;
+ if (!GetProcessAffinityMask(GetCurrentProcess(), &proc_aff_mask, &sys_aff_mask)) return false;
+ ULONG highest_node_number;
+ if (!os::Kernel32Dll::GetNumaHighestNodeNumber(&highest_node_number)) return false;
+ free_node_list();
+ _numa_used_node_list = NEW_C_HEAP_ARRAY(int, highest_node_number);
+ for (unsigned int i = 0; i <= highest_node_number; i++) {
+ ULONGLONG proc_mask_numa_node;
+ if (!os::Kernel32Dll::GetNumaNodeProcessorMask(i, &proc_mask_numa_node)) return false;
+ if ((proc_aff_mask & proc_mask_numa_node)!=0) {
+ _numa_used_node_list[_numa_used_node_count++] = i;
+ }
+ }
+ return (_numa_used_node_count > 1);
+ }
+
+ int get_count() {return _numa_used_node_count;}
+ int get_node_list_entry(int n) {
+ // for indexes out of range, returns -1
+ return (n < _numa_used_node_count ? _numa_used_node_list[n] : -1);
+ }
+
+} numa_node_list_holder;
+
+
+
static size_t _large_page_size = 0;
static bool resolve_functions_for_large_page_init() {
@@ -2653,6 +2704,154 @@
_hToken = NULL;
}
+static bool numa_interleaving_init() {
+ bool success = false;
+ bool use_numa_specified = !FLAG_IS_DEFAULT(UseNUMA);
+ bool use_numa_interleaving_specified = !FLAG_IS_DEFAULT(UseNUMAInterleaving);
+
+ // print a warning if UseNUMA or UseNUMAInterleaving flag is specified on command line
+ bool warn_on_failure = use_numa_specified || use_numa_interleaving_specified;
+# define WARN(msg) if (warn_on_failure) { warning(msg); }
+
+ // NUMAInterleaveGranularity cannot be less than vm_allocation_granularity (or _large_page_size if using large pages)
+ size_t min_interleave_granularity = UseLargePages ? _large_page_size : os::vm_allocation_granularity();
+ NUMAInterleaveGranularity = align_size_up(NUMAInterleaveGranularity, min_interleave_granularity);
+
+ if (os::Kernel32Dll::NumaCallsAvailable()) {
+ if (numa_node_list_holder.build()) {
+ if (PrintMiscellaneous && Verbose) {
+ tty->print("NUMA UsedNodeCount=%d, namely ", os::numa_get_groups_num());
+ for (int i = 0; i < numa_node_list_holder.get_count(); i++) {
+ tty->print("%d ", numa_node_list_holder.get_node_list_entry(i));
+ }
+ tty->print("\n");
+ }
+ success = true;
+ } else {
+ WARN("Process does not cover multiple NUMA nodes.");
+ }
+ } else {
+ WARN("NUMA Interleaving is not supported by the operating system.");
+ }
+ if (!success) {
+ if (use_numa_specified) WARN("...Ignoring UseNUMA flag.");
+ if (use_numa_interleaving_specified) WARN("...Ignoring UseNUMAInterleaving flag.");
+ }
+ return success;
+#undef WARN
+}
+
+// this routine is used whenever we need to reserve a contiguous VA range
+// but we need to make separate VirtualAlloc calls for each piece of the range
+// Reasons for doing this:
+// * UseLargePagesIndividualAllocation was set (normally only needed on WS2003 but possible to be set otherwise)
+// * UseNUMAInterleaving requires a separate node for each piece
+static char* allocate_pages_individually(size_t bytes, char* addr, DWORD flags, DWORD prot,
+ bool should_inject_error=false) {
+ char * p_buf;
+ // note: at setup time we guaranteed that NUMAInterleaveGranularity was aligned up to a page size
+ size_t page_size = UseLargePages ? _large_page_size : os::vm_allocation_granularity();
+ size_t chunk_size = UseNUMAInterleaving ? NUMAInterleaveGranularity : page_size;
+
+ // first reserve enough address space in advance since we want to be
+ // able to break a single contiguous virtual address range into multiple
+ // large page commits but WS2003 does not allow reserving large page space
+ // so we just use 4K pages for reserve, this gives us a legal contiguous
+ // address space. then we will deallocate that reservation, and re alloc
+ // using large pages
+ const size_t size_of_reserve = bytes + chunk_size;
+ if (bytes > size_of_reserve) {
+ // Overflowed.
+ return NULL;
+ }
+ p_buf = (char *) VirtualAlloc(addr,
+ size_of_reserve, // size of Reserve
+ MEM_RESERVE,
+ PAGE_READWRITE);
+ // If reservation failed, return NULL
+ if (p_buf == NULL) return NULL;
+
+ os::release_memory(p_buf, bytes + chunk_size);
+
+ // we still need to round up to a page boundary (in case we are using large pages)
+ // but not to a chunk boundary (in case InterleavingGranularity doesn't align with page size)
+ // instead we handle this in the bytes_to_rq computation below
+ p_buf = (char *) align_size_up((size_t)p_buf, page_size);
+
+ // now go through and allocate one chunk at a time until all bytes are
+ // allocated
+ size_t bytes_remaining = bytes;
+ // An overflow of align_size_up() would have been caught above
+ // in the calculation of size_of_reserve.
+ char * next_alloc_addr = p_buf;
+ HANDLE hProc = GetCurrentProcess();
+
+#ifdef ASSERT
+ // Variable for the failure injection
+ long ran_num = os::random();
+ size_t fail_after = ran_num % bytes;
+#endif
+
+ int count=0;
+ while (bytes_remaining) {
+ // select bytes_to_rq to get to the next chunk_size boundary
+
+ size_t bytes_to_rq = MIN2(bytes_remaining, chunk_size - ((size_t)next_alloc_addr % chunk_size));
+ // Note allocate and commit
+ char * p_new;
+
+#ifdef ASSERT
+ bool inject_error_now = should_inject_error && (bytes_remaining <= fail_after);
+#else
+ const bool inject_error_now = false;
+#endif
+
+ if (inject_error_now) {
+ p_new = NULL;
+ } else {
+ if (!UseNUMAInterleaving) {
+ p_new = (char *) VirtualAlloc(next_alloc_addr,
+ bytes_to_rq,
+ flags,
+ prot);
+ } else {
+ // get the next node to use from the used_node_list
+ DWORD node = numa_node_list_holder.get_node_list_entry(count % os::numa_get_groups_num());
+ p_new = (char *)os::Kernel32Dll::VirtualAllocExNuma(hProc,
+ next_alloc_addr,
+ bytes_to_rq,
+ flags,
+ prot,
+ node);
+ }
+ }
+
+ if (p_new == NULL) {
+ // Free any allocated pages
+ if (next_alloc_addr > p_buf) {
+ // Some memory was committed so release it.
+ size_t bytes_to_release = bytes - bytes_remaining;
+ os::release_memory(p_buf, bytes_to_release);
+ }
+#ifdef ASSERT
+ if (should_inject_error) {
+ if (TracePageSizes && Verbose) {
+ tty->print_cr("Reserving pages individually failed.");
+ }
+ }
+#endif
+ return NULL;
+ }
+ bytes_remaining -= bytes_to_rq;
+ next_alloc_addr += bytes_to_rq;
+ count++;
+ }
+ // made it this far, success
+ return p_buf;
+}
+
+
+
void os::large_page_init() {
if (!UseLargePages) return;
@@ -2722,9 +2921,30 @@
assert((size_t)addr % os::vm_allocation_granularity() == 0,
"reserve alignment");
assert(bytes % os::vm_allocation_granularity() == 0, "reserve block size");
- char* res = (char*)VirtualAlloc(addr, bytes, MEM_RESERVE, PAGE_READWRITE);
+ char* res;
+ // note that if UseLargePages is on, all the areas that require interleaving
+ // will go thru reserve_memory_special rather than thru here.
+ bool use_individual = (UseNUMAInterleaving && !UseLargePages);
+ if (!use_individual) {
+ res = (char*)VirtualAlloc(addr, bytes, MEM_RESERVE, PAGE_READWRITE);
+ } else {
+ elapsedTimer reserveTimer;
+ if( Verbose && PrintMiscellaneous ) reserveTimer.start();
+ // in numa interleaving, we have to allocate pages individually
+ // (well really chunks of NUMAInterleaveGranularity size)
+ res = allocate_pages_individually(bytes, addr, MEM_RESERVE, PAGE_READWRITE);
+ if (res == NULL) {
+ warning("NUMA page allocation failed");
+ }
+ if( Verbose && PrintMiscellaneous ) {
+ reserveTimer.stop();
+ tty->print_cr("reserve_memory of %Ix bytes took %ld ms (%ld ticks)", bytes,
+ reserveTimer.milliseconds(), reserveTimer.ticks());
+ }
+ }
assert(res == NULL || addr == NULL || addr == res,
"Unexpected address from reserve.");
+
return res;
}
@@ -2754,92 +2974,27 @@
char* os::reserve_memory_special(size_t bytes, char* addr, bool exec) {
const DWORD prot = exec ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE;
-
- if (UseLargePagesIndividualAllocation) {
+ const DWORD flags = MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES;
+
+ // with large pages, there are two cases where we need to use Individual Allocation
+ // 1) the UseLargePagesIndividualAllocation flag is set (set by default on WS2003)
+ // 2) NUMA Interleaving is enabled, in which case we use a different node for each page
+ if (UseLargePagesIndividualAllocation || UseNUMAInterleaving) {
if (TracePageSizes && Verbose) {
tty->print_cr("Reserving large pages individually.");
}
- char * p_buf;
- // first reserve enough address space in advance since we want to be
- // able to break a single contiguous virtual address range into multiple
- // large page commits but WS2003 does not allow reserving large page space
- // so we just use 4K pages for reserve, this gives us a legal contiguous
- // address space. then we will deallocate that reservation, and re alloc
- // using large pages
- const size_t size_of_reserve = bytes + _large_page_size;
- if (bytes > size_of_reserve) {
- // Overflowed.
- warning("Individually allocated large pages failed, "
- "use -XX:-UseLargePagesIndividualAllocation to turn off");
+ char * p_buf = allocate_pages_individually(bytes, addr, flags, prot, LargePagesIndividualAllocationInjectError);
+ if (p_buf == NULL) {
+ // give an appropriate warning message
+ if (UseNUMAInterleaving) {
+ warning("NUMA large page allocation failed, UseLargePages flag ignored");
+ }
+ if (UseLargePagesIndividualAllocation) {
+ warning("Individually allocated large pages failed, "
+ "use -XX:-UseLargePagesIndividualAllocation to turn off");
+ }
return NULL;
}
- p_buf = (char *) VirtualAlloc(addr,
- size_of_reserve, // size of Reserve
- MEM_RESERVE,
- PAGE_READWRITE);
- // If reservation failed, return NULL
- if (p_buf == NULL) return NULL;
-
- release_memory(p_buf, bytes + _large_page_size);
- // round up to page boundary. If the size_of_reserve did not
- // overflow and the reservation did not fail, this align up
- // should not overflow.
- p_buf = (char *) align_size_up((size_t)p_buf, _large_page_size);
-
- // now go through and allocate one page at a time until all bytes are
- // allocated
- size_t bytes_remaining = align_size_up(bytes, _large_page_size);
- // An overflow of align_size_up() would have been caught above
- // in the calculation of size_of_reserve.
- char * next_alloc_addr = p_buf;
-
-#ifdef ASSERT
- // Variable for the failure injection
- long ran_num = os::random();
- size_t fail_after = ran_num % bytes;
-#endif
-
- while (bytes_remaining) {
- size_t bytes_to_rq = MIN2(bytes_remaining, _large_page_size);
- // Note allocate and commit
- char * p_new;
-
-#ifdef ASSERT
- bool inject_error = LargePagesIndividualAllocationInjectError &&
- (bytes_remaining <= fail_after);
-#else
- const bool inject_error = false;
-#endif
-
- if (inject_error) {
- p_new = NULL;
- } else {
- p_new = (char *) VirtualAlloc(next_alloc_addr,
- bytes_to_rq,
- MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES,
- prot);
- }
-
- if (p_new == NULL) {
- // Free any allocated pages
- if (next_alloc_addr > p_buf) {
- // Some memory was committed so release it.
- size_t bytes_to_release = bytes - bytes_remaining;
- release_memory(p_buf, bytes_to_release);
- }
-#ifdef ASSERT
- if (UseLargePagesIndividualAllocation &&
- LargePagesIndividualAllocationInjectError) {
- if (TracePageSizes && Verbose) {
- tty->print_cr("Reserving large pages individually failed.");
- }
- }
-#endif
- return NULL;
- }
- bytes_remaining -= bytes_to_rq;
- next_alloc_addr += bytes_to_rq;
- }
return p_buf;
@@ -2867,14 +3022,43 @@
assert(bytes % os::vm_page_size() == 0, "commit in page-sized chunks");
// Don't attempt to print anything if the OS call fails. We're
// probably low on resources, so the print itself may cause crashes.
- bool result = VirtualAlloc(addr, bytes, MEM_COMMIT, PAGE_READWRITE) != 0;
- if (result != NULL && exec) {
- DWORD oldprot;
- // Windows doc says to use VirtualProtect to get execute permissions
- return VirtualProtect(addr, bytes, PAGE_EXECUTE_READWRITE, &oldprot) != 0;
+
+ // unless we have NUMAInterleaving enabled, the range of a commit
+ // is always within a reserve covered by a single VirtualAlloc
+ // in that case we can just do a single commit for the requested size
+ if (!UseNUMAInterleaving) {
+ if (VirtualAlloc(addr, bytes, MEM_COMMIT, PAGE_READWRITE) == NULL) return false;
+ if (exec) {
+ DWORD oldprot;
+ // Windows doc says to use VirtualProtect to get execute permissions
+ if (!VirtualProtect(addr, bytes, PAGE_EXECUTE_READWRITE, &oldprot)) return false;
+ }
+ return true;
} else {
- return result;
- }
+
+ // when NUMAInterleaving is enabled, the commit might cover a range that
+ // came from multiple VirtualAlloc reserves (using allocate_pages_individually).
+ // VirtualQuery can help us determine that. The RegionSize that VirtualQuery
+ // returns represents the number of bytes that can be committed in one step.
+ size_t bytes_remaining = bytes;
+ char * next_alloc_addr = addr;
+ while (bytes_remaining > 0) {
+ MEMORY_BASIC_INFORMATION alloc_info;
+ VirtualQuery(next_alloc_addr, &alloc_info, sizeof(alloc_info));
+ size_t bytes_to_rq = MIN2(bytes_remaining, (size_t)alloc_info.RegionSize);
+ if (VirtualAlloc(next_alloc_addr, bytes_to_rq, MEM_COMMIT, PAGE_READWRITE) == NULL)
+ return false;
+ if (exec) {
+ DWORD oldprot;
+ if (!VirtualProtect(next_alloc_addr, bytes_to_rq, PAGE_EXECUTE_READWRITE, &oldprot))
+ return false;
+ }
+ bytes_remaining -= bytes_to_rq;
+ next_alloc_addr += bytes_to_rq;
+ }
+ }
+ // if we made it this far, return true
+ return true;
}
bool os::commit_memory(char* addr, size_t size, size_t alignment_hint,
@@ -2948,14 +3132,15 @@
void os::numa_make_global(char *addr, size_t bytes) { }
void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint) { }
bool os::numa_topology_changed() { return false; }
-size_t os::numa_get_groups_num() { return 1; }
+size_t os::numa_get_groups_num() { return numa_node_list_holder.get_count(); }
int os::numa_get_group_id() { return 0; }
size_t os::numa_get_leaf_groups(int *ids, size_t size) {
- if (size > 0) {
- ids[0] = 0;
- return 1;
- }
- return 0;
+ // check for size bigger than actual groups_num
+ size = MIN2(size, numa_get_groups_num());
+ for (int i = 0; i < (int)size; i++) {
+ ids[i] = numa_node_list_holder.get_node_list_entry(i);
+ }
+ return size;
}
bool os::get_page_info(char *start, page_info* info) {
@@ -3480,7 +3665,7 @@
if(Verbose && PrintMiscellaneous)
tty->print("[Memory Serialize Page address: " INTPTR_FORMAT "]\n", (intptr_t)mem_serialize_page);
#endif
-}
+ }
os::large_page_init();
@@ -3583,8 +3768,10 @@
// initialize thread priority policy
prio_init();
- if (UseNUMA && !ForceNUMA) {
- UseNUMA = false; // Currently unsupported.
+ if (UseNUMAInterleaving) {
+ // first check whether this Windows OS supports VirtualAllocExNuma, if not ignore this flag
+ bool success = numa_interleaving_init();
+ if (!success) UseNUMAInterleaving = false;
}
return JNI_OK;
@@ -4758,7 +4945,14 @@
// Kernel32 API
typedef SIZE_T (WINAPI* GetLargePageMinimum_Fn)(void);
+typedef LPVOID (WINAPI *VirtualAllocExNuma_Fn) (HANDLE, LPVOID, SIZE_T, DWORD, DWORD, DWORD);
+typedef BOOL (WINAPI *GetNumaHighestNodeNumber_Fn) (PULONG);
+typedef BOOL (WINAPI *GetNumaNodeProcessorMask_Fn) (UCHAR, PULONGLONG);
+
GetLargePageMinimum_Fn os::Kernel32Dll::_GetLargePageMinimum = NULL;
+VirtualAllocExNuma_Fn os::Kernel32Dll::_VirtualAllocExNuma = NULL;
+GetNumaHighestNodeNumber_Fn os::Kernel32Dll::_GetNumaHighestNodeNumber = NULL;
+GetNumaNodeProcessorMask_Fn os::Kernel32Dll::_GetNumaNodeProcessorMask = NULL;
BOOL os::Kernel32Dll::initialized = FALSE;
SIZE_T os::Kernel32Dll::GetLargePageMinimum() {
assert(initialized && _GetLargePageMinimum != NULL,
@@ -4773,16 +4967,53 @@
return _GetLargePageMinimum != NULL;
}
+BOOL os::Kernel32Dll::NumaCallsAvailable() {
+ if (!initialized) {
+ initialize();
+ }
+ return _VirtualAllocExNuma != NULL;
+}
+
+LPVOID os::Kernel32Dll::VirtualAllocExNuma(HANDLE hProc, LPVOID addr, SIZE_T bytes, DWORD flags, DWORD prot, DWORD node) {
+ assert(initialized && _VirtualAllocExNuma != NULL,
+ "NUMACallsAvailable() not yet called");
+
+ return _VirtualAllocExNuma(hProc, addr, bytes, flags, prot, node);
+}
+
+BOOL os::Kernel32Dll::GetNumaHighestNodeNumber(PULONG ptr_highest_node_number) {
+ assert(initialized && _GetNumaHighestNodeNumber != NULL,
+ "NUMACallsAvailable() not yet called");
+
+ return _GetNumaHighestNodeNumber(ptr_highest_node_number);
+}
+
+BOOL os::Kernel32Dll::GetNumaNodeProcessorMask(UCHAR node, PULONGLONG proc_mask) {
+ assert(initialized && _GetNumaNodeProcessorMask != NULL,
+ "NUMACallsAvailable() not yet called");
+
+ return _GetNumaNodeProcessorMask(node, proc_mask);
+}
+
+
+void os::Kernel32Dll::initializeCommon() {
+ if (!initialized) {
+ HMODULE handle = ::GetModuleHandle("Kernel32.dll");
+ assert(handle != NULL, "Just check");
+ _GetLargePageMinimum = (GetLargePageMinimum_Fn)::GetProcAddress(handle, "GetLargePageMinimum");
+ _VirtualAllocExNuma = (VirtualAllocExNuma_Fn)::GetProcAddress(handle, "VirtualAllocExNuma");
+ _GetNumaHighestNodeNumber = (GetNumaHighestNodeNumber_Fn)::GetProcAddress(handle, "GetNumaHighestNodeNumber");
+ _GetNumaNodeProcessorMask = (GetNumaNodeProcessorMask_Fn)::GetProcAddress(handle, "GetNumaNodeProcessorMask");
+ initialized = TRUE;
+ }
+}
+
+
#ifndef JDK6_OR_EARLIER
void os::Kernel32Dll::initialize() {
- if (!initialized) {
- HMODULE handle = ::GetModuleHandle("Kernel32.dll");
- assert(handle != NULL, "Just check");
- _GetLargePageMinimum = (GetLargePageMinimum_Fn)::GetProcAddress(handle, "GetLargePageMinimum");
- initialized = TRUE;
- }
+ initializeCommon();
}
@@ -4887,18 +5118,19 @@
Module32Next_Fn os::Kernel32Dll::_Module32Next = NULL;
GetNativeSystemInfo_Fn os::Kernel32Dll::_GetNativeSystemInfo = NULL;
+
void os::Kernel32Dll::initialize() {
if (!initialized) {
HMODULE handle = ::GetModuleHandle("Kernel32.dll");
assert(handle != NULL, "Just check");
_SwitchToThread = (SwitchToThread_Fn)::GetProcAddress(handle, "SwitchToThread");
- _GetLargePageMinimum = (GetLargePageMinimum_Fn)::GetProcAddress(handle, "GetLargePageMinimum");
_CreateToolhelp32Snapshot = (CreateToolhelp32Snapshot_Fn)
::GetProcAddress(handle, "CreateToolhelp32Snapshot");
_Module32First = (Module32First_Fn)::GetProcAddress(handle, "Module32First");
_Module32Next = (Module32Next_Fn)::GetProcAddress(handle, "Module32Next");
_GetNativeSystemInfo = (GetNativeSystemInfo_Fn)::GetProcAddress(handle, "GetNativeSystemInfo");
+ initializeCommon(); // resolve the functions that always need resolving
initialized = TRUE;
}
@@ -4964,6 +5196,8 @@
_GetNativeSystemInfo(lpSystemInfo);
}
+
+
// PSAPI API