# HG changeset patch # User tschatzl # Date 1547674324 -3600 # Node ID ef09fd205ef01bfc429924f61e44a74b06a614b0 # Parent 4a59f7042325bba08e73ed2c8f325220e0653aa5 8213827: NUMA heap allocation does not respect process membind/interleave settings Summary: Optionally use libnuma v2 API to query for and support NUMA membind/interleave process configuration. Reviewed-by: tschatzl, sangheki Contributed-by: Amith Pawar diff -r 4a59f7042325 -r ef09fd205ef0 src/hotspot/os/linux/os_linux.cpp --- a/src/hotspot/os/linux/os_linux.cpp Wed Jan 16 21:37:12 2019 +0100 +++ b/src/hotspot/os/linux/os_linux.cpp Wed Jan 16 22:32:04 2019 +0100 @@ -33,6 +33,7 @@ #include "compiler/disassembler.hpp" #include "interpreter/interpreter.hpp" #include "logging/log.hpp" +#include "logging/logStream.hpp" #include "memory/allocation.inline.hpp" #include "memory/filemap.hpp" #include "oops/oop.inline.hpp" @@ -2780,7 +2781,7 @@ // Get the total number of nodes in the system including nodes without memory. for (node = 0; node <= highest_node_number; node++) { - if (isnode_in_existing_nodes(node)) { + if (is_node_in_existing_nodes(node)) { num_nodes++; } } @@ -2796,7 +2797,7 @@ // node number. If the nodes have been bound explicitly using numactl membind, // then allocate memory from those nodes only. for (int node = 0; node <= highest_node_number; node++) { - if (Linux::isnode_in_bound_nodes((unsigned int)node)) { + if (Linux::is_node_in_bound_nodes((unsigned int)node)) { ids[i++] = node; } } @@ -2899,11 +2900,15 @@ libnuma_dlsym(handle, "numa_distance"))); set_numa_get_membind(CAST_TO_FN_PTR(numa_get_membind_func_t, libnuma_v2_dlsym(handle, "numa_get_membind"))); + set_numa_get_interleave_mask(CAST_TO_FN_PTR(numa_get_interleave_mask_func_t, + libnuma_v2_dlsym(handle, "numa_get_interleave_mask"))); if (numa_available() != -1) { set_numa_all_nodes((unsigned long*)libnuma_dlsym(handle, "numa_all_nodes")); set_numa_all_nodes_ptr((struct bitmask **)libnuma_dlsym(handle, "numa_all_nodes_ptr")); set_numa_nodes_ptr((struct bitmask **)libnuma_dlsym(handle, "numa_nodes_ptr")); + set_numa_interleave_bitmask(_numa_get_interleave_mask()); + set_numa_membind_bitmask(_numa_get_membind()); // Create an index -> node mapping, since nodes are not always consecutive _nindex_to_node = new (ResourceObj::C_HEAP, mtInternal) GrowableArray(0, true); rebuild_nindex_to_node_map(); @@ -2929,7 +2934,7 @@ nindex_to_node()->clear(); for (int node = 0; node <= highest_node_number; node++) { - if (Linux::isnode_in_existing_nodes(node)) { + if (Linux::is_node_in_existing_nodes(node)) { nindex_to_node()->append(node); } } @@ -2966,16 +2971,16 @@ // the closest configured node. Check also if node is bound, i.e. it's allowed // to allocate memory from the node. If it's not allowed, map cpus in that node // to the closest node from which memory allocation is allowed. - if (!isnode_in_configured_nodes(nindex_to_node()->at(i)) || - !isnode_in_bound_nodes(nindex_to_node()->at(i))) { + if (!is_node_in_configured_nodes(nindex_to_node()->at(i)) || + !is_node_in_bound_nodes(nindex_to_node()->at(i))) { closest_distance = INT_MAX; // Check distance from all remaining nodes in the system. Ignore distance // from itself, from another non-configured node, and from another non-bound // node. for (size_t m = 0; m < node_num; m++) { if (m != i && - isnode_in_configured_nodes(nindex_to_node()->at(m)) && - isnode_in_bound_nodes(nindex_to_node()->at(m))) { + is_node_in_configured_nodes(nindex_to_node()->at(m)) && + is_node_in_bound_nodes(nindex_to_node()->at(m))) { distance = numa_distance(nindex_to_node()->at(i), nindex_to_node()->at(m)); // If a closest node is found, update. There is always at least one // configured and bound node in the system so there is always at least @@ -3030,9 +3035,13 @@ os::Linux::numa_bitmask_isbitset_func_t os::Linux::_numa_bitmask_isbitset; os::Linux::numa_distance_func_t os::Linux::_numa_distance; os::Linux::numa_get_membind_func_t os::Linux::_numa_get_membind; +os::Linux::numa_get_interleave_mask_func_t os::Linux::_numa_get_interleave_mask; +os::Linux::NumaAllocationPolicy os::Linux::_current_numa_policy; unsigned long* os::Linux::_numa_all_nodes; struct bitmask* os::Linux::_numa_all_nodes_ptr; struct bitmask* os::Linux::_numa_nodes_ptr; +struct bitmask* os::Linux::_numa_interleave_bitmask; +struct bitmask* os::Linux::_numa_membind_bitmask; bool os::pd_uncommit_memory(char* addr, size_t size) { uintptr_t res = (uintptr_t) ::mmap(addr, size, PROT_NONE, @@ -4936,6 +4945,74 @@ OSContainer::init(); } +void os::Linux::numa_init() { + + // Java can be invoked as + // 1. Without numactl and heap will be allocated/configured on all nodes as + // per the system policy. + // 2. With numactl --interleave: + // Use numa_get_interleave_mask(v2) API to get nodes bitmask. The same + // API for membind case bitmask is reset. + // Interleave is only hint and Kernel can fallback to other nodes if + // no memory is available on the target nodes. + // 3. With numactl --membind: + // Use numa_get_membind(v2) API to get nodes bitmask. The same API for + // interleave case returns bitmask of all nodes. + // numa_all_nodes_ptr holds bitmask of all nodes. + // numa_get_interleave_mask(v2) and numa_get_membind(v2) APIs returns correct + // bitmask when externally configured to run on all or fewer nodes. + + if (!Linux::libnuma_init()) { + UseNUMA = false; + } else { + if ((Linux::numa_max_node() < 1) || Linux::is_bound_to_single_node()) { + // If there's only one node (they start from 0) or if the process + // is bound explicitly to a single node using membind, disable NUMA. + UseNUMA = false; + } else { + + LogTarget(Info,os) log; + LogStream ls(log); + + Linux::set_configured_numa_policy(Linux::identify_numa_policy()); + + struct bitmask* bmp = Linux::_numa_membind_bitmask; + const char* numa_mode = "membind"; + + if (Linux::is_running_in_interleave_mode()) { + bmp = Linux::_numa_interleave_bitmask; + numa_mode = "interleave"; + } + + ls.print("UseNUMA is enabled and invoked in '%s' mode." + " Heap will be configured using NUMA memory nodes:", numa_mode); + + for (int node = 0; node <= Linux::numa_max_node(); node++) { + if (Linux::_numa_bitmask_isbitset(bmp, node)) { + ls.print(" %d", node); + } + } + } + } + + if (UseParallelGC && UseNUMA && UseLargePages && !can_commit_large_page_memory()) { + // With SHM and HugeTLBFS large pages we cannot uncommit a page, so there's no way + // we can make the adaptive lgrp chunk resizing work. If the user specified both + // UseNUMA and UseLargePages (or UseSHM/UseHugeTLBFS) on the command line - warn + // and disable adaptive resizing. + if (UseAdaptiveSizePolicy || UseAdaptiveNUMAChunkSizing) { + warning("UseNUMA is not fully compatible with SHM/HugeTLBFS large pages, " + "disabling adaptive resizing (-XX:-UseAdaptiveSizePolicy -XX:-UseAdaptiveNUMAChunkSizing)"); + UseAdaptiveSizePolicy = false; + UseAdaptiveNUMAChunkSizing = false; + } + } + + if (!UseNUMA && ForceNUMA) { + UseNUMA = true; + } +} + // this is called _after_ the global arguments have been parsed jint os::init_2(void) { @@ -4980,32 +5057,7 @@ Linux::glibc_version(), Linux::libpthread_version()); if (UseNUMA) { - if (!Linux::libnuma_init()) { - UseNUMA = false; - } else { - if ((Linux::numa_max_node() < 1) || Linux::isbound_to_single_node()) { - // If there's only one node (they start from 0) or if the process - // is bound explicitly to a single node using membind, disable NUMA. - UseNUMA = false; - } - } - - if (UseParallelGC && UseNUMA && UseLargePages && !can_commit_large_page_memory()) { - // With SHM and HugeTLBFS large pages we cannot uncommit a page, so there's no way - // we can make the adaptive lgrp chunk resizing work. If the user specified both - // UseNUMA and UseLargePages (or UseSHM/UseHugeTLBFS) on the command line - warn - // and disable adaptive resizing. - if (UseAdaptiveSizePolicy || UseAdaptiveNUMAChunkSizing) { - warning("UseNUMA is not fully compatible with SHM/HugeTLBFS large pages, " - "disabling adaptive resizing (-XX:-UseAdaptiveSizePolicy -XX:-UseAdaptiveNUMAChunkSizing)"); - UseAdaptiveSizePolicy = false; - UseAdaptiveNUMAChunkSizing = false; - } - } - - if (!UseNUMA && ForceNUMA) { - UseNUMA = true; - } + Linux::numa_init(); } if (MaxFDLimit) { diff -r 4a59f7042325 -r ef09fd205ef0 src/hotspot/os/linux/os_linux.hpp --- a/src/hotspot/os/linux/os_linux.hpp Wed Jan 16 21:37:12 2019 +0100 +++ b/src/hotspot/os/linux/os_linux.hpp Wed Jan 16 22:32:04 2019 +0100 @@ -211,6 +211,7 @@ // none present private: + static void numa_init(); static void expand_stack_to(address bottom); typedef int (*sched_getcpu_func_t)(void); @@ -222,6 +223,7 @@ typedef void (*numa_interleave_memory_func_t)(void *start, size_t size, unsigned long *nodemask); typedef void (*numa_interleave_memory_v2_func_t)(void *start, size_t size, struct bitmask* mask); typedef struct bitmask* (*numa_get_membind_func_t)(void); + typedef struct bitmask* (*numa_get_interleave_mask_func_t)(void); typedef void (*numa_set_bind_policy_func_t)(int policy); typedef int (*numa_bitmask_isbitset_func_t)(struct bitmask *bmp, unsigned int n); @@ -239,9 +241,12 @@ static numa_bitmask_isbitset_func_t _numa_bitmask_isbitset; static numa_distance_func_t _numa_distance; static numa_get_membind_func_t _numa_get_membind; + static numa_get_interleave_mask_func_t _numa_get_interleave_mask; static unsigned long* _numa_all_nodes; static struct bitmask* _numa_all_nodes_ptr; static struct bitmask* _numa_nodes_ptr; + static struct bitmask* _numa_interleave_bitmask; + static struct bitmask* _numa_membind_bitmask; static void set_sched_getcpu(sched_getcpu_func_t func) { _sched_getcpu = func; } static void set_numa_node_to_cpus(numa_node_to_cpus_func_t func) { _numa_node_to_cpus = func; } @@ -255,10 +260,21 @@ static void set_numa_bitmask_isbitset(numa_bitmask_isbitset_func_t func) { _numa_bitmask_isbitset = func; } static void set_numa_distance(numa_distance_func_t func) { _numa_distance = func; } static void set_numa_get_membind(numa_get_membind_func_t func) { _numa_get_membind = func; } + static void set_numa_get_interleave_mask(numa_get_interleave_mask_func_t func) { _numa_get_interleave_mask = func; } static void set_numa_all_nodes(unsigned long* ptr) { _numa_all_nodes = ptr; } static void set_numa_all_nodes_ptr(struct bitmask **ptr) { _numa_all_nodes_ptr = (ptr == NULL ? NULL : *ptr); } static void set_numa_nodes_ptr(struct bitmask **ptr) { _numa_nodes_ptr = (ptr == NULL ? NULL : *ptr); } + static void set_numa_interleave_bitmask(struct bitmask* ptr) { _numa_interleave_bitmask = ptr ; } + static void set_numa_membind_bitmask(struct bitmask* ptr) { _numa_membind_bitmask = ptr ; } static int sched_getcpu_syscall(void); + + enum NumaAllocationPolicy{ + NotInitialized, + Membind, + Interleave + }; + static NumaAllocationPolicy _current_numa_policy; + public: static int sched_getcpu() { return _sched_getcpu != NULL ? _sched_getcpu() : -1; } static int numa_node_to_cpus(int node, unsigned long *buffer, int bufferlen) { @@ -272,11 +288,33 @@ static int numa_tonode_memory(void *start, size_t size, int node) { return _numa_tonode_memory != NULL ? _numa_tonode_memory(start, size, node) : -1; } + + static bool is_running_in_interleave_mode() { + return _current_numa_policy == Interleave; + } + + static void set_configured_numa_policy(NumaAllocationPolicy numa_policy) { + _current_numa_policy = numa_policy; + } + + static NumaAllocationPolicy identify_numa_policy() { + for (int node = 0; node <= Linux::numa_max_node(); node++) { + if (Linux::_numa_bitmask_isbitset(Linux::_numa_interleave_bitmask, node)) { + return Interleave; + } + } + return Membind; + } + static void numa_interleave_memory(void *start, size_t size) { - // Use v2 api if available - if (_numa_interleave_memory_v2 != NULL && _numa_all_nodes_ptr != NULL) { - _numa_interleave_memory_v2(start, size, _numa_all_nodes_ptr); - } else if (_numa_interleave_memory != NULL && _numa_all_nodes != NULL) { + // Prefer v2 API + if (_numa_interleave_memory_v2 != NULL) { + if (is_running_in_interleave_mode()) { + _numa_interleave_memory_v2(start, size, _numa_interleave_bitmask); + } else if (_numa_membind_bitmask != NULL) { + _numa_interleave_memory_v2(start, size, _numa_membind_bitmask); + } + } else if (_numa_interleave_memory != NULL) { _numa_interleave_memory(start, size, _numa_all_nodes); } } @@ -291,14 +329,14 @@ static int get_node_by_cpu(int cpu_id); static int get_existing_num_nodes(); // Check if numa node is configured (non-zero memory node). - static bool isnode_in_configured_nodes(unsigned int n) { + static bool is_node_in_configured_nodes(unsigned int n) { if (_numa_bitmask_isbitset != NULL && _numa_all_nodes_ptr != NULL) { return _numa_bitmask_isbitset(_numa_all_nodes_ptr, n); } else return false; } // Check if numa node exists in the system (including zero memory nodes). - static bool isnode_in_existing_nodes(unsigned int n) { + static bool is_node_in_existing_nodes(unsigned int n) { if (_numa_bitmask_isbitset != NULL && _numa_nodes_ptr != NULL) { return _numa_bitmask_isbitset(_numa_nodes_ptr, n); } else if (_numa_bitmask_isbitset != NULL && _numa_all_nodes_ptr != NULL) { @@ -317,16 +355,19 @@ return false; } // Check if node is in bound node set. - static bool isnode_in_bound_nodes(int node) { - if (_numa_get_membind != NULL && _numa_bitmask_isbitset != NULL) { - return _numa_bitmask_isbitset(_numa_get_membind(), node); - } else { - return false; + static bool is_node_in_bound_nodes(int node) { + if (_numa_bitmask_isbitset != NULL) { + if (is_running_in_interleave_mode()) { + return _numa_bitmask_isbitset(_numa_interleave_bitmask, node); + } else { + return _numa_membind_bitmask != NULL ? _numa_bitmask_isbitset(_numa_membind_bitmask, node) : false; + } } + return false; } // Check if bound to only one numa node. // Returns true if bound to a single numa node, otherwise returns false. - static bool isbound_to_single_node() { + static bool is_bound_to_single_node() { int nodes = 0; struct bitmask* bmp = NULL; unsigned int node = 0;