--- a/hotspot/src/cpu/sparc/vm/assembler_sparc.cpp Fri Aug 26 16:11:25 2011 -0700
+++ b/hotspot/src/cpu/sparc/vm/assembler_sparc.cpp Wed Aug 31 23:55:58 2011 -0700
@@ -2161,29 +2161,6 @@
#endif
}
-void MacroAssembler::br_on_reg_cond( RCondition rc, bool a, Predict p,
- Register s1, address d,
- relocInfo::relocType rt ) {
- assert_not_delayed();
- if (VM_Version::v9_instructions_work()) {
- bpr(rc, a, p, s1, d, rt);
- } else {
- tst(s1);
- br(reg_cond_to_cc_cond(rc), a, p, d, rt);
- }
-}
-
-void MacroAssembler::br_on_reg_cond( RCondition rc, bool a, Predict p,
- Register s1, Label& L ) {
- assert_not_delayed();
- if (VM_Version::v9_instructions_work()) {
- bpr(rc, a, p, s1, L);
- } else {
- tst(s1);
- br(reg_cond_to_cc_cond(rc), a, p, L);
- }
-}
-
// Compare registers and branch with nop in delay slot or cbcond without delay slot.
// Compare integer (32 bit) values (icc only).
@@ -4340,22 +4317,29 @@
} else {
pre_val = O0;
}
+
int satb_q_index_byte_offset =
in_bytes(JavaThread::satb_mark_queue_offset() +
PtrQueue::byte_offset_of_index());
+
int satb_q_buf_byte_offset =
in_bytes(JavaThread::satb_mark_queue_offset() +
PtrQueue::byte_offset_of_buf());
+
assert(in_bytes(PtrQueue::byte_width_of_index()) == sizeof(intptr_t) &&
in_bytes(PtrQueue::byte_width_of_buf()) == sizeof(intptr_t),
"check sizes in assembly below");
__ bind(restart);
+
+ // Load the index into the SATB buffer. PtrQueue::_index is a size_t
+ // so ld_ptr is appropriate.
__ ld_ptr(G2_thread, satb_q_index_byte_offset, L0);
- __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pn, L0, refill);
- // If the branch is taken, no harm in executing this in the delay slot.
- __ delayed()->ld_ptr(G2_thread, satb_q_buf_byte_offset, L1);
+ // index == 0?
+ __ cmp_and_brx_short(L0, G0, Assembler::equal, Assembler::pn, refill);
+
+ __ ld_ptr(G2_thread, satb_q_buf_byte_offset, L1);
__ sub(L0, oopSize, L0);
__ st_ptr(pre_val, L1, L0); // [_buf + index] := I0
@@ -4466,9 +4450,8 @@
tmp);
}
- // Check on whether to annul.
- br_on_reg_cond(rc_z, /*annul*/false, Assembler::pt, tmp, filtered);
- delayed()->nop();
+ // Is marking active?
+ cmp_and_br_short(tmp, G0, Assembler::equal, Assembler::pt, filtered);
// Do we need to load the previous value?
if (obj != noreg) {
@@ -4490,9 +4473,7 @@
assert(pre_val != noreg, "must have a real register");
// Is the previous value null?
- // Check on whether to annul.
- br_on_reg_cond(rc_z, /*annul*/false, Assembler::pt, pre_val, filtered);
- delayed()->nop();
+ cmp_and_brx_short(pre_val, G0, Assembler::equal, Assembler::pt, filtered);
// OK, it's not filtered, so we'll need to call enqueue. In the normal
// case, pre_val will be a scratch G-reg, but there are some cases in
@@ -4519,39 +4500,6 @@
bind(filtered);
}
-static jint num_ct_writes = 0;
-static jint num_ct_writes_filtered_in_hr = 0;
-static jint num_ct_writes_filtered_null = 0;
-static G1CollectedHeap* g1 = NULL;
-
-static Thread* count_ct_writes(void* filter_val, void* new_val) {
- Atomic::inc(&num_ct_writes);
- if (filter_val == NULL) {
- Atomic::inc(&num_ct_writes_filtered_in_hr);
- } else if (new_val == NULL) {
- Atomic::inc(&num_ct_writes_filtered_null);
- } else {
- if (g1 == NULL) {
- g1 = G1CollectedHeap::heap();
- }
- }
- if ((num_ct_writes % 1000000) == 0) {
- jint num_ct_writes_filtered =
- num_ct_writes_filtered_in_hr +
- num_ct_writes_filtered_null;
-
- tty->print_cr("%d potential CT writes: %5.2f%% filtered\n"
- " (%5.2f%% intra-HR, %5.2f%% null).",
- num_ct_writes,
- 100.0*(float)num_ct_writes_filtered/(float)num_ct_writes,
- 100.0*(float)num_ct_writes_filtered_in_hr/
- (float)num_ct_writes,
- 100.0*(float)num_ct_writes_filtered_null/
- (float)num_ct_writes);
- }
- return Thread::current();
-}
-
static address dirty_card_log_enqueue = 0;
static u_char* dirty_card_log_enqueue_end = 0;
@@ -4574,11 +4522,8 @@
__ set(addrlit, O1); // O1 := <card table base>
__ ldub(O0, O1, O2); // O2 := [O0 + O1]
- __ br_on_reg_cond(Assembler::rc_nz, /*annul*/false, Assembler::pt,
- O2, not_already_dirty);
- // Get O1 + O2 into a reg by itself -- useful in the take-the-branch
- // case, harmless if not.
- __ delayed()->add(O0, O1, O3);
+ assert(CardTableModRefBS::dirty_card_val() == 0, "otherwise check this code");
+ __ cmp_and_br_short(O2, G0, Assembler::notEqual, Assembler::pt, not_already_dirty);
// We didn't take the branch, so we're already dirty: return.
// Use return-from-leaf
@@ -4587,8 +4532,13 @@
// Not dirty.
__ bind(not_already_dirty);
+
+ // Get O0 + O1 into a reg by itself
+ __ add(O0, O1, O3);
+
// First, dirty it.
__ stb(G0, O3, G0); // [cardPtr] := 0 (i.e., dirty).
+
int dirty_card_q_index_byte_offset =
in_bytes(JavaThread::dirty_card_queue_offset() +
PtrQueue::byte_offset_of_index());
@@ -4596,12 +4546,15 @@
in_bytes(JavaThread::dirty_card_queue_offset() +
PtrQueue::byte_offset_of_buf());
__ bind(restart);
+
+ // Load the index into the update buffer. PtrQueue::_index is
+ // a size_t so ld_ptr is appropriate here.
__ ld_ptr(G2_thread, dirty_card_q_index_byte_offset, L0);
- __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pn,
- L0, refill);
- // If the branch is taken, no harm in executing this in the delay slot.
- __ delayed()->ld_ptr(G2_thread, dirty_card_q_buf_byte_offset, L1);
+ // index == 0?
+ __ cmp_and_brx_short(L0, G0, Assembler::equal, Assembler::pn, refill);
+
+ __ ld_ptr(G2_thread, dirty_card_q_buf_byte_offset, L1);
__ sub(L0, oopSize, L0);
__ st_ptr(O3, L1, L0); // [_buf + index] := I0
@@ -4664,6 +4617,7 @@
G1SATBCardTableModRefBS* bs = (G1SATBCardTableModRefBS*) Universe::heap()->barrier_set();
assert(bs->kind() == BarrierSet::G1SATBCT ||
bs->kind() == BarrierSet::G1SATBCTLogging, "wrong barrier");
+
if (G1RSBarrierRegionFilter) {
xor3(store_addr, new_val, tmp);
#ifdef _LP64
@@ -4672,33 +4626,8 @@
srl(tmp, HeapRegion::LogOfHRGrainBytes, tmp);
#endif
- if (G1PrintCTFilterStats) {
- guarantee(tmp->is_global(), "Or stats won't work...");
- // This is a sleazy hack: I'm temporarily hijacking G2, which I
- // promise to restore.
- mov(new_val, G2);
- save_frame(0);
- mov(tmp, O0);
- mov(G2, O1);
- // Save G-regs that target may use.
- mov(G1, L1);
- mov(G2, L2);
- mov(G3, L3);
- mov(G4, L4);
- mov(G5, L5);
- call(CAST_FROM_FN_PTR(address, &count_ct_writes));
- delayed()->nop();
- mov(O0, G2);
- // Restore G-regs that target may have used.
- mov(L1, G1);
- mov(L3, G3);
- mov(L4, G4);
- mov(L5, G5);
- restore(G0, G0, G0);
- }
- // XXX Should I predict this taken or not? Does it mattern?
- br_on_reg_cond(rc_z, /*annul*/false, Assembler::pt, tmp, filtered);
- delayed()->nop();
+ // XXX Should I predict this taken or not? Does it matter?
+ cmp_and_brx_short(tmp, G0, Assembler::equal, Assembler::pt, filtered);
}
// If the "store_addr" register is an "in" or "local" register, move it to
@@ -4723,7 +4652,6 @@
restore();
bind(filtered);
-
}
#endif // SERIALGC
--- a/hotspot/src/cpu/sparc/vm/assembler_sparc.hpp Fri Aug 26 16:11:25 2011 -0700
+++ b/hotspot/src/cpu/sparc/vm/assembler_sparc.hpp Wed Aug 31 23:55:58 2011 -0700
@@ -1940,12 +1940,6 @@
void br_null ( Register s1, bool a, Predict p, Label& L );
void br_notnull( Register s1, bool a, Predict p, Label& L );
- // These versions will do the most efficient thing on v8 and v9. Perhaps
- // this is what the routine above was meant to do, but it didn't (and
- // didn't cover both target address kinds.)
- void br_on_reg_cond( RCondition c, bool a, Predict p, Register s1, address d, relocInfo::relocType rt = relocInfo::none );
- void br_on_reg_cond( RCondition c, bool a, Predict p, Register s1, Label& L);
-
//
// Compare registers and branch with nop in delay slot or cbcond without delay slot.
//
--- a/hotspot/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp Fri Aug 26 16:11:25 2011 -0700
+++ b/hotspot/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp Wed Aug 31 23:55:58 2011 -0700
@@ -421,8 +421,7 @@
}
if (__ is_in_wdisp16_range(_continuation)) {
- __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pt,
- pre_val_reg, _continuation);
+ __ br_null(pre_val_reg, /*annul*/false, Assembler::pt, _continuation);
} else {
__ cmp(pre_val_reg, G0);
__ brx(Assembler::equal, false, Assembler::pn, _continuation);
@@ -458,8 +457,7 @@
// The original src operand was not a constant.
// Generate src == null?
if (__ is_in_wdisp16_range(_continuation)) {
- __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pt,
- src_reg, _continuation);
+ __ br_null(src_reg, /*annul*/false, Assembler::pt, _continuation);
} else {
__ cmp(src_reg, G0);
__ brx(Assembler::equal, false, Assembler::pt, _continuation);
@@ -476,13 +474,9 @@
Address ref_type_adr(tmp_reg, instanceKlass::reference_type_offset_in_bytes() + sizeof(oopDesc));
__ ld(ref_type_adr, tmp_reg);
- if (__ is_in_wdisp16_range(_continuation)) {
- __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pt,
- tmp_reg, _continuation);
- } else {
- __ cmp(tmp_reg, G0);
- __ brx(Assembler::equal, false, Assembler::pt, _continuation);
- }
+ // _reference_type field is of type ReferenceType (enum)
+ assert(REF_NONE == 0, "check this code");
+ __ cmp_zero_and_br(Assembler::equal, tmp_reg, _continuation, /*annul*/false, Assembler::pt);
__ delayed()->nop();
// Is marking active?
@@ -498,13 +492,8 @@
assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
__ ldsb(in_progress, tmp_reg);
}
- if (__ is_in_wdisp16_range(_continuation)) {
- __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pt,
- tmp_reg, _continuation);
- } else {
- __ cmp(tmp_reg, G0);
- __ brx(Assembler::equal, false, Assembler::pt, _continuation);
- }
+
+ __ cmp_zero_and_br(Assembler::equal, tmp_reg, _continuation, /*annul*/false, Assembler::pt);
__ delayed()->nop();
// val == null?
@@ -512,8 +501,7 @@
Register val_reg = val()->as_register();
if (__ is_in_wdisp16_range(_continuation)) {
- __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pt,
- val_reg, _continuation);
+ __ br_null(val_reg, /*annul*/false, Assembler::pt, _continuation);
} else {
__ cmp(val_reg, G0);
__ brx(Assembler::equal, false, Assembler::pt, _continuation);
@@ -542,9 +530,9 @@
assert(new_val()->is_register(), "Precondition.");
Register addr_reg = addr()->as_pointer_register();
Register new_val_reg = new_val()->as_register();
+
if (__ is_in_wdisp16_range(_continuation)) {
- __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pt,
- new_val_reg, _continuation);
+ __ br_null(new_val_reg, /*annul*/false, Assembler::pt, _continuation);
} else {
__ cmp(new_val_reg, G0);
__ brx(Assembler::equal, false, Assembler::pn, _continuation);
--- a/hotspot/src/cpu/sparc/vm/c1_Runtime1_sparc.cpp Fri Aug 26 16:11:25 2011 -0700
+++ b/hotspot/src/cpu/sparc/vm/c1_Runtime1_sparc.cpp Wed Aug 31 23:55:58 2011 -0700
@@ -834,14 +834,16 @@
int satb_q_buf_byte_offset =
in_bytes(JavaThread::satb_mark_queue_offset() +
PtrQueue::byte_offset_of_buf());
+
__ bind(restart);
+ // Load the index into the SATB buffer. PtrQueue::_index is a
+ // size_t so ld_ptr is appropriate
__ ld_ptr(G2_thread, satb_q_index_byte_offset, tmp);
- __ br_on_reg_cond(Assembler::rc_z, /*annul*/false,
- Assembler::pn, tmp, refill);
+ // index == 0?
+ __ cmp_and_brx_short(tmp, G0, Assembler::equal, Assembler::pn, refill);
- // If the branch is taken, no harm in executing this in the delay slot.
- __ delayed()->ld_ptr(G2_thread, satb_q_buf_byte_offset, tmp2);
+ __ ld_ptr(G2_thread, satb_q_buf_byte_offset, tmp2);
__ sub(tmp, oopSize, tmp);
__ st_ptr(pre_val, tmp2, tmp); // [_buf + index] := <address_of_card>
@@ -901,11 +903,8 @@
__ set(rs, cardtable); // cardtable := <card table base>
__ ldub(addr, cardtable, tmp); // tmp := [addr + cardtable]
- __ br_on_reg_cond(Assembler::rc_nz, /*annul*/false, Assembler::pt,
- tmp, not_already_dirty);
- // Get cardtable + tmp into a reg by itself -- useful in the take-the-branch
- // case, harmless if not.
- __ delayed()->add(addr, cardtable, tmp2);
+ assert(CardTableModRefBS::dirty_card_val() == 0, "otherwise check this code");
+ __ cmp_and_br_short(tmp, G0, Assembler::notEqual, Assembler::pt, not_already_dirty);
// We didn't take the branch, so we're already dirty: return.
// Use return-from-leaf
@@ -914,6 +913,10 @@
// Not dirty.
__ bind(not_already_dirty);
+
+ // Get cardtable + tmp into a reg by itself
+ __ add(addr, cardtable, tmp2);
+
// First, dirty it.
__ stb(G0, tmp2, 0); // [cardPtr] := 0 (i.e., dirty).
@@ -929,13 +932,17 @@
int dirty_card_q_buf_byte_offset =
in_bytes(JavaThread::dirty_card_queue_offset() +
PtrQueue::byte_offset_of_buf());
+
__ bind(restart);
+
+ // Get the index into the update buffer. PtrQueue::_index is
+ // a size_t so ld_ptr is appropriate here.
__ ld_ptr(G2_thread, dirty_card_q_index_byte_offset, tmp3);
- __ br_on_reg_cond(Assembler::rc_z, /*annul*/false, Assembler::pn,
- tmp3, refill);
- // If the branch is taken, no harm in executing this in the delay slot.
- __ delayed()->ld_ptr(G2_thread, dirty_card_q_buf_byte_offset, tmp4);
+ // index == 0?
+ __ cmp_and_brx_short(tmp3, G0, Assembler::equal, Assembler::pn, refill);
+
+ __ ld_ptr(G2_thread, dirty_card_q_buf_byte_offset, tmp4);
__ sub(tmp3, oopSize, tmp3);
__ st_ptr(tmp2, tmp4, tmp3); // [_buf + index] := <address_of_card>
--- a/hotspot/src/os/linux/vm/os_linux.cpp Fri Aug 26 16:11:25 2011 -0700
+++ b/hotspot/src/os/linux/vm/os_linux.cpp Wed Aug 31 23:55:58 2011 -0700
@@ -125,10 +125,6 @@
# include <inttypes.h>
# include <sys/ioctl.h>
-#ifdef AMD64
-#include <asm/vsyscall.h>
-#endif
-
#define MAX_PATH (2 * K)
// for timer info max values which include all bits
@@ -2502,7 +2498,13 @@
int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
uintptr_t res = (uintptr_t) ::mmap(addr, size, prot,
MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0);
- return res != (uintptr_t) MAP_FAILED;
+ if (res != (uintptr_t) MAP_FAILED) {
+ if (UseNUMAInterleaving) {
+ numa_make_global(addr, size);
+ }
+ return true;
+ }
+ return false;
}
// Define MAP_HUGETLB here so we can build HotSpot on old systems.
@@ -2523,7 +2525,13 @@
(uintptr_t) ::mmap(addr, size, prot,
MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS|MAP_HUGETLB,
-1, 0);
- return res != (uintptr_t) MAP_FAILED;
+ if (res != (uintptr_t) MAP_FAILED) {
+ if (UseNUMAInterleaving) {
+ numa_make_global(addr, size);
+ }
+ return true;
+ }
+ return false;
}
return commit_memory(addr, size, exec);
@@ -2588,8 +2596,17 @@
int retval = -1;
#if defined(IA32)
+# ifndef SYS_getcpu
+# define SYS_getcpu 318
+# endif
retval = syscall(SYS_getcpu, &cpu, NULL, NULL);
#elif defined(AMD64)
+// Unfortunately we have to bring all these macros here from vsyscall.h
+// to be able to compile on old linuxes.
+# define __NR_vgetcpu 2
+# define VSYSCALL_START (-10UL << 20)
+# define VSYSCALL_SIZE 1024
+# define VSYSCALL_ADDR(vsyscall_nr) (VSYSCALL_START+VSYSCALL_SIZE*(vsyscall_nr))
typedef long (*vgetcpu_t)(unsigned int *cpu, unsigned int *node, unsigned long *tcache);
vgetcpu_t vgetcpu = (vgetcpu_t)VSYSCALL_ADDR(__NR_vgetcpu);
retval = vgetcpu(&cpu, NULL, NULL);
@@ -3115,6 +3132,10 @@
return NULL;
}
+ if ((addr != NULL) && UseNUMAInterleaving) {
+ numa_make_global(addr, bytes);
+ }
+
return addr;
}
--- a/hotspot/src/os/solaris/vm/os_solaris.cpp Fri Aug 26 16:11:25 2011 -0700
+++ b/hotspot/src/os/solaris/vm/os_solaris.cpp Wed Aug 31 23:55:58 2011 -0700
@@ -2777,8 +2777,14 @@
bool os::commit_memory(char* addr, size_t bytes, bool exec) {
int prot = exec ? PROT_READ|PROT_WRITE|PROT_EXEC : PROT_READ|PROT_WRITE;
size_t size = bytes;
- return
- NULL != Solaris::mmap_chunk(addr, size, MAP_PRIVATE|MAP_FIXED, prot);
+ char *res = Solaris::mmap_chunk(addr, size, MAP_PRIVATE|MAP_FIXED, prot);
+ if (res != NULL) {
+ if (UseNUMAInterleaving) {
+ numa_make_global(addr, bytes);
+ }
+ return true;
+ }
+ return false;
}
bool os::commit_memory(char* addr, size_t bytes, size_t alignment_hint,
@@ -3389,12 +3395,11 @@
return true;
}
-char* os::reserve_memory_special(size_t bytes, char* addr, bool exec) {
+char* os::reserve_memory_special(size_t size, char* addr, bool exec) {
// "exec" is passed in but not used. Creating the shared image for
// the code cache doesn't have an SHM_X executable permission to check.
assert(UseLargePages && UseISM, "only for ISM large pages");
- size_t size = bytes;
char* retAddr = NULL;
int shmid;
key_t ismKey;
@@ -3436,7 +3441,9 @@
}
return NULL;
}
-
+ if ((retAddr != NULL) && UseNUMAInterleaving) {
+ numa_make_global(retAddr, size);
+ }
return retAddr;
}
--- a/hotspot/src/os/windows/vm/os_windows.cpp Fri Aug 26 16:11:25 2011 -0700
+++ b/hotspot/src/os/windows/vm/os_windows.cpp Wed Aug 31 23:55:58 2011 -0700
@@ -2614,6 +2614,57 @@
static HANDLE _hProcess;
static HANDLE _hToken;
+// Container for NUMA node list info
+class NUMANodeListHolder {
+private:
+ int *_numa_used_node_list; // allocated below
+ int _numa_used_node_count;
+
+ void free_node_list() {
+ if (_numa_used_node_list != NULL) {
+ FREE_C_HEAP_ARRAY(int, _numa_used_node_list);
+ }
+ }
+
+public:
+ NUMANodeListHolder() {
+ _numa_used_node_count = 0;
+ _numa_used_node_list = NULL;
+ // do rest of initialization in build routine (after function pointers are set up)
+ }
+
+ ~NUMANodeListHolder() {
+ free_node_list();
+ }
+
+ bool build() {
+ DWORD_PTR proc_aff_mask;
+ DWORD_PTR sys_aff_mask;
+ if (!GetProcessAffinityMask(GetCurrentProcess(), &proc_aff_mask, &sys_aff_mask)) return false;
+ ULONG highest_node_number;
+ if (!os::Kernel32Dll::GetNumaHighestNodeNumber(&highest_node_number)) return false;
+ free_node_list();
+ _numa_used_node_list = NEW_C_HEAP_ARRAY(int, highest_node_number);
+ for (unsigned int i = 0; i <= highest_node_number; i++) {
+ ULONGLONG proc_mask_numa_node;
+ if (!os::Kernel32Dll::GetNumaNodeProcessorMask(i, &proc_mask_numa_node)) return false;
+ if ((proc_aff_mask & proc_mask_numa_node)!=0) {
+ _numa_used_node_list[_numa_used_node_count++] = i;
+ }
+ }
+ return (_numa_used_node_count > 1);
+ }
+
+ int get_count() {return _numa_used_node_count;}
+ int get_node_list_entry(int n) {
+ // for indexes out of range, returns -1
+ return (n < _numa_used_node_count ? _numa_used_node_list[n] : -1);
+ }
+
+} numa_node_list_holder;
+
+
+
static size_t _large_page_size = 0;
static bool resolve_functions_for_large_page_init() {
@@ -2653,6 +2704,154 @@
_hToken = NULL;
}
+static bool numa_interleaving_init() {
+ bool success = false;
+ bool use_numa_specified = !FLAG_IS_DEFAULT(UseNUMA);
+ bool use_numa_interleaving_specified = !FLAG_IS_DEFAULT(UseNUMAInterleaving);
+
+ // print a warning if UseNUMA or UseNUMAInterleaving flag is specified on command line
+ bool warn_on_failure = use_numa_specified || use_numa_interleaving_specified;
+# define WARN(msg) if (warn_on_failure) { warning(msg); }
+
+ // NUMAInterleaveGranularity cannot be less than vm_allocation_granularity (or _large_page_size if using large pages)
+ size_t min_interleave_granularity = UseLargePages ? _large_page_size : os::vm_allocation_granularity();
+ NUMAInterleaveGranularity = align_size_up(NUMAInterleaveGranularity, min_interleave_granularity);
+
+ if (os::Kernel32Dll::NumaCallsAvailable()) {
+ if (numa_node_list_holder.build()) {
+ if (PrintMiscellaneous && Verbose) {
+ tty->print("NUMA UsedNodeCount=%d, namely ", os::numa_get_groups_num());
+ for (int i = 0; i < numa_node_list_holder.get_count(); i++) {
+ tty->print("%d ", numa_node_list_holder.get_node_list_entry(i));
+ }
+ tty->print("\n");
+ }
+ success = true;
+ } else {
+ WARN("Process does not cover multiple NUMA nodes.");
+ }
+ } else {
+ WARN("NUMA Interleaving is not supported by the operating system.");
+ }
+ if (!success) {
+ if (use_numa_specified) WARN("...Ignoring UseNUMA flag.");
+ if (use_numa_interleaving_specified) WARN("...Ignoring UseNUMAInterleaving flag.");
+ }
+ return success;
+#undef WARN
+}
+
+// this routine is used whenever we need to reserve a contiguous VA range
+// but we need to make separate VirtualAlloc calls for each piece of the range
+// Reasons for doing this:
+// * UseLargePagesIndividualAllocation was set (normally only needed on WS2003 but possible to be set otherwise)
+// * UseNUMAInterleaving requires a separate node for each piece
+static char* allocate_pages_individually(size_t bytes, char* addr, DWORD flags, DWORD prot,
+ bool should_inject_error=false) {
+ char * p_buf;
+ // note: at setup time we guaranteed that NUMAInterleaveGranularity was aligned up to a page size
+ size_t page_size = UseLargePages ? _large_page_size : os::vm_allocation_granularity();
+ size_t chunk_size = UseNUMAInterleaving ? NUMAInterleaveGranularity : page_size;
+
+ // first reserve enough address space in advance since we want to be
+ // able to break a single contiguous virtual address range into multiple
+ // large page commits but WS2003 does not allow reserving large page space
+ // so we just use 4K pages for reserve, this gives us a legal contiguous
+ // address space. then we will deallocate that reservation, and re alloc
+ // using large pages
+ const size_t size_of_reserve = bytes + chunk_size;
+ if (bytes > size_of_reserve) {
+ // Overflowed.
+ return NULL;
+ }
+ p_buf = (char *) VirtualAlloc(addr,
+ size_of_reserve, // size of Reserve
+ MEM_RESERVE,
+ PAGE_READWRITE);
+ // If reservation failed, return NULL
+ if (p_buf == NULL) return NULL;
+
+ os::release_memory(p_buf, bytes + chunk_size);
+
+ // we still need to round up to a page boundary (in case we are using large pages)
+ // but not to a chunk boundary (in case InterleavingGranularity doesn't align with page size)
+ // instead we handle this in the bytes_to_rq computation below
+ p_buf = (char *) align_size_up((size_t)p_buf, page_size);
+
+ // now go through and allocate one chunk at a time until all bytes are
+ // allocated
+ size_t bytes_remaining = bytes;
+ // An overflow of align_size_up() would have been caught above
+ // in the calculation of size_of_reserve.
+ char * next_alloc_addr = p_buf;
+ HANDLE hProc = GetCurrentProcess();
+
+#ifdef ASSERT
+ // Variable for the failure injection
+ long ran_num = os::random();
+ size_t fail_after = ran_num % bytes;
+#endif
+
+ int count=0;
+ while (bytes_remaining) {
+ // select bytes_to_rq to get to the next chunk_size boundary
+
+ size_t bytes_to_rq = MIN2(bytes_remaining, chunk_size - ((size_t)next_alloc_addr % chunk_size));
+ // Note allocate and commit
+ char * p_new;
+
+#ifdef ASSERT
+ bool inject_error_now = should_inject_error && (bytes_remaining <= fail_after);
+#else
+ const bool inject_error_now = false;
+#endif
+
+ if (inject_error_now) {
+ p_new = NULL;
+ } else {
+ if (!UseNUMAInterleaving) {
+ p_new = (char *) VirtualAlloc(next_alloc_addr,
+ bytes_to_rq,
+ flags,
+ prot);
+ } else {
+ // get the next node to use from the used_node_list
+ DWORD node = numa_node_list_holder.get_node_list_entry(count % os::numa_get_groups_num());
+ p_new = (char *)os::Kernel32Dll::VirtualAllocExNuma(hProc,
+ next_alloc_addr,
+ bytes_to_rq,
+ flags,
+ prot,
+ node);
+ }
+ }
+
+ if (p_new == NULL) {
+ // Free any allocated pages
+ if (next_alloc_addr > p_buf) {
+ // Some memory was committed so release it.
+ size_t bytes_to_release = bytes - bytes_remaining;
+ os::release_memory(p_buf, bytes_to_release);
+ }
+#ifdef ASSERT
+ if (should_inject_error) {
+ if (TracePageSizes && Verbose) {
+ tty->print_cr("Reserving pages individually failed.");
+ }
+ }
+#endif
+ return NULL;
+ }
+ bytes_remaining -= bytes_to_rq;
+ next_alloc_addr += bytes_to_rq;
+ count++;
+ }
+ // made it this far, success
+ return p_buf;
+}
+
+
+
void os::large_page_init() {
if (!UseLargePages) return;
@@ -2722,9 +2921,30 @@
assert((size_t)addr % os::vm_allocation_granularity() == 0,
"reserve alignment");
assert(bytes % os::vm_allocation_granularity() == 0, "reserve block size");
- char* res = (char*)VirtualAlloc(addr, bytes, MEM_RESERVE, PAGE_READWRITE);
+ char* res;
+ // note that if UseLargePages is on, all the areas that require interleaving
+ // will go thru reserve_memory_special rather than thru here.
+ bool use_individual = (UseNUMAInterleaving && !UseLargePages);
+ if (!use_individual) {
+ res = (char*)VirtualAlloc(addr, bytes, MEM_RESERVE, PAGE_READWRITE);
+ } else {
+ elapsedTimer reserveTimer;
+ if( Verbose && PrintMiscellaneous ) reserveTimer.start();
+ // in numa interleaving, we have to allocate pages individually
+ // (well really chunks of NUMAInterleaveGranularity size)
+ res = allocate_pages_individually(bytes, addr, MEM_RESERVE, PAGE_READWRITE);
+ if (res == NULL) {
+ warning("NUMA page allocation failed");
+ }
+ if( Verbose && PrintMiscellaneous ) {
+ reserveTimer.stop();
+ tty->print_cr("reserve_memory of %Ix bytes took %ld ms (%ld ticks)", bytes,
+ reserveTimer.milliseconds(), reserveTimer.ticks());
+ }
+ }
assert(res == NULL || addr == NULL || addr == res,
"Unexpected address from reserve.");
+
return res;
}
@@ -2754,92 +2974,27 @@
char* os::reserve_memory_special(size_t bytes, char* addr, bool exec) {
const DWORD prot = exec ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE;
-
- if (UseLargePagesIndividualAllocation) {
+ const DWORD flags = MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES;
+
+ // with large pages, there are two cases where we need to use Individual Allocation
+ // 1) the UseLargePagesIndividualAllocation flag is set (set by default on WS2003)
+ // 2) NUMA Interleaving is enabled, in which case we use a different node for each page
+ if (UseLargePagesIndividualAllocation || UseNUMAInterleaving) {
if (TracePageSizes && Verbose) {
tty->print_cr("Reserving large pages individually.");
}
- char * p_buf;
- // first reserve enough address space in advance since we want to be
- // able to break a single contiguous virtual address range into multiple
- // large page commits but WS2003 does not allow reserving large page space
- // so we just use 4K pages for reserve, this gives us a legal contiguous
- // address space. then we will deallocate that reservation, and re alloc
- // using large pages
- const size_t size_of_reserve = bytes + _large_page_size;
- if (bytes > size_of_reserve) {
- // Overflowed.
- warning("Individually allocated large pages failed, "
- "use -XX:-UseLargePagesIndividualAllocation to turn off");
+ char * p_buf = allocate_pages_individually(bytes, addr, flags, prot, LargePagesIndividualAllocationInjectError);
+ if (p_buf == NULL) {
+ // give an appropriate warning message
+ if (UseNUMAInterleaving) {
+ warning("NUMA large page allocation failed, UseLargePages flag ignored");
+ }
+ if (UseLargePagesIndividualAllocation) {
+ warning("Individually allocated large pages failed, "
+ "use -XX:-UseLargePagesIndividualAllocation to turn off");
+ }
return NULL;
}
- p_buf = (char *) VirtualAlloc(addr,
- size_of_reserve, // size of Reserve
- MEM_RESERVE,
- PAGE_READWRITE);
- // If reservation failed, return NULL
- if (p_buf == NULL) return NULL;
-
- release_memory(p_buf, bytes + _large_page_size);
- // round up to page boundary. If the size_of_reserve did not
- // overflow and the reservation did not fail, this align up
- // should not overflow.
- p_buf = (char *) align_size_up((size_t)p_buf, _large_page_size);
-
- // now go through and allocate one page at a time until all bytes are
- // allocated
- size_t bytes_remaining = align_size_up(bytes, _large_page_size);
- // An overflow of align_size_up() would have been caught above
- // in the calculation of size_of_reserve.
- char * next_alloc_addr = p_buf;
-
-#ifdef ASSERT
- // Variable for the failure injection
- long ran_num = os::random();
- size_t fail_after = ran_num % bytes;
-#endif
-
- while (bytes_remaining) {
- size_t bytes_to_rq = MIN2(bytes_remaining, _large_page_size);
- // Note allocate and commit
- char * p_new;
-
-#ifdef ASSERT
- bool inject_error = LargePagesIndividualAllocationInjectError &&
- (bytes_remaining <= fail_after);
-#else
- const bool inject_error = false;
-#endif
-
- if (inject_error) {
- p_new = NULL;
- } else {
- p_new = (char *) VirtualAlloc(next_alloc_addr,
- bytes_to_rq,
- MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES,
- prot);
- }
-
- if (p_new == NULL) {
- // Free any allocated pages
- if (next_alloc_addr > p_buf) {
- // Some memory was committed so release it.
- size_t bytes_to_release = bytes - bytes_remaining;
- release_memory(p_buf, bytes_to_release);
- }
-#ifdef ASSERT
- if (UseLargePagesIndividualAllocation &&
- LargePagesIndividualAllocationInjectError) {
- if (TracePageSizes && Verbose) {
- tty->print_cr("Reserving large pages individually failed.");
- }
- }
-#endif
- return NULL;
- }
- bytes_remaining -= bytes_to_rq;
- next_alloc_addr += bytes_to_rq;
- }
return p_buf;
@@ -2867,14 +3022,43 @@
assert(bytes % os::vm_page_size() == 0, "commit in page-sized chunks");
// Don't attempt to print anything if the OS call fails. We're
// probably low on resources, so the print itself may cause crashes.
- bool result = VirtualAlloc(addr, bytes, MEM_COMMIT, PAGE_READWRITE) != 0;
- if (result != NULL && exec) {
- DWORD oldprot;
- // Windows doc says to use VirtualProtect to get execute permissions
- return VirtualProtect(addr, bytes, PAGE_EXECUTE_READWRITE, &oldprot) != 0;
+
+ // unless we have NUMAInterleaving enabled, the range of a commit
+ // is always within a reserve covered by a single VirtualAlloc
+ // in that case we can just do a single commit for the requested size
+ if (!UseNUMAInterleaving) {
+ if (VirtualAlloc(addr, bytes, MEM_COMMIT, PAGE_READWRITE) == NULL) return false;
+ if (exec) {
+ DWORD oldprot;
+ // Windows doc says to use VirtualProtect to get execute permissions
+ if (!VirtualProtect(addr, bytes, PAGE_EXECUTE_READWRITE, &oldprot)) return false;
+ }
+ return true;
} else {
- return result;
- }
+
+ // when NUMAInterleaving is enabled, the commit might cover a range that
+ // came from multiple VirtualAlloc reserves (using allocate_pages_individually).
+ // VirtualQuery can help us determine that. The RegionSize that VirtualQuery
+ // returns represents the number of bytes that can be committed in one step.
+ size_t bytes_remaining = bytes;
+ char * next_alloc_addr = addr;
+ while (bytes_remaining > 0) {
+ MEMORY_BASIC_INFORMATION alloc_info;
+ VirtualQuery(next_alloc_addr, &alloc_info, sizeof(alloc_info));
+ size_t bytes_to_rq = MIN2(bytes_remaining, (size_t)alloc_info.RegionSize);
+ if (VirtualAlloc(next_alloc_addr, bytes_to_rq, MEM_COMMIT, PAGE_READWRITE) == NULL)
+ return false;
+ if (exec) {
+ DWORD oldprot;
+ if (!VirtualProtect(next_alloc_addr, bytes_to_rq, PAGE_EXECUTE_READWRITE, &oldprot))
+ return false;
+ }
+ bytes_remaining -= bytes_to_rq;
+ next_alloc_addr += bytes_to_rq;
+ }
+ }
+ // if we made it this far, return true
+ return true;
}
bool os::commit_memory(char* addr, size_t size, size_t alignment_hint,
@@ -2948,14 +3132,15 @@
void os::numa_make_global(char *addr, size_t bytes) { }
void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint) { }
bool os::numa_topology_changed() { return false; }
-size_t os::numa_get_groups_num() { return 1; }
+size_t os::numa_get_groups_num() { return numa_node_list_holder.get_count(); }
int os::numa_get_group_id() { return 0; }
size_t os::numa_get_leaf_groups(int *ids, size_t size) {
- if (size > 0) {
- ids[0] = 0;
- return 1;
- }
- return 0;
+ // check for size bigger than actual groups_num
+ size = MIN2(size, numa_get_groups_num());
+ for (int i = 0; i < (int)size; i++) {
+ ids[i] = numa_node_list_holder.get_node_list_entry(i);
+ }
+ return size;
}
bool os::get_page_info(char *start, page_info* info) {
@@ -3480,7 +3665,7 @@
if(Verbose && PrintMiscellaneous)
tty->print("[Memory Serialize Page address: " INTPTR_FORMAT "]\n", (intptr_t)mem_serialize_page);
#endif
-}
+ }
os::large_page_init();
@@ -3583,8 +3768,10 @@
// initialize thread priority policy
prio_init();
- if (UseNUMA && !ForceNUMA) {
- UseNUMA = false; // Currently unsupported.
+ if (UseNUMAInterleaving) {
+ // first check whether this Windows OS supports VirtualAllocExNuma, if not ignore this flag
+ bool success = numa_interleaving_init();
+ if (!success) UseNUMAInterleaving = false;
}
return JNI_OK;
@@ -4758,7 +4945,14 @@
// Kernel32 API
typedef SIZE_T (WINAPI* GetLargePageMinimum_Fn)(void);
+typedef LPVOID (WINAPI *VirtualAllocExNuma_Fn) (HANDLE, LPVOID, SIZE_T, DWORD, DWORD, DWORD);
+typedef BOOL (WINAPI *GetNumaHighestNodeNumber_Fn) (PULONG);
+typedef BOOL (WINAPI *GetNumaNodeProcessorMask_Fn) (UCHAR, PULONGLONG);
+
GetLargePageMinimum_Fn os::Kernel32Dll::_GetLargePageMinimum = NULL;
+VirtualAllocExNuma_Fn os::Kernel32Dll::_VirtualAllocExNuma = NULL;
+GetNumaHighestNodeNumber_Fn os::Kernel32Dll::_GetNumaHighestNodeNumber = NULL;
+GetNumaNodeProcessorMask_Fn os::Kernel32Dll::_GetNumaNodeProcessorMask = NULL;
BOOL os::Kernel32Dll::initialized = FALSE;
SIZE_T os::Kernel32Dll::GetLargePageMinimum() {
assert(initialized && _GetLargePageMinimum != NULL,
@@ -4773,16 +4967,53 @@
return _GetLargePageMinimum != NULL;
}
+BOOL os::Kernel32Dll::NumaCallsAvailable() {
+ if (!initialized) {
+ initialize();
+ }
+ return _VirtualAllocExNuma != NULL;
+}
+
+LPVOID os::Kernel32Dll::VirtualAllocExNuma(HANDLE hProc, LPVOID addr, SIZE_T bytes, DWORD flags, DWORD prot, DWORD node) {
+ assert(initialized && _VirtualAllocExNuma != NULL,
+ "NUMACallsAvailable() not yet called");
+
+ return _VirtualAllocExNuma(hProc, addr, bytes, flags, prot, node);
+}
+
+BOOL os::Kernel32Dll::GetNumaHighestNodeNumber(PULONG ptr_highest_node_number) {
+ assert(initialized && _GetNumaHighestNodeNumber != NULL,
+ "NUMACallsAvailable() not yet called");
+
+ return _GetNumaHighestNodeNumber(ptr_highest_node_number);
+}
+
+BOOL os::Kernel32Dll::GetNumaNodeProcessorMask(UCHAR node, PULONGLONG proc_mask) {
+ assert(initialized && _GetNumaNodeProcessorMask != NULL,
+ "NUMACallsAvailable() not yet called");
+
+ return _GetNumaNodeProcessorMask(node, proc_mask);
+}
+
+
+void os::Kernel32Dll::initializeCommon() {
+ if (!initialized) {
+ HMODULE handle = ::GetModuleHandle("Kernel32.dll");
+ assert(handle != NULL, "Just check");
+ _GetLargePageMinimum = (GetLargePageMinimum_Fn)::GetProcAddress(handle, "GetLargePageMinimum");
+ _VirtualAllocExNuma = (VirtualAllocExNuma_Fn)::GetProcAddress(handle, "VirtualAllocExNuma");
+ _GetNumaHighestNodeNumber = (GetNumaHighestNodeNumber_Fn)::GetProcAddress(handle, "GetNumaHighestNodeNumber");
+ _GetNumaNodeProcessorMask = (GetNumaNodeProcessorMask_Fn)::GetProcAddress(handle, "GetNumaNodeProcessorMask");
+ initialized = TRUE;
+ }
+}
+
+
#ifndef JDK6_OR_EARLIER
void os::Kernel32Dll::initialize() {
- if (!initialized) {
- HMODULE handle = ::GetModuleHandle("Kernel32.dll");
- assert(handle != NULL, "Just check");
- _GetLargePageMinimum = (GetLargePageMinimum_Fn)::GetProcAddress(handle, "GetLargePageMinimum");
- initialized = TRUE;
- }
+ initializeCommon();
}
@@ -4887,18 +5118,19 @@
Module32Next_Fn os::Kernel32Dll::_Module32Next = NULL;
GetNativeSystemInfo_Fn os::Kernel32Dll::_GetNativeSystemInfo = NULL;
+
void os::Kernel32Dll::initialize() {
if (!initialized) {
HMODULE handle = ::GetModuleHandle("Kernel32.dll");
assert(handle != NULL, "Just check");
_SwitchToThread = (SwitchToThread_Fn)::GetProcAddress(handle, "SwitchToThread");
- _GetLargePageMinimum = (GetLargePageMinimum_Fn)::GetProcAddress(handle, "GetLargePageMinimum");
_CreateToolhelp32Snapshot = (CreateToolhelp32Snapshot_Fn)
::GetProcAddress(handle, "CreateToolhelp32Snapshot");
_Module32First = (Module32First_Fn)::GetProcAddress(handle, "Module32First");
_Module32Next = (Module32Next_Fn)::GetProcAddress(handle, "Module32Next");
_GetNativeSystemInfo = (GetNativeSystemInfo_Fn)::GetProcAddress(handle, "GetNativeSystemInfo");
+ initializeCommon(); // resolve the functions that always need resolving
initialized = TRUE;
}
@@ -4964,6 +5196,8 @@
_GetNativeSystemInfo(lpSystemInfo);
}
+
+
// PSAPI API
--- a/hotspot/src/os/windows/vm/os_windows.hpp Fri Aug 26 16:11:25 2011 -0700
+++ b/hotspot/src/os/windows/vm/os_windows.hpp Wed Aug 31 23:55:58 2011 -0700
@@ -173,13 +173,25 @@
static BOOL GetNativeSystemInfoAvailable();
static void GetNativeSystemInfo(LPSYSTEM_INFO);
+ // NUMA calls
+ static BOOL NumaCallsAvailable();
+ static LPVOID VirtualAllocExNuma(HANDLE, LPVOID, SIZE_T, DWORD, DWORD, DWORD);
+ static BOOL GetNumaHighestNodeNumber(PULONG);
+ static BOOL GetNumaNodeProcessorMask(UCHAR, PULONGLONG);
+
private:
// GetLargePageMinimum available on Windows Vista/Windows Server 2003
// and later
+ // NUMA calls available Windows Vista/WS2008 and later
+
static SIZE_T (WINAPI *_GetLargePageMinimum)(void);
+ static LPVOID (WINAPI *_VirtualAllocExNuma) (HANDLE, LPVOID, SIZE_T, DWORD, DWORD, DWORD);
+ static BOOL (WINAPI *_GetNumaHighestNodeNumber) (PULONG);
+ static BOOL (WINAPI *_GetNumaNodeProcessorMask) (UCHAR, PULONGLONG);
static BOOL initialized;
static void initialize();
+ static void initializeCommon();
#ifdef JDK6_OR_EARLIER
private:
--- a/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp Fri Aug 26 16:11:25 2011 -0700
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp Wed Aug 31 23:55:58 2011 -0700
@@ -4069,6 +4069,23 @@
}
#endif // PRODUCT
+G1ParGCAllocBuffer::G1ParGCAllocBuffer(size_t gclab_word_size) :
+ ParGCAllocBuffer(gclab_word_size),
+ _should_mark_objects(false),
+ _bitmap(G1CollectedHeap::heap()->reserved_region().start(), gclab_word_size),
+ _retired(false)
+{
+ //_should_mark_objects is set to true when G1ParCopyHelper needs to
+ // mark the forwarded location of an evacuated object.
+ // We set _should_mark_objects to true if marking is active, i.e. when we
+ // need to propagate a mark, or during an initial mark pause, i.e. when we
+ // need to mark objects immediately reachable by the roots.
+ if (G1CollectedHeap::heap()->mark_in_progress() ||
+ G1CollectedHeap::heap()->g1_policy()->during_initial_mark_pause()) {
+ _should_mark_objects = true;
+ }
+}
+
G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, int queue_num)
: _g1h(g1h),
_refs(g1h->task_queue(queue_num)),
@@ -4184,12 +4201,14 @@
G1ParClosureSuper::G1ParClosureSuper(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state) :
_g1(g1), _g1_rem(_g1->g1_rem_set()), _cm(_g1->concurrent_mark()),
- _par_scan_state(par_scan_state) { }
-
-template <class T> void G1ParCopyHelper::mark_forwardee(T* p) {
- // This is called _after_ do_oop_work has been called, hence after
- // the object has been relocated to its new location and *p points
- // to its new location.
+ _par_scan_state(par_scan_state),
+ _during_initial_mark(_g1->g1_policy()->during_initial_mark_pause()),
+ _mark_in_progress(_g1->mark_in_progress()) { }
+
+template <class T> void G1ParCopyHelper::mark_object(T* p) {
+ // This is called from do_oop_work for objects that are not
+ // in the collection set. Objects in the collection set
+ // are marked after they have been evacuated.
T heap_oop = oopDesc::load_heap_oop(p);
if (!oopDesc::is_null(heap_oop)) {
@@ -4201,7 +4220,7 @@
}
}
-oop G1ParCopyHelper::copy_to_survivor_space(oop old) {
+oop G1ParCopyHelper::copy_to_survivor_space(oop old, bool should_mark_copy) {
size_t word_sz = old->size();
HeapRegion* from_region = _g1->heap_region_containing_raw(old);
// +1 to make the -1 indexes valid...
@@ -4257,8 +4276,8 @@
obj->set_mark(m);
}
- // preserve "next" mark bit
- if (_g1->mark_in_progress() && !_g1->is_obj_ill(old)) {
+ // Mark the evacuated object or propagate "next" mark bit
+ if (should_mark_copy) {
if (!use_local_bitmaps ||
!_par_scan_state->alloc_buffer(alloc_purpose)->mark(obj_ptr)) {
// if we couldn't mark it on the local bitmap (this happens when
@@ -4266,11 +4285,12 @@
// the bullet and do the standard parallel mark
_cm->markAndGrayObjectIfNecessary(obj);
}
-#if 1
+
if (_g1->isMarkedNext(old)) {
+ // Unmark the object's old location so that marking
+ // doesn't think the old object is alive.
_cm->nextMarkBitMap()->parClear((HeapWord*)old);
}
-#endif
}
size_t* surv_young_words = _par_scan_state->surviving_young_words();
@@ -4293,26 +4313,62 @@
return obj;
}
-template <bool do_gen_barrier, G1Barrier barrier, bool do_mark_forwardee>
+template <bool do_gen_barrier, G1Barrier barrier, bool do_mark_object>
template <class T>
-void G1ParCopyClosure <do_gen_barrier, barrier, do_mark_forwardee>
+void G1ParCopyClosure<do_gen_barrier, barrier, do_mark_object>
::do_oop_work(T* p) {
oop obj = oopDesc::load_decode_heap_oop(p);
assert(barrier != G1BarrierRS || obj != NULL,
"Precondition: G1BarrierRS implies obj is nonNull");
+ // Marking:
+ // If the object is in the collection set, then the thread
+ // that copies the object should mark, or propagate the
+ // mark to, the evacuated object.
+ // If the object is not in the collection set then we
+ // should call the mark_object() method depending on the
+ // value of the template parameter do_mark_object (which will
+ // be true for root scanning closures during an initial mark
+ // pause).
+ // The mark_object() method first checks whether the object
+ // is marked and, if not, attempts to mark the object.
+
// here the null check is implicit in the cset_fast_test() test
if (_g1->in_cset_fast_test(obj)) {
if (obj->is_forwarded()) {
oopDesc::encode_store_heap_oop(p, obj->forwardee());
+ // If we are a root scanning closure during an initial
+ // mark pause (i.e. do_mark_object will be true) then
+ // we also need to handle marking of roots in the
+ // event of an evacuation failure. In the event of an
+ // evacuation failure, the object is forwarded to itself
+ // and not copied so let's mark it here.
+ if (do_mark_object && obj->forwardee() == obj) {
+ mark_object(p);
+ }
} else {
- oop copy_oop = copy_to_survivor_space(obj);
+ // We need to mark the copied object if we're a root scanning
+ // closure during an initial mark pause (i.e. do_mark_object
+ // will be true), or the object is already marked and we need
+ // to propagate the mark to the evacuated copy.
+ bool should_mark_copy = do_mark_object ||
+ _during_initial_mark ||
+ (_mark_in_progress && !_g1->is_obj_ill(obj));
+
+ oop copy_oop = copy_to_survivor_space(obj, should_mark_copy);
oopDesc::encode_store_heap_oop(p, copy_oop);
}
// When scanning the RS, we only care about objs in CS.
if (barrier == G1BarrierRS) {
_par_scan_state->update_rs(_from, p, _par_scan_state->queue_num());
}
+ } else {
+ // The object is not in collection set. If we're a root scanning
+ // closure during an initial mark pause (i.e. do_mark_object will
+ // be true) then attempt to mark the object.
+ if (do_mark_object) {
+ mark_object(p);
+ }
}
if (barrier == G1BarrierEvac && obj != NULL) {
--- a/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp Fri Aug 26 16:11:25 2011 -0700
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp Wed Aug 31 23:55:58 2011 -0700
@@ -1715,26 +1715,22 @@
class G1ParGCAllocBuffer: public ParGCAllocBuffer {
private:
bool _retired;
- bool _during_marking;
+ bool _should_mark_objects;
GCLabBitMap _bitmap;
public:
- G1ParGCAllocBuffer(size_t gclab_word_size) :
- ParGCAllocBuffer(gclab_word_size),
- _during_marking(G1CollectedHeap::heap()->mark_in_progress()),
- _bitmap(G1CollectedHeap::heap()->reserved_region().start(), gclab_word_size),
- _retired(false)
- { }
+ G1ParGCAllocBuffer(size_t gclab_word_size);
inline bool mark(HeapWord* addr) {
guarantee(use_local_bitmaps, "invariant");
- assert(_during_marking, "invariant");
+ assert(_should_mark_objects, "invariant");
return _bitmap.mark(addr);
}
inline void set_buf(HeapWord* buf) {
- if (use_local_bitmaps && _during_marking)
+ if (use_local_bitmaps && _should_mark_objects) {
_bitmap.set_buffer(buf);
+ }
ParGCAllocBuffer::set_buf(buf);
_retired = false;
}
@@ -1742,7 +1738,7 @@
inline void retire(bool end_of_gc, bool retain) {
if (_retired)
return;
- if (use_local_bitmaps && _during_marking) {
+ if (use_local_bitmaps && _should_mark_objects) {
_bitmap.retire();
}
ParGCAllocBuffer::retire(end_of_gc, retain);
--- a/hotspot/src/share/vm/gc_implementation/g1/g1OopClosures.hpp Fri Aug 26 16:11:25 2011 -0700
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1OopClosures.hpp Wed Aug 31 23:55:58 2011 -0700
@@ -50,6 +50,8 @@
G1RemSet* _g1_rem;
ConcurrentMark* _cm;
G1ParScanThreadState* _par_scan_state;
+ bool _during_initial_mark;
+ bool _mark_in_progress;
public:
G1ParClosureSuper(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state);
bool apply_to_weak_ref_discovered_field() { return true; }
@@ -102,8 +104,8 @@
class G1ParCopyHelper : public G1ParClosureSuper {
G1ParScanClosure *_scanner;
protected:
- template <class T> void mark_forwardee(T* p);
- oop copy_to_survivor_space(oop obj);
+ template <class T> void mark_object(T* p);
+ oop copy_to_survivor_space(oop obj, bool should_mark_copy);
public:
G1ParCopyHelper(G1CollectedHeap* g1, G1ParScanThreadState* par_scan_state,
G1ParScanClosure *scanner) :
@@ -111,7 +113,7 @@
};
template<bool do_gen_barrier, G1Barrier barrier,
- bool do_mark_forwardee>
+ bool do_mark_object>
class G1ParCopyClosure : public G1ParCopyHelper {
G1ParScanClosure _scanner;
template <class T> void do_oop_work(T* p);
@@ -120,8 +122,6 @@
_scanner(g1, par_scan_state), G1ParCopyHelper(g1, par_scan_state, &_scanner) { }
template <class T> void do_oop_nv(T* p) {
do_oop_work(p);
- if (do_mark_forwardee)
- mark_forwardee(p);
}
virtual void do_oop(oop* p) { do_oop_nv(p); }
virtual void do_oop(narrowOop* p) { do_oop_nv(p); }
--- a/hotspot/src/share/vm/gc_implementation/g1/g1_globals.hpp Fri Aug 26 16:11:25 2011 -0700
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1_globals.hpp Wed Aug 31 23:55:58 2011 -0700
@@ -124,9 +124,6 @@
develop(bool, G1RSBarrierNullFilter, true, \
"If true, generate null-pointer filtering code in RS barrier") \
\
- develop(bool, G1PrintCTFilterStats, false, \
- "If true, print stats on RS filtering effectiveness") \
- \
develop(bool, G1DeferredRSUpdate, true, \
"If true, use deferred RS updates") \
\
--- a/hotspot/src/share/vm/gc_implementation/g1/g1_specialized_oop_closures.hpp Fri Aug 26 16:11:25 2011 -0700
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1_specialized_oop_closures.hpp Wed Aug 31 23:55:58 2011 -0700
@@ -36,7 +36,7 @@
};
template<bool do_gen_barrier, G1Barrier barrier,
- bool do_mark_forwardee>
+ bool do_mark_object>
class G1ParCopyClosure;
class G1ParScanClosure;
class G1ParPushHeapRSClosure;
--- a/hotspot/src/share/vm/runtime/arguments.cpp Fri Aug 26 16:11:25 2011 -0700
+++ b/hotspot/src/share/vm/runtime/arguments.cpp Wed Aug 31 23:55:58 2011 -0700
@@ -1423,6 +1423,9 @@
if (FLAG_IS_DEFAULT(MinHeapDeltaBytes)) {
FLAG_SET_DEFAULT(MinHeapDeltaBytes, 64*M);
}
+ // For those collectors or operating systems (eg, Windows) that do
+ // not support full UseNUMA, we will map to UseNUMAInterleaving for now
+ UseNUMAInterleaving = true;
}
}
--- a/hotspot/src/share/vm/runtime/globals.hpp Fri Aug 26 16:11:25 2011 -0700
+++ b/hotspot/src/share/vm/runtime/globals.hpp Wed Aug 31 23:55:58 2011 -0700
@@ -475,6 +475,12 @@
product(bool, UseNUMA, false, \
"Use NUMA if available") \
\
+ product(bool, UseNUMAInterleaving, false, \
+ "Interleave memory across NUMA nodes if available") \
+ \
+ product(uintx, NUMAInterleaveGranularity, 2*M, \
+ "Granularity to use for NUMA interleaving on Windows OS") \
+ \
product(bool, ForceNUMA, false, \
"Force NUMA optimizations on single-node/UMA systems") \
\