# HG changeset patch
# User tschatzl
# Date 1474018427 -7200
# Node ID e567be0973152935bc98fe1f90dcf8a02b57dc86
# Parent  3869072fc2e1c23c855f70af9e23fc6796d5648e
8157952: Parallelize Memory Pretouch
Summary: Use multiple threads to pretouch memory using -XX:+AlwaysPreTouch to use more memory bandwidth
Reviewed-by: jmasa, sangheki

diff -r 3869072fc2e1 -r e567be097315 hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp
--- a/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp	Thu Sep 15 12:10:43 2016 -0400
+++ b/hotspot/src/share/vm/gc/g1/g1CollectedHeap.cpp	Fri Sep 16 11:33:47 2016 +0200
@@ -1479,7 +1479,7 @@
                               "Capacity: " SIZE_FORMAT "B occupancy: " SIZE_FORMAT "B min_desired_capacity: " SIZE_FORMAT "B (" UINTX_FORMAT " %%)",
                               capacity_after_gc, used_after_gc, minimum_desired_capacity, MinHeapFreeRatio);
 
-    expand(expand_bytes);
+    expand(expand_bytes, _workers);
 
     // No expansion, now see if we want to shrink
   } else if (capacity_after_gc > maximum_desired_capacity) {
@@ -1599,7 +1599,7 @@
                             word_size * HeapWordSize);
 
 
-  if (expand(expand_bytes)) {
+  if (expand(expand_bytes, _workers)) {
     _hrm.verify_optional();
     _verifier->verify_region_sets_optional();
     return attempt_allocation_at_safepoint(word_size,
@@ -1609,7 +1609,7 @@
   return NULL;
 }
 
-bool G1CollectedHeap::expand(size_t expand_bytes, double* expand_time_ms) {
+bool G1CollectedHeap::expand(size_t expand_bytes, WorkGang* pretouch_workers, double* expand_time_ms) {
   size_t aligned_expand_bytes = ReservedSpace::page_align_size_up(expand_bytes);
   aligned_expand_bytes = align_size_up(aligned_expand_bytes,
                                        HeapRegion::GrainBytes);
@@ -1626,7 +1626,7 @@
   uint regions_to_expand = (uint)(aligned_expand_bytes / HeapRegion::GrainBytes);
   assert(regions_to_expand > 0, "Must expand by at least one region");
 
-  uint expanded_by = _hrm.expand_by(regions_to_expand);
+  uint expanded_by = _hrm.expand_by(regions_to_expand, pretouch_workers);
   if (expand_time_ms != NULL) {
     *expand_time_ms = (os::elapsedTime() - expand_heap_start_time_sec) * MILLIUNITS;
   }
@@ -1927,7 +1927,7 @@
   _cmThread = _cm->cmThread();
 
   // Now expand into the initial heap size.
-  if (!expand(init_byte_size)) {
+  if (!expand(init_byte_size, _workers)) {
     vm_shutdown_during_initialization("Failed to allocate initial heap.");
     return JNI_ENOMEM;
   }
@@ -3240,7 +3240,7 @@
             // No need for an ergo logging here,
             // expansion_amount() does this when it returns a value > 0.
             double expand_ms;
-            if (!expand(expand_bytes, &expand_ms)) {
+            if (!expand(expand_bytes, _workers, &expand_ms)) {
               // We failed to expand the heap. Cannot do anything about it.
             }
             g1_policy()->phase_times()->record_expand_heap_time(expand_ms);
diff -r 3869072fc2e1 -r e567be097315 hotspot/src/share/vm/gc/g1/g1CollectedHeap.hpp
--- a/hotspot/src/share/vm/gc/g1/g1CollectedHeap.hpp	Thu Sep 15 12:10:43 2016 -0400
+++ b/hotspot/src/share/vm/gc/g1/g1CollectedHeap.hpp	Fri Sep 16 11:33:47 2016 +0200
@@ -557,7 +557,7 @@
   // Returns true if the heap was expanded by the requested amount;
   // false otherwise.
   // (Rounds up to a HeapRegion boundary.)
-  bool expand(size_t expand_bytes, double* expand_time_ms = NULL);
+  bool expand(size_t expand_bytes, WorkGang* pretouch_workers = NULL, double* expand_time_ms = NULL);
 
   // Returns the PLAB statistics for a given destination.
   inline G1EvacStats* alloc_buffer_stats(InCSetState dest);
diff -r 3869072fc2e1 -r e567be097315 hotspot/src/share/vm/gc/g1/g1PageBasedVirtualSpace.cpp
--- a/hotspot/src/share/vm/gc/g1/g1PageBasedVirtualSpace.cpp	Thu Sep 15 12:10:43 2016 -0400
+++ b/hotspot/src/share/vm/gc/g1/g1PageBasedVirtualSpace.cpp	Fri Sep 16 11:33:47 2016 +0200
@@ -24,8 +24,10 @@
 
 #include "precompiled.hpp"
 #include "gc/g1/g1PageBasedVirtualSpace.hpp"
+#include "gc/shared/workgroup.hpp"
 #include "oops/markOop.hpp"
 #include "oops/oop.inline.hpp"
+#include "runtime/atomic.hpp"
 #include "runtime/os.inline.hpp"
 #include "services/memTracker.hpp"
 #include "utilities/bitMap.inline.hpp"
@@ -177,7 +179,7 @@
   guarantee(start_page < end_page,
             "Given start page " SIZE_FORMAT " is larger or equal to end page " SIZE_FORMAT, start_page, end_page);
 
-  os::pretouch_memory(page_start(start_page), bounded_end_addr(end_page));
+  os::pretouch_memory(page_start(start_page), bounded_end_addr(end_page), _page_size);
 }
 
 bool G1PageBasedVirtualSpace::commit(size_t start_page, size_t size_in_pages) {
@@ -198,9 +200,6 @@
   }
   _committed.set_range(start_page, end_page);
 
-  if (AlwaysPreTouch) {
-    pretouch_internal(start_page, end_page);
-  }
   return zero_filled;
 }
 
@@ -227,6 +226,53 @@
   _committed.clear_range(start_page, end_page);
 }
 
+class G1PretouchTask : public AbstractGangTask {
+private:
+  char* volatile _cur_addr;
+  char* const _start_addr;
+  char* const _end_addr;
+  size_t const _page_size;
+public:
+  G1PretouchTask(char* start_address, char* end_address, size_t page_size) :
+    AbstractGangTask("G1 PreTouch",
+                     Universe::is_fully_initialized() ? GCId::current_raw() :
+                                                        // During VM initialization there is
+                                                        // no GC cycle that this task can be
+                                                        // associated with.
+                                                        GCId::undefined()),
+    _cur_addr(start_address),
+    _start_addr(start_address),
+    _end_addr(end_address),
+    _page_size(page_size) {
+  }
+
+  virtual void work(uint worker_id) {
+    size_t const actual_chunk_size = MAX2(chunk_size(), _page_size);
+    while (true) {
+      char* touch_addr = (char*)Atomic::add_ptr((intptr_t)actual_chunk_size, (volatile void*) &_cur_addr) - actual_chunk_size;
+      if (touch_addr < _start_addr || touch_addr >= _end_addr) {
+        break;
+      }
+      char* end_addr = touch_addr + MIN2(actual_chunk_size, pointer_delta(_end_addr, touch_addr, sizeof(char)));
+      os::pretouch_memory(touch_addr, end_addr, _page_size);
+    }
+  }
+
+  static size_t chunk_size() { return PreTouchParallelChunkSize; }
+};
+
+void G1PageBasedVirtualSpace::pretouch(size_t start_page, size_t size_in_pages, WorkGang* pretouch_gang) {
+  guarantee(pretouch_gang != NULL, "No pretouch gang specified.");
+
+  size_t num_chunks = MAX2((size_t)1, size_in_pages * _page_size / MAX2(G1PretouchTask::chunk_size(), _page_size));
+
+  uint num_workers = MIN2((uint)num_chunks, pretouch_gang->active_workers());
+  G1PretouchTask cl(page_start(start_page), bounded_end_addr(start_page + size_in_pages), _page_size);
+  log_debug(gc, heap)("Running %s with %u workers for " SIZE_FORMAT " work units pre-touching " SIZE_FORMAT "B.",
+                      cl.name(), num_workers, num_chunks, size_in_pages * _page_size);
+  pretouch_gang->run_task(&cl, num_workers);
+}
+
 bool G1PageBasedVirtualSpace::contains(const void* p) const {
   return _low_boundary <= (const char*) p && (const char*) p < _high_boundary;
 }
diff -r 3869072fc2e1 -r e567be097315 hotspot/src/share/vm/gc/g1/g1PageBasedVirtualSpace.hpp
--- a/hotspot/src/share/vm/gc/g1/g1PageBasedVirtualSpace.hpp	Thu Sep 15 12:10:43 2016 -0400
+++ b/hotspot/src/share/vm/gc/g1/g1PageBasedVirtualSpace.hpp	Fri Sep 16 11:33:47 2016 +0200
@@ -30,6 +30,8 @@
 #include "memory/virtualspace.hpp"
 #include "utilities/bitMap.hpp"
 
+class WorkGang;
+
 // Virtual space management helper for a virtual space with an OS page allocation
 // granularity.
 // (De-)Allocation requests are always OS page aligned by passing a page index
@@ -117,6 +119,8 @@
   // Uncommit the given area of pages starting at start being size_in_pages large.
   void uncommit(size_t start_page, size_t size_in_pages);
 
+  void pretouch(size_t start_page, size_t size_in_pages, WorkGang* pretouch_gang = NULL);
+
   // Initialize the given reserved space with the given base address and the size
   // actually used.
   // Prefer to commit in page_size chunks.
diff -r 3869072fc2e1 -r e567be097315 hotspot/src/share/vm/gc/g1/g1RegionToSpaceMapper.cpp
--- a/hotspot/src/share/vm/gc/g1/g1RegionToSpaceMapper.cpp	Thu Sep 15 12:10:43 2016 -0400
+++ b/hotspot/src/share/vm/gc/g1/g1RegionToSpaceMapper.cpp	Fri Sep 16 11:33:47 2016 +0200
@@ -66,8 +66,12 @@
     guarantee(alloc_granularity >= page_size, "allocation granularity smaller than commit granularity");
   }
 
-  virtual void commit_regions(uint start_idx, size_t num_regions) {
-    bool zero_filled = _storage.commit((size_t)start_idx * _pages_per_region, num_regions * _pages_per_region);
+  virtual void commit_regions(uint start_idx, size_t num_regions, WorkGang* pretouch_gang) {
+    size_t const start_page = (size_t)start_idx * _pages_per_region;
+    bool zero_filled = _storage.commit(start_page, num_regions * _pages_per_region);
+    if (AlwaysPreTouch) {
+      _storage.pretouch(start_page, num_regions * _pages_per_region, pretouch_gang);
+    }
     _commit_map.set_range(start_idx, start_idx + num_regions);
     fire_on_commit(start_idx, num_regions, zero_filled);
   }
@@ -110,19 +114,38 @@
     _refcounts.initialize((HeapWord*)rs.base(), (HeapWord*)(rs.base() + align_size_up(rs.size(), page_size)), page_size);
   }
 
-  virtual void commit_regions(uint start_idx, size_t num_regions) {
+  virtual void commit_regions(uint start_idx, size_t num_regions, WorkGang* pretouch_gang) {
+    size_t const NoPage = ~(size_t)0;
+
+    size_t first_committed = NoPage;
+    size_t num_committed = 0;
+
+    bool all_zero_filled = true;
+
     for (uint i = start_idx; i < start_idx + num_regions; i++) {
       assert(!_commit_map.at(i), "Trying to commit storage at region %u that is already committed", i);
       size_t idx = region_idx_to_page_idx(i);
       uint old_refcount = _refcounts.get_by_index(idx);
+
       bool zero_filled = false;
       if (old_refcount == 0) {
+        if (first_committed == NoPage) {
+          first_committed = idx;
+          num_committed = 1;
+        } else {
+          num_committed++;
+        }
         zero_filled = _storage.commit(idx, 1);
       }
+      all_zero_filled &= zero_filled;
+
       _refcounts.set_by_index(idx, old_refcount + 1);
       _commit_map.set_bit(i);
-      fire_on_commit(i, 1, zero_filled);
     }
+    if (AlwaysPreTouch && num_committed > 0) {
+      _storage.pretouch(first_committed, num_committed, pretouch_gang);
+    }
+    fire_on_commit(start_idx, num_regions, all_zero_filled);
   }
 
   virtual void uncommit_regions(uint start_idx, size_t num_regions) {
diff -r 3869072fc2e1 -r e567be097315 hotspot/src/share/vm/gc/g1/g1RegionToSpaceMapper.hpp
--- a/hotspot/src/share/vm/gc/g1/g1RegionToSpaceMapper.hpp	Thu Sep 15 12:10:43 2016 -0400
+++ b/hotspot/src/share/vm/gc/g1/g1RegionToSpaceMapper.hpp	Fri Sep 16 11:33:47 2016 +0200
@@ -29,6 +29,8 @@
 #include "memory/allocation.hpp"
 #include "utilities/debug.hpp"
 
+class WorkGang;
+
 class G1MappingChangedListener VALUE_OBJ_CLASS_SPEC {
  public:
   // Fired after commit of the memory, i.e. the memory this listener is registered
@@ -68,7 +70,7 @@
     return _commit_map.at(idx);
   }
 
-  virtual void commit_regions(uint start_idx, size_t num_regions = 1) = 0;
+  virtual void commit_regions(uint start_idx, size_t num_regions = 1, WorkGang* pretouch_workers = NULL) = 0;
   virtual void uncommit_regions(uint start_idx, size_t num_regions = 1) = 0;
 
   // Creates an appropriate G1RegionToSpaceMapper for the given parameters.
diff -r 3869072fc2e1 -r e567be097315 hotspot/src/share/vm/gc/g1/heapRegionManager.cpp
--- a/hotspot/src/share/vm/gc/g1/heapRegionManager.cpp	Thu Sep 15 12:10:43 2016 -0400
+++ b/hotspot/src/share/vm/gc/g1/heapRegionManager.cpp	Fri Sep 16 11:33:47 2016 +0200
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -72,22 +72,22 @@
   return g1h->new_heap_region(hrm_index, mr);
 }
 
-void HeapRegionManager::commit_regions(uint index, size_t num_regions) {
+void HeapRegionManager::commit_regions(uint index, size_t num_regions, WorkGang* pretouch_gang) {
   guarantee(num_regions > 0, "Must commit more than zero regions");
   guarantee(_num_committed + num_regions <= max_length(), "Cannot commit more than the maximum amount of regions");
 
   _num_committed += (uint)num_regions;
 
-  _heap_mapper->commit_regions(index, num_regions);
+  _heap_mapper->commit_regions(index, num_regions, pretouch_gang);
 
   // Also commit auxiliary data
-  _prev_bitmap_mapper->commit_regions(index, num_regions);
-  _next_bitmap_mapper->commit_regions(index, num_regions);
+  _prev_bitmap_mapper->commit_regions(index, num_regions, pretouch_gang);
+  _next_bitmap_mapper->commit_regions(index, num_regions, pretouch_gang);
 
-  _bot_mapper->commit_regions(index, num_regions);
-  _cardtable_mapper->commit_regions(index, num_regions);
+  _bot_mapper->commit_regions(index, num_regions, pretouch_gang);
+  _cardtable_mapper->commit_regions(index, num_regions, pretouch_gang);
 
-  _card_counts_mapper->commit_regions(index, num_regions);
+  _card_counts_mapper->commit_regions(index, num_regions, pretouch_gang);
 }
 
 void HeapRegionManager::uncommit_regions(uint start, size_t num_regions) {
@@ -117,9 +117,9 @@
   _card_counts_mapper->uncommit_regions(start, num_regions);
 }
 
-void HeapRegionManager::make_regions_available(uint start, uint num_regions) {
+void HeapRegionManager::make_regions_available(uint start, uint num_regions, WorkGang* pretouch_gang) {
   guarantee(num_regions > 0, "No point in calling this for zero regions");
-  commit_regions(start, num_regions);
+  commit_regions(start, num_regions, pretouch_gang);
   for (uint i = start; i < start + num_regions; i++) {
     if (_regions.get_by_index(i) == NULL) {
       HeapRegion* new_hr = new_heap_region(i);
@@ -163,11 +163,11 @@
   return MemoryUsage(0, used_sz, committed_sz, committed_sz);
 }
 
-uint HeapRegionManager::expand_by(uint num_regions) {
-  return expand_at(0, num_regions);
+uint HeapRegionManager::expand_by(uint num_regions, WorkGang* pretouch_workers) {
+  return expand_at(0, num_regions, pretouch_workers);
 }
 
-uint HeapRegionManager::expand_at(uint start, uint num_regions) {
+uint HeapRegionManager::expand_at(uint start, uint num_regions, WorkGang* pretouch_workers) {
   if (num_regions == 0) {
     return 0;
   }
@@ -181,7 +181,7 @@
   while (expanded < num_regions &&
          (num_last_found = find_unavailable_from_idx(cur, &idx_last_found)) > 0) {
     uint to_expand = MIN2(num_regions - expanded, num_last_found);
-    make_regions_available(idx_last_found, to_expand);
+    make_regions_available(idx_last_found, to_expand, pretouch_workers);
     expanded += to_expand;
     cur = idx_last_found + num_last_found + 1;
   }
diff -r 3869072fc2e1 -r e567be097315 hotspot/src/share/vm/gc/g1/heapRegionManager.hpp
--- a/hotspot/src/share/vm/gc/g1/heapRegionManager.hpp	Thu Sep 15 12:10:43 2016 -0400
+++ b/hotspot/src/share/vm/gc/g1/heapRegionManager.hpp	Fri Sep 16 11:33:47 2016 +0200
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -34,6 +34,7 @@
 class HeapRegionClosure;
 class HeapRegionClaimer;
 class FreeRegionList;
+class WorkGang;
 
 class G1HeapRegionTable : public G1BiasedMappedArray<HeapRegion*> {
  protected:
@@ -94,10 +95,10 @@
   HeapWord* heap_bottom() const { return _regions.bottom_address_mapped(); }
   HeapWord* heap_end() const {return _regions.end_address_mapped(); }
 
-  void make_regions_available(uint index, uint num_regions = 1);
+  void make_regions_available(uint index, uint num_regions = 1, WorkGang* pretouch_gang = NULL);
 
   // Pass down commit calls to the VirtualSpace.
-  void commit_regions(uint index, size_t num_regions = 1);
+  void commit_regions(uint index, size_t num_regions = 1, WorkGang* pretouch_gang = NULL);
   void uncommit_regions(uint index, size_t num_regions = 1);
 
   // Notify other data structures about change in the heap layout.
@@ -209,12 +210,12 @@
   // HeapRegions, or re-use existing ones. Returns the number of regions the
   // sequence was expanded by. If a HeapRegion allocation fails, the resulting
   // number of regions might be smaller than what's desired.
-  uint expand_by(uint num_regions);
+  uint expand_by(uint num_regions, WorkGang* pretouch_workers = NULL);
 
   // Makes sure that the regions from start to start+num_regions-1 are available
   // for allocation. Returns the number of regions that were committed to achieve
   // this.
-  uint expand_at(uint start, uint num_regions);
+  uint expand_at(uint start, uint num_regions, WorkGang* pretouch_workers = NULL);
 
   // Find a contiguous set of empty regions of length num. Returns the start index of
   // that set, or G1_NO_HRM_INDEX.
diff -r 3869072fc2e1 -r e567be097315 hotspot/src/share/vm/gc/shared/workgroup.hpp
--- a/hotspot/src/share/vm/gc/shared/workgroup.hpp	Thu Sep 15 12:10:43 2016 -0400
+++ b/hotspot/src/share/vm/gc/shared/workgroup.hpp	Fri Sep 16 11:33:47 2016 +0200
@@ -62,7 +62,12 @@
   AbstractGangTask(const char* name) :
     _name(name),
     _gc_id(GCId::current_raw())
- {}
+  {}
+
+  AbstractGangTask(const char* name, const uint gc_id) :
+    _name(name),
+    _gc_id(gc_id)
+  {}
 
   // The abstract work method.
   // The argument tells you which member of the gang you are.
diff -r 3869072fc2e1 -r e567be097315 hotspot/src/share/vm/runtime/globals.hpp
--- a/hotspot/src/share/vm/runtime/globals.hpp	Thu Sep 15 12:10:43 2016 -0400
+++ b/hotspot/src/share/vm/runtime/globals.hpp	Fri Sep 16 11:33:47 2016 +0200
@@ -1596,6 +1596,10 @@
   product(bool, AlwaysPreTouch, false,                                      \
           "Force all freshly committed pages to be pre-touched")            \
                                                                             \
+  product(size_t, PreTouchParallelChunkSize, 1 * G,                         \
+          "Per-thread chunk size for parallel memory pre-touch.")           \
+          range(1, SIZE_MAX / 2)                                            \
+                                                                            \
   product_pd(size_t, CMSYoungGenPerWorker,                                  \
           "The maximum size of young gen chosen by default per GC worker "  \
           "thread available")                                               \
diff -r 3869072fc2e1 -r e567be097315 hotspot/src/share/vm/runtime/os.cpp
--- a/hotspot/src/share/vm/runtime/os.cpp	Thu Sep 15 12:10:43 2016 -0400
+++ b/hotspot/src/share/vm/runtime/os.cpp	Fri Sep 16 11:33:47 2016 +0200
@@ -1705,8 +1705,8 @@
   return res;
 }
 
-void os::pretouch_memory(void* start, void* end) {
-  for (volatile char *p = (char*)start; p < (char*)end; p += os::vm_page_size()) {
+void os::pretouch_memory(void* start, void* end, size_t page_size) {
+  for (volatile char *p = (char*)start; p < (char*)end; p += page_size) {
     *p = 0;
   }
 }
diff -r 3869072fc2e1 -r e567be097315 hotspot/src/share/vm/runtime/os.hpp
--- a/hotspot/src/share/vm/runtime/os.hpp	Thu Sep 15 12:10:43 2016 -0400
+++ b/hotspot/src/share/vm/runtime/os.hpp	Fri Sep 16 11:33:47 2016 +0200
@@ -324,7 +324,7 @@
   // to make the OS back the memory range with actual memory.
   // Current implementation may not touch the last page if unaligned addresses
   // are passed.
-  static void   pretouch_memory(void* start, void* end);
+  static void   pretouch_memory(void* start, void* end, size_t page_size = vm_page_size());
 
   enum ProtType { MEM_PROT_NONE, MEM_PROT_READ, MEM_PROT_RW, MEM_PROT_RWX };
   static bool   protect_memory(char* addr, size_t bytes, ProtType prot,