8077144: Concurrent mark initialization takes too long
authortschatzl
Wed, 06 Apr 2016 13:32:48 +0200
changeset 37413 2f71679d06dd
parent 37412 d5f8d53af5ec
child 37414 2672ba9af0dc
8077144: Concurrent mark initialization takes too long Summary: Remove per-marking thread liveness bitmaps and recreate liveness bitmap concurrently after the cleanup pause. Reviewed-by: mgerdin, ehelin, kbarrett
hotspot/src/share/vm/gc/g1/concurrentMarkThread.cpp
hotspot/src/share/vm/gc/g1/g1ConcurrentMark.cpp
hotspot/src/share/vm/gc/g1/g1ConcurrentMark.hpp
hotspot/src/share/vm/gc/g1/g1ConcurrentMark.inline.hpp
hotspot/src/share/vm/gc/g1/g1EvacFailure.cpp
hotspot/src/share/vm/gc/g1/g1OopClosures.inline.hpp
hotspot/src/share/vm/gc/g1/g1_globals.hpp
hotspot/src/share/vm/utilities/bitMap.cpp
hotspot/src/share/vm/utilities/bitMap.hpp
hotspot/test/gc/g1/Test2GbHeap.java
--- a/hotspot/src/share/vm/gc/g1/concurrentMarkThread.cpp	Wed Apr 06 10:50:19 2016 +0300
+++ b/hotspot/src/share/vm/gc/g1/concurrentMarkThread.cpp	Wed Apr 06 13:32:48 2016 +0200
@@ -183,6 +183,11 @@
         }
       } while (cm()->restart_for_overflow());
 
+      if (!cm()->has_aborted()) {
+        G1ConcPhaseTimer t(_cm, "Concurrent Create Live Data");
+        cm()->create_live_data();
+      }
+
       double end_time = os::elapsedVTime();
       // Update the total virtual time before doing this, since it will try
       // to measure it to get the vtime for this marking.  We purposely
--- a/hotspot/src/share/vm/gc/g1/g1ConcurrentMark.cpp	Wed Apr 06 10:50:19 2016 +0300
+++ b/hotspot/src/share/vm/gc/g1/g1ConcurrentMark.cpp	Wed Apr 06 13:32:48 2016 +0200
@@ -48,6 +48,7 @@
 #include "gc/shared/taskqueue.inline.hpp"
 #include "gc/shared/vmGCOperations.hpp"
 #include "logging/log.hpp"
+#include "logging/logTag.hpp"
 #include "memory/allocation.hpp"
 #include "memory/resourceArea.hpp"
 #include "oops/oop.inline.hpp"
@@ -355,10 +356,8 @@
   _sleep_factor(0.0),
   _marking_task_overhead(1.0),
   _cleanup_list("Cleanup List"),
-  _region_bm((BitMap::idx_t)(g1h->max_regions()), false /* in_resource_area*/),
-  _card_bm((g1h->reserved_region().byte_size() + CardTableModRefBS::card_size - 1) >>
-            CardTableModRefBS::card_shift,
-            false /* in_resource_area*/),
+  _region_live_bm(),
+  _card_live_bm(),
 
   _prevMarkBitMap(&_markBitMap1),
   _nextMarkBitMap(&_markBitMap2),
@@ -390,8 +389,6 @@
 
   _parallel_workers(NULL),
 
-  _count_card_bitmaps(NULL),
-  _count_marked_bytes(NULL),
   _completed_initialization(false) {
 
   _markBitMap1.initialize(g1h->reserved_region(), prev_bitmap_storage);
@@ -502,43 +499,28 @@
     return;
   }
 
+  allocate_internal_bitmaps();
+
+  if (G1PretouchAuxiliaryMemory) {
+    pretouch_internal_bitmaps();
+  }
+
   _tasks = NEW_C_HEAP_ARRAY(G1CMTask*, _max_worker_id, mtGC);
   _accum_task_vtime = NEW_C_HEAP_ARRAY(double, _max_worker_id, mtGC);
 
-  _count_card_bitmaps = NEW_C_HEAP_ARRAY(BitMap,  _max_worker_id, mtGC);
-  _count_marked_bytes = NEW_C_HEAP_ARRAY(size_t*, _max_worker_id, mtGC);
-
-  BitMap::idx_t card_bm_size = _card_bm.size();
-
   // so that the assertion in MarkingTaskQueue::task_queue doesn't fail
   _active_tasks = _max_worker_id;
 
-  uint max_regions = _g1h->max_regions();
   for (uint i = 0; i < _max_worker_id; ++i) {
     G1CMTaskQueue* task_queue = new G1CMTaskQueue();
     task_queue->initialize();
     _task_queues->register_queue(i, task_queue);
 
-    _count_card_bitmaps[i] = BitMap(card_bm_size, false);
-    _count_marked_bytes[i] = NEW_C_HEAP_ARRAY(size_t, max_regions, mtGC);
-
-    _tasks[i] = new G1CMTask(i, this,
-                             _count_marked_bytes[i],
-                             &_count_card_bitmaps[i],
-                             task_queue, _task_queues);
+    _tasks[i] = new G1CMTask(i, this, task_queue, _task_queues);
 
     _accum_task_vtime[i] = 0.0;
   }
 
-  // Calculate the card number for the bottom of the heap. Used
-  // in biasing indexes into the accounting card bitmaps.
-  _heap_bottom_card_num =
-    intptr_t(uintptr_t(_g1h->reserved_region().start()) >>
-                                CardTableModRefBS::card_shift);
-
-  // Clear all the liveness counting data
-  clear_all_count_data();
-
   // so that the call below can read a sensible value
   _heap_start = g1h->reserved_region().start();
   set_non_marking_state();
@@ -716,10 +698,11 @@
 
   clear_bitmap(_nextMarkBitMap, _parallel_workers, true);
 
-  // Clear the liveness counting data. If the marking has been aborted, the abort()
+  // Clear the live count data. If the marking has been aborted, the abort()
   // call already did that.
   if (!has_aborted()) {
-    clear_all_count_data();
+    clear_all_live_data(_parallel_workers);
+    DEBUG_ONLY(verify_all_live_data());
   }
 
   // Repeat the asserts from above.
@@ -1107,14 +1090,6 @@
     // marking due to overflowing the global mark stack.
     reset_marking_state();
   } else {
-    {
-      GCTraceTime(Debug, gc, phases) trace("Aggregate Data", _gc_timer_cm);
-
-      // Aggregate the per-task counting data that we have accumulated
-      // while marking.
-      aggregate_count_data();
-    }
-
     SATBMarkQueueSet& satb_mq_set = JavaThread::satb_mark_queue_set();
     // We're done with marking.
     // This is the end of  the marking cycle, we're expected all
@@ -1150,17 +1125,81 @@
   _gc_tracer_cm->report_object_count_after_gc(&is_alive);
 }
 
-// Base class of the closures that finalize and verify the
-// liveness counting data.
-class G1CMCountDataClosureBase: public HeapRegionClosure {
-protected:
-  G1CollectedHeap* _g1h;
-  G1ConcurrentMark* _cm;
-  CardTableModRefBS* _ct_bs;
-
+// Helper class that provides functionality to generate the Live Data Count
+// information.
+class G1LiveDataHelper VALUE_OBJ_CLASS_SPEC {
+private:
   BitMap* _region_bm;
   BitMap* _card_bm;
 
+  // The card number of the bottom of the G1 heap. Used for converting addresses
+  // to bitmap indices quickly.
+  BitMap::idx_t _heap_card_bias;
+
+  // Utility routine to set an exclusive range of bits on the given
+  // bitmap, optimized for very small ranges.
+  // There must be at least one bit to set.
+  inline void set_card_bitmap_range(BitMap* bm,
+                                    BitMap::idx_t start_idx,
+                                    BitMap::idx_t end_idx) {
+
+    // Set the exclusive bit range [start_idx, end_idx).
+    assert((end_idx - start_idx) > 0, "at least one bit");
+    assert(end_idx <= bm->size(), "sanity");
+
+    // For small ranges use a simple loop; otherwise use set_range or
+    // use par_at_put_range (if parallel). The range is made up of the
+    // cards that are spanned by an object/mem region so 8 cards will
+    // allow up to object sizes up to 4K to be handled using the loop.
+    if ((end_idx - start_idx) <= 8) {
+      for (BitMap::idx_t i = start_idx; i < end_idx; i += 1) {
+        bm->set_bit(i);
+      }
+    } else {
+      bm->set_range(start_idx, end_idx);
+    }
+  }
+
+  // We cache the last mark set. This avoids setting the same bit multiple times.
+  // This is particularly interesting for dense bitmaps, as this avoids doing
+  // lots of work most of the time.
+  BitMap::idx_t _last_marked_bit_idx;
+
+  // Mark the card liveness bitmap for the object spanning from start to end.
+  void mark_card_bitmap_range(HeapWord* start, HeapWord* end) {
+    BitMap::idx_t start_idx = card_live_bitmap_index_for(start);
+    BitMap::idx_t end_idx = card_live_bitmap_index_for((HeapWord*)align_ptr_up(end, CardTableModRefBS::card_size));
+
+    assert((end_idx - start_idx) > 0, "Trying to mark zero sized range.");
+
+    if (start_idx == _last_marked_bit_idx) {
+      start_idx++;
+    }
+    if (start_idx == end_idx) {
+      return;
+    }
+
+    // Set the bits in the card bitmap for the cards spanned by this object.
+    set_card_bitmap_range(_card_bm, start_idx, end_idx);
+    _last_marked_bit_idx = end_idx - 1;
+  }
+
+  void reset_mark_cache() {
+    _last_marked_bit_idx = (BitMap::idx_t)-1;
+  }
+
+public:
+  // Returns the index in the per-card liveness count bitmap
+  // for the given address
+  inline BitMap::idx_t card_live_bitmap_index_for(HeapWord* addr) {
+    // Below, the term "card num" means the result of shifting an address
+    // by the card shift -- address 0 corresponds to card number 0.  One
+    // must subtract the card num of the bottom of the heap to obtain a
+    // card table index.
+    BitMap::idx_t card_num = (BitMap::idx_t)(uintptr_t(addr) >> CardTableModRefBS::card_shift);
+    return card_num - _heap_card_bias;
+  }
+
   // Takes a region that's not empty (i.e., it has at least one
   // live object in it and sets its corresponding bit on the region
   // bitmap to 1.
@@ -1169,136 +1208,128 @@
     _region_bm->par_at_put(index, true);
   }
 
-public:
-  G1CMCountDataClosureBase(G1CollectedHeap* g1h,
-                           BitMap* region_bm, BitMap* card_bm):
-    _g1h(g1h), _cm(g1h->concurrent_mark()),
-    _ct_bs(barrier_set_cast<CardTableModRefBS>(g1h->barrier_set())),
-    _region_bm(region_bm), _card_bm(card_bm) { }
-};
-
-// Closure that calculates the # live objects per region. Used
-// for verification purposes during the cleanup pause.
-class CalcLiveObjectsClosure: public G1CMCountDataClosureBase {
-  G1CMBitMapRO* _bm;
-  size_t _region_marked_bytes;
-
-public:
-  CalcLiveObjectsClosure(G1CMBitMapRO *bm, G1CollectedHeap* g1h,
-                         BitMap* region_bm, BitMap* card_bm) :
-    G1CMCountDataClosureBase(g1h, region_bm, card_bm),
-    _bm(bm), _region_marked_bytes(0) { }
-
-  bool doHeapRegion(HeapRegion* hr) {
+  // Mark the range of bits covered by allocations done since the last marking
+  // in the given heap region, i.e. from NTAMS to top of the given region.
+  // Returns if there has been some allocation in this region since the last marking.
+  bool mark_allocated_since_marking(HeapRegion* hr) {
+    reset_mark_cache();
+
+    HeapWord* ntams = hr->next_top_at_mark_start();
+    HeapWord* top   = hr->top();
+
+    assert(hr->bottom() <= ntams && ntams <= hr->end(), "Preconditions.");
+
+    // Mark the allocated-since-marking portion...
+    if (ntams < top) {
+      mark_card_bitmap_range(ntams, top);
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  // Mark the range of bits covered by live objects on the mark bitmap between
+  // bottom and NTAMS of the given region.
+  // Returns the number of live bytes marked within that area for the given
+  // heap region.
+  size_t mark_marked_during_marking(G1CMBitMap* mark_bitmap, HeapRegion* hr) {
+    reset_mark_cache();
+
+    size_t marked_bytes = 0;
+
     HeapWord* ntams = hr->next_top_at_mark_start();
     HeapWord* start = hr->bottom();
 
+    if (ntams <= start) {
+      // Skip empty regions.
+      return 0;
+    } else if (hr->is_humongous()) {
+      mark_card_bitmap_range(start, hr->top());
+      return pointer_delta(hr->top(), start, 1);
+    }
+
     assert(start <= hr->end() && start <= ntams && ntams <= hr->end(),
            "Preconditions not met - "
            "start: " PTR_FORMAT ", ntams: " PTR_FORMAT ", end: " PTR_FORMAT,
            p2i(start), p2i(ntams), p2i(hr->end()));
 
     // Find the first marked object at or after "start".
-    start = _bm->getNextMarkedWordAddress(start, ntams);
-
-    size_t marked_bytes = 0;
-
+    start = mark_bitmap->getNextMarkedWordAddress(start, ntams);
     while (start < ntams) {
       oop obj = oop(start);
       int obj_sz = obj->size();
       HeapWord* obj_end = start + obj_sz;
 
-      BitMap::idx_t start_idx = _cm->card_bitmap_index_for(start);
-      BitMap::idx_t end_idx = _cm->card_bitmap_index_for(obj_end);
-
-      // Note: if we're looking at the last region in heap - obj_end
-      // could be actually just beyond the end of the heap; end_idx
-      // will then correspond to a (non-existent) card that is also
-      // just beyond the heap.
-      if (_g1h->is_in_g1_reserved(obj_end) && !_ct_bs->is_card_aligned(obj_end)) {
-        // end of object is not card aligned - increment to cover
-        // all the cards spanned by the object
-        end_idx += 1;
-      }
-
-      // Set the bits in the card BM for the cards spanned by this object.
-      _cm->set_card_bitmap_range(_card_bm, start_idx, end_idx, true /* is_par */);
+      assert(obj_end <= hr->end(), "Humongous objects must have been handled elsewhere.");
+
+      mark_card_bitmap_range(start, obj_end);
 
       // Add the size of this object to the number of marked bytes.
       marked_bytes += (size_t)obj_sz * HeapWordSize;
 
-      // This will happen if we are handling a humongous object that spans
-      // several heap regions.
-      if (obj_end > hr->end()) {
-        break;
-      }
       // Find the next marked object after this one.
-      start = _bm->getNextMarkedWordAddress(obj_end, ntams);
+      start = mark_bitmap->getNextMarkedWordAddress(obj_end, ntams);
     }
 
-    // Mark the allocated-since-marking portion...
-    HeapWord* top = hr->top();
-    if (ntams < top) {
-      BitMap::idx_t start_idx = _cm->card_bitmap_index_for(ntams);
-      BitMap::idx_t end_idx = _cm->card_bitmap_index_for(top);
-
-      // Note: if we're looking at the last region in heap - top
-      // could be actually just beyond the end of the heap; end_idx
-      // will then correspond to a (non-existent) card that is also
-      // just beyond the heap.
-      if (_g1h->is_in_g1_reserved(top) && !_ct_bs->is_card_aligned(top)) {
-        // end of object is not card aligned - increment to cover
-        // all the cards spanned by the object
-        end_idx += 1;
-      }
-      _cm->set_card_bitmap_range(_card_bm, start_idx, end_idx, true /* is_par */);
-
-      // This definitely means the region has live objects.
-      set_bit_for_region(hr);
-    }
-
-    // Update the live region bitmap.
-    if (marked_bytes > 0) {
-      set_bit_for_region(hr);
-    }
-
-    // Set the marked bytes for the current region so that
-    // it can be queried by a calling verification routine
-    _region_marked_bytes = marked_bytes;
-
-    return false;
+    return marked_bytes;
   }
 
-  size_t region_marked_bytes() const { return _region_marked_bytes; }
+  G1LiveDataHelper(BitMap* region_bm,
+                   BitMap* card_bm):
+    _region_bm(region_bm),
+    _card_bm(card_bm) {
+    //assert(region_bm != NULL, "");
+    assert(card_bm != NULL, "");
+    // Calculate the card number for the bottom of the heap. Used
+    // in biasing indexes into the accounting card bitmaps.
+    _heap_card_bias =
+      (BitMap::idx_t)(uintptr_t(G1CollectedHeap::heap()->reserved_region().start()) >> CardTableModRefBS::card_shift);
+  }
 };
 
-// Heap region closure used for verifying the counting data
-// that was accumulated concurrently and aggregated during
+// Heap region closure used for verifying the live count data
+// that was created concurrently and finalized during
 // the remark pause. This closure is applied to the heap
 // regions during the STW cleanup pause.
-
-class VerifyLiveObjectDataHRClosure: public HeapRegionClosure {
+class G1VerifyLiveDataHRClosure: public HeapRegionClosure {
+private:
   G1CollectedHeap* _g1h;
-  G1ConcurrentMark* _cm;
-  CalcLiveObjectsClosure _calc_cl;
-  BitMap* _region_bm;   // Region BM to be verified
-  BitMap* _card_bm;     // Card BM to be verified
+  G1CMBitMap* _mark_bitmap;
+  G1LiveDataHelper _calc_helper;
+
+  BitMap* _act_region_bm; // Region BM to be verified
+  BitMap* _act_card_bm;   // Card BM to be verified
 
   BitMap* _exp_region_bm; // Expected Region BM values
   BitMap* _exp_card_bm;   // Expected card BM values
 
   int _failures;
 
+  // Updates the live data count for the given heap region and returns the number
+  // of bytes marked.
+  size_t create_live_data_count(HeapRegion* hr) {
+    size_t bytes_marked = _calc_helper.mark_marked_during_marking(_mark_bitmap, hr);
+    bool allocated_since_marking = _calc_helper.mark_allocated_since_marking(hr);
+    if (allocated_since_marking || bytes_marked > 0) {
+      _calc_helper.set_bit_for_region(hr);
+    }
+    return bytes_marked;
+  }
+
 public:
-  VerifyLiveObjectDataHRClosure(G1CollectedHeap* g1h,
-                                BitMap* region_bm,
-                                BitMap* card_bm,
-                                BitMap* exp_region_bm,
-                                BitMap* exp_card_bm) :
-    _g1h(g1h), _cm(g1h->concurrent_mark()),
-    _calc_cl(_cm->nextMarkBitMap(), g1h, exp_region_bm, exp_card_bm),
-    _region_bm(region_bm), _card_bm(card_bm),
-    _exp_region_bm(exp_region_bm), _exp_card_bm(exp_card_bm),
+  G1VerifyLiveDataHRClosure(G1CollectedHeap* g1h,
+                            G1CMBitMap* mark_bitmap,
+                            BitMap* act_region_bm,
+                            BitMap* act_card_bm,
+                            BitMap* exp_region_bm,
+                            BitMap* exp_card_bm) :
+    _g1h(g1h),
+    _mark_bitmap(mark_bitmap),
+    _calc_helper(exp_region_bm, exp_card_bm),
+    _act_region_bm(act_region_bm),
+    _act_card_bm(act_card_bm),
+    _exp_region_bm(exp_region_bm),
+    _exp_card_bm(exp_card_bm),
     _failures(0) { }
 
   int failures() const { return _failures; }
@@ -1306,35 +1337,16 @@
   bool doHeapRegion(HeapRegion* hr) {
     int failures = 0;
 
-    // Call the CalcLiveObjectsClosure to walk the marking bitmap for
-    // this region and set the corresponding bits in the expected region
-    // and card bitmaps.
-    bool res = _calc_cl.doHeapRegion(hr);
-    assert(res == false, "should be continuing");
-
-    // Verify the marked bytes for this region.
-    size_t exp_marked_bytes = _calc_cl.region_marked_bytes();
+    // Walk the marking bitmap for this region and set the corresponding bits
+    // in the expected region and card bitmaps.
+    size_t exp_marked_bytes = create_live_data_count(hr);
     size_t act_marked_bytes = hr->next_marked_bytes();
-
-    if (exp_marked_bytes > act_marked_bytes) {
-      if (hr->is_starts_humongous()) {
-        // For start_humongous regions, the size of the whole object will be
-        // in exp_marked_bytes.
-        HeapRegion* region = hr;
-        int num_regions;
-        for (num_regions = 0; region != NULL; num_regions++) {
-          region = _g1h->next_region_in_humongous(region);
-        }
-        if ((num_regions-1) * HeapRegion::GrainBytes >= exp_marked_bytes) {
-          failures += 1;
-        } else if (num_regions * HeapRegion::GrainBytes < exp_marked_bytes) {
-          failures += 1;
-        }
-      } else {
-        // We're not OK if expected marked bytes > actual marked bytes. It means
-        // we have missed accounting some objects during the actual marking.
-        failures += 1;
-      }
+    // Verify the marked bytes for this region.
+
+    if (exp_marked_bytes != act_marked_bytes) {
+      failures += 1;
+    } else if (exp_marked_bytes > HeapRegion::GrainBytes) {
+      failures += 1;
     }
 
     // Verify the bit, for this region, in the actual and expected
@@ -1344,7 +1356,7 @@
     BitMap::idx_t index = (BitMap::idx_t) hr->hrm_index();
 
     bool expected = _exp_region_bm->at(index);
-    bool actual = _region_bm->at(index);
+    bool actual = _act_region_bm->at(index);
     if (expected && !actual) {
       failures += 1;
     }
@@ -1353,12 +1365,12 @@
     // region match. We have an error if we have a set bit in the expected
     // bit map and the corresponding bit in the actual bitmap is not set.
 
-    BitMap::idx_t start_idx = _cm->card_bitmap_index_for(hr->bottom());
-    BitMap::idx_t end_idx = _cm->card_bitmap_index_for(hr->top());
+    BitMap::idx_t start_idx = _calc_helper.card_live_bitmap_index_for(hr->bottom());
+    BitMap::idx_t end_idx = _calc_helper.card_live_bitmap_index_for(hr->top());
 
     for (BitMap::idx_t i = start_idx; i < end_idx; i+=1) {
       expected = _exp_card_bm->at(i);
-      actual = _card_bm->at(i);
+      actual = _act_card_bm->at(i);
 
       if (expected && !actual) {
         failures += 1;
@@ -1373,137 +1385,100 @@
   }
 };
 
-class G1ParVerifyFinalCountTask: public AbstractGangTask {
+class G1VerifyLiveDataTask: public AbstractGangTask {
 protected:
   G1CollectedHeap* _g1h;
-  G1ConcurrentMark* _cm;
+  G1CMBitMap* _mark_bitmap;
   BitMap* _actual_region_bm;
   BitMap* _actual_card_bm;
 
-  uint    _n_workers;
-
-  BitMap* _expected_region_bm;
-  BitMap* _expected_card_bm;
+  BitMap _expected_region_bm;
+  BitMap _expected_card_bm;
 
   int  _failures;
 
-  HeapRegionClaimer _hrclaimer;
+  HeapRegionClaimer _hr_claimer;
 
 public:
-  G1ParVerifyFinalCountTask(G1CollectedHeap* g1h,
-                            BitMap* region_bm, BitMap* card_bm,
-                            BitMap* expected_region_bm, BitMap* expected_card_bm)
-    : AbstractGangTask("G1 verify final counting"),
-      _g1h(g1h), _cm(_g1h->concurrent_mark()),
-      _actual_region_bm(region_bm), _actual_card_bm(card_bm),
-      _expected_region_bm(expected_region_bm), _expected_card_bm(expected_card_bm),
-      _failures(0),
-      _n_workers(_g1h->workers()->active_workers()), _hrclaimer(_n_workers) {
+  G1VerifyLiveDataTask(G1CollectedHeap* g1h,
+                       G1CMBitMap* bitmap,
+                       BitMap* region_bm,
+                       BitMap* card_bm,
+                       uint n_workers)
+  : AbstractGangTask("G1 verify final counting"),
+    _g1h(g1h),
+    _mark_bitmap(bitmap),
+    _actual_region_bm(region_bm),
+    _actual_card_bm(card_bm),
+    _expected_region_bm(region_bm->size(), true /* in_resource_area */),
+    _expected_card_bm(card_bm->size(), true /* in_resource_area */),
+    _failures(0),
+    _hr_claimer(n_workers) {
     assert(VerifyDuringGC, "don't call this otherwise");
-    assert(_expected_card_bm->size() == _actual_card_bm->size(), "sanity");
-    assert(_expected_region_bm->size() == _actual_region_bm->size(), "sanity");
   }
 
   void work(uint worker_id) {
-    assert(worker_id < _n_workers, "invariant");
-
-    VerifyLiveObjectDataHRClosure verify_cl(_g1h,
-                                            _actual_region_bm, _actual_card_bm,
-                                            _expected_region_bm,
-                                            _expected_card_bm);
-
-    _g1h->heap_region_par_iterate(&verify_cl, worker_id, &_hrclaimer);
-
-    Atomic::add(verify_cl.failures(), &_failures);
+    G1VerifyLiveDataHRClosure cl(_g1h,
+                                 _mark_bitmap,
+                                 _actual_region_bm,
+                                 _actual_card_bm,
+                                 &_expected_region_bm,
+                                 &_expected_card_bm);
+    _g1h->heap_region_par_iterate(&cl, worker_id, &_hr_claimer);
+
+    Atomic::add(cl.failures(), &_failures);
   }
 
   int failures() const { return _failures; }
 };
 
-// Closure that finalizes the liveness counting data.
-// Used during the cleanup pause.
-// Sets the bits corresponding to the interval [NTAMS, top]
-// (which contains the implicitly live objects) in the
-// card liveness bitmap. Also sets the bit for each region,
-// containing live data, in the region liveness bitmap.
-
-class FinalCountDataUpdateClosure: public G1CMCountDataClosureBase {
- public:
-  FinalCountDataUpdateClosure(G1CollectedHeap* g1h,
-                              BitMap* region_bm,
-                              BitMap* card_bm) :
-    G1CMCountDataClosureBase(g1h, region_bm, card_bm) { }
-
-  bool doHeapRegion(HeapRegion* hr) {
-    HeapWord* ntams = hr->next_top_at_mark_start();
-    HeapWord* top   = hr->top();
-
-    assert(hr->bottom() <= ntams && ntams <= hr->end(), "Preconditions.");
-
-    // Mark the allocated-since-marking portion...
-    if (ntams < top) {
-      // This definitely means the region has live objects.
-      set_bit_for_region(hr);
-
-      // Now set the bits in the card bitmap for [ntams, top)
-      BitMap::idx_t start_idx = _cm->card_bitmap_index_for(ntams);
-      BitMap::idx_t end_idx = _cm->card_bitmap_index_for(top);
-
-      // Note: if we're looking at the last region in heap - top
-      // could be actually just beyond the end of the heap; end_idx
-      // will then correspond to a (non-existent) card that is also
-      // just beyond the heap.
-      if (_g1h->is_in_g1_reserved(top) && !_ct_bs->is_card_aligned(top)) {
-        // end of object is not card aligned - increment to cover
-        // all the cards spanned by the object
-        end_idx += 1;
+class G1FinalizeLiveDataTask: public AbstractGangTask {
+  // Finalizes the liveness counting data.
+  // Sets the bits corresponding to the interval [NTAMS, top]
+  // (which contains the implicitly live objects) in the
+  // card liveness bitmap. Also sets the bit for each region
+  // containing live data, in the region liveness bitmap.
+  class G1FinalizeCountDataClosure: public HeapRegionClosure {
+  private:
+    G1LiveDataHelper _helper;
+  public:
+    G1FinalizeCountDataClosure(G1CMBitMap* bitmap,
+                               BitMap* region_bm,
+                               BitMap* card_bm) :
+      HeapRegionClosure(),
+      _helper(region_bm, card_bm) { }
+
+    bool doHeapRegion(HeapRegion* hr) {
+      bool allocated_since_marking = _helper.mark_allocated_since_marking(hr);
+      if (allocated_since_marking || hr->next_marked_bytes() > 0) {
+        _helper.set_bit_for_region(hr);
       }
-
-      assert(end_idx <= _card_bm->size(),
-             "oob: end_idx=  " SIZE_FORMAT ", bitmap size= " SIZE_FORMAT,
-             end_idx, _card_bm->size());
-      assert(start_idx < _card_bm->size(),
-             "oob: start_idx=  " SIZE_FORMAT ", bitmap size= " SIZE_FORMAT,
-             start_idx, _card_bm->size());
-
-      _cm->set_card_bitmap_range(_card_bm, start_idx, end_idx, true /* is_par */);
+      return false;
     }
-
-    // Set the bit for the region if it contains live data
-    if (hr->next_marked_bytes() > 0) {
-      set_bit_for_region(hr);
-    }
-
-    return false;
-  }
-};
-
-class G1ParFinalCountTask: public AbstractGangTask {
-protected:
-  G1CollectedHeap* _g1h;
-  G1ConcurrentMark* _cm;
+  };
+
+  G1CMBitMap* _bitmap;
+
   BitMap* _actual_region_bm;
   BitMap* _actual_card_bm;
 
-  uint    _n_workers;
-  HeapRegionClaimer _hrclaimer;
+  HeapRegionClaimer _hr_claimer;
 
 public:
-  G1ParFinalCountTask(G1CollectedHeap* g1h, BitMap* region_bm, BitMap* card_bm)
-    : AbstractGangTask("G1 final counting"),
-      _g1h(g1h), _cm(_g1h->concurrent_mark()),
-      _actual_region_bm(region_bm), _actual_card_bm(card_bm),
-      _n_workers(_g1h->workers()->active_workers()), _hrclaimer(_n_workers) {
+  G1FinalizeLiveDataTask(G1CMBitMap* bitmap, BitMap* region_bm, BitMap* card_bm, uint n_workers) :
+    AbstractGangTask("G1 final counting"),
+    _bitmap(bitmap),
+    _actual_region_bm(region_bm),
+    _actual_card_bm(card_bm),
+    _hr_claimer(n_workers) {
   }
 
   void work(uint worker_id) {
-    assert(worker_id < _n_workers, "invariant");
-
-    FinalCountDataUpdateClosure final_update_cl(_g1h,
-                                                _actual_region_bm,
-                                                _actual_card_bm);
-
-    _g1h->heap_region_par_iterate(&final_update_cl, worker_id, &_hrclaimer);
+    G1FinalizeCountDataClosure cl(_bitmap,
+                                  _actual_region_bm,
+                                  _actual_card_bm);
+
+    G1CollectedHeap::heap()->heap_region_par_iterate(&cl, worker_id, &_hr_claimer);
   }
 };
 
@@ -1637,31 +1612,29 @@
 
   HeapRegionRemSet::reset_for_cleanup_tasks();
 
-  // Do counting once more with the world stopped for good measure.
-  G1ParFinalCountTask g1_par_count_task(g1h, &_region_bm, &_card_bm);
-
-  g1h->workers()->run_task(&g1_par_count_task);
+  {
+    // Finalize the live data.
+    G1FinalizeLiveDataTask cl(_nextMarkBitMap,
+                              &_region_live_bm,
+                              &_card_live_bm,
+                              g1h->workers()->active_workers());
+    g1h->workers()->run_task(&cl);
+  }
 
   if (VerifyDuringGC) {
-    // Verify that the counting data accumulated during marking matches
-    // that calculated by walking the marking bitmap.
-
-    // Bitmaps to hold expected values
-    BitMap expected_region_bm(_region_bm.size(), true);
-    BitMap expected_card_bm(_card_bm.size(), true);
-
-    G1ParVerifyFinalCountTask g1_par_verify_task(g1h,
-                                                 &_region_bm,
-                                                 &_card_bm,
-                                                 &expected_region_bm,
-                                                 &expected_card_bm);
-
-    g1h->workers()->run_task(&g1_par_verify_task);
-
-    guarantee(g1_par_verify_task.failures() == 0, "Unexpected accounting failures");
+    // Verify that the liveness count data created concurrently matches one created
+    // during this safepoint.
+    ResourceMark rm;
+    G1VerifyLiveDataTask cl(G1CollectedHeap::heap(),
+                            _nextMarkBitMap,
+                            &_region_live_bm,
+                            &_card_live_bm,
+                            g1h->workers()->active_workers());
+    g1h->workers()->run_task(&cl);
+
+    guarantee(cl.failures() == 0, "Unexpected accounting failures");
   }
 
-  size_t start_used_bytes = g1h->used();
   g1h->collector_state()->set_mark_in_progress(false);
 
   double count_end = os::elapsedTime();
@@ -1696,7 +1669,7 @@
   // regions.
   if (G1ScrubRemSets) {
     double rs_scrub_start = os::elapsedTime();
-    g1h->scrub_rem_set(&_region_bm, &_card_bm);
+    g1h->scrub_rem_set(&_region_live_bm, &_card_live_bm);
     _total_rs_scrub_time += (os::elapsedTime() - rs_scrub_start);
   }
 
@@ -2142,6 +2115,35 @@
   _nextMarkBitMap    = (G1CMBitMap*)  temp;
 }
 
+BitMap G1ConcurrentMark::allocate_large_bitmap(BitMap::idx_t size_in_bits) {
+  size_t size_in_words = BitMap::size_in_words(size_in_bits);
+
+  BitMap::bm_word_t* map = MmapArrayAllocator<BitMap::bm_word_t, mtGC>::allocate(size_in_words);
+
+  return BitMap(map, size_in_bits);
+}
+
+void G1ConcurrentMark::allocate_internal_bitmaps() {
+  double start_time = os::elapsedTime();
+
+  _region_live_bm = allocate_large_bitmap(_g1h->max_regions());
+
+  guarantee(_g1h->max_capacity() % CardTableModRefBS::card_size == 0,
+            "Heap capacity must be aligned to card size.");
+  _card_live_bm = allocate_large_bitmap(_g1h->max_capacity() / CardTableModRefBS::card_size);
+
+  log_debug(gc, marking)("Allocating internal bitmaps took %1.2f seconds.", os::elapsedTime() - start_time);
+}
+
+void G1ConcurrentMark::pretouch_internal_bitmaps() {
+  double start_time = os::elapsedTime();
+
+  _region_live_bm.pretouch();
+  _card_live_bm.pretouch();
+
+  log_debug(gc, marking)("Pre-touching internal bitmaps took %1.2f seconds.", os::elapsedTime() - start_time);
+}
+
 // Closure for marking entries in SATB buffers.
 class G1CMSATBBufferClosure : public SATBBufferClosure {
 private:
@@ -2160,7 +2162,7 @@
       oop obj = static_cast<oop>(entry);
       assert(obj->is_oop(true /* ignore mark word */),
              "Invalid oop in SATB buffer: " PTR_FORMAT, p2i(obj));
-      _task->make_reference_grey(obj, hr);
+      _task->make_reference_grey(obj);
     }
   }
 
@@ -2402,165 +2404,117 @@
 }
 #endif // PRODUCT
 
-// Aggregate the counting data that was constructed concurrently
-// with marking.
-class AggregateCountDataHRClosure: public HeapRegionClosure {
-  G1CollectedHeap* _g1h;
-  G1ConcurrentMark* _cm;
-  CardTableModRefBS* _ct_bs;
-  BitMap* _cm_card_bm;
-  uint _max_worker_id;
-
- public:
-  AggregateCountDataHRClosure(G1CollectedHeap* g1h,
-                              BitMap* cm_card_bm,
-                              uint max_worker_id) :
-    _g1h(g1h), _cm(g1h->concurrent_mark()),
-    _ct_bs(barrier_set_cast<CardTableModRefBS>(g1h->barrier_set())),
-    _cm_card_bm(cm_card_bm), _max_worker_id(max_worker_id) { }
-
-  bool doHeapRegion(HeapRegion* hr) {
-    HeapWord* start = hr->bottom();
-    HeapWord* limit = hr->next_top_at_mark_start();
-    HeapWord* end = hr->end();
-
-    assert(start <= limit && limit <= hr->top() && hr->top() <= hr->end(),
-           "Preconditions not met - "
-           "start: " PTR_FORMAT ", limit: " PTR_FORMAT ", "
-           "top: " PTR_FORMAT ", end: " PTR_FORMAT,
-           p2i(start), p2i(limit), p2i(hr->top()), p2i(hr->end()));
-
-    assert(hr->next_marked_bytes() == 0, "Precondition");
-
-    if (start == limit) {
-      // NTAMS of this region has not been set so nothing to do.
+class G1CreateLiveDataTask: public AbstractGangTask {
+  // Aggregate the counting data that was constructed concurrently
+  // with marking.
+  class G1CreateLiveDataHRClosure: public HeapRegionClosure {
+    G1LiveDataHelper _helper;
+
+    G1CMBitMap* _mark_bitmap;
+
+    G1ConcurrentMark* _cm;
+  public:
+    G1CreateLiveDataHRClosure(G1ConcurrentMark* cm,
+                              G1CMBitMap* mark_bitmap,
+                              BitMap* cm_card_bm) :
+      HeapRegionClosure(),
+      _helper(NULL, cm_card_bm),
+      _mark_bitmap(mark_bitmap),
+      _cm(cm) { }
+
+    bool doHeapRegion(HeapRegion* hr) {
+      size_t marked_bytes = _helper.mark_marked_during_marking(_mark_bitmap, hr);
+      if (marked_bytes > 0) {
+        hr->add_to_marked_bytes(marked_bytes);
+      }
+
+      if (_cm->do_yield_check() && _cm->has_aborted()) {
+        return true;
+      }
       return false;
     }
-
-    // 'start' should be in the heap.
-    assert(_g1h->is_in_g1_reserved(start) && _ct_bs->is_card_aligned(start), "sanity");
-    // 'end' *may* be just beyond the end of the heap (if hr is the last region)
-    assert(!_g1h->is_in_g1_reserved(end) || _ct_bs->is_card_aligned(end), "sanity");
-
-    BitMap::idx_t start_idx = _cm->card_bitmap_index_for(start);
-    BitMap::idx_t limit_idx = _cm->card_bitmap_index_for(limit);
-    BitMap::idx_t end_idx = _cm->card_bitmap_index_for(end);
-
-    // If ntams is not card aligned then we bump card bitmap index
-    // for limit so that we get the all the cards spanned by
-    // the object ending at ntams.
-    // Note: if this is the last region in the heap then ntams
-    // could be actually just beyond the end of the the heap;
-    // limit_idx will then  correspond to a (non-existent) card
-    // that is also outside the heap.
-    if (_g1h->is_in_g1_reserved(limit) && !_ct_bs->is_card_aligned(limit)) {
-      limit_idx += 1;
-    }
-
-    assert(limit_idx <= end_idx, "or else use atomics");
-
-    // Aggregate the "stripe" in the count data associated with hr.
-    uint hrm_index = hr->hrm_index();
-    size_t marked_bytes = 0;
-
-    for (uint i = 0; i < _max_worker_id; i += 1) {
-      size_t* marked_bytes_array = _cm->count_marked_bytes_array_for(i);
-      BitMap* task_card_bm = _cm->count_card_bitmap_for(i);
-
-      // Fetch the marked_bytes in this region for task i and
-      // add it to the running total for this region.
-      marked_bytes += marked_bytes_array[hrm_index];
-
-      // Now union the bitmaps[0,max_worker_id)[start_idx..limit_idx)
-      // into the global card bitmap.
-      BitMap::idx_t scan_idx = task_card_bm->get_next_one_offset(start_idx, limit_idx);
-
-      while (scan_idx < limit_idx) {
-        assert(task_card_bm->at(scan_idx) == true, "should be");
-        _cm_card_bm->set_bit(scan_idx);
-        assert(_cm_card_bm->at(scan_idx) == true, "should be");
-
-        // BitMap::get_next_one_offset() can handle the case when
-        // its left_offset parameter is greater than its right_offset
-        // parameter. It does, however, have an early exit if
-        // left_offset == right_offset. So let's limit the value
-        // passed in for left offset here.
-        BitMap::idx_t next_idx = MIN2(scan_idx + 1, limit_idx);
-        scan_idx = task_card_bm->get_next_one_offset(next_idx, limit_idx);
-      }
-    }
-
-    // Update the marked bytes for this region.
-    hr->add_to_marked_bytes(marked_bytes);
-
-    // Next heap region
-    return false;
-  }
-};
-
-class G1AggregateCountDataTask: public AbstractGangTask {
-protected:
+  };
+
   G1CollectedHeap* _g1h;
   G1ConcurrentMark* _cm;
   BitMap* _cm_card_bm;
-  uint _max_worker_id;
-  uint _active_workers;
-  HeapRegionClaimer _hrclaimer;
+  HeapRegionClaimer _hr_claimer;
 
 public:
-  G1AggregateCountDataTask(G1CollectedHeap* g1h,
-                           G1ConcurrentMark* cm,
-                           BitMap* cm_card_bm,
-                           uint max_worker_id,
-                           uint n_workers) :
-      AbstractGangTask("Count Aggregation"),
-      _g1h(g1h), _cm(cm), _cm_card_bm(cm_card_bm),
-      _max_worker_id(max_worker_id),
-      _active_workers(n_workers),
-      _hrclaimer(_active_workers) {
+  G1CreateLiveDataTask(G1CollectedHeap* g1h,
+                       BitMap* cm_card_bm,
+                       uint n_workers) :
+      AbstractGangTask("Create Live Data"),
+      _g1h(g1h),
+      _cm_card_bm(cm_card_bm),
+      _hr_claimer(n_workers) {
   }
 
   void work(uint worker_id) {
-    AggregateCountDataHRClosure cl(_g1h, _cm_card_bm, _max_worker_id);
-
-    _g1h->heap_region_par_iterate(&cl, worker_id, &_hrclaimer);
+    SuspendibleThreadSetJoiner sts_join;
+
+    G1CreateLiveDataHRClosure cl(_g1h->concurrent_mark(), _g1h->concurrent_mark()->nextMarkBitMap(), _cm_card_bm);
+    _g1h->heap_region_par_iterate(&cl, worker_id, &_hr_claimer);
   }
 };
 
 
-void G1ConcurrentMark::aggregate_count_data() {
-  uint n_workers = _g1h->workers()->active_workers();
-
-  G1AggregateCountDataTask g1_par_agg_task(_g1h, this, &_card_bm,
-                                           _max_worker_id, n_workers);
-
-  _g1h->workers()->run_task(&g1_par_agg_task);
+void G1ConcurrentMark::create_live_data() {
+  uint n_workers = _parallel_workers->active_workers();
+
+  G1CreateLiveDataTask cl(_g1h,
+                          &_card_live_bm,
+                          n_workers);
+  _parallel_workers->run_task(&cl);
 }
 
-// Clear the per-worker arrays used to store the per-region counting data
-void G1ConcurrentMark::clear_all_count_data() {
-  // Clear the global card bitmap - it will be filled during
-  // liveness count aggregation (during remark) and the
-  // final counting task.
-  _card_bm.clear();
-
-  // Clear the global region bitmap - it will be filled as part
-  // of the final counting task.
-  _region_bm.clear();
-
-  uint max_regions = _g1h->max_regions();
-  assert(_max_worker_id > 0, "uninitialized");
-
-  for (uint i = 0; i < _max_worker_id; i += 1) {
-    BitMap* task_card_bm = count_card_bitmap_for(i);
-    size_t* marked_bytes_array = count_marked_bytes_array_for(i);
-
-    assert(task_card_bm->size() == _card_bm.size(), "size mismatch");
-    assert(marked_bytes_array != NULL, "uninitialized");
-
-    memset(marked_bytes_array, 0, (size_t) max_regions * sizeof(size_t));
-    task_card_bm->clear();
+class G1ClearAllLiveDataTask : public AbstractGangTask {
+  BitMap* _bitmap;
+  size_t _num_tasks;
+  size_t _cur_task;
+public:
+  G1ClearAllLiveDataTask(BitMap* bitmap, size_t num_tasks) :
+    AbstractGangTask("Clear All Live Data"),
+    _bitmap(bitmap),
+    _num_tasks(num_tasks),
+    _cur_task(0) {
+  }
+
+  virtual void work(uint worker_id) {
+    while (true) {
+      size_t to_process = Atomic::add(1, &_cur_task) - 1;
+      if (to_process >= _num_tasks) {
+        break;
+      }
+
+      BitMap::idx_t start = M * BitsPerByte * to_process;
+      BitMap::idx_t end = MIN2(start + M * BitsPerByte, _bitmap->size());
+      _bitmap->clear_range(start, end);
+    }
   }
+};
+
+void G1ConcurrentMark::clear_all_live_data(WorkGang* workers) {
+  double start_time = os::elapsedTime();
+
+  guarantee(Universe::is_fully_initialized(), "Should not call this during initialization.");
+
+  size_t const num_chunks = align_size_up(_card_live_bm.size_in_words() * HeapWordSize, M) / M;
+
+  G1ClearAllLiveDataTask cl(&_card_live_bm, num_chunks);
+  workers->run_task(&cl);
+
+  // The region live bitmap is always very small, even for huge heaps. Clear
+  // directly.
+  _region_live_bm.clear();
+
+
+  log_debug(gc, marking)("Clear Live Data took %.3fms", (os::elapsedTime() - start_time) * 1000.0);
+}
+
+void G1ConcurrentMark::verify_all_live_data() {
+  assert(_card_live_bm.count_one_bits() == 0, "Master card bitmap not clear");
+  assert(_region_live_bm.count_one_bits() == 0, "Master region bitmap not clear");
 }
 
 void G1ConcurrentMark::print_stats() {
@@ -2574,7 +2528,6 @@
   }
 }
 
-// abandon current marking iteration due to a Full GC
 void G1ConcurrentMark::abort() {
   if (!cmThread()->during_cycle() || _has_aborted) {
     // We haven't started a concurrent cycle or we have already aborted it. No need to do anything.
@@ -2589,8 +2542,8 @@
   // since VerifyDuringGC verifies the objects marked during
   // a full GC against the previous bitmap.
 
-  // Clear the liveness counting data
-  clear_all_count_data();
+  clear_all_live_data(_g1h->workers());
+  DEBUG_ONLY(verify_all_live_data());
   // Empty mark stack
   reset_marking_state();
   for (uint i = 0; i < _max_worker_id; ++i) {
@@ -2634,7 +2587,7 @@
 
   }
   print_ms_time_info("  ", "cleanups", _cleanup_times);
-  log.trace("    Final counting total time = %8.2f s (avg = %8.2f ms).",
+  log.trace("    Finalize live data total time = %8.2f s (avg = %8.2f ms).",
             _total_counting_time, (_cleanup_times.num() > 0 ? _total_counting_time * 1000.0 / (double)_cleanup_times.num() : 0.0));
   if (G1ScrubRemSets) {
     log.trace("    RS scrub total time = %8.2f s (avg = %8.2f ms).",
@@ -3473,8 +3426,6 @@
 
 G1CMTask::G1CMTask(uint worker_id,
                    G1ConcurrentMark* cm,
-                   size_t* marked_bytes,
-                   BitMap* card_bm,
                    G1CMTaskQueue* task_queue,
                    G1CMTaskQueueSet* task_queues)
   : _g1h(G1CollectedHeap::heap()),
@@ -3483,9 +3434,7 @@
     _nextMarkBitMap(NULL), _hash_seed(17),
     _task_queue(task_queue),
     _task_queues(task_queues),
-    _cm_oop_closure(NULL),
-    _marked_bytes_array(marked_bytes),
-    _card_bm(card_bm) {
+    _cm_oop_closure(NULL) {
   guarantee(task_queue != NULL, "invariant");
   guarantee(task_queues != NULL, "invariant");
 
--- a/hotspot/src/share/vm/gc/g1/g1ConcurrentMark.hpp	Wed Apr 06 10:50:19 2016 +0300
+++ b/hotspot/src/share/vm/gc/g1/g1ConcurrentMark.hpp	Wed Apr 06 13:32:48 2016 +0200
@@ -266,7 +266,7 @@
 class G1ConcurrentMark: public CHeapObj<mtGC> {
   friend class ConcurrentMarkThread;
   friend class G1ParNoteEndTask;
-  friend class CalcLiveObjectsClosure;
+  friend class G1VerifyLiveDataClosure;
   friend class G1CMRefProcTaskProxy;
   friend class G1CMRefProcTaskExecutor;
   friend class G1CMKeepAliveAndDrainClosure;
@@ -298,8 +298,14 @@
   G1CMBitMapRO*           _prevMarkBitMap; // Completed mark bitmap
   G1CMBitMap*             _nextMarkBitMap; // Under-construction mark bitmap
 
-  BitMap                  _region_bm;
-  BitMap                  _card_bm;
+  // Liveness count data. After marking G1 iterates over the recently gathered mark
+  // bitmap and records rough information about liveness on card and region basis.
+  // This information can be used for e.g. remembered set scrubbing.
+
+  // A set bit indicates whether the given region contains any live object.
+  BitMap                  _region_live_bm;
+  // A set bit indicates that the given card contains a live object.
+  BitMap                  _card_live_bm;
 
   // Heap bounds
   HeapWord*               _heap_start;
@@ -373,6 +379,14 @@
 
   void swapMarkBitMaps();
 
+  // Allocates and returns a zero-ed out "large" bitmap of the given size in bits.
+  // It is always allocated using virtual memory.
+  BitMap allocate_large_bitmap(BitMap::idx_t size_in_bits);
+  // Allocates the memory for all bitmaps used by the concurrent marking.
+  void allocate_internal_bitmaps();
+  // Pre-touches the internal bitmaps.
+  void pretouch_internal_bitmaps();
+
   // It resets the global marking data structures, as well as the
   // task local ones; should be called during initial mark.
   void reset();
@@ -461,23 +475,6 @@
   void enter_first_sync_barrier(uint worker_id);
   void enter_second_sync_barrier(uint worker_id);
 
-  // Live Data Counting data structures...
-  // These data structures are initialized at the start of
-  // marking. They are written to while marking is active.
-  // They are aggregated during remark; the aggregated values
-  // are then used to populate the _region_bm, _card_bm, and
-  // the total live bytes, which are then subsequently updated
-  // during cleanup.
-
-  // An array of bitmaps (one bit map per task). Each bitmap
-  // is used to record the cards spanned by the live objects
-  // marked by that task/worker.
-  BitMap*  _count_card_bitmaps;
-
-  // Used to record the number of marked live bytes
-  // (for each region, by worker thread).
-  size_t** _count_marked_bytes;
-
   // Card index of the bottom of the G1 heap. Used for biasing indices into
   // the card bitmaps.
   intptr_t _heap_bottom_card_num;
@@ -563,18 +560,10 @@
   // G1CollectedHeap
 
   // This notifies CM that a root during initial-mark needs to be
-  // grayed. It is MT-safe. word_size is the size of the object in
-  // words. It is passed explicitly as sometimes we cannot calculate
-  // it from the given object because it might be in an inconsistent
-  // state (e.g., in to-space and being copied). So the caller is
-  // responsible for dealing with this issue (e.g., get the size from
-  // the from-space image when the to-space image might be
-  // inconsistent) and always passing the size. hr is the region that
+  // grayed. It is MT-safe. hr is the region that
   // contains the object and it's passed optionally from callers who
   // might already have it (no point in recalculating it).
   inline void grayRoot(oop obj,
-                       size_t word_size,
-                       uint worker_id,
                        HeapRegion* hr = NULL);
 
   // Prepare internal data structures for the next mark cycle. This includes clearing
@@ -641,7 +630,7 @@
 
   inline bool do_yield_check(uint worker_i = 0);
 
-  // Called to abort the marking cycle after a Full GC takes place.
+  // Abandon current marking iteration due to a Full GC.
   void abort();
 
   bool has_aborted()      { return _has_aborted; }
@@ -652,75 +641,8 @@
 
   void print_on_error(outputStream* st) const;
 
-  // Liveness counting
-
-  // Utility routine to set an exclusive range of cards on the given
-  // card liveness bitmap
-  inline void set_card_bitmap_range(BitMap* card_bm,
-                                    BitMap::idx_t start_idx,
-                                    BitMap::idx_t end_idx,
-                                    bool is_par);
-
-  // Returns the card number of the bottom of the G1 heap.
-  // Used in biasing indices into accounting card bitmaps.
-  intptr_t heap_bottom_card_num() const {
-    return _heap_bottom_card_num;
-  }
-
-  // Returns the card bitmap for a given task or worker id.
-  BitMap* count_card_bitmap_for(uint worker_id) {
-    assert(worker_id < _max_worker_id, "oob");
-    assert(_count_card_bitmaps != NULL, "uninitialized");
-    BitMap* task_card_bm = &_count_card_bitmaps[worker_id];
-    assert(task_card_bm->size() == _card_bm.size(), "size mismatch");
-    return task_card_bm;
-  }
-
-  // Returns the array containing the marked bytes for each region,
-  // for the given worker or task id.
-  size_t* count_marked_bytes_array_for(uint worker_id) {
-    assert(worker_id < _max_worker_id, "oob");
-    assert(_count_marked_bytes != NULL, "uninitialized");
-    size_t* marked_bytes_array = _count_marked_bytes[worker_id];
-    assert(marked_bytes_array != NULL, "uninitialized");
-    return marked_bytes_array;
-  }
-
-  // Returns the index in the liveness accounting card table bitmap
-  // for the given address
-  inline BitMap::idx_t card_bitmap_index_for(HeapWord* addr);
-
-  // Counts the size of the given memory region in the the given
-  // marked_bytes array slot for the given HeapRegion.
-  // Sets the bits in the given card bitmap that are associated with the
-  // cards that are spanned by the memory region.
-  inline void count_region(MemRegion mr,
-                           HeapRegion* hr,
-                           size_t* marked_bytes_array,
-                           BitMap* task_card_bm);
-
-  // Counts the given object in the given task/worker counting
-  // data structures.
-  inline void count_object(oop obj,
-                           HeapRegion* hr,
-                           size_t* marked_bytes_array,
-                           BitMap* task_card_bm,
-                           size_t word_size);
-
-  // Attempts to mark the given object and, if successful, counts
-  // the object in the given task/worker counting structures.
-  inline bool par_mark_and_count(oop obj,
-                                 HeapRegion* hr,
-                                 size_t* marked_bytes_array,
-                                 BitMap* task_card_bm);
-
-  // Attempts to mark the given object and, if successful, counts
-  // the object in the task/worker counting structures for the
-  // given worker id.
-  inline bool par_mark_and_count(oop obj,
-                                 size_t word_size,
-                                 HeapRegion* hr,
-                                 uint worker_id);
+  // Attempts to mark the given object on the next mark bitmap.
+  inline bool par_mark(oop obj);
 
   // Returns true if initialization was successfully completed.
   bool completed_initialization() const {
@@ -730,19 +652,19 @@
   ConcurrentGCTimer* gc_timer_cm() const { return _gc_timer_cm; }
   G1OldTracer* gc_tracer_cm() const { return _gc_tracer_cm; }
 
-protected:
-  // Clear all the per-task bitmaps and arrays used to store the
-  // counting data.
-  void clear_all_count_data();
+private:
+  // Clear (Reset) all liveness count data.
+  void clear_all_live_data(WorkGang* workers);
 
-  // Aggregates the counting data for each worker/task
-  // that was constructed while marking. Also sets
-  // the amount of marked bytes for each region and
-  // the top at concurrent mark count.
-  void aggregate_count_data();
+  // Verify all of the above data structures that they are in initial state.
+  void verify_all_live_data();
+
+  // Aggregates the per-card liveness data based on the current marking. Also sets
+  // the amount of marked bytes for each region.
+  void create_live_data();
 
   // Verification routine
-  void verify_count_data();
+  void verify_live_data();
 };
 
 // A class representing a marking task.
@@ -844,12 +766,6 @@
 
   TruncatedSeq                _marking_step_diffs_ms;
 
-  // Counting data structures. Embedding the task's marked_bytes_array
-  // and card bitmap into the actual task saves having to go through
-  // the ConcurrentMark object.
-  size_t*                     _marked_bytes_array;
-  BitMap*                     _card_bm;
-
   // it updates the local fields after this task has claimed
   // a new region to scan
   void setup_for_region(HeapRegion* hr);
@@ -936,9 +852,8 @@
 
   // Grey the object by marking it.  If not already marked, push it on
   // the local queue if below the finger.
-  // Precondition: obj is in region.
   // Precondition: obj is below region's NTAMS.
-  inline void make_reference_grey(oop obj, HeapRegion* region);
+  inline void make_reference_grey(oop obj);
 
   // Grey the object (by calling make_grey_reference) if required,
   // e.g. obj is below its containing region's NTAMS.
@@ -976,8 +891,6 @@
 
   G1CMTask(uint worker_id,
            G1ConcurrentMark *cm,
-           size_t* marked_bytes,
-           BitMap* card_bm,
            G1CMTaskQueue* task_queue,
            G1CMTaskQueueSet* task_queues);
 
--- a/hotspot/src/share/vm/gc/g1/g1ConcurrentMark.inline.hpp	Wed Apr 06 10:50:19 2016 +0300
+++ b/hotspot/src/share/vm/gc/g1/g1ConcurrentMark.inline.hpp	Wed Apr 06 13:32:48 2016 +0200
@@ -29,138 +29,8 @@
 #include "gc/g1/g1ConcurrentMark.hpp"
 #include "gc/shared/taskqueue.inline.hpp"
 
-// Utility routine to set an exclusive range of cards on the given
-// card liveness bitmap
-inline void G1ConcurrentMark::set_card_bitmap_range(BitMap* card_bm,
-                                                    BitMap::idx_t start_idx,
-                                                    BitMap::idx_t end_idx,
-                                                    bool is_par) {
-
-  // Set the exclusive bit range [start_idx, end_idx).
-  assert((end_idx - start_idx) > 0, "at least one card");
-  assert(end_idx <= card_bm->size(), "sanity");
-
-  // Silently clip the end index
-  end_idx = MIN2(end_idx, card_bm->size());
-
-  // For small ranges use a simple loop; otherwise use set_range or
-  // use par_at_put_range (if parallel). The range is made up of the
-  // cards that are spanned by an object/mem region so 8 cards will
-  // allow up to object sizes up to 4K to be handled using the loop.
-  if ((end_idx - start_idx) <= 8) {
-    for (BitMap::idx_t i = start_idx; i < end_idx; i += 1) {
-      if (is_par) {
-        card_bm->par_set_bit(i);
-      } else {
-        card_bm->set_bit(i);
-      }
-    }
-  } else {
-    // Note BitMap::par_at_put_range() and BitMap::set_range() are exclusive.
-    if (is_par) {
-      card_bm->par_at_put_range(start_idx, end_idx, true);
-    } else {
-      card_bm->set_range(start_idx, end_idx);
-    }
-  }
-}
-
-// Returns the index in the liveness accounting card bitmap
-// for the given address
-inline BitMap::idx_t G1ConcurrentMark::card_bitmap_index_for(HeapWord* addr) {
-  // Below, the term "card num" means the result of shifting an address
-  // by the card shift -- address 0 corresponds to card number 0.  One
-  // must subtract the card num of the bottom of the heap to obtain a
-  // card table index.
-  intptr_t card_num = intptr_t(uintptr_t(addr) >> CardTableModRefBS::card_shift);
-  return card_num - heap_bottom_card_num();
-}
-
-// Counts the given memory region in the given task/worker
-// counting data structures.
-inline void G1ConcurrentMark::count_region(MemRegion mr, HeapRegion* hr,
-                                           size_t* marked_bytes_array,
-                                           BitMap* task_card_bm) {
-  G1CollectedHeap* g1h = _g1h;
-  CardTableModRefBS* ct_bs = g1h->g1_barrier_set();
-
-  HeapWord* start = mr.start();
-  HeapWord* end = mr.end();
-  size_t region_size_bytes = mr.byte_size();
-  uint index = hr->hrm_index();
-
-  assert(hr == g1h->heap_region_containing(start), "sanity");
-  assert(marked_bytes_array != NULL, "pre-condition");
-  assert(task_card_bm != NULL, "pre-condition");
-
-  // Add to the task local marked bytes for this region.
-  marked_bytes_array[index] += region_size_bytes;
-
-  BitMap::idx_t start_idx = card_bitmap_index_for(start);
-  BitMap::idx_t end_idx = card_bitmap_index_for(end);
-
-  // Note: if we're looking at the last region in heap - end
-  // could be actually just beyond the end of the heap; end_idx
-  // will then correspond to a (non-existent) card that is also
-  // just beyond the heap.
-  if (g1h->is_in_g1_reserved(end) && !ct_bs->is_card_aligned(end)) {
-    // end of region is not card aligned - increment to cover
-    // all the cards spanned by the region.
-    end_idx += 1;
-  }
-  // The card bitmap is task/worker specific => no need to use
-  // the 'par' BitMap routines.
-  // Set bits in the exclusive bit range [start_idx, end_idx).
-  set_card_bitmap_range(task_card_bm, start_idx, end_idx, false /* is_par */);
-}
-
-// Counts the given object in the given task/worker counting data structures.
-inline void G1ConcurrentMark::count_object(oop obj,
-                                           HeapRegion* hr,
-                                           size_t* marked_bytes_array,
-                                           BitMap* task_card_bm,
-                                           size_t word_size) {
-  assert(!hr->is_continues_humongous(), "Cannot enter count_object with continues humongous");
-  if (!hr->is_starts_humongous()) {
-    MemRegion mr((HeapWord*)obj, word_size);
-    count_region(mr, hr, marked_bytes_array, task_card_bm);
-  } else {
-    do {
-      MemRegion mr(hr->bottom(), hr->top());
-      count_region(mr, hr, marked_bytes_array, task_card_bm);
-      hr = _g1h->next_region_in_humongous(hr);
-    } while (hr != NULL);
-  }
-}
-
-// Attempts to mark the given object and, if successful, counts
-// the object in the given task/worker counting structures.
-inline bool G1ConcurrentMark::par_mark_and_count(oop obj,
-                                                 HeapRegion* hr,
-                                                 size_t* marked_bytes_array,
-                                                 BitMap* task_card_bm) {
-  if (_nextMarkBitMap->parMark((HeapWord*)obj)) {
-    // Update the task specific count data for the object.
-    count_object(obj, hr, marked_bytes_array, task_card_bm, obj->size());
-    return true;
-  }
-  return false;
-}
-
-// Attempts to mark the given object and, if successful, counts
-// the object in the task/worker counting structures for the
-// given worker id.
-inline bool G1ConcurrentMark::par_mark_and_count(oop obj,
-                                                 size_t word_size,
-                                                 HeapRegion* hr,
-                                                 uint worker_id) {
-  if (_nextMarkBitMap->parMark((HeapWord*)obj)) {
-    size_t* marked_bytes_array = count_marked_bytes_array_for(worker_id);
-    BitMap* task_card_bm = count_card_bitmap_for(worker_id);
-    count_object(obj, hr, marked_bytes_array, task_card_bm, word_size);
-    return true;
-  }
-  return false;
+inline bool G1ConcurrentMark::par_mark(oop obj) {
+  return _nextMarkBitMap->parMark((HeapWord*)obj);
 }
 
 inline bool G1CMBitMapRO::iterate(BitMapClosure* cl, MemRegion mr) {
@@ -294,10 +164,8 @@
   check_limits();
 }
 
-
-
-inline void G1CMTask::make_reference_grey(oop obj, HeapRegion* hr) {
-  if (_cm->par_mark_and_count(obj, hr, _marked_bytes_array, _card_bm)) {
+inline void G1CMTask::make_reference_grey(oop obj) {
+  if (_cm->par_mark(obj)) {
     // No OrderAccess:store_load() is needed. It is implicit in the
     // CAS done in G1CMBitMap::parMark() call in the routine above.
     HeapWord* global_finger = _cm->finger();
@@ -348,7 +216,7 @@
       // anything with it).
       HeapRegion* hr = _g1h->heap_region_containing(obj);
       if (!hr->obj_allocated_since_next_marking(obj)) {
-        make_reference_grey(obj, hr);
+        make_reference_grey(obj);
       }
     }
   }
@@ -370,8 +238,7 @@
   return _prevMarkBitMap->isMarked(addr);
 }
 
-inline void G1ConcurrentMark::grayRoot(oop obj, size_t word_size,
-                                       uint worker_id, HeapRegion* hr) {
+inline void G1ConcurrentMark::grayRoot(oop obj, HeapRegion* hr) {
   assert(obj != NULL, "pre-condition");
   HeapWord* addr = (HeapWord*) obj;
   if (hr == NULL) {
@@ -386,7 +253,7 @@
 
   if (addr < hr->next_top_at_mark_start()) {
     if (!_nextMarkBitMap->isMarked(addr)) {
-      par_mark_and_count(obj, word_size, hr, worker_id);
+      par_mark(obj);
     }
   }
 }
--- a/hotspot/src/share/vm/gc/g1/g1EvacFailure.cpp	Wed Apr 06 10:50:19 2016 +0300
+++ b/hotspot/src/share/vm/gc/g1/g1EvacFailure.cpp	Wed Apr 06 13:32:48 2016 +0200
@@ -95,8 +95,6 @@
   void do_object(oop obj) {
     HeapWord* obj_addr = (HeapWord*) obj;
     assert(_hr->is_in(obj_addr), "sanity");
-    size_t obj_size = obj->size();
-    HeapWord* obj_end = obj_addr + obj_size;
 
     if (obj->is_forwarded() && obj->forwardee() == obj) {
       // The object failed to move.
@@ -119,8 +117,10 @@
         // explicitly and all objects in the CSet are considered
         // (implicitly) live. So, we won't mark them explicitly and
         // we'll leave them over NTAMS.
-        _cm->grayRoot(obj, obj_size, _worker_id, _hr);
+        _cm->grayRoot(obj, _hr);
       }
+      size_t obj_size = obj->size();
+
       _marked_bytes += (obj_size * HeapWordSize);
       obj->set_mark(markOopDesc::prototype());
 
@@ -138,6 +138,7 @@
       // the collection set. So, we'll recreate such entries now.
       obj->oop_iterate(_update_rset_cl);
 
+      HeapWord* obj_end = obj_addr + obj_size;
       _last_forwarded_object_end = obj_end;
       _hr->cross_threshold(obj_addr, obj_end);
     }
--- a/hotspot/src/share/vm/gc/g1/g1OopClosures.inline.hpp	Wed Apr 06 10:50:19 2016 +0300
+++ b/hotspot/src/share/vm/gc/g1/g1OopClosures.inline.hpp	Wed Apr 06 13:32:48 2016 +0200
@@ -131,7 +131,7 @@
   if (!oopDesc::is_null(heap_oop)) {
     oop obj = oopDesc::decode_heap_oop_not_null(heap_oop);
     HeapRegion* hr = _g1h->heap_region_containing((HeapWord*) obj);
-    _cm->grayRoot(obj, obj->size(), _worker_id, hr);
+    _cm->grayRoot(obj, hr);
   }
 }
 
@@ -246,7 +246,7 @@
   assert(!_g1->heap_region_containing(obj)->in_collection_set(), "should not mark objects in the CSet");
 
   // We know that the object is not moving so it's safe to read its size.
-  _cm->grayRoot(obj, (size_t) obj->size(), _worker_id);
+  _cm->grayRoot(obj);
 }
 
 void G1ParCopyHelper::mark_forwarded_object(oop from_obj, oop to_obj) {
@@ -261,7 +261,7 @@
   // worker so we cannot trust that its to-space image is
   // well-formed. So we have to read its size from its from-space
   // image which we know should not be changing.
-  _cm->grayRoot(to_obj, (size_t) from_obj->size(), _worker_id);
+  _cm->grayRoot(to_obj);
 }
 
 template <G1Barrier barrier, G1Mark do_mark_object, bool use_ext>
--- a/hotspot/src/share/vm/gc/g1/g1_globals.hpp	Wed Apr 06 10:50:19 2016 +0300
+++ b/hotspot/src/share/vm/gc/g1/g1_globals.hpp	Wed Apr 06 13:32:48 2016 +0200
@@ -260,6 +260,9 @@
           "The target number of mixed GCs after a marking cycle.")          \
           range(0, max_uintx)                                               \
                                                                             \
+  experimental(bool, G1PretouchAuxiliaryMemory, false,                      \
+          "Pre-touch large auxiliary data structures used by the GC.")      \
+                                                                            \
   experimental(bool, G1EagerReclaimHumongousObjects, true,                  \
           "Try to reclaim dead large objects at every young GC.")           \
                                                                             \
--- a/hotspot/src/share/vm/utilities/bitMap.cpp	Wed Apr 06 10:50:19 2016 +0300
+++ b/hotspot/src/share/vm/utilities/bitMap.cpp	Wed Apr 06 13:32:48 2016 +0200
@@ -68,6 +68,10 @@
   }
 }
 
+void BitMap::pretouch() {
+  os::pretouch_memory((char*)word_addr(0), (char*)word_addr(size()));
+}
+
 void BitMap::set_range_within_word(idx_t beg, idx_t end) {
   // With a valid range (beg <= end), this test ensures that end != 0, as
   // required by inverted_bit_mask_for_range.  Also avoids an unnecessary write.
--- a/hotspot/src/share/vm/utilities/bitMap.hpp	Wed Apr 06 10:50:19 2016 +0300
+++ b/hotspot/src/share/vm/utilities/bitMap.hpp	Wed Apr 06 13:32:48 2016 +0200
@@ -135,12 +135,19 @@
   // use the same value for "in_resource_area".)
   void resize(idx_t size_in_bits, bool in_resource_area = true);
 
+  // Pretouch the entire range of memory this BitMap covers.
+  void pretouch();
+
   // Accessing
   idx_t size() const                    { return _size; }
   idx_t size_in_words() const           {
     return word_index(size() + BitsPerWord - 1);
   }
 
+  static idx_t size_in_words(size_t size_in_bits) {
+    return word_index(size_in_bits + BitsPerWord - 1);
+  }
+
   bool at(idx_t index) const {
     verify_index(index);
     return (*word_addr(index) & bit_mask(index)) != 0;
--- a/hotspot/test/gc/g1/Test2GbHeap.java	Wed Apr 06 10:50:19 2016 +0300
+++ b/hotspot/test/gc/g1/Test2GbHeap.java	Wed Apr 06 13:32:48 2016 +0200
@@ -25,6 +25,9 @@
  * @test Test2GbHeap
  * @bug 8031686
  * @summary Regression test to ensure we can start G1 with 2gb heap.
+ * Skip test on 32 bit Windows: it typically does not support the many and large virtual memory reservations needed.
+ * @requires (vm.gc == "G1" | vm.gc == "null")
+ * @requires !((sun.arch.data.model == "32") & (os.family == "windows"))
  * @key gc
  * @key regression
  * @library /testlibrary
@@ -48,17 +51,6 @@
     ProcessBuilder pb = ProcessTools.createJavaProcessBuilder(testArguments.toArray(new String[0]));
 
     OutputAnalyzer output = new OutputAnalyzer(pb.start());
-
-    // Avoid failing test for setups not supported.
-    if (output.getOutput().contains("Could not reserve enough space for 2097152KB object heap")) {
-      // Will fail on machines with too little memory (and Windows 32-bit VM), ignore such failures.
-      output.shouldHaveExitValue(1);
-    } else if (output.getOutput().contains("-XX:+UseG1GC not supported in this VM")) {
-      // G1 is not supported on embedded, ignore such failures.
-      output.shouldHaveExitValue(1);
-    } else {
-      // Normally everything should be fine.
-      output.shouldHaveExitValue(0);
-    }
+    output.shouldHaveExitValue(0);
   }
 }