# HG changeset patch # User manc # Date 1574471035 28800 # Node ID bd9dba789919954236f2ebf961e60afe49fc3645 # Parent d4ddf19c2624f464df3bd160835ac646548fdb96 8087198: G1 card refinement: batching, sorting Reviewed-by: tschatzl, kbarrett diff -r d4ddf19c2624 -r bd9dba789919 src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp --- a/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp Fri Nov 22 16:26:35 2019 -0800 +++ b/src/hotspot/share/gc/g1/g1DirtyCardQueue.cpp Fri Nov 22 17:03:55 2019 -0800 @@ -41,6 +41,7 @@ #include "runtime/safepoint.hpp" #include "runtime/thread.inline.hpp" #include "runtime/threadSMR.hpp" +#include "utilities/quickSort.hpp" G1DirtyCardQueue::G1DirtyCardQueue(G1DirtyCardQueueSet* qset) : // Dirty card queues are always active, so we create them with their @@ -226,21 +227,127 @@ return result; } +class G1RefineBufferedCards : public StackObj { + BufferNode* const _node; + CardTable::CardValue** const _node_buffer; + const size_t _node_buffer_size; + const uint _worker_id; + size_t* _total_refined_cards; + G1RemSet* const _g1rs; + + static inline int compare_card(const CardTable::CardValue* p1, + const CardTable::CardValue* p2) { + return p2 - p1; + } + + // Sorts the cards from start_index to _node_buffer_size in *decreasing* + // address order. Tests showed that this order is preferable to not sorting + // or increasing address order. + void sort_cards(size_t start_index) { + QuickSort::sort(&_node_buffer[start_index], + _node_buffer_size - start_index, + compare_card, + false); + } + + // Returns the index to the first clean card in the buffer. + size_t clean_cards() { + const size_t start = _node->index(); + assert(start <= _node_buffer_size, "invariant"); + + // Two-fingered compaction algorithm similar to the filtering mechanism in + // SATBMarkQueue. The main difference is that clean_card_before_refine() + // could change the buffer element in-place. + // We don't check for SuspendibleThreadSet::should_yield(), because + // cleaning and redirtying the cards is fast. + CardTable::CardValue** src = &_node_buffer[start]; + CardTable::CardValue** dst = &_node_buffer[_node_buffer_size]; + assert(src <= dst, "invariant"); + for ( ; src < dst; ++src) { + // Search low to high for a card to keep. + if (_g1rs->clean_card_before_refine(src)) { + // Found keeper. Search high to low for a card to discard. + while (src < --dst) { + if (!_g1rs->clean_card_before_refine(dst)) { + *dst = *src; // Replace discard with keeper. + break; + } + } + // If discard search failed (src == dst), the outer loop will also end. + } + } + + // dst points to the first retained clean card, or the end of the buffer + // if all the cards were discarded. + const size_t first_clean = dst - _node_buffer; + assert(first_clean >= start && first_clean <= _node_buffer_size, "invariant"); + // Discarded cards are considered as refined. + *_total_refined_cards += first_clean - start; + return first_clean; + } + + bool refine_cleaned_cards(size_t start_index) { + bool result = true; + size_t i = start_index; + for ( ; i < _node_buffer_size; ++i) { + if (SuspendibleThreadSet::should_yield()) { + redirty_unrefined_cards(i); + result = false; + break; + } + _g1rs->refine_card_concurrently(_node_buffer[i], _worker_id); + } + _node->set_index(i); + *_total_refined_cards += i - start_index; + return result; + } + + void redirty_unrefined_cards(size_t start) { + for ( ; start < _node_buffer_size; ++start) { + *_node_buffer[start] = G1CardTable::dirty_card_val(); + } + } + +public: + G1RefineBufferedCards(BufferNode* node, + size_t node_buffer_size, + uint worker_id, + size_t* total_refined_cards) : + _node(node), + _node_buffer(reinterpret_cast(BufferNode::make_buffer_from_node(node))), + _node_buffer_size(node_buffer_size), + _worker_id(worker_id), + _total_refined_cards(total_refined_cards), + _g1rs(G1CollectedHeap::heap()->rem_set()) {} + + bool refine() { + size_t first_clean_index = clean_cards(); + if (first_clean_index == _node_buffer_size) { + _node->set_index(first_clean_index); + return true; + } + // This fence serves two purposes. First, the cards must be cleaned + // before processing the contents. Second, we can't proceed with + // processing a region until after the read of the region's top in + // collect_and_clean_cards(), for synchronization with possibly concurrent + // humongous object allocation (see comment at the StoreStore fence before + // setting the regions' tops in humongous allocation path). + // It's okay that reading region's top and reading region's type were racy + // wrto each other. We need both set, in any order, to proceed. + OrderAccess::fence(); + sort_cards(first_clean_index); + return refine_cleaned_cards(first_clean_index); + } +}; + bool G1DirtyCardQueueSet::refine_buffer(BufferNode* node, uint worker_id, size_t* total_refined_cards) { - G1RemSet* rem_set = G1CollectedHeap::heap()->rem_set(); - size_t size = buffer_size(); - void** buffer = BufferNode::make_buffer_from_node(node); - size_t i = node->index(); - assert(i <= size, "invariant"); - for ( ; (i < size) && !SuspendibleThreadSet::should_yield(); ++i) { - CardTable::CardValue* cp = static_cast(buffer[i]); - rem_set->refine_card_concurrently(cp, worker_id); - } - *total_refined_cards += (i - node->index()); - node->set_index(i); - return i == size; + G1RefineBufferedCards buffered_cards(node, + buffer_size(), + worker_id, + total_refined_cards); + return buffered_cards.refine(); } #ifndef ASSERT diff -r d4ddf19c2624 -r bd9dba789919 src/hotspot/share/gc/g1/g1RemSet.cpp --- a/src/hotspot/share/gc/g1/g1RemSet.cpp Fri Nov 22 16:26:35 2019 -0800 +++ b/src/hotspot/share/gc/g1/g1RemSet.cpp Fri Nov 22 17:03:55 2019 -0800 @@ -1261,25 +1261,27 @@ #endif } -void G1RemSet::refine_card_concurrently(CardValue* card_ptr, - uint worker_id) { +bool G1RemSet::clean_card_before_refine(CardValue** const card_ptr_addr) { assert(!_g1h->is_gc_active(), "Only call concurrently"); - // Construct the region representing the card. + CardValue* card_ptr = *card_ptr_addr; + // Find the start address represented by the card. HeapWord* start = _ct->addr_for(card_ptr); // And find the region containing it. HeapRegion* r = _g1h->heap_region_containing_or_null(start); // If this is a (stale) card into an uncommitted region, exit. if (r == NULL) { - return; + return false; } check_card_ptr(card_ptr, _ct); // If the card is no longer dirty, nothing to do. + // We cannot load the card value before the "r == NULL" check, because G1 + // could uncommit parts of the card table covering uncommitted regions. if (*card_ptr != G1CardTable::dirty_card_val()) { - return; + return false; } // This check is needed for some uncommon cases where we should @@ -1302,7 +1304,7 @@ // enqueueing of the card and processing it here will have ensured // we see the up-to-date region type here. if (!r->is_old_or_humongous_or_archive()) { - return; + return false; } // The result from the hot card cache insert call is either: @@ -1321,7 +1323,7 @@ card_ptr = _hot_card_cache->insert(card_ptr); if (card_ptr == NULL) { // There was no eviction. Nothing to do. - return; + return false; } else if (card_ptr != orig_card_ptr) { // Original card was inserted and an old card was evicted. start = _ct->addr_for(card_ptr); @@ -1331,8 +1333,9 @@ // ignored, as discussed earlier for the original card. The // region could have been freed while in the cache. if (!r->is_old_or_humongous_or_archive()) { - return; + return false; } + *card_ptr_addr = card_ptr; } // Else we still have the original card. } @@ -1341,18 +1344,19 @@ // (part of) an object at the end of the allocated space and extend // beyond the end of allocation. - // Non-humongous objects are only allocated in the old-gen during - // GC, so if region is old then top is stable. Humongous object - // allocation sets top last; if top has not yet been set, this is - // a stale card and we'll end up with an empty intersection. If - // this is not a stale card, the synchronization between the + // Non-humongous objects are either allocated in the old regions during GC, + // or mapped in archive regions during startup. So if region is old or + // archive then top is stable. + // Humongous object allocation sets top last; if top has not yet been set, + // this is a stale card and we'll end up with an empty intersection. + // If this is not a stale card, the synchronization between the // enqueuing of the card and processing it here will have ensured // we see the up-to-date top here. HeapWord* scan_limit = r->top(); if (scan_limit <= start) { // If the trimmed region is empty, the card must be stale. - return; + return false; } // Okay to clean and process the card now. There are still some @@ -1360,13 +1364,26 @@ // as iteration failure. *const_cast(card_ptr) = G1CardTable::clean_card_val(); - // This fence serves two purposes. First, the card must be cleaned - // before processing the contents. Second, we can't proceed with - // processing until after the read of top, for synchronization with - // possibly concurrent humongous object allocation. It's okay that - // reading top and reading type were racy wrto each other. We need - // both set, in any order, to proceed. - OrderAccess::fence(); + return true; +} + +void G1RemSet::refine_card_concurrently(CardValue* const card_ptr, + const uint worker_id) { + assert(!_g1h->is_gc_active(), "Only call concurrently"); + check_card_ptr(card_ptr, _ct); + + // Construct the MemRegion representing the card. + HeapWord* start = _ct->addr_for(card_ptr); + // And find the region containing it. + HeapRegion* r = _g1h->heap_region_containing(start); + // This reload of the top is safe even though it happens after the full + // fence, because top is stable for old, archive and unfiltered humongous + // regions, so it must return the same value as the previous load when + // cleaning the card. Also cleaning the card and refinement of the card + // cannot span across safepoint, so we don't need to worry about top being + // changed during safepoint. + HeapWord* scan_limit = r->top(); + assert(scan_limit > start, "sanity"); // Don't use addr_for(card_ptr + 1) which can ask for // a card beyond the heap. diff -r d4ddf19c2624 -r bd9dba789919 src/hotspot/share/gc/g1/g1RemSet.hpp --- a/src/hotspot/share/gc/g1/g1RemSet.hpp Fri Nov 22 16:26:35 2019 -0800 +++ b/src/hotspot/share/gc/g1/g1RemSet.hpp Fri Nov 22 17:03:55 2019 -0800 @@ -113,10 +113,17 @@ G1GCPhaseTimes::GCParPhases coderoots_phase, G1GCPhaseTimes::GCParPhases objcopy_phase); - // Refine the card corresponding to "card_ptr". Safe to be called concurrently - // to the mutator. - void refine_card_concurrently(CardValue* card_ptr, - uint worker_id); + // Two methods for concurrent refinement support, executed concurrently to + // the mutator: + // Cleans the card at "*card_ptr_addr" before refinement, returns true iff the + // card needs later refinement. Note that "*card_ptr_addr" could be updated to + // a different card due to use of hot card cache. + bool clean_card_before_refine(CardValue** const card_ptr_addr); + // Refine the region corresponding to "card_ptr". Must be called after + // being filtered by clean_card_before_refine(), and after proper + // fence/synchronization. + void refine_card_concurrently(CardValue* const card_ptr, + const uint worker_id); // Print accumulated summary info from the start of the VM. void print_summary_info();