8225255: Make SATB qset lock-free
Summary: Refactor PtrQueueSet, use lock-free stack for SATB completed buffers
Reviewed-by: tschatzl, shade
/*
* Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
#include "precompiled.hpp"
#include "gc/shared/ptrQueue.hpp"
#include "logging/log.hpp"
#include "memory/allocation.hpp"
#include "memory/allocation.inline.hpp"
#include "runtime/atomic.hpp"
#include "runtime/mutex.hpp"
#include "runtime/mutexLocker.hpp"
#include "runtime/orderAccess.hpp"
#include "runtime/thread.inline.hpp"
#include "utilities/globalCounter.inline.hpp"
#include <new>
PtrQueue::PtrQueue(PtrQueueSet* qset, bool active) :
_qset(qset),
_active(active),
_index(0),
_capacity_in_bytes(0),
_buf(NULL)
{}
PtrQueue::~PtrQueue() {
assert(_buf == NULL, "queue must be flushed before delete");
}
void PtrQueue::flush_impl() {
if (_buf != NULL) {
BufferNode* node = BufferNode::make_node_from_buffer(_buf, index());
if (is_empty()) {
// No work to do.
qset()->deallocate_buffer(node);
} else {
qset()->enqueue_completed_buffer(node);
}
_buf = NULL;
set_index(0);
}
}
void PtrQueue::enqueue_known_active(void* ptr) {
while (_index == 0) {
handle_zero_index();
}
assert(_buf != NULL, "postcondition");
assert(index() > 0, "postcondition");
assert(index() <= capacity(), "invariant");
_index -= _element_size;
_buf[index()] = ptr;
}
void PtrQueue::handle_zero_index() {
assert(index() == 0, "precondition");
if (_buf != NULL) {
handle_completed_buffer();
} else {
// Bootstrapping kludge; lazily initialize capacity. The initial
// thread's queues are constructed before the second phase of the
// two-phase initialization of the associated qsets. As a result,
// we can't initialize _capacity_in_bytes in the queue constructor.
if (_capacity_in_bytes == 0) {
_capacity_in_bytes = index_to_byte_index(qset()->buffer_size());
}
allocate_buffer();
}
}
void PtrQueue::allocate_buffer() {
_buf = qset()->allocate_buffer();
reset();
}
void PtrQueue::enqueue_completed_buffer() {
assert(_buf != NULL, "precondition");
BufferNode* node = BufferNode::make_node_from_buffer(_buf, index());
qset()->enqueue_completed_buffer(node);
allocate_buffer();
}
BufferNode* BufferNode::allocate(size_t size) {
size_t byte_size = size * sizeof(void*);
void* data = NEW_C_HEAP_ARRAY(char, buffer_offset() + byte_size, mtGC);
return new (data) BufferNode;
}
void BufferNode::deallocate(BufferNode* node) {
node->~BufferNode();
FREE_C_HEAP_ARRAY(char, node);
}
BufferNode::Allocator::Allocator(const char* name, size_t buffer_size) :
_buffer_size(buffer_size),
_pending_list(),
_free_list(),
_pending_count(0),
_free_count(0),
_transfer_lock(false)
{
strncpy(_name, name, sizeof(_name) - 1);
_name[sizeof(_name) - 1] = '\0';
}
BufferNode::Allocator::~Allocator() {
delete_list(_free_list.pop_all());
delete_list(_pending_list.pop_all());
}
void BufferNode::Allocator::delete_list(BufferNode* list) {
while (list != NULL) {
BufferNode* next = list->next();
DEBUG_ONLY(list->set_next(NULL);)
BufferNode::deallocate(list);
list = next;
}
}
size_t BufferNode::Allocator::free_count() const {
return Atomic::load(&_free_count);
}
BufferNode* BufferNode::Allocator::allocate() {
BufferNode* node;
{
// Protect against ABA; see release().
GlobalCounter::CriticalSection cs(Thread::current());
node = _free_list.pop();
}
if (node == NULL) {
node = BufferNode::allocate(_buffer_size);
} else {
// Decrement count after getting buffer from free list. This, along
// with incrementing count before adding to free list, ensures count
// never underflows.
size_t count = Atomic::sub(1u, &_free_count);
assert((count + 1) != 0, "_free_count underflow");
}
return node;
}
// To solve the ABA problem for lock-free stack pop, allocate does the
// pop inside a critical section, and release synchronizes on the
// critical sections before adding to the _free_list. But we don't
// want to make every release have to do a synchronize. Instead, we
// initially place released nodes on the _pending_list, and transfer
// them to the _free_list in batches. Only one transfer at a time is
// permitted, with a lock bit to control access to that phase. A
// transfer takes all the nodes from the _pending_list, synchronizes on
// the _free_list pops, and then adds the former pending nodes to the
// _free_list. While that's happening, other threads might be adding
// other nodes to the _pending_list, to be dealt with by some later
// transfer.
void BufferNode::Allocator::release(BufferNode* node) {
assert(node != NULL, "precondition");
assert(node->next() == NULL, "precondition");
// Desired minimum transfer batch size. There is relatively little
// importance to the specific number. It shouldn't be too big, else
// we're wasting space when the release rate is low. If the release
// rate is high, we might accumulate more than this before being
// able to start a new transfer, but that's okay. Also note that
// the allocation rate and the release rate are going to be fairly
// similar, due to how the buffers are used.
const size_t trigger_transfer = 10;
// Add to pending list. Update count first so no underflow in transfer.
size_t pending_count = Atomic::add(1u, &_pending_count);
_pending_list.push(*node);
if (pending_count > trigger_transfer) {
try_transfer_pending();
}
}
// Try to transfer nodes from _pending_list to _free_list, with a
// synchronization delay for any in-progress pops from the _free_list,
// to solve ABA there. Return true if performed a (possibly empty)
// transfer, false if blocked from doing so by some other thread's
// in-progress transfer.
bool BufferNode::Allocator::try_transfer_pending() {
// Attempt to claim the lock.
if (Atomic::load(&_transfer_lock) || // Skip CAS if likely to fail.
Atomic::cmpxchg(true, &_transfer_lock, false)) {
return false;
}
// Have the lock; perform the transfer.
// Claim all the pending nodes.
BufferNode* first = _pending_list.pop_all();
if (first != NULL) {
// Prepare to add the claimed nodes, and update _pending_count.
BufferNode* last = first;
size_t count = 1;
for (BufferNode* next = first->next(); next != NULL; next = next->next()) {
last = next;
++count;
}
Atomic::sub(count, &_pending_count);
// Wait for any in-progress pops, to avoid ABA for them.
GlobalCounter::write_synchronize();
// Add synchronized nodes to _free_list.
// Update count first so no underflow in allocate().
Atomic::add(count, &_free_count);
_free_list.prepend(*first, *last);
log_trace(gc, ptrqueue, freelist)
("Transferred %s pending to free: " SIZE_FORMAT, name(), count);
}
OrderAccess::release_store(&_transfer_lock, false);
return true;
}
size_t BufferNode::Allocator::reduce_free_list(size_t remove_goal) {
try_transfer_pending();
size_t removed = 0;
for ( ; removed < remove_goal; ++removed) {
BufferNode* node = _free_list.pop();
if (node == NULL) break;
BufferNode::deallocate(node);
}
size_t new_count = Atomic::sub(removed, &_free_count);
log_debug(gc, ptrqueue, freelist)
("Reduced %s free list by " SIZE_FORMAT " to " SIZE_FORMAT,
name(), removed, new_count);
return removed;
}
PtrQueueSet::PtrQueueSet() :
_allocator(NULL),
_all_active(false)
{}
PtrQueueSet::~PtrQueueSet() {}
void PtrQueueSet::initialize(BufferNode::Allocator* allocator) {
assert(allocator != NULL, "Init order issue?");
_allocator = allocator;
}
void** PtrQueueSet::allocate_buffer() {
BufferNode* node = _allocator->allocate();
return BufferNode::make_buffer_from_node(node);
}
void PtrQueueSet::deallocate_buffer(BufferNode* node) {
_allocator->release(node);
}