8168467: Use TaskEntry as task mark queue elements
authortschatzl
Wed, 15 Mar 2017 11:44:46 +0100
changeset 46328 6061df52d610
parent 46327 91576389a517
child 46329 53ccc37bda19
8168467: Use TaskEntry as task mark queue elements Summary: Change the mark stack to use TaskEntry queue elements to improve type safety instead of casting around raw pointers. Reviewed-by: kbarrett, sangheki
hotspot/src/share/vm/gc/g1/g1ConcurrentMark.cpp
hotspot/src/share/vm/gc/g1/g1ConcurrentMark.hpp
hotspot/src/share/vm/gc/g1/g1ConcurrentMark.inline.hpp
hotspot/src/share/vm/gc/g1/g1ConcurrentMarkObjArrayProcessor.cpp
hotspot/src/share/vm/gc/g1/g1ConcurrentMarkObjArrayProcessor.hpp
--- a/hotspot/src/share/vm/gc/g1/g1ConcurrentMark.cpp	Tue Mar 14 14:07:24 2017 -0400
+++ b/hotspot/src/share/vm/gc/g1/g1ConcurrentMark.cpp	Wed Mar 15 11:44:46 2017 +0100
@@ -145,15 +145,15 @@
   assert(new_capacity <= _max_chunk_capacity,
          "Trying to resize stack to " SIZE_FORMAT " chunks when the maximum is " SIZE_FORMAT, new_capacity, _max_chunk_capacity);
 
-  OopChunk* new_base = MmapArrayAllocator<OopChunk, mtGC>::allocate_or_null(new_capacity);
+  TaskQueueEntryChunk* new_base = MmapArrayAllocator<TaskQueueEntryChunk, mtGC>::allocate_or_null(new_capacity);
 
   if (new_base == NULL) {
-    log_warning(gc)("Failed to reserve memory for new overflow mark stack with " SIZE_FORMAT " chunks and size " SIZE_FORMAT "B.", new_capacity, new_capacity * sizeof(OopChunk));
+    log_warning(gc)("Failed to reserve memory for new overflow mark stack with " SIZE_FORMAT " chunks and size " SIZE_FORMAT "B.", new_capacity, new_capacity * sizeof(TaskQueueEntryChunk));
     return false;
   }
   // Release old mapping.
   if (_base != NULL) {
-    MmapArrayAllocator<OopChunk, mtGC>::free(_base, _chunk_capacity);
+    MmapArrayAllocator<TaskQueueEntryChunk, mtGC>::free(_base, _chunk_capacity);
   }
 
   _base = new_base;
@@ -165,16 +165,16 @@
 }
 
 size_t G1CMMarkStack::capacity_alignment() {
-  return (size_t)lcm(os::vm_allocation_granularity(), sizeof(OopChunk)) / sizeof(void*);
+  return (size_t)lcm(os::vm_allocation_granularity(), sizeof(TaskQueueEntryChunk)) / sizeof(G1TaskQueueEntry);
 }
 
 bool G1CMMarkStack::initialize(size_t initial_capacity, size_t max_capacity) {
   guarantee(_max_chunk_capacity == 0, "G1CMMarkStack already initialized.");
 
-  size_t const OopChunkSizeInVoidStar = sizeof(OopChunk) / sizeof(void*);
-
-  _max_chunk_capacity = (size_t)align_size_up(max_capacity, capacity_alignment()) / OopChunkSizeInVoidStar;
-  size_t initial_chunk_capacity = (size_t)align_size_up(initial_capacity, capacity_alignment()) / OopChunkSizeInVoidStar;
+  size_t const TaskEntryChunkSizeInVoidStar = sizeof(TaskQueueEntryChunk) / sizeof(G1TaskQueueEntry);
+
+  _max_chunk_capacity = (size_t)align_size_up(max_capacity, capacity_alignment()) / TaskEntryChunkSizeInVoidStar;
+  size_t initial_chunk_capacity = (size_t)align_size_up(initial_capacity, capacity_alignment()) / TaskEntryChunkSizeInVoidStar;
 
   guarantee(initial_chunk_capacity <= _max_chunk_capacity,
             "Maximum chunk capacity " SIZE_FORMAT " smaller than initial capacity " SIZE_FORMAT,
@@ -210,49 +210,49 @@
 
 G1CMMarkStack::~G1CMMarkStack() {
   if (_base != NULL) {
-    MmapArrayAllocator<OopChunk, mtGC>::free(_base, _chunk_capacity);
+    MmapArrayAllocator<TaskQueueEntryChunk, mtGC>::free(_base, _chunk_capacity);
   }
 }
 
-void G1CMMarkStack::add_chunk_to_list(OopChunk* volatile* list, OopChunk* elem) {
+void G1CMMarkStack::add_chunk_to_list(TaskQueueEntryChunk* volatile* list, TaskQueueEntryChunk* elem) {
   elem->next = *list;
   *list = elem;
 }
 
-void G1CMMarkStack::add_chunk_to_chunk_list(OopChunk* elem) {
+void G1CMMarkStack::add_chunk_to_chunk_list(TaskQueueEntryChunk* elem) {
   MutexLockerEx x(MarkStackChunkList_lock, Mutex::_no_safepoint_check_flag);
   add_chunk_to_list(&_chunk_list, elem);
   _chunks_in_chunk_list++;
 }
 
-void G1CMMarkStack::add_chunk_to_free_list(OopChunk* elem) {
+void G1CMMarkStack::add_chunk_to_free_list(TaskQueueEntryChunk* elem) {
   MutexLockerEx x(MarkStackFreeList_lock, Mutex::_no_safepoint_check_flag);
   add_chunk_to_list(&_free_list, elem);
 }
 
-G1CMMarkStack::OopChunk* G1CMMarkStack::remove_chunk_from_list(OopChunk* volatile* list) {
-  OopChunk* result = *list;
+G1CMMarkStack::TaskQueueEntryChunk* G1CMMarkStack::remove_chunk_from_list(TaskQueueEntryChunk* volatile* list) {
+  TaskQueueEntryChunk* result = *list;
   if (result != NULL) {
     *list = (*list)->next;
   }
   return result;
 }
 
-G1CMMarkStack::OopChunk* G1CMMarkStack::remove_chunk_from_chunk_list() {
+G1CMMarkStack::TaskQueueEntryChunk* G1CMMarkStack::remove_chunk_from_chunk_list() {
   MutexLockerEx x(MarkStackChunkList_lock, Mutex::_no_safepoint_check_flag);
-  OopChunk* result = remove_chunk_from_list(&_chunk_list);
+  TaskQueueEntryChunk* result = remove_chunk_from_list(&_chunk_list);
   if (result != NULL) {
     _chunks_in_chunk_list--;
   }
   return result;
 }
 
-G1CMMarkStack::OopChunk* G1CMMarkStack::remove_chunk_from_free_list() {
+G1CMMarkStack::TaskQueueEntryChunk* G1CMMarkStack::remove_chunk_from_free_list() {
   MutexLockerEx x(MarkStackFreeList_lock, Mutex::_no_safepoint_check_flag);
   return remove_chunk_from_list(&_free_list);
 }
 
-G1CMMarkStack::OopChunk* G1CMMarkStack::allocate_new_chunk() {
+G1CMMarkStack::TaskQueueEntryChunk* G1CMMarkStack::allocate_new_chunk() {
   // This dirty read of _hwm is okay because we only ever increase the _hwm in parallel code.
   // Further this limits _hwm to a value of _chunk_capacity + #threads, avoiding
   // wraparound of _hwm.
@@ -265,14 +265,14 @@
     return NULL;
   }
 
-  OopChunk* result = ::new (&_base[cur_idx]) OopChunk;
+  TaskQueueEntryChunk* result = ::new (&_base[cur_idx]) TaskQueueEntryChunk;
   result->next = NULL;
   return result;
 }
 
-bool G1CMMarkStack::par_push_chunk(oop* ptr_arr) {
+bool G1CMMarkStack::par_push_chunk(G1TaskQueueEntry* ptr_arr) {
   // Get a new chunk.
-  OopChunk* new_chunk = remove_chunk_from_free_list();
+  TaskQueueEntryChunk* new_chunk = remove_chunk_from_free_list();
 
   if (new_chunk == NULL) {
     // Did not get a chunk from the free list. Allocate from backing memory.
@@ -283,21 +283,21 @@
     }
   }
 
-  Copy::conjoint_memory_atomic(ptr_arr, new_chunk->data, OopsPerChunk * sizeof(oop));
+  Copy::conjoint_memory_atomic(ptr_arr, new_chunk->data, EntriesPerChunk * sizeof(G1TaskQueueEntry));
 
   add_chunk_to_chunk_list(new_chunk);
 
   return true;
 }
 
-bool G1CMMarkStack::par_pop_chunk(oop* ptr_arr) {
-  OopChunk* cur = remove_chunk_from_chunk_list();
+bool G1CMMarkStack::par_pop_chunk(G1TaskQueueEntry* ptr_arr) {
+  TaskQueueEntryChunk* cur = remove_chunk_from_chunk_list();
 
   if (cur == NULL) {
     return false;
   }
 
-  Copy::conjoint_memory_atomic(cur->data, ptr_arr, OopsPerChunk * sizeof(oop));
+  Copy::conjoint_memory_atomic(cur->data, ptr_arr, EntriesPerChunk * sizeof(G1TaskQueueEntry));
 
   add_chunk_to_free_list(cur);
   return true;
@@ -1995,13 +1995,17 @@
     _info(info)
   { }
 
-  void operator()(oop obj) const {
-    guarantee(G1CMObjArrayProcessor::is_array_slice(obj) || obj->is_oop(),
+  void operator()(G1TaskQueueEntry task_entry) const {
+    if (task_entry.is_array_slice()) {
+      guarantee(_g1h->is_in_reserved(task_entry.slice()), "Slice " PTR_FORMAT " must be in heap.", p2i(task_entry.slice()));
+      return;
+    }
+    guarantee(task_entry.obj()->is_oop(),
               "Non-oop " PTR_FORMAT ", phase: %s, info: %d",
-              p2i(obj), _phase, _info);
-    guarantee(G1CMObjArrayProcessor::is_array_slice(obj) || !_g1h->is_in_cset(obj),
+              p2i(task_entry.obj()), _phase, _info);
+    guarantee(!_g1h->is_in_cset(task_entry.obj()),
               "obj: " PTR_FORMAT " in CSet, phase: %s, info: %d",
-              p2i(obj), _phase, _info);
+              p2i(task_entry.obj()), _phase, _info);
   }
 };
 
@@ -2195,7 +2199,7 @@
     // We move that task's local finger along.
     _task->move_finger_to(addr);
 
-    _task->scan_object(oop(addr));
+    _task->scan_task_entry(G1TaskQueueEntry::from_oop(oop(addr)));
     // we only partially drain the local queue and global stack
     _task->drain_local_queue(true);
     _task->drain_global_stack(true);
@@ -2386,16 +2390,16 @@
 void G1CMTask::move_entries_to_global_stack() {
   // Local array where we'll store the entries that will be popped
   // from the local queue.
-  oop buffer[G1CMMarkStack::OopsPerChunk];
+  G1TaskQueueEntry buffer[G1CMMarkStack::EntriesPerChunk];
 
   size_t n = 0;
-  oop obj;
-  while (n < G1CMMarkStack::OopsPerChunk && _task_queue->pop_local(obj)) {
-    buffer[n] = obj;
+  G1TaskQueueEntry task_entry;
+  while (n < G1CMMarkStack::EntriesPerChunk && _task_queue->pop_local(task_entry)) {
+    buffer[n] = task_entry;
     ++n;
   }
-  if (n < G1CMMarkStack::OopsPerChunk) {
-    buffer[n] = NULL;
+  if (n < G1CMMarkStack::EntriesPerChunk) {
+    buffer[n] = G1TaskQueueEntry();
   }
 
   if (n > 0) {
@@ -2411,20 +2415,20 @@
 bool G1CMTask::get_entries_from_global_stack() {
   // Local array where we'll store the entries that will be popped
   // from the global stack.
-  oop buffer[G1CMMarkStack::OopsPerChunk];
+  G1TaskQueueEntry buffer[G1CMMarkStack::EntriesPerChunk];
 
   if (!_cm->mark_stack_pop(buffer)) {
     return false;
   }
 
   // We did actually pop at least one entry.
-  for (size_t i = 0; i < G1CMMarkStack::OopsPerChunk; ++i) {
-    oop elem = buffer[i];
-    if (elem == NULL) {
+  for (size_t i = 0; i < G1CMMarkStack::EntriesPerChunk; ++i) {
+    G1TaskQueueEntry task_entry = buffer[i];
+    if (task_entry.is_null()) {
       break;
     }
-    assert(G1CMObjArrayProcessor::is_array_slice(elem) || elem->is_oop(), "Element " PTR_FORMAT " must be an array slice or oop", p2i(elem));
-    bool success = _task_queue->push(elem);
+    assert(task_entry.is_array_slice() || task_entry.obj()->is_oop(), "Element " PTR_FORMAT " must be an array slice or oop", p2i(task_entry.obj()));
+    bool success = _task_queue->push(task_entry);
     // We only call this when the local queue is empty or under a
     // given target limit. So, we do not expect this push to fail.
     assert(success, "invariant");
@@ -2451,14 +2455,14 @@
   }
 
   if (_task_queue->size() > target_size) {
-    oop obj;
-    bool ret = _task_queue->pop_local(obj);
+    G1TaskQueueEntry entry;
+    bool ret = _task_queue->pop_local(entry);
     while (ret) {
-      scan_object(obj);
+      scan_task_entry(entry);
       if (_task_queue->size() <= target_size || has_aborted()) {
         ret = false;
       } else {
-        ret = _task_queue->pop_local(obj);
+        ret = _task_queue->pop_local(entry);
       }
     }
   }
@@ -2539,8 +2543,8 @@
                        _step_times_ms.maximum(), _step_times_ms.sum());
 }
 
-bool G1ConcurrentMark::try_stealing(uint worker_id, int* hash_seed, oop& obj) {
-  return _task_queues->steal(worker_id, hash_seed, obj);
+bool G1ConcurrentMark::try_stealing(uint worker_id, int* hash_seed, G1TaskQueueEntry& task_entry) {
+  return _task_queues->steal(worker_id, hash_seed, task_entry);
 }
 
 /*****************************************************************************
@@ -2863,9 +2867,9 @@
     assert(_cm->out_of_regions() && _task_queue->size() == 0,
            "only way to reach here");
     while (!has_aborted()) {
-      oop obj;
-      if (_cm->try_stealing(_worker_id, &_hash_seed, obj)) {
-        scan_object(obj);
+      G1TaskQueueEntry entry;
+      if (_cm->try_stealing(_worker_id, &_hash_seed, entry)) {
+        scan_task_entry(entry);
 
         // And since we're towards the end, let's totally drain the
         // local queue and global stack.
--- a/hotspot/src/share/vm/gc/g1/g1ConcurrentMark.hpp	Tue Mar 14 14:07:24 2017 -0400
+++ b/hotspot/src/share/vm/gc/g1/g1ConcurrentMark.hpp	Wed Mar 15 11:44:46 2017 +0100
@@ -38,7 +38,62 @@
 class ConcurrentGCTimer;
 class G1OldTracer;
 class G1SurvivorRegions;
-typedef GenericTaskQueue<oop, mtGC>              G1CMTaskQueue;
+
+#ifdef _MSC_VER
+#pragma warning(push)
+// warning C4522: multiple assignment operators specified
+#pragma warning(disable:4522)
+#endif
+
+// This is a container class for either an oop or a continuation address for
+// mark stack entries. Both are pushed onto the mark stack.
+class G1TaskQueueEntry VALUE_OBJ_CLASS_SPEC {
+private:
+  void* _holder;
+
+  static const uintptr_t ArraySliceBit = 1;
+
+  G1TaskQueueEntry(oop obj) : _holder(obj) {
+    assert(_holder != NULL, "Not allowed to set NULL task queue element");
+  }
+  G1TaskQueueEntry(HeapWord* addr) : _holder((void*)((uintptr_t)addr | ArraySliceBit)) { }
+public:
+  G1TaskQueueEntry(const G1TaskQueueEntry& other) { _holder = other._holder; }
+  G1TaskQueueEntry() : _holder(NULL) { }
+
+  static G1TaskQueueEntry from_slice(HeapWord* what) { return G1TaskQueueEntry(what); }
+  static G1TaskQueueEntry from_oop(oop obj) { return G1TaskQueueEntry(obj); }
+
+  G1TaskQueueEntry& operator=(const G1TaskQueueEntry& t) {
+    _holder = t._holder;
+    return *this;
+  }
+
+  volatile G1TaskQueueEntry& operator=(const volatile G1TaskQueueEntry& t) volatile {
+    _holder = t._holder;
+    return *this;
+  }
+
+  oop obj() const {
+    assert(!is_array_slice(), "Trying to read array slice " PTR_FORMAT " as oop", p2i(_holder));
+    return (oop)_holder;
+  }
+
+  HeapWord* slice() const {
+    assert(is_array_slice(), "Trying to read oop " PTR_FORMAT " as array slice", p2i(_holder));
+    return (HeapWord*)((uintptr_t)_holder & ~ArraySliceBit);
+  }
+
+  bool is_oop() const { return !is_array_slice(); }
+  bool is_array_slice() const { return ((uintptr_t)_holder & ArraySliceBit) != 0; }
+  bool is_null() const { return _holder == NULL; }
+};
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+typedef GenericTaskQueue<G1TaskQueueEntry, mtGC> G1CMTaskQueue;
 typedef GenericTaskQueueSet<G1CMTaskQueue, mtGC> G1CMTaskQueueSet;
 
 // Closure used by CM during concurrent reference discovery
@@ -165,44 +220,44 @@
 // list connecting all empty chunks.
 class G1CMMarkStack VALUE_OBJ_CLASS_SPEC {
 public:
-  // Number of oops that can fit in a single chunk.
-  static const size_t OopsPerChunk = 1024 - 1 /* One reference for the next pointer */;
+  // Number of TaskQueueEntries that can fit in a single chunk.
+  static const size_t EntriesPerChunk = 1024 - 1 /* One reference for the next pointer */;
 private:
-  struct OopChunk {
-    OopChunk* next;
-    oop data[OopsPerChunk];
+  struct TaskQueueEntryChunk {
+    TaskQueueEntryChunk* next;
+    G1TaskQueueEntry data[EntriesPerChunk];
   };
 
-  size_t _max_chunk_capacity;    // Maximum number of OopChunk elements on the stack.
+  size_t _max_chunk_capacity;    // Maximum number of TaskQueueEntryChunk elements on the stack.
 
-  OopChunk* _base;               // Bottom address of allocated memory area.
-  size_t _chunk_capacity;        // Current maximum number of OopChunk elements.
+  TaskQueueEntryChunk* _base;    // Bottom address of allocated memory area.
+  size_t _chunk_capacity;        // Current maximum number of TaskQueueEntryChunk elements.
 
   char _pad0[DEFAULT_CACHE_LINE_SIZE];
-  OopChunk* volatile _free_list;  // Linked list of free chunks that can be allocated by users.
-  char _pad1[DEFAULT_CACHE_LINE_SIZE - sizeof(OopChunk*)];
-  OopChunk* volatile _chunk_list; // List of chunks currently containing data.
+  TaskQueueEntryChunk* volatile _free_list;  // Linked list of free chunks that can be allocated by users.
+  char _pad1[DEFAULT_CACHE_LINE_SIZE - sizeof(TaskQueueEntryChunk*)];
+  TaskQueueEntryChunk* volatile _chunk_list; // List of chunks currently containing data.
   volatile size_t _chunks_in_chunk_list;
-  char _pad2[DEFAULT_CACHE_LINE_SIZE - sizeof(OopChunk*) - sizeof(size_t)];
+  char _pad2[DEFAULT_CACHE_LINE_SIZE - sizeof(TaskQueueEntryChunk*) - sizeof(size_t)];
 
   volatile size_t _hwm;          // High water mark within the reserved space.
   char _pad4[DEFAULT_CACHE_LINE_SIZE - sizeof(size_t)];
 
   // Allocate a new chunk from the reserved memory, using the high water mark. Returns
   // NULL if out of memory.
-  OopChunk* allocate_new_chunk();
+  TaskQueueEntryChunk* allocate_new_chunk();
 
   // Atomically add the given chunk to the list.
-  void add_chunk_to_list(OopChunk* volatile* list, OopChunk* elem);
+  void add_chunk_to_list(TaskQueueEntryChunk* volatile* list, TaskQueueEntryChunk* elem);
   // Atomically remove and return a chunk from the given list. Returns NULL if the
   // list is empty.
-  OopChunk* remove_chunk_from_list(OopChunk* volatile* list);
+  TaskQueueEntryChunk* remove_chunk_from_list(TaskQueueEntryChunk* volatile* list);
 
-  void add_chunk_to_chunk_list(OopChunk* elem);
-  void add_chunk_to_free_list(OopChunk* elem);
+  void add_chunk_to_chunk_list(TaskQueueEntryChunk* elem);
+  void add_chunk_to_free_list(TaskQueueEntryChunk* elem);
 
-  OopChunk* remove_chunk_from_chunk_list();
-  OopChunk* remove_chunk_from_free_list();
+  TaskQueueEntryChunk* remove_chunk_from_chunk_list();
+  TaskQueueEntryChunk* remove_chunk_from_free_list();
 
   bool  _should_expand;
 
@@ -220,17 +275,17 @@
   // Allocate and initialize the mark stack with the given number of oops.
   bool initialize(size_t initial_capacity, size_t max_capacity);
 
-  // Pushes the given buffer containing at most OopsPerChunk elements on the mark
-  // stack. If less than OopsPerChunk elements are to be pushed, the array must
+  // Pushes the given buffer containing at most EntriesPerChunk elements on the mark
+  // stack. If less than EntriesPerChunk elements are to be pushed, the array must
   // be terminated with a NULL.
   // Returns whether the buffer contents were successfully pushed to the global mark
   // stack.
-  bool par_push_chunk(oop* buffer);
+  bool par_push_chunk(G1TaskQueueEntry* buffer);
 
   // Pops a chunk from this mark stack, copying them into the given buffer. This
-  // chunk may contain up to OopsPerChunk elements. If there are less, the last
+  // chunk may contain up to EntriesPerChunk elements. If there are less, the last
   // element in the array is a NULL pointer.
-  bool par_pop_chunk(oop* buffer);
+  bool par_pop_chunk(G1TaskQueueEntry* buffer);
 
   // Return whether the chunk list is empty. Racy due to unsynchronized access to
   // _chunk_list.
@@ -246,7 +301,7 @@
 
   // Return the approximate number of oops on this mark stack. Racy due to
   // unsynchronized access to _chunks_in_chunk_list.
-  size_t size() const { return _chunks_in_chunk_list * OopsPerChunk; }
+  size_t size() const { return _chunks_in_chunk_list * EntriesPerChunk; }
 
   void set_empty();
 
@@ -526,14 +581,14 @@
   // Manipulation of the global mark stack.
   // The push and pop operations are used by tasks for transfers
   // between task-local queues and the global mark stack.
-  bool mark_stack_push(oop* arr) {
+  bool mark_stack_push(G1TaskQueueEntry* arr) {
     if (!_global_mark_stack.par_push_chunk(arr)) {
       set_has_overflown();
       return false;
     }
     return true;
   }
-  bool mark_stack_pop(oop* arr) {
+  bool mark_stack_pop(G1TaskQueueEntry* arr) {
     return _global_mark_stack.par_pop_chunk(arr);
   }
   size_t mark_stack_size()                { return _global_mark_stack.size(); }
@@ -567,7 +622,7 @@
   }
 
   // Attempts to steal an object from the task queues of other tasks
-  bool try_stealing(uint worker_id, int* hash_seed, oop& obj);
+  bool try_stealing(uint worker_id, int* hash_seed, G1TaskQueueEntry& task_entry);
 
   G1ConcurrentMark(G1CollectedHeap* g1h,
                    G1RegionToSpaceMapper* prev_bitmap_storage,
@@ -822,7 +877,7 @@
   // mark bitmap scan, and so needs to be pushed onto the mark stack.
   bool is_below_finger(oop obj, HeapWord* global_finger) const;
 
-  template<bool scan> void process_grey_object(oop obj);
+  template<bool scan> void process_grey_task_entry(G1TaskQueueEntry task_entry);
 public:
   // Apply the closure on the given area of the objArray. Return the number of words
   // scanned.
@@ -887,10 +942,10 @@
   inline void deal_with_reference(oop obj);
 
   // It scans an object and visits its children.
-  inline void scan_object(oop obj);
+  inline void scan_task_entry(G1TaskQueueEntry task_entry);
 
   // It pushes an object on the local queue.
-  inline void push(oop obj);
+  inline void push(G1TaskQueueEntry task_entry);
 
   // Move entries to the global stack.
   void move_entries_to_global_stack();
--- a/hotspot/src/share/vm/gc/g1/g1ConcurrentMark.inline.hpp	Tue Mar 14 14:07:24 2017 -0400
+++ b/hotspot/src/share/vm/gc/g1/g1ConcurrentMark.inline.hpp	Wed Mar 15 11:44:46 2017 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001, 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2017, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -97,12 +97,12 @@
 
   size_t num_chunks = 0;
 
-  OopChunk* cur = _chunk_list;
+  TaskQueueEntryChunk* cur = _chunk_list;
   while (cur != NULL) {
     guarantee(num_chunks <= _chunks_in_chunk_list, "Found " SIZE_FORMAT " oop chunks which is more than there should be", num_chunks);
 
-    for (size_t i = 0; i < OopsPerChunk; ++i) {
-      if (cur->data[i] == NULL) {
+    for (size_t i = 0; i < EntriesPerChunk; ++i) {
+      if (cur->data[i].is_null()) {
         break;
       }
       fn(cur->data[i]);
@@ -114,17 +114,16 @@
 #endif
 
 // It scans an object and visits its children.
-inline void G1CMTask::scan_object(oop obj) { process_grey_object<true>(obj); }
+inline void G1CMTask::scan_task_entry(G1TaskQueueEntry task_entry) { process_grey_task_entry<true>(task_entry); }
 
-inline void G1CMTask::push(oop obj) {
-  HeapWord* objAddr = (HeapWord*) obj;
-  assert(G1CMObjArrayProcessor::is_array_slice(obj) || _g1h->is_in_g1_reserved(objAddr), "invariant");
-  assert(G1CMObjArrayProcessor::is_array_slice(obj) || !_g1h->is_on_master_free_list(
-              _g1h->heap_region_containing((HeapWord*) objAddr)), "invariant");
-  assert(G1CMObjArrayProcessor::is_array_slice(obj) || !_g1h->is_obj_ill(obj), "invariant");
-  assert(G1CMObjArrayProcessor::is_array_slice(obj) || _nextMarkBitMap->isMarked(objAddr), "invariant");
+inline void G1CMTask::push(G1TaskQueueEntry task_entry) {
+  assert(task_entry.is_array_slice() || _g1h->is_in_g1_reserved(task_entry.obj()), "invariant");
+  assert(task_entry.is_array_slice() || !_g1h->is_on_master_free_list(
+              _g1h->heap_region_containing(task_entry.obj())), "invariant");
+  assert(task_entry.is_array_slice() || !_g1h->is_obj_ill(task_entry.obj()), "invariant");  // FIXME!!!
+  assert(task_entry.is_array_slice() || _nextMarkBitMap->isMarked((HeapWord*)task_entry.obj()), "invariant");
 
-  if (!_task_queue->push(obj)) {
+  if (!_task_queue->push(task_entry)) {
     // The local task queue looks full. We need to push some entries
     // to the global stack.
     move_entries_to_global_stack();
@@ -132,7 +131,7 @@
     // this should succeed since, even if we overflow the global
     // stack, we should have definitely removed some entries from the
     // local queue. So, there must be space on it.
-    bool success = _task_queue->push(obj);
+    bool success = _task_queue->push(task_entry);
     assert(success, "invariant");
   }
 }
@@ -168,18 +167,21 @@
 }
 
 template<bool scan>
-inline void G1CMTask::process_grey_object(oop obj) {
-  assert(scan || obj->is_typeArray(), "Skipping scan of grey non-typeArray");
-  assert(G1CMObjArrayProcessor::is_array_slice(obj) || _nextMarkBitMap->isMarked((HeapWord*) obj),
+inline void G1CMTask::process_grey_task_entry(G1TaskQueueEntry task_entry) {
+  assert(scan || (task_entry.is_oop() && task_entry.obj()->is_typeArray()), "Skipping scan of grey non-typeArray");
+  assert(task_entry.is_array_slice() || _nextMarkBitMap->isMarked((HeapWord*)task_entry.obj()),
          "Any stolen object should be a slice or marked");
 
   if (scan) {
-    if (G1CMObjArrayProcessor::is_array_slice(obj)) {
-      _words_scanned += _objArray_processor.process_slice(obj);
-    } else if (G1CMObjArrayProcessor::should_be_sliced(obj)) {
-      _words_scanned += _objArray_processor.process_obj(obj);
+    if (task_entry.is_array_slice()) {
+      _words_scanned += _objArray_processor.process_slice(task_entry.slice());
     } else {
-      _words_scanned += obj->oop_iterate_size(_cm_oop_closure);;
+      oop obj = task_entry.obj();
+      if (G1CMObjArrayProcessor::should_be_sliced(obj)) {
+        _words_scanned += _objArray_processor.process_obj(obj);
+      } else {
+        _words_scanned += obj->oop_iterate_size(_cm_oop_closure);;
+      }
     }
   }
   check_limits();
@@ -210,6 +212,7 @@
     // be pushed on the stack. So, some duplicate work, but no
     // correctness problems.
     if (is_below_finger(obj, global_finger)) {
+      G1TaskQueueEntry entry = G1TaskQueueEntry::from_oop(obj);
       if (obj->is_typeArray()) {
         // Immediately process arrays of primitive types, rather
         // than pushing on the mark stack.  This keeps us from
@@ -221,9 +224,9 @@
         // by only doing a bookkeeping update and avoiding the
         // actual scan of the object - a typeArray contains no
         // references, and the metadata is built-in.
-        process_grey_object<false>(obj);
+        process_grey_task_entry<false>(entry);
       } else {
-        push(obj);
+        push(entry);
       }
     }
   }
--- a/hotspot/src/share/vm/gc/g1/g1ConcurrentMarkObjArrayProcessor.cpp	Tue Mar 14 14:07:24 2017 -0400
+++ b/hotspot/src/share/vm/gc/g1/g1ConcurrentMarkObjArrayProcessor.cpp	Wed Mar 15 11:44:46 2017 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, 2017, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -26,18 +26,8 @@
 #include "gc/g1/g1ConcurrentMark.inline.hpp"
 #include "gc/g1/g1ConcurrentMarkObjArrayProcessor.inline.hpp"
 
-oop G1CMObjArrayProcessor::encode_array_slice(HeapWord* addr) {
-  return oop((void*)((uintptr_t)addr | ArraySliceBit));
-}
-
-HeapWord* G1CMObjArrayProcessor::decode_array_slice(oop value) {
-  assert(is_array_slice(value), "Given value " PTR_FORMAT " is not an array slice", p2i(value));
-  return (HeapWord*)((uintptr_t)(void*)value & ~ArraySliceBit);
-}
-
 void G1CMObjArrayProcessor::push_array_slice(HeapWord* what) {
-  oop obj = encode_array_slice(what);
-  _task->push(obj);
+  _task->push(G1TaskQueueEntry::from_slice(what));
 }
 
 size_t G1CMObjArrayProcessor::process_array_slice(objArrayOop obj, HeapWord* start_from, size_t remaining) {
@@ -58,30 +48,29 @@
   return process_array_slice(objArrayOop(obj), (HeapWord*)obj, (size_t)objArrayOop(obj)->size());
 }
 
-size_t G1CMObjArrayProcessor::process_slice(oop obj) {
-  HeapWord* const decoded_address = decode_array_slice(obj);
+size_t G1CMObjArrayProcessor::process_slice(HeapWord* slice) {
 
   // Find the start address of the objArrayOop.
   // Shortcut the BOT access if the given address is from a humongous object. The BOT
   // slide is fast enough for "smaller" objects in non-humongous regions, but is slower
   // than directly using heap region table.
   G1CollectedHeap* g1h = G1CollectedHeap::heap();
-  HeapRegion* r = g1h->heap_region_containing(decoded_address);
+  HeapRegion* r = g1h->heap_region_containing(slice);
 
   HeapWord* const start_address = r->is_humongous() ?
                                   r->humongous_start_region()->bottom() :
-                                  g1h->block_start(decoded_address);
+                                  g1h->block_start(slice);
 
   assert(oop(start_address)->is_objArray(), "Address " PTR_FORMAT " does not refer to an object array ", p2i(start_address));
-  assert(start_address < decoded_address,
+  assert(start_address < slice,
          "Object start address " PTR_FORMAT " must be smaller than decoded address " PTR_FORMAT,
          p2i(start_address),
-         p2i(decoded_address));
+         p2i(slice));
 
   objArrayOop objArray = objArrayOop(start_address);
 
-  size_t already_scanned = decoded_address - start_address;
+  size_t already_scanned = slice - start_address;
   size_t remaining = objArray->size() - already_scanned;
 
-  return process_array_slice(objArray, decoded_address, remaining);
+  return process_array_slice(objArray, slice, remaining);
 }
--- a/hotspot/src/share/vm/gc/g1/g1ConcurrentMarkObjArrayProcessor.hpp	Tue Mar 14 14:07:24 2017 -0400
+++ b/hotspot/src/share/vm/gc/g1/g1ConcurrentMarkObjArrayProcessor.hpp	Wed Mar 15 11:44:46 2017 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, 2017, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -36,32 +36,22 @@
 // This allows incremental processing of large objects.
 class G1CMObjArrayProcessor VALUE_OBJ_CLASS_SPEC {
 private:
-  // The bit mask for the continuation indicator of elements on the mark stack.
-  static const size_t ArraySliceBit = 1;
-
   // Reference to the task for doing the actual work.
   G1CMTask* _task;
 
-  // Encodes the given address as a continuation "oop".
-  oop encode_array_slice(HeapWord* addr);
-  // Remove the continuation marker from the given oop from the mark stack.
-  HeapWord* decode_array_slice(oop value);
-
   // Push the continuation at the given address onto the mark stack.
   void push_array_slice(HeapWord* addr);
 
   // Process (apply the closure) on the given continuation of the given objArray.
   size_t process_array_slice(objArrayOop const obj, HeapWord* start_from, size_t remaining);
 public:
-  static bool is_array_slice(void* obj) { return ((uintptr_t)obj & ArraySliceBit) != 0; }
-
   static bool should_be_sliced(oop obj);
 
   G1CMObjArrayProcessor(G1CMTask* task) : _task(task) {
   }
 
-  // Process the given continuation "oop". Returns the number of words scanned.
-  size_t process_slice(oop obj);
+  // Process the given continuation. Returns the number of words scanned.
+  size_t process_slice(HeapWord* slice);
   // Start processing the given objArrayOop by scanning the header and pushing its
   // continuation.
   size_t process_obj(oop obj);