8224584: Shenandoah: Eliminate forwarding pointer word
authorrkennke
Wed, 29 May 2019 12:01:21 +0200
changeset 55076 785a12e0f89b
parent 55075 044f2ca6ce22
child 55077 806b1c8e24b8
8224584: Shenandoah: Eliminate forwarding pointer word Reviewed-by: shade, roland
src/hotspot/cpu/aarch64/gc/shenandoah/shenandoahBarrierSetAssembler_aarch64.cpp
src/hotspot/cpu/aarch64/gc/shenandoah/shenandoahBarrierSetAssembler_aarch64.hpp
src/hotspot/cpu/x86/gc/shenandoah/shenandoahBarrierSetAssembler_x86.cpp
src/hotspot/cpu/x86/gc/shenandoah/shenandoahBarrierSetAssembler_x86.hpp
src/hotspot/share/gc/shenandoah/c2/shenandoahBarrierSetC2.cpp
src/hotspot/share/gc/shenandoah/c2/shenandoahBarrierSetC2.hpp
src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp
src/hotspot/share/gc/shenandoah/shenandoahAsserts.cpp
src/hotspot/share/gc/shenandoah/shenandoahBarrierSet.cpp
src/hotspot/share/gc/shenandoah/shenandoahConcurrentMark.inline.hpp
src/hotspot/share/gc/shenandoah/shenandoahForwarding.hpp
src/hotspot/share/gc/shenandoah/shenandoahForwarding.inline.hpp
src/hotspot/share/gc/shenandoah/shenandoahHeap.cpp
src/hotspot/share/gc/shenandoah/shenandoahHeap.hpp
src/hotspot/share/gc/shenandoah/shenandoahHeap.inline.hpp
src/hotspot/share/gc/shenandoah/shenandoahHeapRegion.cpp
src/hotspot/share/gc/shenandoah/shenandoahHeuristics.cpp
src/hotspot/share/gc/shenandoah/shenandoahMarkCompact.cpp
src/hotspot/share/gc/shenandoah/shenandoahMarkCompact.hpp
src/hotspot/share/gc/shenandoah/shenandoahTraversalGC.cpp
src/hotspot/share/gc/shenandoah/shenandoahVerifier.cpp
--- a/src/hotspot/cpu/aarch64/gc/shenandoah/shenandoahBarrierSetAssembler_aarch64.cpp	Wed May 29 09:25:20 2019 +0200
+++ b/src/hotspot/cpu/aarch64/gc/shenandoah/shenandoahBarrierSetAssembler_aarch64.cpp	Wed May 29 12:01:21 2019 +0200
@@ -211,18 +211,46 @@
   __ bind(done);
 }
 
-void ShenandoahBarrierSetAssembler::resolve_forward_pointer(MacroAssembler* masm, Register dst) {
+void ShenandoahBarrierSetAssembler::resolve_forward_pointer(MacroAssembler* masm, Register dst, Register tmp) {
   assert(ShenandoahLoadRefBarrier || ShenandoahCASBarrier, "Should be enabled");
   Label is_null;
   __ cbz(dst, is_null);
-  resolve_forward_pointer_not_null(masm, dst);
+  resolve_forward_pointer_not_null(masm, dst, tmp);
   __ bind(is_null);
 }
 
-// IMPORTANT: This must preserve all registers, even rscratch1 and rscratch2.
-void ShenandoahBarrierSetAssembler::resolve_forward_pointer_not_null(MacroAssembler* masm, Register dst) {
+// IMPORTANT: This must preserve all registers, even rscratch1 and rscratch2, except those explicitely
+// passed in.
+void ShenandoahBarrierSetAssembler::resolve_forward_pointer_not_null(MacroAssembler* masm, Register dst, Register tmp) {
   assert(ShenandoahLoadRefBarrier || ShenandoahCASBarrier, "Should be enabled");
-  __ ldr(dst, Address(dst, ShenandoahForwarding::byte_offset()));
+  // The below loads the mark word, checks if the lowest two bits are
+  // set, and if so, clear the lowest two bits and copy the result
+  // to dst. Otherwise it leaves dst alone.
+  // Implementing this is surprisingly awkward. I do it here by:
+  // - Inverting the mark word
+  // - Test lowest two bits == 0
+  // - If so, set the lowest two bits
+  // - Invert the result back, and copy to dst
+
+  bool borrow_reg = (tmp == noreg);
+  if (borrow_reg) {
+    // No free registers available. Make one useful.
+    tmp = rscratch1;
+    __ push(RegSet::of(tmp), sp);
+  }
+
+  Label done;
+  __ ldr(tmp, Address(dst, oopDesc::mark_offset_in_bytes()));
+  __ eon(tmp, tmp, zr);
+  __ ands(zr, tmp, markOopDesc::lock_mask_in_place);
+  __ br(Assembler::NE, done);
+  __ orr(tmp, tmp, markOopDesc::marked_value);
+  __ eon(dst, tmp, zr);
+  __ bind(done);
+
+  if (borrow_reg) {
+    __ pop(RegSet::of(tmp), sp);
+  }
 }
 
 void ShenandoahBarrierSetAssembler::load_reference_barrier_not_null(MacroAssembler* masm, Register dst, Register tmp) {
@@ -343,40 +371,6 @@
 
 }
 
-void ShenandoahBarrierSetAssembler::tlab_allocate(MacroAssembler* masm, Register obj,
-                                                  Register var_size_in_bytes,
-                                                  int con_size_in_bytes,
-                                                  Register t1,
-                                                  Register t2,
-                                                  Label& slow_case) {
-
-  assert_different_registers(obj, t2);
-  assert_different_registers(obj, var_size_in_bytes);
-  Register end = t2;
-
-  __ ldr(obj, Address(rthread, JavaThread::tlab_top_offset()));
-  if (var_size_in_bytes == noreg) {
-    __ lea(end, Address(obj, (int) (con_size_in_bytes + ShenandoahForwarding::byte_size())));
-  } else {
-    __ add(var_size_in_bytes, var_size_in_bytes, ShenandoahForwarding::byte_size());
-    __ lea(end, Address(obj, var_size_in_bytes));
-  }
-  __ ldr(rscratch1, Address(rthread, JavaThread::tlab_end_offset()));
-  __ cmp(end, rscratch1);
-  __ br(Assembler::HI, slow_case);
-
-  // update the tlab top pointer
-  __ str(end, Address(rthread, JavaThread::tlab_top_offset()));
-
-  __ add(obj, obj, ShenandoahForwarding::byte_size());
-  __ str(obj, Address(obj, ShenandoahForwarding::byte_offset()));
-
-  // recover var_size_in_bytes if necessary
-  if (var_size_in_bytes == end) {
-    __ sub(var_size_in_bytes, var_size_in_bytes, obj);
-  }
-}
-
 void ShenandoahBarrierSetAssembler::cmpxchg_oop(MacroAssembler* masm, Register addr, Register expected, Register new_val,
                                                 bool acquire, bool release, bool weak, bool is_cae,
                                                 Register result) {
@@ -570,7 +564,7 @@
   __ bind(work);
 
   __ mov(rscratch2, r0);
-  resolve_forward_pointer_not_null(cgen->assembler(), r0);
+  resolve_forward_pointer_not_null(cgen->assembler(), r0, rscratch1);
   __ cmp(rscratch2, r0);
   __ br(Assembler::NE, done);
 
--- a/src/hotspot/cpu/aarch64/gc/shenandoah/shenandoahBarrierSetAssembler_aarch64.hpp	Wed May 29 09:25:20 2019 +0200
+++ b/src/hotspot/cpu/aarch64/gc/shenandoah/shenandoahBarrierSetAssembler_aarch64.hpp	Wed May 29 12:01:21 2019 +0200
@@ -54,8 +54,8 @@
                                     bool tosca_live,
                                     bool expand_call);
 
-  void resolve_forward_pointer(MacroAssembler* masm, Register dst);
-  void resolve_forward_pointer_not_null(MacroAssembler* masm, Register dst);
+  void resolve_forward_pointer(MacroAssembler* masm, Register dst, Register tmp = noreg);
+  void resolve_forward_pointer_not_null(MacroAssembler* masm, Register dst, Register tmp = noreg);
   void load_reference_barrier(MacroAssembler* masm, Register dst, Register tmp);
   void load_reference_barrier_not_null(MacroAssembler* masm, Register dst, Register tmp);
 
@@ -80,13 +80,6 @@
                        Register dst, Address src, Register tmp1, Register tmp_thread);
   virtual void store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
                         Address dst, Register val, Register tmp1, Register tmp2);
-  virtual void tlab_allocate(MacroAssembler* masm, Register obj,
-                             Register var_size_in_bytes,
-                             int con_size_in_bytes,
-                             Register t1,
-                             Register t2,
-                             Label& slow_case);
-
   void cmpxchg_oop(MacroAssembler* masm, Register addr, Register expected, Register new_val,
                    bool acquire, bool release, bool weak, bool is_cae, Register result);
 
--- a/src/hotspot/cpu/x86/gc/shenandoah/shenandoahBarrierSetAssembler_x86.cpp	Wed May 29 09:25:20 2019 +0200
+++ b/src/hotspot/cpu/x86/gc/shenandoah/shenandoahBarrierSetAssembler_x86.cpp	Wed May 29 12:01:21 2019 +0200
@@ -317,18 +317,46 @@
   __ bind(done);
 }
 
-void ShenandoahBarrierSetAssembler::resolve_forward_pointer(MacroAssembler* masm, Register dst) {
+void ShenandoahBarrierSetAssembler::resolve_forward_pointer(MacroAssembler* masm, Register dst, Register tmp) {
   assert(ShenandoahCASBarrier, "should be enabled");
   Label is_null;
   __ testptr(dst, dst);
   __ jcc(Assembler::zero, is_null);
-  resolve_forward_pointer_not_null(masm, dst);
+  resolve_forward_pointer_not_null(masm, dst, tmp);
   __ bind(is_null);
 }
 
-void ShenandoahBarrierSetAssembler::resolve_forward_pointer_not_null(MacroAssembler* masm, Register dst) {
+void ShenandoahBarrierSetAssembler::resolve_forward_pointer_not_null(MacroAssembler* masm, Register dst, Register tmp) {
   assert(ShenandoahCASBarrier || ShenandoahLoadRefBarrier, "should be enabled");
-  __ movptr(dst, Address(dst, ShenandoahForwarding::byte_offset()));
+  // The below loads the mark word, checks if the lowest two bits are
+  // set, and if so, clear the lowest two bits and copy the result
+  // to dst. Otherwise it leaves dst alone.
+  // Implementing this is surprisingly awkward. I do it here by:
+  // - Inverting the mark word
+  // - Test lowest two bits == 0
+  // - If so, set the lowest two bits
+  // - Invert the result back, and copy to dst
+
+  bool borrow_reg = (tmp == noreg);
+  if (borrow_reg) {
+    // No free registers available. Make one useful.
+    tmp = rscratch1;
+    __ push(tmp);
+  }
+
+  Label done;
+  __ movptr(tmp, Address(dst, oopDesc::mark_offset_in_bytes()));
+  __ notptr(tmp);
+  __ testb(tmp, markOopDesc::marked_value);
+  __ jccb(Assembler::notZero, done);
+  __ orptr(tmp, markOopDesc::marked_value);
+  __ notptr(tmp);
+  __ mov(dst, tmp);
+  __ bind(done);
+
+  if (borrow_reg) {
+    __ pop(tmp);
+  }
 }
 
 
@@ -338,13 +366,7 @@
   Label done;
 
   Address gc_state(r15_thread, in_bytes(ShenandoahThreadLocalData::gc_state_offset()));
-  __ testb(gc_state, ShenandoahHeap::HAS_FORWARDED | ShenandoahHeap::EVACUATION | ShenandoahHeap::TRAVERSAL);
-  __ jccb(Assembler::zero, done);
-
-  // Heap is unstable, need to perform the resolve even if LRB is inactive
-  resolve_forward_pointer_not_null(masm, dst);
-
-  __ testb(gc_state, ShenandoahHeap::EVACUATION | ShenandoahHeap::TRAVERSAL);
+  __ testb(gc_state, ShenandoahHeap::HAS_FORWARDED);
   __ jccb(Assembler::zero, done);
 
    if (dst != rax) {
@@ -479,55 +501,6 @@
   }
 }
 
-void ShenandoahBarrierSetAssembler::tlab_allocate(MacroAssembler* masm,
-                                                  Register thread, Register obj,
-                                                  Register var_size_in_bytes,
-                                                  int con_size_in_bytes,
-                                                  Register t1, Register t2,
-                                                  Label& slow_case) {
-  assert_different_registers(obj, t1, t2);
-  assert_different_registers(obj, var_size_in_bytes, t1);
-  Register end = t2;
-  if (!thread->is_valid()) {
-#ifdef _LP64
-    thread = r15_thread;
-#else
-    assert(t1->is_valid(), "need temp reg");
-    thread = t1;
-    __ get_thread(thread);
-#endif
-  }
-
-  __ verify_tlab();
-
-  __ movptr(obj, Address(thread, JavaThread::tlab_top_offset()));
-  if (var_size_in_bytes == noreg) {
-    __ lea(end, Address(obj, con_size_in_bytes + ShenandoahForwarding::byte_size()));
-  } else {
-    __ addptr(var_size_in_bytes, ShenandoahForwarding::byte_size());
-    __ lea(end, Address(obj, var_size_in_bytes, Address::times_1));
-  }
-  __ cmpptr(end, Address(thread, JavaThread::tlab_end_offset()));
-  __ jcc(Assembler::above, slow_case);
-
-  // update the tlab top pointer
-  __ movptr(Address(thread, JavaThread::tlab_top_offset()), end);
-
-  // Initialize brooks pointer
-#ifdef _LP64
-  __ incrementq(obj, ShenandoahForwarding::byte_size());
-#else
-  __ incrementl(obj, ShenandoahForwarding::byte_size());
-#endif
-  __ movptr(Address(obj, ShenandoahForwarding::byte_offset()), obj);
-
-  // recover var_size_in_bytes if necessary
-  if (var_size_in_bytes == end) {
-    __ subptr(var_size_in_bytes, obj);
-  }
-  __ verify_tlab();
-}
-
 // Special Shenandoah CAS implementation that handles false negatives
 // due to concurrent evacuation.
 #ifndef _LP64
@@ -856,7 +829,7 @@
   address start = __ pc();
 
 #ifdef _LP64
-  Label not_done;
+  Label resolve_oop, slow_path;
 
   // We use RDI, which also serves as argument register for slow call.
   // RAX always holds the src object ptr, except after the slow call and
@@ -878,13 +851,31 @@
   // unlive: rdi
   __ testbool(r8);
   // unlive: r8
-  __ jccb(Assembler::notZero, not_done);
+  __ jccb(Assembler::notZero, resolve_oop);
 
   __ pop(r8);
   __ pop(rdi);
   __ ret(0);
 
-  __ bind(not_done);
+  __ bind(resolve_oop);
+
+  __ movptr(r8, Address(rax, oopDesc::mark_offset_in_bytes()));
+  // Test if both lowest bits are set. We trick it by negating the bits
+  // then test for both bits clear.
+  __ notptr(r8);
+  __ testb(r8, markOopDesc::marked_value);
+  __ jccb(Assembler::notZero, slow_path);
+  // Clear both lower bits. It's still inverted, so set them, and then invert back.
+  __ orptr(r8, markOopDesc::marked_value);
+  __ notptr(r8);
+  // At this point, r8 contains the decoded forwarding pointer.
+  __ mov(rax, r8);
+
+  __ pop(r8);
+  __ pop(rdi);
+  __ ret(0);
+
+  __ bind(slow_path);
 
   __ push(rcx);
   __ push(rdx);
--- a/src/hotspot/cpu/x86/gc/shenandoah/shenandoahBarrierSetAssembler_x86.hpp	Wed May 29 09:25:20 2019 +0200
+++ b/src/hotspot/cpu/x86/gc/shenandoah/shenandoahBarrierSetAssembler_x86.hpp	Wed May 29 12:01:21 2019 +0200
@@ -55,8 +55,8 @@
                                     bool tosca_live,
                                     bool expand_call);
 
-  void resolve_forward_pointer(MacroAssembler* masm, Register dst);
-  void resolve_forward_pointer_not_null(MacroAssembler* masm, Register dst);
+  void resolve_forward_pointer(MacroAssembler* masm, Register dst, Register tmp = noreg);
+  void resolve_forward_pointer_not_null(MacroAssembler* masm, Register dst, Register tmp = noreg);
 
   void load_reference_barrier_not_null(MacroAssembler* masm, Register dst);
 
@@ -91,13 +91,6 @@
   virtual void store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
                         Address dst, Register val, Register tmp1, Register tmp2);
 
-  virtual void tlab_allocate(MacroAssembler* masm,
-                             Register thread, Register obj,
-                             Register var_size_in_bytes,
-                             int con_size_in_bytes,
-                             Register t1, Register t2,
-                             Label& slow_case);
-
   virtual void barrier_stubs_init();
 
 };
--- a/src/hotspot/share/gc/shenandoah/c2/shenandoahBarrierSetC2.cpp	Wed May 29 09:25:20 2019 +0200
+++ b/src/hotspot/share/gc/shenandoah/c2/shenandoahBarrierSetC2.cpp	Wed May 29 12:01:21 2019 +0200
@@ -710,30 +710,6 @@
   BarrierSetC2::clone(kit, src, dst, size, is_array);
 }
 
-Node* ShenandoahBarrierSetC2::obj_allocate(PhaseMacroExpand* macro, Node* ctrl, Node* mem, Node* toobig_false, Node* size_in_bytes,
-                                           Node*& i_o, Node*& needgc_ctrl,
-                                           Node*& fast_oop_ctrl, Node*& fast_oop_rawmem,
-                                           intx prefetch_lines) const {
-  PhaseIterGVN& igvn = macro->igvn();
-
-  // Allocate several words more for the Shenandoah brooks pointer.
-  size_in_bytes = new AddXNode(size_in_bytes, igvn.MakeConX(ShenandoahForwarding::byte_size()));
-  macro->transform_later(size_in_bytes);
-
-  Node* fast_oop = BarrierSetC2::obj_allocate(macro, ctrl, mem, toobig_false, size_in_bytes,
-                                              i_o, needgc_ctrl, fast_oop_ctrl, fast_oop_rawmem,
-                                              prefetch_lines);
-
-  // Bump up object for Shenandoah brooks pointer.
-  fast_oop = new AddPNode(macro->top(), fast_oop, igvn.MakeConX(ShenandoahForwarding::byte_size()));
-  macro->transform_later(fast_oop);
-
-  // Initialize Shenandoah brooks pointer to point to the object itself.
-  fast_oop_rawmem = macro->make_store(fast_oop_ctrl, fast_oop_rawmem, fast_oop, ShenandoahForwarding::byte_offset(), fast_oop, T_OBJECT);
-
-  return fast_oop;
-}
-
 // Support for GC barriers emitted during parsing
 bool ShenandoahBarrierSetC2::is_gc_barrier_node(Node* node) const {
   if (node->Opcode() == Op_ShenandoahLoadReferenceBarrier) return true;
--- a/src/hotspot/share/gc/shenandoah/c2/shenandoahBarrierSetC2.hpp	Wed May 29 09:25:20 2019 +0200
+++ b/src/hotspot/share/gc/shenandoah/c2/shenandoahBarrierSetC2.hpp	Wed May 29 12:01:21 2019 +0200
@@ -108,11 +108,6 @@
   // This is the entry-point for the backend to perform accesses through the Access API.
   virtual void clone(GraphKit* kit, Node* src, Node* dst, Node* size, bool is_array) const;
 
-  virtual Node* obj_allocate(PhaseMacroExpand* macro, Node* ctrl, Node* mem, Node* toobig_false, Node* size_in_bytes,
-                             Node*& i_o, Node*& needgc_ctrl,
-                             Node*& fast_oop_ctrl, Node*& fast_oop_rawmem,
-                             intx prefetch_lines) const;
-
   // These are general helper methods used by C2
   virtual bool array_copy_requires_gc_barriers(bool tightly_coupled_alloc, BasicType type, bool is_clone, ArrayCopyPhase phase) const;
   virtual void clone_barrier_at_expansion(ArrayCopyNode* ac, Node* call, PhaseIterGVN& igvn) const;
--- a/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp	Wed May 29 09:25:20 2019 +0200
+++ b/src/hotspot/share/gc/shenandoah/c2/shenandoahSupport.cpp	Wed May 29 12:01:21 2019 +0200
@@ -1455,7 +1455,7 @@
     assert(val->bottom_type()->make_oopptr(), "need oop");
     assert(val->bottom_type()->make_oopptr()->const_oop() == NULL, "expect non-constant");
 
-    enum { _heap_stable = 1, _not_cset, _not_equal, _evac_path, _null_path, PATH_LIMIT };
+    enum { _heap_stable = 1, _not_cset, _fwded, _evac_path, _null_path, PATH_LIMIT };
     Node* region = new RegionNode(PATH_LIMIT);
     Node* val_phi = new PhiNode(region, uncasted_val->bottom_type()->is_oopptr());
     Node* raw_mem_phi = PhiNode::make(region, raw_mem, Type::MEMORY, TypeRawPtr::BOTTOM);
@@ -1505,36 +1505,48 @@
       IfNode* iff = unc_ctrl->in(0)->as_If();
       phase->igvn().replace_input_of(iff, 1, phase->igvn().intcon(1));
     }
-    Node* addr = new AddPNode(new_val, uncasted_val, phase->igvn().MakeConX(ShenandoahForwarding::byte_offset()));
+    Node* addr = new AddPNode(new_val, uncasted_val, phase->igvn().MakeConX(oopDesc::mark_offset_in_bytes()));
     phase->register_new_node(addr, ctrl);
-    assert(val->bottom_type()->isa_oopptr(), "what else?");
-    const TypePtr* obj_type =  val->bottom_type()->is_oopptr();
-    const TypePtr* adr_type = TypeRawPtr::BOTTOM;
-    Node* fwd = new LoadPNode(ctrl, raw_mem, addr, adr_type, obj_type, MemNode::unordered);
-    phase->register_new_node(fwd, ctrl);
+    assert(new_val->bottom_type()->isa_oopptr(), "what else?");
+    Node* markword = new LoadXNode(ctrl, raw_mem, addr, TypeRawPtr::BOTTOM, TypeX_X, MemNode::unordered);
+    phase->register_new_node(markword, ctrl);
+
+    // Test if object is forwarded. This is the case if lowest two bits are set.
+    Node* masked = new AndXNode(markword, phase->igvn().MakeConX(markOopDesc::lock_mask_in_place));
+    phase->register_new_node(masked, ctrl);
+    Node* cmp = new CmpXNode(masked, phase->igvn().MakeConX(markOopDesc::marked_value));
+    phase->register_new_node(cmp, ctrl);
 
     // Only branch to LRB stub if object is not forwarded; otherwise reply with fwd ptr
-    Node* cmp = new CmpPNode(fwd, new_val);
-    phase->register_new_node(cmp, ctrl);
-    Node* bol = new BoolNode(cmp, BoolTest::eq);
+    Node* bol = new BoolNode(cmp, BoolTest::eq); // Equals 3 means it's forwarded
     phase->register_new_node(bol, ctrl);
 
-    IfNode* iff = new IfNode(ctrl, bol, PROB_UNLIKELY(0.999), COUNT_UNKNOWN);
-    if (reg2_ctrl == NULL) reg2_ctrl = iff;
+    IfNode* iff = new IfNode(ctrl, bol, PROB_LIKELY(0.999), COUNT_UNKNOWN);
     phase->register_control(iff, loop, ctrl);
-    Node* if_not_eq = new IfFalseNode(iff);
-    phase->register_control(if_not_eq, loop, iff);
-    Node* if_eq = new IfTrueNode(iff);
-    phase->register_control(if_eq, loop, iff);
+    Node* if_fwd = new IfTrueNode(iff);
+    phase->register_control(if_fwd, loop, iff);
+    Node* if_not_fwd = new IfFalseNode(iff);
+    phase->register_control(if_not_fwd, loop, iff);
+
+    // Decode forward pointer: since we already have the lowest bits, we can just subtract them
+    // from the mark word without the need for large immediate mask.
+    Node* masked2 = new SubXNode(markword, masked);
+    phase->register_new_node(masked2, if_fwd);
+    Node* fwdraw = new CastX2PNode(masked2);
+    fwdraw->init_req(0, if_fwd);
+    phase->register_new_node(fwdraw, if_fwd);
+    Node* fwd = new CheckCastPPNode(NULL, fwdraw, val->bottom_type());
+    phase->register_new_node(fwd, if_fwd);
 
     // Wire up not-equal-path in slots 3.
-    region->init_req(_not_equal, if_not_eq);
-    val_phi->init_req(_not_equal, fwd);
-    raw_mem_phi->init_req(_not_equal, raw_mem);
+    region->init_req(_fwded, if_fwd);
+    val_phi->init_req(_fwded, fwd);
+    raw_mem_phi->init_req(_fwded, raw_mem);
 
     // Call lrb-stub and wire up that path in slots 4
     Node* result_mem = NULL;
-    ctrl = if_eq;
+    ctrl = if_not_fwd;
+    fwd = new_val;
     call_lrb_stub(ctrl, fwd, result_mem, raw_mem, phase);
     region->init_req(_evac_path, ctrl);
     val_phi->init_req(_evac_path, fwd);
--- a/src/hotspot/share/gc/shenandoah/shenandoahAsserts.cpp	Wed May 29 09:25:20 2019 +0200
+++ b/src/hotspot/share/gc/shenandoah/shenandoahAsserts.cpp	Wed May 29 12:01:21 2019 +0200
@@ -60,6 +60,9 @@
   stringStream ss;
   r->print_on(&ss);
 
+  stringStream mw_ss;
+  obj->mark()->print_on(&mw_ss);
+
   ShenandoahMarkingContext* const ctx = heap->marking_context();
 
   msg.append("  " PTR_FORMAT " - klass " PTR_FORMAT " %s\n", p2i(obj), p2i(obj->klass()), obj->klass()->external_name());
@@ -69,6 +72,7 @@
   if (heap->traversal_gc() != NULL) {
     msg.append("    %3s in traversal set\n",         heap->traversal_gc()->traversal_set()->is_in((HeapWord*) obj) ? "" : "not");
   }
+  msg.append("  mark:%s\n", mw_ss.as_string());
   msg.append("  region: %s", ss.as_string());
 }
 
@@ -250,7 +254,7 @@
                   file, line);
   }
 
-  size_t alloc_size = obj->size() + ShenandoahForwarding::word_size();
+  size_t alloc_size = obj->size();
   if (alloc_size > ShenandoahHeapRegion::humongous_threshold_words()) {
     size_t idx = r->region_number();
     size_t num_regions = ShenandoahHeapRegion::required_regions(alloc_size * HeapWordSize);
--- a/src/hotspot/share/gc/shenandoah/shenandoahBarrierSet.cpp	Wed May 29 09:25:20 2019 +0200
+++ b/src/hotspot/share/gc/shenandoah/shenandoahBarrierSet.cpp	Wed May 29 12:01:21 2019 +0200
@@ -26,7 +26,6 @@
 #include "gc/shenandoah/shenandoahBarrierSet.hpp"
 #include "gc/shenandoah/shenandoahBarrierSetAssembler.hpp"
 #include "gc/shenandoah/shenandoahCollectorPolicy.hpp"
-#include "gc/shenandoah/shenandoahForwarding.hpp"
 #include "gc/shenandoah/shenandoahHeap.inline.hpp"
 #include "gc/shenandoah/shenandoahHeuristics.hpp"
 #include "gc/shenandoah/shenandoahTraversalGC.hpp"
@@ -262,7 +261,7 @@
       ShenandoahHeapRegion* r = _heap->heap_region_containing(obj);
       assert(r->is_cset(), "sanity");
 
-      HeapWord* cur = (HeapWord*)obj + obj->size() + ShenandoahForwarding::word_size();
+      HeapWord* cur = (HeapWord*)obj + obj->size();
 
       size_t count = 0;
       while ((cur < r->top()) && ctx->is_marked(oop(cur)) && (count++ < max)) {
@@ -270,7 +269,7 @@
         if (oopDesc::equals_raw(cur_oop, resolve_forwarded_not_null(cur_oop))) {
           _heap->evacuate_object(cur_oop, thread);
         }
-        cur = cur + cur_oop->size() + ShenandoahForwarding::word_size();
+        cur = cur + cur_oop->size();
       }
     }
 
--- a/src/hotspot/share/gc/shenandoah/shenandoahConcurrentMark.inline.hpp	Wed May 29 09:25:20 2019 +0200
+++ b/src/hotspot/share/gc/shenandoah/shenandoahConcurrentMark.inline.hpp	Wed May 29 12:01:21 2019 +0200
@@ -25,7 +25,6 @@
 #define SHARE_GC_SHENANDOAH_SHENANDOAHCONCURRENTMARK_INLINE_HPP
 
 #include "gc/shenandoah/shenandoahAsserts.hpp"
-#include "gc/shenandoah/shenandoahForwarding.hpp"
 #include "gc/shenandoah/shenandoahBarrierSet.inline.hpp"
 #include "gc/shenandoah/shenandoahConcurrentMark.hpp"
 #include "gc/shenandoah/shenandoahMarkingContext.inline.hpp"
@@ -70,7 +69,7 @@
 inline void ShenandoahConcurrentMark::count_liveness(jushort* live_data, oop obj) {
   size_t region_idx = _heap->heap_region_index_containing(obj);
   ShenandoahHeapRegion* region = _heap->get_region(region_idx);
-  size_t size = obj->size() + ShenandoahForwarding::word_size();
+  size_t size = obj->size();
 
   if (!region->is_humongous_start()) {
     assert(!region->is_humongous(), "Cannot have continuations here");
--- a/src/hotspot/share/gc/shenandoah/shenandoahForwarding.hpp	Wed May 29 09:25:20 2019 +0200
+++ b/src/hotspot/share/gc/shenandoah/shenandoahForwarding.hpp	Wed May 29 12:01:21 2019 +0200
@@ -28,68 +28,11 @@
 #include "utilities/globalDefinitions.hpp"
 
 class ShenandoahForwarding {
-  /*
-   * Notes:
-   *
-   *  a. It is important to have byte_offset and word_offset return constant
-   *     expressions, because that will allow to constant-fold forwarding ptr
-   *     accesses. This is not a problem in JIT compilers that would generate
-   *     the code once, but it is problematic in GC hotpath code.
-   *
-   *  b. With filler object mechanics, we may need to allocate more space for
-   *     the forwarding ptr to meet alignment requirements for objects. This
-   *     means *_offset and *_size calls are NOT interchangeable. The accesses
-   *     to forwarding ptrs should always be via *_offset. Storage size
-   *     calculations should always be via *_size.
-   */
-
 public:
-  /* Offset from the object start, in HeapWords. */
-  static inline int word_offset() {
-    return -1; // exactly one HeapWord
-  }
-
-  /* Offset from the object start, in bytes. */
-  static inline int byte_offset() {
-    return -HeapWordSize; // exactly one HeapWord
-  }
-
-  /* Allocated size, in HeapWords. */
-  static inline uint word_size() {
-    return (uint) MinObjAlignment;
-  }
-
-  /* Allocated size, in bytes */
-  static inline uint byte_size() {
-    return (uint) MinObjAlignmentInBytes;
-  }
-
-  /* Assert basic stuff once at startup. */
-  static void initial_checks() {
-    guarantee (MinObjAlignment > 0, "sanity, word_size is correct");
-    guarantee (MinObjAlignmentInBytes > 0, "sanity, byte_size is correct");
-  }
-
-  /* Initializes forwarding pointer (to self).
-   */
-  static inline void initialize(oop obj);
-
   /* Gets forwardee from the given object.
    */
   static inline oop get_forwardee(oop obj);
 
-  /* Tries to atomically update forwardee in $holder object to $update.
-   * Assumes $holder points at itself.
-   * Asserts $holder is in from-space.
-   * Asserts $update is in to-space.
-   */
-  static inline oop try_update_forwardee(oop obj, oop update);
-
-  /* Sets raw value for forwardee slot.
-   * THIS IS DANGEROUS: USERS HAVE TO INITIALIZE/SET FORWARDEE BACK AFTER THEY ARE DONE.
-   */
-  static inline void set_forwardee_raw(oop obj, HeapWord* update);
-
   /* Returns the raw value from forwardee slot.
    */
   static inline HeapWord* get_forwardee_raw(oop obj);
@@ -99,8 +42,21 @@
    */
   static inline HeapWord* get_forwardee_raw_unchecked(oop obj);
 
-private:
-  static inline HeapWord** forward_ptr_addr(oop obj);
+  /**
+   * Returns true if the object is forwarded, false otherwise.
+   */
+  static inline bool is_forwarded(oop obj);
+
+  /* Tries to atomically update forwardee in $holder object to $update.
+   * Assumes $holder points at itself.
+   * Asserts $holder is in from-space.
+   * Asserts $update is in to-space.
+   *
+   * Returns the new object 'update' upon success, or
+   * the new forwardee that a competing thread installed.
+   */
+  static inline oop try_update_forwardee(oop obj, oop update);
+
 };
 
 #endif // SHARE_GC_SHENANDOAH_SHENANDOAHFORWARDING_HPP
--- a/src/hotspot/share/gc/shenandoah/shenandoahForwarding.inline.hpp	Wed May 29 09:25:20 2019 +0200
+++ b/src/hotspot/share/gc/shenandoah/shenandoahForwarding.inline.hpp	Wed May 29 12:01:21 2019 +0200
@@ -26,40 +26,45 @@
 
 #include "gc/shenandoah/shenandoahAsserts.hpp"
 #include "gc/shenandoah/shenandoahForwarding.hpp"
+#include "oops/markOop.inline.hpp"
 #include "runtime/atomic.hpp"
 
-inline HeapWord** ShenandoahForwarding::forward_ptr_addr(oop obj) {
-  return (HeapWord**)((HeapWord*) obj + word_offset());
-}
-
-inline void ShenandoahForwarding::initialize(oop obj) {
-  shenandoah_assert_in_heap(NULL, obj);
-  *forward_ptr_addr(obj) = (HeapWord*) obj;
-}
-
-inline void ShenandoahForwarding::set_forwardee_raw(oop obj, HeapWord* update) {
-  shenandoah_assert_in_heap(NULL, obj);
-  *forward_ptr_addr(obj) = update;
-}
-
 inline HeapWord* ShenandoahForwarding::get_forwardee_raw(oop obj) {
   shenandoah_assert_in_heap(NULL, obj);
-  return *forward_ptr_addr(obj);
+  return get_forwardee_raw_unchecked(obj);
 }
 
 inline HeapWord* ShenandoahForwarding::get_forwardee_raw_unchecked(oop obj) {
-  return *forward_ptr_addr(obj);
+  markOop mark = obj->mark_raw();
+  if (mark->is_marked()) {
+    return (HeapWord*) mark->clear_lock_bits();
+  } else {
+    return (HeapWord*) obj;
+  }
 }
 
 inline oop ShenandoahForwarding::get_forwardee(oop obj) {
   shenandoah_assert_correct(NULL, obj);
-  return oop(*forward_ptr_addr(obj));
+  return oop(get_forwardee_raw_unchecked(obj));
+}
+
+inline bool ShenandoahForwarding::is_forwarded(oop obj) {
+  return obj->mark_raw()->is_marked();
 }
 
 inline oop ShenandoahForwarding::try_update_forwardee(oop obj, oop update) {
-  oop result = (oop) Atomic::cmpxchg(update, (oop*)forward_ptr_addr(obj), obj);
-  shenandoah_assert_correct_except(NULL, obj, !oopDesc::equals_raw(result, obj));
-  return result;
+  markOop old_mark = obj->mark_raw();
+  if (old_mark->is_marked()) {
+    return (oop) old_mark->clear_lock_bits();
+  }
+
+  markOop new_mark = markOopDesc::encode_pointer_as_mark(update);
+  markOop prev_mark = obj->cas_set_mark_raw(new_mark, old_mark);
+  if (prev_mark == old_mark) {
+    return update;
+  } else {
+    return (oop) prev_mark->clear_lock_bits();
+  }
 }
 
 #endif // SHARE_GC_SHENANDOAH_SHENANDOAHFORWARDING_INLINE_HPP
--- a/src/hotspot/share/gc/shenandoah/shenandoahHeap.cpp	Wed May 29 09:25:20 2019 +0200
+++ b/src/hotspot/share/gc/shenandoah/shenandoahHeap.cpp	Wed May 29 12:01:21 2019 +0200
@@ -34,7 +34,6 @@
 
 #include "gc/shenandoah/shenandoahAllocTracker.hpp"
 #include "gc/shenandoah/shenandoahBarrierSet.hpp"
-#include "gc/shenandoah/shenandoahForwarding.hpp"
 #include "gc/shenandoah/shenandoahClosures.inline.hpp"
 #include "gc/shenandoah/shenandoahCollectionSet.hpp"
 #include "gc/shenandoah/shenandoahCollectorPolicy.hpp"
@@ -139,8 +138,6 @@
 };
 
 jint ShenandoahHeap::initialize() {
-  ShenandoahForwarding::initial_checks();
-
   initialize_heuristics();
 
   //
@@ -876,49 +873,6 @@
   return _free_set->allocate(req, in_new_region);
 }
 
-class ShenandoahMemAllocator : public MemAllocator {
-private:
-  MemAllocator& _initializer;
-public:
-  ShenandoahMemAllocator(MemAllocator& initializer, Klass* klass, size_t word_size, Thread* thread) :
-  MemAllocator(klass, word_size + ShenandoahForwarding::word_size(), thread),
-    _initializer(initializer) {}
-
-protected:
-  virtual HeapWord* mem_allocate(Allocation& allocation) const {
-    HeapWord* result = MemAllocator::mem_allocate(allocation);
-    // Initialize brooks-pointer
-    if (result != NULL) {
-      result += ShenandoahForwarding::word_size();
-      ShenandoahForwarding::initialize(oop(result));
-      assert(! ShenandoahHeap::heap()->in_collection_set(result), "never allocate in targetted region");
-    }
-    return result;
-  }
-
-  virtual oop initialize(HeapWord* mem) const {
-     return _initializer.initialize(mem);
-  }
-};
-
-oop ShenandoahHeap::obj_allocate(Klass* klass, int size, TRAPS) {
-  ObjAllocator initializer(klass, size, THREAD);
-  ShenandoahMemAllocator allocator(initializer, klass, size, THREAD);
-  return allocator.allocate();
-}
-
-oop ShenandoahHeap::array_allocate(Klass* klass, int size, int length, bool do_zero, TRAPS) {
-  ObjArrayAllocator initializer(klass, size, length, do_zero, THREAD);
-  ShenandoahMemAllocator allocator(initializer, klass, size, THREAD);
-  return allocator.allocate();
-}
-
-oop ShenandoahHeap::class_allocate(Klass* klass, int size, TRAPS) {
-  ClassAllocator initializer(klass, size, THREAD);
-  ShenandoahMemAllocator allocator(initializer, klass, size, THREAD);
-  return allocator.allocate();
-}
-
 HeapWord* ShenandoahHeap::mem_allocate(size_t size,
                                         bool*  gc_overhead_limit_was_exceeded) {
   ShenandoahAllocRequest req = ShenandoahAllocRequest::for_shared(size);
@@ -961,15 +915,6 @@
   return NULL;
 }
 
-void ShenandoahHeap::fill_with_dummy_object(HeapWord* start, HeapWord* end, bool zap) {
-  HeapWord* obj = tlab_post_allocation_setup(start);
-  CollectedHeap::fill_with_object(obj, end);
-}
-
-size_t ShenandoahHeap::min_dummy_object_size() const {
-  return CollectedHeap::min_dummy_object_size() + ShenandoahForwarding::word_size();
-}
-
 class ShenandoahConcurrentEvacuateRegionObjectClosure : public ObjectClosure {
 private:
   ShenandoahHeap* const _heap;
@@ -980,7 +925,7 @@
 
   void do_object(oop p) {
     shenandoah_assert_marked(NULL, p);
-    if (oopDesc::equals_raw(p, ShenandoahBarrierSet::resolve_forwarded_not_null(p))) {
+    if (!p->is_forwarded()) {
       _heap->evacuate_object(p, _thread);
     }
   }
@@ -1060,8 +1005,8 @@
 void ShenandoahHeap::trash_humongous_region_at(ShenandoahHeapRegion* start) {
   assert(start->is_humongous_start(), "reclaim regions starting with the first one");
 
-  oop humongous_obj = oop(start->bottom() + ShenandoahForwarding::word_size());
-  size_t size = humongous_obj->size() + ShenandoahForwarding::word_size();
+  oop humongous_obj = oop(start->bottom());
+  size_t size = humongous_obj->size();
   size_t required_regions = ShenandoahHeapRegion::required_regions(size * HeapWordSize);
   size_t index = start->region_number() + required_regions - 1;
 
@@ -1874,13 +1819,6 @@
   set_gc_state_mask(EVACUATION, in_progress);
 }
 
-HeapWord* ShenandoahHeap::tlab_post_allocation_setup(HeapWord* obj) {
-  // Initialize Brooks pointer for the next object
-  HeapWord* result = obj + ShenandoahForwarding::word_size();
-  ShenandoahForwarding::initialize(oop(result));
-  return result;
-}
-
 void ShenandoahHeap::ref_processing_init() {
   assert(_max_workers > 0, "Sanity");
 
@@ -2853,11 +2791,3 @@
     }
   }
 }
-
-size_t ShenandoahHeap::obj_size(oop obj) const {
-  return CollectedHeap::obj_size(obj) + ShenandoahForwarding::word_size();
-}
-
-ptrdiff_t ShenandoahHeap::cell_header_size() const {
-  return ShenandoahForwarding::byte_size();
-}
--- a/src/hotspot/share/gc/shenandoah/shenandoahHeap.hpp	Wed May 29 09:25:20 2019 +0200
+++ b/src/hotspot/share/gc/shenandoah/shenandoahHeap.hpp	Wed May 29 12:01:21 2019 +0200
@@ -520,9 +520,6 @@
 
   bool is_in(const void* p) const;
 
-  size_t obj_size(oop obj) const;
-  virtual ptrdiff_t cell_header_size() const;
-
   void collect(GCCause::Cause cause);
   void do_full_collection(bool clear_all_soft_refs);
 
@@ -576,10 +573,6 @@
                                                size_t size,
                                                Metaspace::MetadataType mdtype);
 
-  oop obj_allocate(Klass* klass, int size, TRAPS);
-  oop array_allocate(Klass* klass, int size, int length, bool do_zero, TRAPS);
-  oop class_allocate(Klass* klass, int size, TRAPS);
-
   void notify_mutator_alloc_words(size_t words, bool waste);
 
   // Shenandoah supports TLAB allocation
@@ -591,10 +584,6 @@
   size_t max_tlab_size() const;
   size_t tlab_used(Thread* ignored) const;
 
-  HeapWord* tlab_post_allocation_setup(HeapWord* obj);
-  void fill_with_dummy_object(HeapWord* start, HeapWord* end, bool zap);
-  size_t min_dummy_object_size() const;
-
   void resize_tlabs();
 
   void ensure_parsability(bool retire_tlabs);
--- a/src/hotspot/share/gc/shenandoah/shenandoahHeap.inline.hpp	Wed May 29 09:25:20 2019 +0200
+++ b/src/hotspot/share/gc/shenandoah/shenandoahHeap.inline.hpp	Wed May 29 12:01:21 2019 +0200
@@ -235,51 +235,46 @@
 
   assert(ShenandoahThreadLocalData::is_evac_allowed(thread), "must be enclosed in oom-evac scope");
 
-  size_t size_no_fwdptr = (size_t) p->size();
-  size_t size_with_fwdptr = size_no_fwdptr + ShenandoahForwarding::word_size();
+  size_t size = p->size();
 
   assert(!heap_region_containing(p)->is_humongous(), "never evacuate humongous objects");
 
   bool alloc_from_gclab = true;
-  HeapWord* filler = NULL;
+  HeapWord* copy = NULL;
 
 #ifdef ASSERT
   if (ShenandoahOOMDuringEvacALot &&
       (os::random() & 1) == 0) { // Simulate OOM every ~2nd slow-path call
-        filler = NULL;
+        copy = NULL;
   } else {
 #endif
     if (UseTLAB) {
-      filler = allocate_from_gclab(thread, size_with_fwdptr);
+      copy = allocate_from_gclab(thread, size);
     }
-    if (filler == NULL) {
-      ShenandoahAllocRequest req = ShenandoahAllocRequest::for_shared_gc(size_with_fwdptr);
-      filler = allocate_memory(req);
+    if (copy == NULL) {
+      ShenandoahAllocRequest req = ShenandoahAllocRequest::for_shared_gc(size);
+      copy = allocate_memory(req);
       alloc_from_gclab = false;
     }
 #ifdef ASSERT
   }
 #endif
 
-  if (filler == NULL) {
-    control_thread()->handle_alloc_failure_evac(size_with_fwdptr);
+  if (copy == NULL) {
+    control_thread()->handle_alloc_failure_evac(size);
 
     _oom_evac_handler.handle_out_of_memory_during_evacuation();
 
     return ShenandoahBarrierSet::resolve_forwarded(p);
   }
 
-  // Copy the object and initialize its forwarding ptr:
-  HeapWord* copy = filler + ShenandoahForwarding::word_size();
-  oop copy_val = oop(copy);
-
-  Copy::aligned_disjoint_words((HeapWord*) p, copy, size_no_fwdptr);
-  ShenandoahForwarding::initialize(oop(copy));
+  // Copy the object:
+  Copy::aligned_disjoint_words((HeapWord*) p, copy, size);
 
   // Try to install the new forwarding pointer.
+  oop copy_val = oop(copy);
   oop result = ShenandoahForwarding::try_update_forwardee(p, copy_val);
-
-  if (oopDesc::equals_raw(result, p)) {
+  if (oopDesc::equals_raw(result, copy_val)) {
     // Successfully evacuated. Our copy is now the public one!
     shenandoah_assert_correct(NULL, copy_val);
     return copy_val;
@@ -296,11 +291,11 @@
     // have to explicitly overwrite the copy with the filler object. With that overwrite,
     // we have to keep the fwdptr initialized and pointing to our (stale) copy.
     if (alloc_from_gclab) {
-      ShenandoahThreadLocalData::gclab(thread)->undo_allocation(filler, size_with_fwdptr);
+      ShenandoahThreadLocalData::gclab(thread)->undo_allocation(copy, size);
     } else {
-      fill_with_object(copy, size_no_fwdptr);
+      fill_with_object(copy, size);
+      shenandoah_assert_correct(NULL, copy_val);
     }
-    shenandoah_assert_correct(NULL, copy_val);
     shenandoah_assert_correct(NULL, result);
     return result;
   }
@@ -371,7 +366,6 @@
 
 template<class T>
 inline void ShenandoahHeap::marked_object_iterate(ShenandoahHeapRegion* region, T* cl, HeapWord* limit) {
-  assert(ShenandoahForwarding::word_offset() < 0, "skip_delta calculation below assumes the forwarding ptr is before obj");
   assert(! region->is_humongous_continuation(), "no humongous continuation regions here");
 
   ShenandoahMarkingContext* const ctx = complete_marking_context();
@@ -380,10 +374,9 @@
   MarkBitMap* mark_bit_map = ctx->mark_bit_map();
   HeapWord* tams = ctx->top_at_mark_start(region);
 
-  size_t skip_bitmap_delta = ShenandoahForwarding::word_size() + 1;
-  size_t skip_objsize_delta = ShenandoahForwarding::word_size() /* + actual obj.size() below */;
-  HeapWord* start = region->bottom() + ShenandoahForwarding::word_size();
-  HeapWord* end = MIN2(tams + ShenandoahForwarding::word_size(), region->end());
+  size_t skip_bitmap_delta = 1;
+  HeapWord* start = region->bottom();
+  HeapWord* end = MIN2(tams, region->end());
 
   // Step 1. Scan below the TAMS based on bitmap data.
   HeapWord* limit_bitmap = MIN2(limit, tams);
@@ -413,7 +406,7 @@
     do {
       avail = 0;
       for (int c = 0; (c < dist) && (cb < limit_bitmap); c++) {
-        Prefetch::read(cb, ShenandoahForwarding::byte_offset());
+        Prefetch::read(cb, oopDesc::mark_offset_in_bytes());
         slots[avail++] = cb;
         cb += skip_bitmap_delta;
         if (cb < limit_bitmap) {
@@ -448,16 +441,16 @@
   // Step 2. Accurate size-based traversal, happens past the TAMS.
   // This restarts the scan at TAMS, which makes sure we traverse all objects,
   // regardless of what happened at Step 1.
-  HeapWord* cs = tams + ShenandoahForwarding::word_size();
+  HeapWord* cs = tams;
   while (cs < limit) {
-    assert (cs > tams,  "only objects past TAMS here: "   PTR_FORMAT " (" PTR_FORMAT ")", p2i(cs), p2i(tams));
+    assert (cs >= tams, "only objects past TAMS here: "   PTR_FORMAT " (" PTR_FORMAT ")", p2i(cs), p2i(tams));
     assert (cs < limit, "only objects below limit here: " PTR_FORMAT " (" PTR_FORMAT ")", p2i(cs), p2i(limit));
     oop obj = oop(cs);
     assert(oopDesc::is_oop(obj), "sanity");
     assert(ctx->is_marked(obj), "object expected to be marked");
     int size = obj->size();
     cl->do_object(obj);
-    cs += size + skip_objsize_delta;
+    cs += size;
   }
 }
 
--- a/src/hotspot/share/gc/shenandoah/shenandoahHeapRegion.cpp	Wed May 29 09:25:20 2019 +0200
+++ b/src/hotspot/share/gc/shenandoah/shenandoahHeapRegion.cpp	Wed May 29 12:01:21 2019 +0200
@@ -23,7 +23,6 @@
 
 #include "precompiled.hpp"
 #include "memory/allocation.hpp"
-#include "gc/shenandoah/shenandoahForwarding.hpp"
 #include "gc/shenandoah/shenandoahHeapRegionSet.inline.hpp"
 #include "gc/shenandoah/shenandoahHeap.inline.hpp"
 #include "gc/shenandoah/shenandoahHeapRegion.hpp"
@@ -453,12 +452,12 @@
 
 void ShenandoahHeapRegion::oop_iterate_objects(OopIterateClosure* blk) {
   assert(! is_humongous(), "no humongous region here");
-  HeapWord* obj_addr = bottom() + ShenandoahForwarding::word_size();
+  HeapWord* obj_addr = bottom();
   HeapWord* t = top();
   // Could call objects iterate, but this is easier.
   while (obj_addr < t) {
     oop obj = oop(obj_addr);
-    obj_addr += obj->oop_iterate_size(blk) + ShenandoahForwarding::word_size();
+    obj_addr += obj->oop_iterate_size(blk);
   }
 }
 
@@ -467,7 +466,7 @@
   // Find head.
   ShenandoahHeapRegion* r = humongous_start_region();
   assert(r->is_humongous_start(), "need humongous head here");
-  oop obj = oop(r->bottom() + ShenandoahForwarding::word_size());
+  oop obj = oop(r->bottom());
   obj->oop_iterate(blk, MemRegion(bottom(), top()));
 }
 
@@ -506,11 +505,11 @@
   if (p >= top()) {
     return top();
   } else {
-    HeapWord* last = bottom() + ShenandoahForwarding::word_size();
+    HeapWord* last = bottom();
     HeapWord* cur = last;
     while (cur <= p) {
       last = cur;
-      cur += oop(cur)->size() + ShenandoahForwarding::word_size();
+      cur += oop(cur)->size();
     }
     shenandoah_assert_correct(NULL, oop(last));
     return last;
--- a/src/hotspot/share/gc/shenandoah/shenandoahHeuristics.cpp	Wed May 29 09:25:20 2019 +0200
+++ b/src/hotspot/share/gc/shenandoah/shenandoahHeuristics.cpp	Wed May 29 12:01:21 2019 +0200
@@ -24,7 +24,6 @@
 #include "precompiled.hpp"
 
 #include "gc/shared/gcCause.hpp"
-#include "gc/shenandoah/shenandoahForwarding.hpp"
 #include "gc/shenandoah/shenandoahCollectionSet.inline.hpp"
 #include "gc/shenandoah/shenandoahCollectorPolicy.hpp"
 #include "gc/shenandoah/shenandoahHeap.inline.hpp"
@@ -164,7 +163,7 @@
       // Reclaim humongous regions here, and count them as the immediate garbage
 #ifdef ASSERT
       bool reg_live = region->has_live();
-      bool bm_live = ctx->is_marked(oop(region->bottom() + ShenandoahForwarding::word_size()));
+      bool bm_live = ctx->is_marked(oop(region->bottom()));
       assert(reg_live == bm_live,
              "Humongous liveness and marks should agree. Region live: %s; Bitmap live: %s; Region Live Words: " SIZE_FORMAT,
              BOOL_TO_STR(reg_live), BOOL_TO_STR(bm_live), region->get_live_data_words());
--- a/src/hotspot/share/gc/shenandoah/shenandoahMarkCompact.cpp	Wed May 29 09:25:20 2019 +0200
+++ b/src/hotspot/share/gc/shenandoah/shenandoahMarkCompact.cpp	Wed May 29 12:01:21 2019 +0200
@@ -25,6 +25,7 @@
 
 #include "code/codeCache.hpp"
 #include "gc/shared/gcTraceTime.inline.hpp"
+#include "gc/shared/preservedMarks.inline.hpp"
 #include "gc/shenandoah/shenandoahForwarding.inline.hpp"
 #include "gc/shenandoah/shenandoahConcurrentMark.inline.hpp"
 #include "gc/shenandoah/shenandoahCollectionSet.hpp"
@@ -46,11 +47,16 @@
 #include "memory/universe.hpp"
 #include "oops/compressedOops.inline.hpp"
 #include "oops/oop.inline.hpp"
+#include "runtime/biasedLocking.hpp"
 #include "runtime/thread.hpp"
 #include "utilities/copy.hpp"
 #include "utilities/growableArray.hpp"
 #include "gc/shared/workgroup.hpp"
 
+ShenandoahMarkCompact::ShenandoahMarkCompact() :
+  _gc_timer(NULL),
+  _preserved_marks(new PreservedMarksSet(true)) {}
+
 void ShenandoahMarkCompact::initialize(GCTimer* gc_timer) {
   _gc_timer = gc_timer;
 }
@@ -121,6 +127,10 @@
 
     // e. Set back forwarded objects bit back, in case some steps above dropped it.
     heap->set_has_forwarded_objects(has_forwarded_objects);
+
+    // The rest of prologue:
+    BiasedLocking::preserve_marks();
+    _preserved_marks->init(heap->workers()->active_workers());
   }
 
   heap->make_parsable(true);
@@ -159,6 +169,16 @@
     phase4_compact_objects(worker_slices);
   }
 
+  {
+    // Epilogue
+    SharedRestorePreservedMarksTaskExecutor exec(heap->workers());
+    _preserved_marks->restore(&exec);
+    BiasedLocking::restore_marks();
+    _preserved_marks->reclaim();
+
+    JvmtiExport::gc_epilogue();
+  }
+
   // Resize metaspace
   MetaspaceGC::compute_new_size();
 
@@ -168,8 +188,6 @@
   }
   FREE_C_HEAP_ARRAY(ShenandoahHeapRegionSet*, worker_slices);
 
-  JvmtiExport::gc_epilogue();
-
   heap->set_full_gc_move_in_progress(false);
   heap->set_full_gc_in_progress(false);
 
@@ -230,6 +248,7 @@
 
 class ShenandoahPrepareForCompactionObjectClosure : public ObjectClosure {
 private:
+  PreservedMarks*          const _preserved_marks;
   ShenandoahHeap*          const _heap;
   GrowableArray<ShenandoahHeapRegion*>& _empty_regions;
   int _empty_regions_pos;
@@ -238,7 +257,10 @@
   HeapWord* _compact_point;
 
 public:
-  ShenandoahPrepareForCompactionObjectClosure(GrowableArray<ShenandoahHeapRegion*>& empty_regions, ShenandoahHeapRegion* to_region) :
+  ShenandoahPrepareForCompactionObjectClosure(PreservedMarks* preserved_marks,
+                                              GrowableArray<ShenandoahHeapRegion*>& empty_regions,
+                                              ShenandoahHeapRegion* to_region) :
+    _preserved_marks(preserved_marks),
     _heap(ShenandoahHeap::heap()),
     _empty_regions(empty_regions),
     _empty_regions_pos(0),
@@ -268,7 +290,7 @@
     assert(_heap->complete_marking_context()->is_marked(p), "must be marked");
     assert(!_heap->complete_marking_context()->allocated_after_mark_start((HeapWord*) p), "must be truly marked");
 
-    size_t obj_size = p->size() + ShenandoahForwarding::word_size();
+    size_t obj_size = p->size();
     if (_compact_point + obj_size > _to_region->end()) {
       finish_region();
 
@@ -291,13 +313,15 @@
     // Object fits into current region, record new location:
     assert(_compact_point + obj_size <= _to_region->end(), "must fit");
     shenandoah_assert_not_forwarded(NULL, p);
-    ShenandoahForwarding::set_forwardee_raw(p, _compact_point + ShenandoahForwarding::word_size());
+    _preserved_marks->push_if_necessary(p, p->mark_raw());
+    p->forward_to(oop(_compact_point));
     _compact_point += obj_size;
   }
 };
 
 class ShenandoahPrepareForCompactionTask : public AbstractGangTask {
 private:
+  PreservedMarksSet*        const _preserved_marks;
   ShenandoahHeap*           const _heap;
   ShenandoahHeapRegionSet** const _worker_slices;
   ShenandoahRegionIterator        _heap_regions;
@@ -320,8 +344,9 @@
   }
 
 public:
-  ShenandoahPrepareForCompactionTask(ShenandoahHeapRegionSet** worker_slices) :
+  ShenandoahPrepareForCompactionTask(PreservedMarksSet* preserved_marks, ShenandoahHeapRegionSet** worker_slices) :
     AbstractGangTask("Shenandoah Prepare For Compaction Task"),
+    _preserved_marks(preserved_marks),
     _heap(ShenandoahHeap::heap()), _worker_slices(worker_slices) {
   }
 
@@ -337,7 +362,7 @@
     // Remember empty regions and reuse them as needed.
     ResourceMark rm;
     GrowableArray<ShenandoahHeapRegion*> empty_regions((int)_heap->num_regions());
-    ShenandoahPrepareForCompactionObjectClosure cl(empty_regions, from_region);
+    ShenandoahPrepareForCompactionObjectClosure cl(_preserved_marks->get(worker_id), empty_regions, from_region);
     while (from_region != NULL) {
       cl.set_from_region(from_region);
       if (from_region->has_live()) {
@@ -377,8 +402,8 @@
   size_t to_begin = heap->num_regions();
   size_t to_end = heap->num_regions();
 
-  for (size_t c = heap->num_regions() - 1; c > 0; c--) {
-    ShenandoahHeapRegion *r = heap->get_region(c);
+  for (size_t c = heap->num_regions(); c > 0; c--) {
+    ShenandoahHeapRegion *r = heap->get_region(c - 1);
     if (r->is_humongous_continuation() || (r->new_top() == r->bottom())) {
       // To-region candidate: record this, and continue scan
       to_begin = r->region_number();
@@ -387,15 +412,16 @@
 
     if (r->is_humongous_start() && r->is_move_allowed()) {
       // From-region candidate: movable humongous region
-      oop old_obj = oop(r->bottom() + ShenandoahForwarding::word_size());
-      size_t words_size = old_obj->size() + ShenandoahForwarding::word_size();
+      oop old_obj = oop(r->bottom());
+      size_t words_size = old_obj->size();
       size_t num_regions = ShenandoahHeapRegion::required_regions(words_size * HeapWordSize);
 
       size_t start = to_end - num_regions;
 
       if (start >= to_begin && start != r->region_number()) {
         // Fits into current window, and the move is non-trivial. Record the move then, and continue scan.
-        ShenandoahForwarding::set_forwardee_raw(old_obj, heap->get_region(start)->bottom() + ShenandoahForwarding::word_size());
+        _preserved_marks->get(0)->push_if_necessary(old_obj, old_obj->mark_raw());
+        old_obj->forward_to(oop(heap->get_region(start)->bottom()));
         to_end = start;
         continue;
       }
@@ -443,7 +469,7 @@
 
   void heap_region_do(ShenandoahHeapRegion* r) {
     if (r->is_humongous_start()) {
-      oop humongous_obj = oop(r->bottom() + ShenandoahForwarding::word_size());
+      oop humongous_obj = oop(r->bottom());
       if (!_ctx->is_marked(humongous_obj)) {
         assert(!r->has_live(),
                "Region " SIZE_FORMAT " is not marked, should not have live", r->region_number());
@@ -484,7 +510,7 @@
   // Compute the new addresses for regular objects
   {
     ShenandoahGCPhase phase(ShenandoahPhaseTimings::full_gc_calculate_addresses_regular);
-    ShenandoahPrepareForCompactionTask prepare_task(worker_slices);
+    ShenandoahPrepareForCompactionTask prepare_task(_preserved_marks, worker_slices);
     heap->workers()->run_task(&prepare_task);
   }
 
@@ -506,8 +532,10 @@
     if (!CompressedOops::is_null(o)) {
       oop obj = CompressedOops::decode_not_null(o);
       assert(_ctx->is_marked(obj), "must be marked");
-      oop forw = oop(ShenandoahForwarding::get_forwardee_raw(obj));
-      RawAccess<IS_NOT_NULL>::oop_store(p, forw);
+      if (obj->is_forwarded()) {
+        oop forw = obj->forwardee();
+        RawAccess<IS_NOT_NULL>::oop_store(p, forw);
+      }
     }
   }
 
@@ -531,7 +559,6 @@
   }
   void do_object(oop p) {
     assert(_heap->complete_marking_context()->is_marked(p), "must be marked");
-    HeapWord* forw = ShenandoahForwarding::get_forwardee_raw(p);
     p->oop_iterate(&_cl);
   }
 };
@@ -562,15 +589,17 @@
 class ShenandoahAdjustRootPointersTask : public AbstractGangTask {
 private:
   ShenandoahRootAdjuster* _rp;
-
+  PreservedMarksSet* _preserved_marks;
 public:
-  ShenandoahAdjustRootPointersTask(ShenandoahRootAdjuster* rp) :
+  ShenandoahAdjustRootPointersTask(ShenandoahRootAdjuster* rp, PreservedMarksSet* preserved_marks) :
     AbstractGangTask("Shenandoah Adjust Root Pointers Task"),
-    _rp(rp) {}
+    _rp(rp),
+    _preserved_marks(preserved_marks) {}
 
   void work(uint worker_id) {
     ShenandoahAdjustPointersClosure cl;
     _rp->roots_do(worker_id, &cl);
+    _preserved_marks->get(worker_id)->adjust_during_full_gc();
   }
 };
 
@@ -587,7 +616,7 @@
     DerivedPointerTable::clear();
 #endif
     ShenandoahRootAdjuster rp(nworkers, ShenandoahPhaseTimings::full_gc_roots);
-    ShenandoahAdjustRootPointersTask task(&rp);
+    ShenandoahAdjustRootPointersTask task(&rp, _preserved_marks);
     workers->run_task(&task);
 #if COMPILER2_OR_JVMCI
     DerivedPointerTable::update_pointers();
@@ -610,13 +639,13 @@
   void do_object(oop p) {
     assert(_heap->complete_marking_context()->is_marked(p), "must be marked");
     size_t size = (size_t)p->size();
-    HeapWord* compact_to = ShenandoahForwarding::get_forwardee_raw(p);
-    HeapWord* compact_from = (HeapWord*) p;
-    if (compact_from != compact_to) {
+    if (p->is_forwarded()) {
+      HeapWord* compact_from = (HeapWord*) p;
+      HeapWord* compact_to = (HeapWord*) p->forwardee();
       Copy::aligned_conjoint_words(compact_from, compact_to, size);
+      oop new_obj = oop(compact_to);
+      new_obj->init_mark_raw();
     }
-    oop new_obj = oop(compact_to);
-    ShenandoahForwarding::initialize(new_obj);
   }
 };
 
@@ -707,31 +736,30 @@
 
   ShenandoahHeap* heap = ShenandoahHeap::heap();
 
-  for (size_t c = heap->num_regions() - 1; c > 0; c--) {
-    ShenandoahHeapRegion* r = heap->get_region(c);
+  for (size_t c = heap->num_regions(); c > 0; c--) {
+    ShenandoahHeapRegion* r = heap->get_region(c - 1);
     if (r->is_humongous_start()) {
-      oop old_obj = oop(r->bottom() + ShenandoahForwarding::word_size());
-      size_t words_size = old_obj->size() + ShenandoahForwarding::word_size();
+      oop old_obj = oop(r->bottom());
+      if (!old_obj->is_forwarded()) {
+        // No need to move the object, it stays at the same slot
+        continue;
+      }
+      size_t words_size = old_obj->size();
       size_t num_regions = ShenandoahHeapRegion::required_regions(words_size * HeapWordSize);
 
       size_t old_start = r->region_number();
       size_t old_end   = old_start + num_regions - 1;
-      size_t new_start = heap->heap_region_index_containing(ShenandoahForwarding::get_forwardee_raw(old_obj));
+      size_t new_start = heap->heap_region_index_containing(old_obj->forwardee());
       size_t new_end   = new_start + num_regions - 1;
-
-      if (old_start == new_start) {
-        // No need to move the object, it stays at the same slot
-        continue;
-      }
-
+      assert(old_start != new_start, "must be real move");
       assert (r->is_move_allowed(), "should be movable");
 
       Copy::aligned_conjoint_words(heap->get_region(old_start)->bottom(),
                                    heap->get_region(new_start)->bottom(),
                                    ShenandoahHeapRegion::region_size_words()*num_regions);
 
-      oop new_obj = oop(heap->get_region(new_start)->bottom() + ShenandoahForwarding::word_size());
-      ShenandoahForwarding::initialize(new_obj);
+      oop new_obj = oop(heap->get_region(new_start)->bottom());
+      new_obj->init_mark_raw();
 
       {
         for (size_t c = old_start; c <= old_end; c++) {
--- a/src/hotspot/share/gc/shenandoah/shenandoahMarkCompact.hpp	Wed May 29 09:25:20 2019 +0200
+++ b/src/hotspot/share/gc/shenandoah/shenandoahMarkCompact.hpp	Wed May 29 12:01:21 2019 +0200
@@ -49,12 +49,19 @@
  * where it does sliding compaction, without interfering with other threads.
  */
 
+class PreservedMarksSet;
+
 class ShenandoahMarkCompact : public CHeapObj<mtGC> {
+  friend class ShenandoahPrepareForCompactionObjectClosure;
 private:
   GCTimer* _gc_timer;
 
+  PreservedMarksSet* _preserved_marks;
+
 public:
+  ShenandoahMarkCompact();
   void initialize(GCTimer* gc_timer);
+
   void do_it(GCCause::Cause gc_cause);
 
 private:
@@ -65,7 +72,6 @@
 
   void calculate_target_humongous_objects();
   void compact_humongous_objects();
-
 };
 
 #endif // SHARE_GC_SHENANDOAH_SHENANDOAHMARKCOMPACT_HPP
--- a/src/hotspot/share/gc/shenandoah/shenandoahTraversalGC.cpp	Wed May 29 09:25:20 2019 +0200
+++ b/src/hotspot/share/gc/shenandoah/shenandoahTraversalGC.cpp	Wed May 29 12:01:21 2019 +0200
@@ -35,7 +35,6 @@
 #include "gc/shenandoah/shenandoahCollectionSet.hpp"
 #include "gc/shenandoah/shenandoahCollectorPolicy.hpp"
 #include "gc/shenandoah/shenandoahFreeSet.hpp"
-#include "gc/shenandoah/shenandoahForwarding.hpp"
 #include "gc/shenandoah/shenandoahPhaseTimings.hpp"
 #include "gc/shenandoah/shenandoahHeap.inline.hpp"
 #include "gc/shenandoah/shenandoahHeapRegionSet.inline.hpp"
@@ -633,7 +632,7 @@
         bool candidate = traversal_regions->is_in(r) && !r->has_live() && not_allocated;
         if (r->is_humongous_start() && candidate) {
           // Trash humongous.
-          HeapWord* humongous_obj = r->bottom() + ShenandoahForwarding::word_size();
+          HeapWord* humongous_obj = r->bottom();
           assert(!ctx->is_marked(oop(humongous_obj)), "must not be marked");
           r->make_trash_immediate();
           while (i + 1 < num_regions && _heap->get_region(i + 1)->is_humongous_continuation()) {
--- a/src/hotspot/share/gc/shenandoah/shenandoahVerifier.cpp	Wed May 29 09:25:20 2019 +0200
+++ b/src/hotspot/share/gc/shenandoah/shenandoahVerifier.cpp	Wed May 29 12:01:21 2019 +0200
@@ -138,7 +138,7 @@
           // skip
           break;
         case ShenandoahVerifier::_verify_liveness_complete:
-          Atomic::add(obj->size() + ShenandoahForwarding::word_size(), &_ld[obj_reg->region_number()]);
+          Atomic::add((uint) obj->size(), &_ld[obj_reg->region_number()]);
           // fallthrough for fast failure for un-live regions:
         case ShenandoahVerifier::_verify_liveness_conservative:
           check(ShenandoahAsserts::_safe_oop, obj, obj_reg->has_live(),
@@ -533,7 +533,7 @@
 
   virtual void work_humongous(ShenandoahHeapRegion *r, ShenandoahVerifierStack& stack, ShenandoahVerifyOopClosure& cl) {
     size_t processed = 0;
-    HeapWord* obj = r->bottom() + ShenandoahForwarding::word_size();
+    HeapWord* obj = r->bottom();
     if (_heap->complete_marking_context()->is_marked((oop)obj)) {
       verify_and_follow(obj, stack, cl, &processed);
     }
@@ -547,12 +547,12 @@
 
     // Bitmaps, before TAMS
     if (tams > r->bottom()) {
-      HeapWord* start = r->bottom() + ShenandoahForwarding::word_size();
+      HeapWord* start = r->bottom();
       HeapWord* addr = mark_bit_map->get_next_marked_addr(start, tams);
 
       while (addr < tams) {
         verify_and_follow(addr, stack, cl, &processed);
-        addr += ShenandoahForwarding::word_size();
+        addr += 1;
         if (addr < tams) {
           addr = mark_bit_map->get_next_marked_addr(addr, tams);
         }
@@ -562,11 +562,11 @@
     // Size-based, after TAMS
     {
       HeapWord* limit = r->top();
-      HeapWord* addr = tams + ShenandoahForwarding::word_size();
+      HeapWord* addr = tams;
 
       while (addr < limit) {
         verify_and_follow(addr, stack, cl, &processed);
-        addr += oop(addr)->size() + ShenandoahForwarding::word_size();
+        addr += oop(addr)->size();
       }
     }