6954029: Improve implicit null check generation with compressed oops
Summary: Hoist DecodeN instruction above null check
Reviewed-by: never, twisti
--- a/hotspot/src/cpu/sparc/vm/sparc.ad Sat May 29 19:22:32 2010 -0700
+++ b/hotspot/src/cpu/sparc/vm/sparc.ad Wed Jun 02 09:49:32 2010 -0700
@@ -1760,6 +1760,12 @@
// registers? True for Intel but false for most RISCs
const bool Matcher::clone_shift_expressions = false;
+bool Matcher::narrow_oop_use_complex_address() {
+ NOT_LP64(ShouldNotCallThis());
+ assert(UseCompressedOops, "only for compressed oops code");
+ return false;
+}
+
// Is it better to copy float constants, or load them directly from memory?
// Intel can load a float constant from a direct address, requiring no
// extra registers. Most RISCs will have to materialize an address into a
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp Sat May 29 19:22:32 2010 -0700
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp Wed Jun 02 09:49:32 2010 -0700
@@ -65,13 +65,6 @@
FLAG_SET_DEFAULT(UseInlineCaches, false);
}
#ifdef _LP64
- // Single issue niagara1 is slower for CompressedOops
- // but niagaras after that it's fine.
- if (!is_niagara1_plus()) {
- if (FLAG_IS_DEFAULT(UseCompressedOops)) {
- FLAG_SET_ERGO(bool, UseCompressedOops, false);
- }
- }
// 32-bit oops don't make sense for the 64-bit VM on sparc
// since the 32-bit VM has the same registers and smaller objects.
Universe::set_narrow_oop_shift(LogMinObjAlignmentInBytes);
--- a/hotspot/src/cpu/x86/vm/x86_32.ad Sat May 29 19:22:32 2010 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad Wed Jun 02 09:49:32 2010 -0700
@@ -1377,6 +1377,12 @@
// registers? True for Intel but false for most RISCs
const bool Matcher::clone_shift_expressions = true;
+bool Matcher::narrow_oop_use_complex_address() {
+ ShouldNotCallThis();
+ return true;
+}
+
+
// Is it better to copy float constants, or load them directly from memory?
// Intel can load a float constant from a direct address, requiring no
// extra registers. Most RISCs will have to materialize an address into a
--- a/hotspot/src/cpu/x86/vm/x86_64.ad Sat May 29 19:22:32 2010 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad Wed Jun 02 09:49:32 2010 -0700
@@ -2037,6 +2037,11 @@
// into registers? True for Intel but false for most RISCs
const bool Matcher::clone_shift_expressions = true;
+bool Matcher::narrow_oop_use_complex_address() {
+ assert(UseCompressedOops, "only for compressed oops code");
+ return (LogMinObjAlignmentInBytes <= 3);
+}
+
// Is it better to copy float constants, or load them directly from
// memory? Intel can load a float constant from a direct address,
// requiring no extra registers. Most RISCs will have to materialize
--- a/hotspot/src/share/vm/opto/compile.cpp Sat May 29 19:22:32 2010 -0700
+++ b/hotspot/src/share/vm/opto/compile.cpp Wed Jun 02 09:49:32 2010 -0700
@@ -2176,14 +2176,14 @@
#ifdef _LP64
case Op_CastPP:
- if (n->in(1)->is_DecodeN() && Universe::narrow_oop_use_implicit_null_checks()) {
+ if (n->in(1)->is_DecodeN() && Matcher::gen_narrow_oop_implicit_null_checks()) {
Compile* C = Compile::current();
Node* in1 = n->in(1);
const Type* t = n->bottom_type();
Node* new_in1 = in1->clone();
new_in1->as_DecodeN()->set_type(t);
- if (!Matcher::clone_shift_expressions) {
+ if (!Matcher::narrow_oop_use_complex_address()) {
//
// x86, ARM and friends can handle 2 adds in addressing mode
// and Matcher can fold a DecodeN node into address by using
@@ -2231,8 +2231,12 @@
new_in2 = in2->in(1);
} else if (in2->Opcode() == Op_ConP) {
const Type* t = in2->bottom_type();
- if (t == TypePtr::NULL_PTR && Universe::narrow_oop_use_implicit_null_checks()) {
- new_in2 = ConNode::make(C, TypeNarrowOop::NULL_PTR);
+ if (t == TypePtr::NULL_PTR) {
+ // Don't convert CmpP null check into CmpN if compressed
+ // oops implicit null check is not generated.
+ // This will allow to generate normal oop implicit null check.
+ if (Matcher::gen_narrow_oop_implicit_null_checks())
+ new_in2 = ConNode::make(C, TypeNarrowOop::NULL_PTR);
//
// This transformation together with CastPP transformation above
// will generated code for implicit NULL checks for compressed oops.
@@ -2289,9 +2293,9 @@
case Op_DecodeN:
assert(!n->in(1)->is_EncodeP(), "should be optimized out");
- // DecodeN could be pinned on Sparc where it can't be fold into
+ // DecodeN could be pinned when it can't be fold into
// an address expression, see the code for Op_CastPP above.
- assert(n->in(0) == NULL || !Matcher::clone_shift_expressions, "no control except on sparc");
+ assert(n->in(0) == NULL || !Matcher::narrow_oop_use_complex_address(), "no control");
break;
case Op_EncodeP: {
@@ -2496,6 +2500,10 @@
}
}
+ // Skip next transformation if compressed oops are not used.
+ if (!UseCompressedOops || !Matcher::gen_narrow_oop_implicit_null_checks())
+ return;
+
// Go over safepoints nodes to skip DecodeN nodes for debug edges.
// It could be done for an uncommon traps or any safepoints/calls
// if the DecodeN node is referenced only in a debug info.
--- a/hotspot/src/share/vm/opto/connode.cpp Sat May 29 19:22:32 2010 -0700
+++ b/hotspot/src/share/vm/opto/connode.cpp Wed Jun 02 09:49:32 2010 -0700
@@ -437,7 +437,7 @@
// If not converting int->oop, throw away cast after constant propagation
Node *CastPPNode::Ideal_DU_postCCP( PhaseCCP *ccp ) {
const Type *t = ccp->type(in(1));
- if (!t->isa_oop_ptr() || (in(1)->is_DecodeN() && Universe::narrow_oop_use_implicit_null_checks())) {
+ if (!t->isa_oop_ptr() || (in(1)->is_DecodeN() && Matcher::gen_narrow_oop_implicit_null_checks())) {
return NULL; // do not transform raw pointers or narrow oops
}
return ConstraintCastNode::Ideal_DU_postCCP(ccp);
--- a/hotspot/src/share/vm/opto/lcm.cpp Sat May 29 19:22:32 2010 -0700
+++ b/hotspot/src/share/vm/opto/lcm.cpp Wed Jun 02 09:49:32 2010 -0700
@@ -32,7 +32,8 @@
// with suitable memory ops nearby. Use the memory op to do the NULL check.
// I can generate a memory op if there is not one nearby.
// The proj is the control projection for the not-null case.
-// The val is the pointer being checked for nullness.
+// The val is the pointer being checked for nullness or
+// decodeHeapOop_not_null node if it did not fold into address.
void Block::implicit_null_check(PhaseCFG *cfg, Node *proj, Node *val, int allowed_reasons) {
// Assume if null check need for 0 offset then always needed
// Intel solaris doesn't support any null checks yet and no
@@ -96,6 +97,13 @@
}
}
+ // Check for decodeHeapOop_not_null node which did not fold into address
+ bool is_decoden = ((intptr_t)val) & 1;
+ val = (Node*)(((intptr_t)val) & ~1);
+
+ assert(!is_decoden || (val->in(0) == NULL) && val->is_Mach() &&
+ (val->as_Mach()->ideal_Opcode() == Op_DecodeN), "sanity");
+
// Search the successor block for a load or store who's base value is also
// the tested value. There may be several.
Node_List *out = new Node_List(Thread::current()->resource_area());
@@ -148,7 +156,8 @@
if( !mach->needs_anti_dependence_check() )
continue; // Not an memory op; skip it
{
- // Check that value is used in memory address.
+ // Check that value is used in memory address in
+ // instructions with embedded load (CmpP val1,(val2+off)).
Node* base;
Node* index;
const MachOper* oper = mach->memory_inputs(base, index);
@@ -213,7 +222,11 @@
uint vidx = 0; // Capture index of value into memop
uint j;
for( j = mach->req()-1; j > 0; j-- ) {
- if( mach->in(j) == val ) vidx = j;
+ if( mach->in(j) == val ) {
+ vidx = j;
+ // Ignore DecodeN val which could be hoisted to where needed.
+ if( is_decoden ) continue;
+ }
// Block of memory-op input
Block *inb = cfg->_bbs[mach->in(j)->_idx];
Block *b = this; // Start from nul check
@@ -270,6 +283,26 @@
extern int implicit_null_checks;
implicit_null_checks++;
+ if( is_decoden ) {
+ // Check if we need to hoist decodeHeapOop_not_null first.
+ Block *valb = cfg->_bbs[val->_idx];
+ if( this != valb && this->_dom_depth < valb->_dom_depth ) {
+ // Hoist it up to the end of the test block.
+ valb->find_remove(val);
+ this->add_inst(val);
+ cfg->_bbs.map(val->_idx,this);
+ // DecodeN on x86 may kill flags. Check for flag-killing projections
+ // that also need to be hoisted.
+ for (DUIterator_Fast jmax, j = val->fast_outs(jmax); j < jmax; j++) {
+ Node* n = val->fast_out(j);
+ if( n->Opcode() == Op_MachProj ) {
+ cfg->_bbs[n->_idx]->find_remove(n);
+ this->add_inst(n);
+ cfg->_bbs.map(n->_idx,this);
+ }
+ }
+ }
+ }
// Hoist the memory candidate up to the end of the test block.
Block *old_block = cfg->_bbs[best->_idx];
old_block->find_remove(best);
--- a/hotspot/src/share/vm/opto/matcher.cpp Sat May 29 19:22:32 2010 -0700
+++ b/hotspot/src/share/vm/opto/matcher.cpp Wed Jun 02 09:49:32 2010 -0700
@@ -1334,7 +1334,7 @@
if( j == max_scan ) // No post-domination before scan end?
return true; // Then break the match tree up
}
- if (m->is_DecodeN() && Matcher::clone_shift_expressions) {
+ if (m->is_DecodeN() && Matcher::narrow_oop_use_complex_address()) {
// These are commonly used in address expressions and can
// efficiently fold into them on X64 in some cases.
return false;
@@ -2110,8 +2110,8 @@
_null_check_tests.push(proj);
Node* val = cmp->in(1);
#ifdef _LP64
- if (UseCompressedOops && !Matcher::clone_shift_expressions &&
- val->bottom_type()->isa_narrowoop()) {
+ if (val->bottom_type()->isa_narrowoop() &&
+ !Matcher::narrow_oop_use_complex_address()) {
//
// Look for DecodeN node which should be pinned to orig_proj.
// On platforms (Sparc) which can not handle 2 adds
@@ -2127,6 +2127,9 @@
if (d->is_DecodeN() && d->in(1) == val) {
val = d;
val->set_req(0, NULL); // Unpin now.
+ // Mark this as special case to distinguish from
+ // a regular case: CmpP(DecodeN, NULL).
+ val = (Node*)(((intptr_t)val) | 1);
break;
}
}
@@ -2146,9 +2149,21 @@
for( uint i=0; i < cnt; i+=2 ) {
Node *test = _null_check_tests[i];
Node *val = _null_check_tests[i+1];
+ bool is_decoden = ((intptr_t)val) & 1;
+ val = (Node*)(((intptr_t)val) & ~1);
if (has_new_node(val)) {
+ Node* new_val = new_node(val);
+ if (is_decoden) {
+ assert(val->is_DecodeN() && val->in(0) == NULL, "sanity");
+ // Note: new_val may have a control edge if
+ // the original ideal node DecodeN was matched before
+ // it was unpinned in Matcher::collect_null_checks().
+ // Unpin the mach node and mark it.
+ new_val->set_req(0, NULL);
+ new_val = (Node*)(((intptr_t)new_val) | 1);
+ }
// Is a match-tree root, so replace with the matched value
- _null_check_tests.map(i+1, new_node(val));
+ _null_check_tests.map(i+1, new_val);
} else {
// Yank from candidate list
_null_check_tests.map(i+1,_null_check_tests[--cnt]);
--- a/hotspot/src/share/vm/opto/matcher.hpp Sat May 29 19:22:32 2010 -0700
+++ b/hotspot/src/share/vm/opto/matcher.hpp Wed Jun 02 09:49:32 2010 -0700
@@ -352,6 +352,38 @@
// registers? True for Intel but false for most RISCs
static const bool clone_shift_expressions;
+ static bool narrow_oop_use_complex_address();
+
+ // Generate implicit null check for narrow oops if it can fold
+ // into address expression (x64).
+ //
+ // [R12 + narrow_oop_reg<<3 + offset] // fold into address expression
+ // NullCheck narrow_oop_reg
+ //
+ // When narrow oops can't fold into address expression (Sparc) and
+ // base is not null use decode_not_null and normal implicit null check.
+ // Note, decode_not_null node can be used here since it is referenced
+ // only on non null path but it requires special handling, see
+ // collect_null_checks():
+ //
+ // decode_not_null narrow_oop_reg, oop_reg // 'shift' and 'add base'
+ // [oop_reg + offset]
+ // NullCheck oop_reg
+ //
+ // With Zero base and when narrow oops can not fold into address
+ // expression use normal implicit null check since only shift
+ // is needed to decode narrow oop.
+ //
+ // decode narrow_oop_reg, oop_reg // only 'shift'
+ // [oop_reg + offset]
+ // NullCheck oop_reg
+ //
+ inline static bool gen_narrow_oop_implicit_null_checks() {
+ return Universe::narrow_oop_use_implicit_null_checks() &&
+ (narrow_oop_use_complex_address() ||
+ Universe::narrow_oop_base() != NULL);
+ }
+
// Is it better to copy float constants, or load them directly from memory?
// Intel can load a float constant from a direct address, requiring no
// extra registers. Most RISCs will have to materialize an address into a