8151573: Multiversioning for range check elimination
authormcberg
Tue, 05 Apr 2016 10:34:05 -0700
changeset 37292 64f6ae06310e
parent 37291 f13632a2a389
child 37293 c010188d360f
child 37294 eda408d4f253
8151573: Multiversioning for range check elimination Summary: range check elimination in post loops Reviewed-by: kvn
hotspot/src/cpu/aarch64/vm/c2_globals_aarch64.hpp
hotspot/src/cpu/ppc/vm/c2_globals_ppc.hpp
hotspot/src/cpu/sparc/vm/c2_globals_sparc.hpp
hotspot/src/cpu/x86/vm/c2_globals_x86.hpp
hotspot/src/share/vm/opto/c2_globals.hpp
hotspot/src/share/vm/opto/loopTransform.cpp
hotspot/src/share/vm/opto/loopnode.cpp
hotspot/src/share/vm/opto/loopnode.hpp
hotspot/src/share/vm/opto/superword.cpp
--- a/hotspot/src/cpu/aarch64/vm/c2_globals_aarch64.hpp	Tue Apr 05 17:51:44 2016 +0200
+++ b/hotspot/src/cpu/aarch64/vm/c2_globals_aarch64.hpp	Tue Apr 05 10:34:05 2016 -0700
@@ -55,6 +55,7 @@
 define_pd_global(intx, NewSizeThreadIncrease, ScaleForWordSize(4*K));
 define_pd_global(intx, LoopUnrollLimit,              60);
 define_pd_global(intx, LoopPercentProfileLimit,      10);
+define_pd_global(intx, PostLoopMultiversioning,      false);
 // InitialCodeCacheSize derived from specjbb2000 run.
 define_pd_global(intx, InitialCodeCacheSize,         2496*K); // Integral multiple of CodeCacheExpansionSize
 define_pd_global(intx, CodeCacheExpansionSize,       64*K);
--- a/hotspot/src/cpu/ppc/vm/c2_globals_ppc.hpp	Tue Apr 05 17:51:44 2016 +0200
+++ b/hotspot/src/cpu/ppc/vm/c2_globals_ppc.hpp	Tue Apr 05 10:34:05 2016 -0700
@@ -55,6 +55,7 @@
 define_pd_global(bool, ResizeTLAB,                   true);
 define_pd_global(intx, LoopUnrollLimit,              60);
 define_pd_global(intx, LoopPercentProfileLimit,      10);
+define_pd_global(intx, PostLoopMultiversioning,      false);
 
 // Peephole and CISC spilling both break the graph, and so make the
 // scheduler sick.
--- a/hotspot/src/cpu/sparc/vm/c2_globals_sparc.hpp	Tue Apr 05 17:51:44 2016 +0200
+++ b/hotspot/src/cpu/sparc/vm/c2_globals_sparc.hpp	Tue Apr 05 10:34:05 2016 -0700
@@ -53,6 +53,7 @@
 define_pd_global(bool, ResizeTLAB,                   true);
 define_pd_global(intx, LoopUnrollLimit,              60); // Design center runs on 1.3.1
 define_pd_global(intx, LoopPercentProfileLimit,      10);
+define_pd_global(intx, PostLoopMultiversioning,      false);
 define_pd_global(intx, MinJumpTableSize,             5);
 
 // Peephole and CISC spilling both break the graph, and so makes the
--- a/hotspot/src/cpu/x86/vm/c2_globals_x86.hpp	Tue Apr 05 17:51:44 2016 +0200
+++ b/hotspot/src/cpu/x86/vm/c2_globals_x86.hpp	Tue Apr 05 10:34:05 2016 -0700
@@ -47,6 +47,7 @@
 define_pd_global(intx, FreqInlineSize,               325);
 define_pd_global(intx, MinJumpTableSize,             10);
 define_pd_global(intx, LoopPercentProfileLimit,      30);
+define_pd_global(intx, PostLoopMultiversioning,      true);
 #ifdef AMD64
 define_pd_global(intx, INTPRESSURE,                  13);
 define_pd_global(intx, FLOATPRESSURE,                14);
--- a/hotspot/src/share/vm/opto/c2_globals.hpp	Tue Apr 05 17:51:44 2016 +0200
+++ b/hotspot/src/share/vm/opto/c2_globals.hpp	Tue Apr 05 10:34:05 2016 -0700
@@ -194,6 +194,9 @@
            "Map number of unrolls for main loop via "                       \
            "Superword Level Parallelism analysis")                          \
                                                                             \
+  product_pd(bool, PostLoopMultiversioning,                                 \
+           "Multi versioned post loops to eliminate range checks")          \
+                                                                            \
   notproduct(bool, TraceSuperWordLoopUnrollAnalysis, false,                 \
           "Trace what Superword Level Parallelism analysis applies")        \
                                                                             \
--- a/hotspot/src/share/vm/opto/loopTransform.cpp	Tue Apr 05 17:51:44 2016 +0200
+++ b/hotspot/src/share/vm/opto/loopTransform.cpp	Tue Apr 05 10:34:05 2016 -0700
@@ -1027,82 +1027,9 @@
     _igvn.replace_input_of(bol, 1, cmp);
   }
 
-  //------------------------------
-  // Step A: Create Post-Loop.
-  Node* main_exit = main_end->proj_out(false);
-  assert( main_exit->Opcode() == Op_IfFalse, "" );
-  int dd_main_exit = dom_depth(main_exit);
-
-  // Step A1: Clone the loop body.  The clone becomes the post-loop.  The main
-  // loop pre-header illegally has 2 control users (old & new loops).
-  clone_loop( loop, old_new, dd_main_exit );
-  assert( old_new[main_end ->_idx]->Opcode() == Op_CountedLoopEnd, "" );
-  CountedLoopNode *post_head = old_new[main_head->_idx]->as_CountedLoop();
-  post_head->set_post_loop(main_head);
-
-  // Reduce the post-loop trip count.
-  CountedLoopEndNode* post_end = old_new[main_end ->_idx]->as_CountedLoopEnd();
-  post_end->_prob = PROB_FAIR;
-
-  // Build the main-loop normal exit.
-  IfFalseNode *new_main_exit = new IfFalseNode(main_end);
-  _igvn.register_new_node_with_optimizer( new_main_exit );
-  set_idom(new_main_exit, main_end, dd_main_exit );
-  set_loop(new_main_exit, loop->_parent);
-
-  // Step A2: Build a zero-trip guard for the post-loop.  After leaving the
-  // main-loop, the post-loop may not execute at all.  We 'opaque' the incr
-  // (the main-loop trip-counter exit value) because we will be changing
-  // the exit value (via unrolling) so we cannot constant-fold away the zero
-  // trip guard until all unrolling is done.
-  Node *zer_opaq = new Opaque1Node(C, incr);
-  Node *zer_cmp  = new CmpINode( zer_opaq, limit );
-  Node *zer_bol  = new BoolNode( zer_cmp, b_test );
-  register_new_node( zer_opaq, new_main_exit );
-  register_new_node( zer_cmp , new_main_exit );
-  register_new_node( zer_bol , new_main_exit );
-
-  // Build the IfNode
-  IfNode *zer_iff = new IfNode( new_main_exit, zer_bol, PROB_FAIR, COUNT_UNKNOWN );
-  _igvn.register_new_node_with_optimizer( zer_iff );
-  set_idom(zer_iff, new_main_exit, dd_main_exit);
-  set_loop(zer_iff, loop->_parent);
-
-  // Plug in the false-path, taken if we need to skip post-loop
-  _igvn.replace_input_of(main_exit, 0, zer_iff);
-  set_idom(main_exit, zer_iff, dd_main_exit);
-  set_idom(main_exit->unique_out(), zer_iff, dd_main_exit);
-  // Make the true-path, must enter the post loop
-  Node *zer_taken = new IfTrueNode( zer_iff );
-  _igvn.register_new_node_with_optimizer( zer_taken );
-  set_idom(zer_taken, zer_iff, dd_main_exit);
-  set_loop(zer_taken, loop->_parent);
-  // Plug in the true path
-  _igvn.hash_delete( post_head );
-  post_head->set_req(LoopNode::EntryControl, zer_taken);
-  set_idom(post_head, zer_taken, dd_main_exit);
-
-  Arena *a = Thread::current()->resource_area();
-  VectorSet visited(a);
-  Node_Stack clones(a, main_head->back_control()->outcnt());
-  // Step A3: Make the fall-in values to the post-loop come from the
-  // fall-out values of the main-loop.
-  for (DUIterator_Fast imax, i = main_head->fast_outs(imax); i < imax; i++) {
-    Node* main_phi = main_head->fast_out(i);
-    if( main_phi->is_Phi() && main_phi->in(0) == main_head && main_phi->outcnt() >0 ) {
-      Node *post_phi = old_new[main_phi->_idx];
-      Node *fallmain  = clone_up_backedge_goo(main_head->back_control(),
-                                              post_head->init_control(),
-                                              main_phi->in(LoopNode::LoopBackControl),
-                                              visited, clones);
-      _igvn.hash_delete(post_phi);
-      post_phi->set_req( LoopNode::EntryControl, fallmain );
-    }
-  }
-
-  // Update local caches for next stanza
-  main_exit = new_main_exit;
-
+  // Add the post loop
+  CountedLoopNode *post_head = NULL;
+  Node *main_exit = insert_post_loop(loop, old_new, main_head, main_end, incr, limit, post_head);
 
   //------------------------------
   // Step B: Create Pre-Loop.
@@ -1158,8 +1085,9 @@
   main_head->set_req(LoopNode::EntryControl, min_taken);
   set_idom(main_head, min_taken, dd_main_head);
 
-  visited.Clear();
-  clones.clear();
+  Arena *a = Thread::current()->resource_area();
+  VectorSet visited(a);
+  Node_Stack clones(a, main_head->back_control()->outcnt());
   // Step B3: Make the fall-in values to the main-loop come from the
   // fall-out values of the pre-loop.
   for (DUIterator_Fast i2max, i2 = main_head->fast_outs(i2max); i2 < i2max; i2++) {
@@ -1185,12 +1113,8 @@
   // variable value and the induction variable Phi to preserve correct
   // dependencies.
 
-  // CastII for the post loop:
-  bool inserted = cast_incr_before_loop(zer_opaq->in(1), zer_taken, post_head);
-  assert(inserted, "no castII inserted");
-
   // CastII for the main loop:
-  inserted = cast_incr_before_loop(pre_incr, min_taken, main_head);
+  bool inserted = cast_incr_before_loop( pre_incr, min_taken, main_head );
   assert(inserted, "no castII inserted");
 
   // Step B4: Shorten the pre-loop to run only 1 iteration (for now).
@@ -1298,19 +1222,82 @@
   guarantee(main_end != NULL, "no loop exit node");
   // diagnostic to show loop end is not properly formed
   assert(main_end->outcnt() == 2, "1 true, 1 false path only");
-  uint dd_main_head = dom_depth(main_head);
-  uint max = main_head->outcnt();
 
   // mark this loop as processed
   main_head->mark_has_atomic_post_loop();
 
-  Node *pre_header = main_head->in(LoopNode::EntryControl);
-  Node *init = main_head->init_trip();
   Node *incr = main_end->incr();
   Node *limit = main_end->limit();
-  Node *stride = main_end->stride();
-  Node *cmp = main_end->cmp_node();
-  BoolTest::mask b_test = main_end->test_trip();
+
+  // In this case we throw away the result as we are not using it to connect anything else.
+  CountedLoopNode *post_head = NULL;
+  insert_post_loop(loop, old_new, main_head, main_end, incr, limit, post_head);
+
+  // It's difficult to be precise about the trip-counts
+  // for post loops.  They are usually very short,
+  // so guess that unit vector trips is a reasonable value.
+  post_head->set_profile_trip_cnt(cur_unroll);
+
+  // Now force out all loop-invariant dominating tests.  The optimizer
+  // finds some, but we _know_ they are all useless.
+  peeled_dom_test_elim(loop, old_new);
+  loop->record_for_igvn();
+}
+
+
+//-------------------------insert_scalar_rced_post_loop------------------------
+// Insert a copy of the rce'd main loop as a post loop,
+// We have not unrolled the main loop, so this is the right time to inject this.
+// Later we will examine the partner of this post loop pair which still has range checks
+// to see inject code which tests at runtime if the range checks are applicable.
+void PhaseIdealLoop::insert_scalar_rced_post_loop(IdealLoopTree *loop, Node_List &old_new) {
+  if (!loop->_head->is_CountedLoop()) return;
+
+  CountedLoopNode *cl = loop->_head->as_CountedLoop();
+
+  // only process RCE'd main loops
+  if (!cl->is_main_loop() || cl->range_checks_present()) return;
+
+#ifndef PRODUCT
+  if (TraceLoopOpts) {
+    tty->print("PostScalarRce  ");
+    loop->dump_head();
+  }
+#endif
+  C->set_major_progress();
+
+  // Find common pieces of the loop being guarded with pre & post loops
+  CountedLoopNode *main_head = loop->_head->as_CountedLoop();
+  CountedLoopEndNode *main_end = main_head->loopexit();
+  guarantee(main_end != NULL, "no loop exit node");
+  // diagnostic to show loop end is not properly formed
+  assert(main_end->outcnt() == 2, "1 true, 1 false path only");
+
+  Node *incr = main_end->incr();
+  Node *limit = main_end->limit();
+
+  // In this case we throw away the result as we are not using it to connect anything else.
+  CountedLoopNode *post_head = NULL;
+  insert_post_loop(loop, old_new, main_head, main_end, incr, limit, post_head);
+
+  // It's difficult to be precise about the trip-counts
+  // for post loops.  They are usually very short,
+  // so guess that unit vector trips is a reasonable value.
+  post_head->set_profile_trip_cnt(4.0);
+  post_head->set_is_rce_post_loop();
+
+  // Now force out all loop-invariant dominating tests.  The optimizer
+  // finds some, but we _know_ they are all useless.
+  peeled_dom_test_elim(loop, old_new);
+  loop->record_for_igvn();
+}
+
+
+//------------------------------insert_post_loop-------------------------------
+// Insert post loops.  Add a post loop to the given loop passed.
+Node *PhaseIdealLoop::insert_post_loop(IdealLoopTree *loop, Node_List &old_new,
+                                       CountedLoopNode *main_head, CountedLoopEndNode *main_end,
+                                       Node *incr, Node *limit, CountedLoopNode *&post_head) {
 
   //------------------------------
   // Step A: Create a new post-Loop.
@@ -1322,7 +1309,7 @@
   // The main loop pre-header illegally has 2 control users (old & new loops).
   clone_loop(loop, old_new, dd_main_exit);
   assert(old_new[main_end->_idx]->Opcode() == Op_CountedLoopEnd, "");
-  CountedLoopNode *post_head = old_new[main_head->_idx]->as_CountedLoop();
+  post_head = old_new[main_head->_idx]->as_CountedLoop();
   post_head->set_normal_loop();
   post_head->set_post_loop(main_head);
 
@@ -1336,14 +1323,14 @@
   set_idom(new_main_exit, main_end, dd_main_exit);
   set_loop(new_main_exit, loop->_parent);
 
-  // Step A2: Build a zero-trip guard for the vector post-loop.  After leaving the
-  // main-loop, the vector post-loop may not execute at all.  We 'opaque' the incr
-  // (the vectorized main-loop trip-counter exit value) because we will be changing
+  // Step A2: Build a zero-trip guard for the post-loop.  After leaving the
+  // main-loop, the post-loop may not execute at all.  We 'opaque' the incr
+  // (the previous loop trip-counter exit value) because we will be changing
   // the exit value (via additional unrolling) so we cannot constant-fold away the zero
   // trip guard until all unrolling is done.
   Node *zer_opaq = new Opaque1Node(C, incr);
   Node *zer_cmp = new CmpINode(zer_opaq, limit);
-  Node *zer_bol = new BoolNode(zer_cmp, b_test);
+  Node *zer_bol = new BoolNode(zer_cmp, main_end->test_trip());
   register_new_node(zer_opaq, new_main_exit);
   register_new_node(zer_cmp, new_main_exit);
   register_new_node(zer_bol, new_main_exit);
@@ -1354,11 +1341,11 @@
   set_idom(zer_iff, new_main_exit, dd_main_exit);
   set_loop(zer_iff, loop->_parent);
 
-  // Plug in the false-path, taken if we need to skip vector post-loop
+  // Plug in the false-path, taken if we need to skip this post-loop
   _igvn.replace_input_of(main_exit, 0, zer_iff);
   set_idom(main_exit, zer_iff, dd_main_exit);
   set_idom(main_exit->unique_out(), zer_iff, dd_main_exit);
-  // Make the true-path, must enter the vector post loop
+  // Make the true-path, must enter this post loop
   Node *zer_taken = new IfTrueNode(zer_iff);
   _igvn.register_new_node_with_optimizer(zer_taken);
   set_idom(zer_taken, zer_iff, dd_main_exit);
@@ -1371,7 +1358,7 @@
   Arena *a = Thread::current()->resource_area();
   VectorSet visited(a);
   Node_Stack clones(a, main_head->back_control()->outcnt());
-  // Step A3: Make the fall-in values to the vector post-loop come from the
+  // Step A3: Make the fall-in values to the post-loop come from the
   // fall-out values of the main-loop.
   for (DUIterator_Fast imax, i = main_head->fast_outs(imax); i < imax; i++) {
     Node* main_phi = main_head->fast_out(i);
@@ -1390,15 +1377,7 @@
   bool inserted = cast_incr_before_loop(zer_opaq->in(1), zer_taken, post_head);
   assert(inserted, "no castII inserted");
 
-  // It's difficult to be precise about the trip-counts
-  // for post loops.  They are usually very short,
-  // so guess that unit vector trips is a reasonable value.
-  post_head->set_profile_trip_cnt((float)slp_max_unroll_factor);
-
-  // Now force out all loop-invariant dominating tests.  The optimizer
-  // finds some, but we _know_ they are all useless.
-  peeled_dom_test_elim(loop, old_new);
-  loop->record_for_igvn();
+  return new_main_exit;
 }
 
 //------------------------------is_invariant-----------------------------
@@ -1457,7 +1436,7 @@
     // Check the shape of the graph at the loop entry. If an inappropriate
     // graph shape is encountered, the compiler bails out loop unrolling;
     // compilation of the method will still succeed.
-    if (!is_canonical_main_loop_entry(loop_head)) {
+    if (!is_canonical_loop_entry(loop_head)) {
       return;
     }
     opaq = ctrl->in(0)->in(1)->in(1)->in(2);
@@ -2036,7 +2015,7 @@
 
 //------------------------------do_range_check---------------------------------
 // Eliminate range-checks and other trip-counter vs loop-invariant tests.
-void PhaseIdealLoop::do_range_check( IdealLoopTree *loop, Node_List &old_new ) {
+int PhaseIdealLoop::do_range_check( IdealLoopTree *loop, Node_List &old_new ) {
 #ifndef PRODUCT
   if (PrintOpto && VerifyLoopOptimizations) {
     tty->print("Range Check Elimination ");
@@ -2048,10 +2027,12 @@
 #endif
   assert(RangeCheckElimination, "");
   CountedLoopNode *cl = loop->_head->as_CountedLoop();
+  // If we fail before trying to eliminate range checks, set multiversion state
+  int closed_range_checks = 1;
 
   // protect against stride not being a constant
   if (!cl->stride_is_con())
-    return;
+    return closed_range_checks;
 
   // Find the trip counter; we are iteration splitting based on it
   Node *trip_counter = cl->phi();
@@ -2062,8 +2043,8 @@
   // Check graph shape. Cannot optimize a loop if zero-trip
   // Opaque1 node is optimized away and then another round
   // of loop opts attempted.
-  if (!is_canonical_main_loop_entry(cl)) {
-    return;
+  if (!is_canonical_loop_entry(cl)) {
+    return closed_range_checks;
   }
 
   // Need to find the main-loop zero-trip guard
@@ -2077,7 +2058,7 @@
   Node *p_f = iffm->in(0);
   // pre loop may have been optimized out
   if (p_f->Opcode() != Op_IfFalse) {
-    return;
+    return closed_range_checks;
   }
   CountedLoopEndNode *pre_end = p_f->in(0)->as_CountedLoopEnd();
   assert(pre_end->loopnode()->is_pre_loop(), "");
@@ -2086,7 +2067,7 @@
   // optimized away and then another round of loop opts attempted.
   // We can not optimize this particular loop in that case.
   if (pre_opaq1->Opcode() != Op_Opaque1)
-    return;
+    return closed_range_checks;
   Opaque1Node *pre_opaq = (Opaque1Node*)pre_opaq1;
   Node *pre_limit = pre_opaq->in(1);
 
@@ -2097,7 +2078,7 @@
   // pre-loop Opaque1 node.
   Node *orig_limit = pre_opaq->original_loop_limit();
   if (orig_limit == NULL || _igvn.type(orig_limit) == Type::TOP)
-    return;
+    return closed_range_checks;
 
   // Must know if its a count-up or count-down loop
 
@@ -2118,6 +2099,10 @@
   // executed.
   bool conditional_rc = false;
 
+  // Count number of range checks and reduce by load range limits, if zero,
+  // the loop is in canonical form to multiversion.
+  closed_range_checks = 0;
+
   // Check loop body for tests of trip-counter plus loop-invariant vs
   // loop-invariant.
   for( uint i = 0; i < loop->_body.size(); i++ ) {
@@ -2126,6 +2111,7 @@
         iff->Opcode() == Op_RangeCheck) { // Test?
       // Test is an IfNode, has 2 projections.  If BOTH are in the loop
       // we need loop unswitching instead of iteration splitting.
+      closed_range_checks++;
       Node *exit = loop->is_loop_exit(iff);
       if( !exit ) continue;
       int flip = (exit->Opcode() == Op_IfTrue) ? 1 : 0;
@@ -2220,7 +2206,7 @@
           scale_con = -scale_con;
           offset = new SubINode( zero, offset );
           register_new_node( offset, pre_ctrl );
-          limit  = new SubINode( zero, limit  );
+          limit  = new SubINode( zero, limit );
           register_new_node( limit, pre_ctrl );
           // Fall into LE case
         case BoolTest::le:
@@ -2269,6 +2255,9 @@
           --imax;
         }
       }
+      if (limit->Opcode() == Op_LoadRange) {
+        closed_range_checks--;
+      }
 
     } // End of is IF
 
@@ -2304,6 +2293,169 @@
   // The OpaqueNode is unshared by design
   assert( opqzm->outcnt() == 1, "cannot hack shared node" );
   _igvn.replace_input_of(opqzm, 1, main_limit);
+
+  return closed_range_checks;
+}
+
+//------------------------------has_range_checks-------------------------------
+// Check to see if RCE cleaned the current loop of range-checks.
+void PhaseIdealLoop::has_range_checks(IdealLoopTree *loop) {
+  assert(RangeCheckElimination, "");
+
+  // skip if not a counted loop
+  if (!loop->is_counted()) return;
+
+  CountedLoopNode *cl = loop->_head->as_CountedLoop();
+
+  // skip this loop if it is already checked
+  if (cl->has_been_range_checked()) return;
+
+  // Now check for existance of range checks
+  for (uint i = 0; i < loop->_body.size(); i++) {
+    Node *iff = loop->_body[i];
+    int iff_opc = iff->Opcode();
+    if (iff_opc == Op_If || iff_opc == Op_RangeCheck) {
+      cl->mark_has_range_checks();
+      break;
+    }
+  }
+  cl->set_has_been_range_checked();
+}
+
+//-------------------------multi_version_post_loops----------------------------
+// Check the range checks that remain, if simple, use the bounds to guard
+// which version to a post loop we execute, one with range checks or one without
+bool PhaseIdealLoop::multi_version_post_loops(IdealLoopTree *rce_loop, IdealLoopTree *legacy_loop) {
+  bool multi_version_succeeded = false;
+  assert(RangeCheckElimination, "");
+  CountedLoopNode *legacy_cl = legacy_loop->_head->as_CountedLoop();
+  assert(legacy_cl->is_post_loop(), "");
+
+  // Check for existance of range checks using the unique instance to make a guard with
+  Unique_Node_List worklist;
+  for (uint i = 0; i < legacy_loop->_body.size(); i++) {
+    Node *iff = legacy_loop->_body[i];
+    int iff_opc = iff->Opcode();
+    if (iff_opc == Op_If || iff_opc == Op_RangeCheck) {
+      worklist.push(iff);
+    }
+  }
+
+  // Find RCE'd post loop so that we can stage its guard.
+  if (!is_canonical_loop_entry(legacy_cl)) return multi_version_succeeded;
+  Node* ctrl = legacy_cl->in(LoopNode::EntryControl);
+  Node* iffm = ctrl->in(0);
+
+  // Now we test that both the post loops are connected
+  Node* post_loop_region = iffm->in(0);
+  if (post_loop_region == NULL) return multi_version_succeeded;
+  if (!post_loop_region->is_Region()) return multi_version_succeeded;
+  Node* covering_region = post_loop_region->in(RegionNode::Control+1);
+  if (covering_region == NULL) return multi_version_succeeded;
+  if (!covering_region->is_Region()) return multi_version_succeeded;
+  Node* p_f = covering_region->in(RegionNode::Control);
+  if (p_f == NULL) return multi_version_succeeded;
+  if (!p_f->is_IfFalse()) return multi_version_succeeded;
+  if (!p_f->in(0)->is_CountedLoopEnd()) return multi_version_succeeded;
+  CountedLoopEndNode* rce_loop_end = p_f->in(0)->as_CountedLoopEnd();
+  if (rce_loop_end == NULL) return multi_version_succeeded;
+  CountedLoopNode* rce_cl = rce_loop_end->loopnode();
+  if (rce_cl == NULL || !rce_cl->is_post_loop()) return multi_version_succeeded;
+  CountedLoopNode *known_rce_cl = rce_loop->_head->as_CountedLoop();
+  if (rce_cl != known_rce_cl) return multi_version_succeeded;
+
+  // Then we fetch the cover entry test
+  ctrl = rce_cl->in(LoopNode::EntryControl);
+  if (!ctrl->is_IfTrue() && !ctrl->is_IfFalse()) return multi_version_succeeded;
+
+#ifndef PRODUCT
+  if (TraceLoopOpts) {
+    tty->print("PostMultiVersion\n");
+    rce_loop->dump_head();
+    legacy_loop->dump_head();
+  }
+#endif
+
+  // Now fetch the limit we want to compare against
+  Node *limit = rce_cl->limit();
+  bool first_time = true;
+
+  // If we got this far, we identified the post loop which has been RCE'd and
+  // we have a work list.  Now we will try to transform the if guard to cause
+  // the loop pair to be multi version executed with the determination left to runtime
+  // or the optimizer if full information is known about the given arrays at compile time.
+  Node *last_min = NULL;
+  multi_version_succeeded = true;
+  while (worklist.size()) {
+    Node* rc_iffm = worklist.pop();
+    if (rc_iffm->is_If()) {
+      Node *rc_bolzm = rc_iffm->in(1);
+      if (rc_bolzm->is_Bool()) {
+        Node *rc_cmpzm = rc_bolzm->in(1);
+        if (rc_cmpzm->is_Cmp()) {
+          Node *rc_left = rc_cmpzm->in(2);
+          if (rc_left->Opcode() != Op_LoadRange) {
+            multi_version_succeeded = false;
+            break;
+          }
+          if (first_time) {
+            last_min = rc_left;
+            first_time = false;
+          } else {
+            Node *cur_min = new MinINode(last_min, rc_left);
+            last_min = cur_min;
+            _igvn.register_new_node_with_optimizer(last_min);
+          }
+        }
+      }
+    }
+  }
+
+  // All we have to do is update the limit of the rce loop
+  // with the min of our expression and the current limit.
+  // We will use this expression to replace the current limit.
+  if (last_min && multi_version_succeeded) {
+    Node *cur_min = new MinINode(last_min, limit);
+    _igvn.register_new_node_with_optimizer(cur_min);
+    Node *cmp_node = rce_loop_end->cmp_node();
+    _igvn.replace_input_of(cmp_node, 2, cur_min);
+    set_idom(cmp_node, cur_min, dom_depth(ctrl));
+    set_ctrl(cur_min, ctrl);
+    set_loop(cur_min, rce_loop->_parent);
+
+    legacy_cl->mark_is_multiversioned();
+    rce_cl->mark_is_multiversioned();
+    multi_version_succeeded = true;
+
+    C->set_major_progress();
+  }
+
+  return multi_version_succeeded;
+}
+
+//-------------------------poison_rce_post_loop--------------------------------
+// Causes the rce'd post loop to be optimized away if multiverioning fails
+void PhaseIdealLoop::poison_rce_post_loop(IdealLoopTree *rce_loop) {
+  CountedLoopNode *rce_cl = rce_loop->_head->as_CountedLoop();
+  Node* ctrl = rce_cl->in(LoopNode::EntryControl);
+  if (ctrl->is_IfTrue() || ctrl->is_IfFalse()) {
+    Node* iffm = ctrl->in(0);
+    if (iffm->is_If()) {
+      Node* cur_bool = iffm->in(1);
+      if (cur_bool->is_Bool()) {
+        Node* cur_cmp = cur_bool->in(1);
+        if (cur_cmp->is_Cmp()) {
+          BoolTest::mask new_test = BoolTest::gt;
+          BoolNode *new_bool = new BoolNode(cur_cmp, new_test);
+          _igvn.replace_node(cur_bool, new_bool);
+          _igvn._worklist.push(new_bool);
+          Node* left_op = cur_cmp->in(1);
+          _igvn.replace_input_of(cur_cmp, 2, left_op);
+          C->set_major_progress();
+        }
+      }
+    }
+  }
 }
 
 //------------------------------DCE_loop_body----------------------------------
@@ -2663,8 +2815,20 @@
     // Adjust the pre- and main-loop limits to let the pre and post loops run
     // with full checks, but the main-loop with no checks.  Remove said
     // checks from the main body.
-    if (should_rce)
-      phase->do_range_check(this,old_new);
+    if (should_rce) {
+      if (phase->do_range_check(this, old_new) != 0) {
+        cl->mark_has_range_checks();
+      }
+    } else {
+      phase->has_range_checks(this);
+    }
+
+    if (should_unroll && !should_peel && PostLoopMultiversioning) {
+      // Try to setup multiversioning on main loops before they are unrolled
+      if (cl->is_main_loop() && (cl->unrolled_count() == 1)) {
+        phase->insert_scalar_rced_post_loop(this, old_new);
+      }
+    }
 
     // Double loop body for unrolling.  Adjust the minimum-trip test (will do
     // twice as many iterations as before) and the main body limit (only do
--- a/hotspot/src/share/vm/opto/loopnode.cpp	Tue Apr 05 17:51:44 2016 +0200
+++ b/hotspot/src/share/vm/opto/loopnode.cpp	Tue Apr 05 10:34:05 2016 -0700
@@ -1832,6 +1832,9 @@
     if (cl->is_pre_loop ()) tty->print(" pre" );
     if (cl->is_main_loop()) tty->print(" main");
     if (cl->is_post_loop()) tty->print(" post");
+    if (cl->is_vectorized_loop()) tty->print(" vector");
+    if (cl->range_checks_present()) tty->print(" rc ");
+    if (cl->is_multiversioned()) tty->print(" multi ");
   }
   if (_has_call) tty->print(" has_call");
   if (_has_sfpt) tty->print(" has_sfpt");
@@ -2350,7 +2353,30 @@
     for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
       IdealLoopTree* lpt = iter.current();
       if (lpt->is_counted()) {
-        sw.transform_loop(lpt, true);
+        CountedLoopNode *cl = lpt->_head->as_CountedLoop();
+
+        if (PostLoopMultiversioning && cl->is_rce_post_loop() && !cl->is_vectorized_loop()) {
+          // Check that the rce'd post loop is encountered first, multiversion after all
+          // major main loop optimization are concluded
+          if (!C->major_progress()) {
+            IdealLoopTree *lpt_next = lpt->_next;
+            if (lpt_next && lpt_next->is_counted()) {
+              CountedLoopNode *cl = lpt_next->_head->as_CountedLoop();
+              has_range_checks(lpt_next);
+              if (cl->is_post_loop() && cl->range_checks_present()) {
+                if (!cl->is_multiversioned()) {
+                  if (multi_version_post_loops(lpt, lpt_next) == false) {
+                    // Cause the rce loop to be optimized away if we fail
+                    cl->mark_is_multiversioned();
+                    poison_rce_post_loop(lpt);
+                  }
+                }
+              }
+            }
+          }
+        } else if (cl->is_main_loop()) {
+          sw.transform_loop(lpt, true);
+        }
       }
     }
   }
@@ -3184,8 +3210,10 @@
 // loop unswitching, and IGVN, or a combination of them) can freely change
 // the graph's shape. As a result, the graph shape outlined below cannot
 // be guaranteed anymore.
-bool PhaseIdealLoop::is_canonical_main_loop_entry(CountedLoopNode* cl) {
-  assert(cl->is_main_loop(), "check should be applied to main loops");
+bool PhaseIdealLoop::is_canonical_loop_entry(CountedLoopNode* cl) {
+  if (!cl->is_main_loop() && !cl->is_post_loop()) {
+    return false;
+  }
   Node* ctrl = cl->in(LoopNode::EntryControl);
   if (ctrl == NULL || (!ctrl->is_IfTrue() && !ctrl->is_IfFalse())) {
     return false;
@@ -3202,8 +3230,16 @@
   if (cmpzm == NULL || !cmpzm->is_Cmp()) {
     return false;
   }
-  Node* opqzm = cmpzm->in(2);
-  if (opqzm == NULL || opqzm->Opcode() != Op_Opaque1) {
+  // compares can get conditionally flipped
+  bool found_opaque = false;
+  for (uint i = 1; i < cmpzm->req(); i++) {
+    Node* opnd = cmpzm->in(i);
+    if (opnd && opnd->Opcode() == Op_Opaque1) {
+      found_opaque = true;
+      break;
+    }
+  }
+  if (!found_opaque) {
     return false;
   }
   return true;
--- a/hotspot/src/share/vm/opto/loopnode.hpp	Tue Apr 05 17:51:44 2016 +0200
+++ b/hotspot/src/share/vm/opto/loopnode.hpp	Tue Apr 05 10:34:05 2016 -0700
@@ -69,9 +69,13 @@
          PassedSlpAnalysis=512,
          DoUnrollOnly=1024,
          VectorizedLoop=2048,
-         HasAtomicPostLoop=4096 };
+         HasAtomicPostLoop=4096,
+         HasRangeChecks=8192,
+         IsMultiversioned=16384};
   char _unswitch_count;
   enum { _unswitch_max=3 };
+  char _postloop_flags;
+  enum { LoopNotRCEChecked = 0, LoopRCEChecked = 1, RCEPostLoop = 2 };
 
 public:
   // Names for edge indices
@@ -80,9 +84,13 @@
   int is_inner_loop() const { return _loop_flags & InnerLoop; }
   void set_inner_loop() { _loop_flags |= InnerLoop; }
 
+  int range_checks_present() const { return _loop_flags & HasRangeChecks; }
+  int is_multiversioned() const { return _loop_flags & IsMultiversioned; }
+  int is_vectorized_loop() const { return _loop_flags & VectorizedLoop; }
   int is_partial_peel_loop() const { return _loop_flags & PartialPeelLoop; }
   void set_partial_peel_loop() { _loop_flags |= PartialPeelLoop; }
   int partial_peel_has_failed() const { return _loop_flags & PartialPeelFailed; }
+
   void mark_partial_peel_failed() { _loop_flags |= PartialPeelFailed; }
   void mark_has_reductions() { _loop_flags |= HasReductions; }
   void mark_was_slp() { _loop_flags |= WasSlpAnalyzed; }
@@ -90,15 +98,23 @@
   void mark_do_unroll_only() { _loop_flags |= DoUnrollOnly; }
   void mark_loop_vectorized() { _loop_flags |= VectorizedLoop; }
   void mark_has_atomic_post_loop() { _loop_flags |= HasAtomicPostLoop; }
+  void mark_has_range_checks() { _loop_flags |=  HasRangeChecks; }
+  void mark_is_multiversioned() { _loop_flags |= IsMultiversioned; }
 
   int unswitch_max() { return _unswitch_max; }
   int unswitch_count() { return _unswitch_count; }
+
+  int has_been_range_checked() const { return _postloop_flags & LoopRCEChecked; }
+  void set_has_been_range_checked() { _postloop_flags |= LoopRCEChecked; }
+  int is_rce_post_loop() const { return _postloop_flags & RCEPostLoop; }
+  void set_is_rce_post_loop() { _postloop_flags |= RCEPostLoop; }
+
   void set_unswitch_count(int val) {
     assert (val <= unswitch_max(), "too many unswitches");
     _unswitch_count = val;
   }
 
-  LoopNode( Node *entry, Node *backedge ) : RegionNode(3), _loop_flags(0), _unswitch_count(0) {
+  LoopNode(Node *entry, Node *backedge) : RegionNode(3), _loop_flags(0), _unswitch_count(0), _postloop_flags(0) {
     init_class_id(Class_Loop);
     init_req(EntryControl, entry);
     init_req(LoopBackControl, backedge);
@@ -225,7 +241,6 @@
   int has_passed_slp   () const { return (_loop_flags&PassedSlpAnalysis) == PassedSlpAnalysis; }
   int do_unroll_only      () const { return (_loop_flags&DoUnrollOnly) == DoUnrollOnly; }
   int is_main_no_pre_loop() const { return _loop_flags & MainHasNoPreLoop; }
-  int is_vectorized_loop    () const { return (_loop_flags & VectorizedLoop) == VectorizedLoop; }
   int has_atomic_post_loop  () const { return (_loop_flags & HasAtomicPostLoop) == HasAtomicPostLoop; }
   void set_main_no_pre_loop() { _loop_flags |= MainHasNoPreLoop; }
 
@@ -657,7 +672,7 @@
 
 public:
 
-  static bool is_canonical_main_loop_entry(CountedLoopNode* cl);
+  static bool is_canonical_loop_entry(CountedLoopNode* cl);
 
   bool has_node( Node* n ) const {
     guarantee(n != NULL, "No Node.");
@@ -911,6 +926,15 @@
   // Add pre and post loops around the given loop.  These loops are used
   // during RCE, unrolling and aligning loops.
   void insert_pre_post_loops( IdealLoopTree *loop, Node_List &old_new, bool peel_only );
+
+  // Add post loop after the given loop.
+  Node *insert_post_loop(IdealLoopTree *loop, Node_List &old_new,
+                         CountedLoopNode *main_head, CountedLoopEndNode *main_end,
+                         Node *incr, Node *limit, CountedLoopNode *&post_head);
+
+  // Add an RCE'd post loop which we will multi-version adapt for run time test path usage
+  void insert_scalar_rced_post_loop( IdealLoopTree *loop, Node_List &old_new );
+
   // Add a vector post loop between a vector main loop and the current post loop
   void insert_vector_post_loop(IdealLoopTree *loop, Node_List &old_new);
   // If Node n lives in the back_ctrl block, we clone a private version of n
@@ -983,7 +1007,17 @@
   }
 
   // Eliminate range-checks and other trip-counter vs loop-invariant tests.
-  void do_range_check( IdealLoopTree *loop, Node_List &old_new );
+  int do_range_check( IdealLoopTree *loop, Node_List &old_new );
+
+  // Check to see if do_range_check(...) cleaned the main loop of range-checks
+  void has_range_checks(IdealLoopTree *loop);
+
+  // Process post loops which have range checks and try to build a multi-version
+  // guard to safely determine if we can execute the post loop which was RCE'd.
+  bool multi_version_post_loops(IdealLoopTree *rce_loop, IdealLoopTree *legacy_loop);
+
+  // Cause the rce'd post loop to optimized away, this happens if we cannot complete multiverioning
+  void poison_rce_post_loop(IdealLoopTree *rce_loop);
 
   // Create a slow version of the loop by cloning the loop
   // and inserting an if to select fast-slow versions.
--- a/hotspot/src/share/vm/opto/superword.cpp	Tue Apr 05 17:51:44 2016 +0200
+++ b/hotspot/src/share/vm/opto/superword.cpp	Tue Apr 05 10:34:05 2016 -0700
@@ -3076,7 +3076,7 @@
 CountedLoopEndNode* SuperWord::get_pre_loop_end(CountedLoopNode* cl) {
   // The loop cannot be optimized if the graph shape at
   // the loop entry is inappropriate.
-  if (!PhaseIdealLoop::is_canonical_main_loop_entry(cl)) {
+  if (!PhaseIdealLoop::is_canonical_loop_entry(cl)) {
     return NULL;
   }