hotspot/src/share/vm/opto/loopTransform.cpp
changeset 36066 60ce66ce3c76
parent 35838 a6e58eeeaa2e
child 36316 7a83de7aabca
--- a/hotspot/src/share/vm/opto/loopTransform.cpp	Wed Feb 03 12:36:18 2016 +0100
+++ b/hotspot/src/share/vm/opto/loopTransform.cpp	Fri Feb 12 16:12:15 2016 -0800
@@ -666,7 +666,8 @@
     if (future_unroll_ct > LoopMaxUnroll) return false;
   } else {
     // obey user constraints on vector mapped loops with additional unrolling applied
-    if ((future_unroll_ct / cl->slp_max_unroll()) > LoopMaxUnroll) return false;
+    int unroll_constraint = (cl->slp_max_unroll()) ? cl->slp_max_unroll() : 1;
+    if ((future_unroll_ct / unroll_constraint) > LoopMaxUnroll) return false;
   }
 
   // Check for initial stride being a small enough constant
@@ -689,7 +690,7 @@
   //   Progress defined as current size less than 20% larger than previous size.
   if (UseSuperWord && cl->node_count_before_unroll() > 0 &&
       future_unroll_ct > LoopUnrollMin &&
-      (future_unroll_ct - 1) * 10.0 > cl->profile_trip_cnt() &&
+      (future_unroll_ct - 1) * (100 / LoopPercentProfileLimit) > cl->profile_trip_cnt() &&
       1.2 * cl->node_count_before_unroll() < (double)_body.size()) {
     return false;
   }
@@ -1260,6 +1261,146 @@
   loop->record_for_igvn();
 }
 
+//------------------------------insert_vector_post_loop------------------------
+// Insert a copy of the atomic unrolled vectorized main loop as a post loop,
+// unroll_policy has already informed us that more unrolling is about to happen to
+// the main loop.  The resultant post loop will serve as a vectorized drain loop.
+void PhaseIdealLoop::insert_vector_post_loop(IdealLoopTree *loop, Node_List &old_new) {
+  if (!loop->_head->is_CountedLoop()) return;
+
+  CountedLoopNode *cl = loop->_head->as_CountedLoop();
+
+  // only process vectorized main loops
+  if (!cl->is_vectorized_loop() || !cl->is_main_loop()) return;
+
+  int slp_max_unroll_factor = cl->slp_max_unroll();
+  int cur_unroll = cl->unrolled_count();
+
+  if (slp_max_unroll_factor == 0) return;
+
+  // only process atomic unroll vector loops (not super unrolled after vectorization)
+  if (cur_unroll != slp_max_unroll_factor) return;
+
+  // we only ever process this one time
+  if (cl->has_atomic_post_loop()) return;
+
+#ifndef PRODUCT
+  if (TraceLoopOpts) {
+    tty->print("PostVector  ");
+    loop->dump_head();
+  }
+#endif
+  C->set_major_progress();
+
+  // Find common pieces of the loop being guarded with pre & post loops
+  CountedLoopNode *main_head = loop->_head->as_CountedLoop();
+  CountedLoopEndNode *main_end = main_head->loopexit();
+  guarantee(main_end != NULL, "no loop exit node");
+  // diagnostic to show loop end is not properly formed
+  assert(main_end->outcnt() == 2, "1 true, 1 false path only");
+  uint dd_main_head = dom_depth(main_head);
+  uint max = main_head->outcnt();
+
+  // mark this loop as processed
+  main_head->mark_has_atomic_post_loop();
+
+  Node *pre_header = main_head->in(LoopNode::EntryControl);
+  Node *init = main_head->init_trip();
+  Node *incr = main_end->incr();
+  Node *limit = main_end->limit();
+  Node *stride = main_end->stride();
+  Node *cmp = main_end->cmp_node();
+  BoolTest::mask b_test = main_end->test_trip();
+
+  //------------------------------
+  // Step A: Create a new post-Loop.
+  Node* main_exit = main_end->proj_out(false);
+  assert(main_exit->Opcode() == Op_IfFalse, "");
+  int dd_main_exit = dom_depth(main_exit);
+
+  // Step A1: Clone the loop body of main.  The clone becomes the vector post-loop.
+  // The main loop pre-header illegally has 2 control users (old & new loops).
+  clone_loop(loop, old_new, dd_main_exit);
+  assert(old_new[main_end->_idx]->Opcode() == Op_CountedLoopEnd, "");
+  CountedLoopNode *post_head = old_new[main_head->_idx]->as_CountedLoop();
+  post_head->set_normal_loop();
+  post_head->set_post_loop(main_head);
+
+  // Reduce the post-loop trip count.
+  CountedLoopEndNode* post_end = old_new[main_end->_idx]->as_CountedLoopEnd();
+  post_end->_prob = PROB_FAIR;
+
+  // Build the main-loop normal exit.
+  IfFalseNode *new_main_exit = new IfFalseNode(main_end);
+  _igvn.register_new_node_with_optimizer(new_main_exit);
+  set_idom(new_main_exit, main_end, dd_main_exit);
+  set_loop(new_main_exit, loop->_parent);
+
+  // Step A2: Build a zero-trip guard for the vector post-loop.  After leaving the
+  // main-loop, the vector post-loop may not execute at all.  We 'opaque' the incr
+  // (the vectorized main-loop trip-counter exit value) because we will be changing
+  // the exit value (via additional unrolling) so we cannot constant-fold away the zero
+  // trip guard until all unrolling is done.
+  Node *zer_opaq = new Opaque1Node(C, incr);
+  Node *zer_cmp = new CmpINode(zer_opaq, limit);
+  Node *zer_bol = new BoolNode(zer_cmp, b_test);
+  register_new_node(zer_opaq, new_main_exit);
+  register_new_node(zer_cmp, new_main_exit);
+  register_new_node(zer_bol, new_main_exit);
+
+  // Build the IfNode
+  IfNode *zer_iff = new IfNode(new_main_exit, zer_bol, PROB_FAIR, COUNT_UNKNOWN);
+  _igvn.register_new_node_with_optimizer(zer_iff);
+  set_idom(zer_iff, new_main_exit, dd_main_exit);
+  set_loop(zer_iff, loop->_parent);
+
+  // Plug in the false-path, taken if we need to skip vector post-loop
+  _igvn.replace_input_of(main_exit, 0, zer_iff);
+  set_idom(main_exit, zer_iff, dd_main_exit);
+  set_idom(main_exit->unique_out(), zer_iff, dd_main_exit);
+  // Make the true-path, must enter the vector post loop
+  Node *zer_taken = new IfTrueNode(zer_iff);
+  _igvn.register_new_node_with_optimizer(zer_taken);
+  set_idom(zer_taken, zer_iff, dd_main_exit);
+  set_loop(zer_taken, loop->_parent);
+  // Plug in the true path
+  _igvn.hash_delete(post_head);
+  post_head->set_req(LoopNode::EntryControl, zer_taken);
+  set_idom(post_head, zer_taken, dd_main_exit);
+
+  Arena *a = Thread::current()->resource_area();
+  VectorSet visited(a);
+  Node_Stack clones(a, main_head->back_control()->outcnt());
+  // Step A3: Make the fall-in values to the vector post-loop come from the
+  // fall-out values of the main-loop.
+  for (DUIterator_Fast imax, i = main_head->fast_outs(imax); i < imax; i++) {
+    Node* main_phi = main_head->fast_out(i);
+    if (main_phi->is_Phi() && main_phi->in(0) == main_head && main_phi->outcnt() >0) {
+      Node *cur_phi = old_new[main_phi->_idx];
+      Node *fallnew = clone_up_backedge_goo(main_head->back_control(),
+                                            post_head->init_control(),
+                                            main_phi->in(LoopNode::LoopBackControl),
+                                            visited, clones);
+      _igvn.hash_delete(cur_phi);
+      cur_phi->set_req(LoopNode::EntryControl, fallnew);
+    }
+  }
+
+  // CastII for the new post loop:
+  bool inserted = cast_incr_before_loop(zer_opaq->in(1), zer_taken, post_head);
+  assert(inserted, "no castII inserted");
+
+  // It's difficult to be precise about the trip-counts
+  // for post loops.  They are usually very short,
+  // so guess that unit vector trips is a reasonable value.
+  post_head->set_profile_trip_cnt((float)slp_max_unroll_factor);
+
+  // Now force out all loop-invariant dominating tests.  The optimizer
+  // finds some, but we _know_ they are all useless.
+  peeled_dom_test_elim(loop, old_new);
+  loop->record_for_igvn();
+}
+
 //------------------------------is_invariant-----------------------------
 // Return true if n is invariant
 bool IdealLoopTree::is_invariant(Node* n) const {
@@ -2608,6 +2749,9 @@
     // and we'd rather unroll the post-RCE'd loop SO... do not unroll if
     // peeling.
     if (should_unroll && !should_peel) {
+      if (SuperWordLoopUnrollAnalysis) {
+        phase->insert_vector_post_loop(this, old_new);
+      }
       phase->do_unroll(this, old_new, true);
     }