8076284: Improve vectorization of parallel streams
authorkvn
Tue, 05 May 2015 12:33:57 -0700
changeset 30593 69f942690128
parent 30592 fa0ae17a543e
child 30594 e1b420c520a0
8076284: Improve vectorization of parallel streams Summary: Improve vectorization of java/util/stream/Streams$RangeIntSpliterator::forEachRemaining() method and enable loop vectorization in a given method on demand. Reviewed-by: kvn Contributed-by: jan.civlin@intel.com
hotspot/src/share/vm/classfile/vmSymbols.hpp
hotspot/src/share/vm/opto/c2_globals.hpp
hotspot/src/share/vm/opto/compile.cpp
hotspot/src/share/vm/opto/compile.hpp
hotspot/src/share/vm/opto/loopTransform.cpp
hotspot/src/share/vm/opto/loopnode.cpp
hotspot/src/share/vm/opto/loopopts.cpp
hotspot/src/share/vm/opto/node.cpp
hotspot/src/share/vm/opto/superword.cpp
hotspot/src/share/vm/opto/superword.hpp
hotspot/src/share/vm/utilities/hashtable.cpp
--- a/hotspot/src/share/vm/classfile/vmSymbols.hpp	Tue May 05 19:27:08 2015 +0200
+++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp	Tue May 05 12:33:57 2015 -0700
@@ -591,6 +591,9 @@
   template(classLoader_name,                           "classLoader")                                             \
   template(componentType_name,                         "componentType")                                           \
                                                                                                                   \
+  /* forEachRemaining support */                                                                                  \
+  template(java_util_stream_StreamsRangeIntSpliterator,          "java/util/stream/Streams$RangeIntSpliterator")  \
+                                                                                                                  \
   /* trace signatures */                                                                                          \
   TRACE_TEMPLATES(template)                                                                                       \
                                                                                                                   \
@@ -1121,6 +1124,11 @@
   do_intrinsic(_Double_valueOf,           java_lang_Double,       valueOf_name, Double_valueOf_signature, F_S)          \
    do_name(     Double_valueOf_signature,                        "(D)Ljava/lang/Double;")                               \
                                                                                                                         \
+  /* forEachRemaining */                                                                             \
+  do_intrinsic(_forEachRemaining, java_util_stream_StreamsRangeIntSpliterator, forEachRemaining_name, forEachRemaining_signature, F_R) \
+   do_name(     forEachRemaining_name,    "forEachRemaining")                                                           \
+   do_name(     forEachRemaining_signature,                      "(Ljava/util/function/IntConsumer;)V")                 \
+
     /*end*/
 
 
--- a/hotspot/src/share/vm/opto/c2_globals.hpp	Tue May 05 19:27:08 2015 +0200
+++ b/hotspot/src/share/vm/opto/c2_globals.hpp	Tue May 05 12:33:57 2015 -0700
@@ -318,6 +318,9 @@
   notproduct(bool, TraceLoopUnswitching, false,                             \
           "Trace loop unswitching")                                         \
                                                                             \
+  product(bool, AllowVectorizeOnDemand, true,                               \
+          "Globally supress vectorization set in VectorizeMethod")          \
+                                                                            \
   product(bool, UseSuperWord, true,                                         \
           "Transform scalar operations into superword operations")          \
                                                                             \
--- a/hotspot/src/share/vm/opto/compile.cpp	Tue May 05 19:27:08 2015 +0200
+++ b/hotspot/src/share/vm/opto/compile.cpp	Tue May 05 12:33:57 2015 -0700
@@ -453,6 +453,8 @@
   assert(compile == Compile::current(), "sanity");
 
   compile->set_type_dict(NULL);
+  compile->set_clone_map(new Dict(cmpkey, hashkey, _compile->comp_arena()));
+  compile->clone_map().set_clone_idx(0);
   compile->set_type_hwm(NULL);
   compile->set_type_last_size(0);
   compile->set_last_tf(NULL, NULL);
@@ -462,6 +464,7 @@
   Type::Initialize(compile);
   _compile->set_scratch_buffer_blob(NULL);
   _compile->begin_method();
+  _compile->clone_map().set_debug(_compile->has_method() && _compile->method_has_option(_compile->clone_map().debug_option_name));
 }
 CompileWrapper::~CompileWrapper() {
   _compile->end_method();
@@ -1091,6 +1094,24 @@
   set_do_scheduling(OptoScheduling);
   set_do_count_invocations(false);
   set_do_method_data_update(false);
+
+  set_do_vector_loop(false);
+
+  bool do_vector = false;
+  if (AllowVectorizeOnDemand) {
+    if (has_method() && (method()->has_option("Vectorize") || method()->has_option("VectorizeDebug"))) {
+      set_do_vector_loop(true);
+    } else if (has_method() && method()->name() != 0 &&
+               method()->intrinsic_id() == vmIntrinsics::_forEachRemaining) {
+      set_do_vector_loop(true);
+    }
+#ifndef PRODUCT
+    if (do_vector_loop() && Verbose) {
+      tty->print("Compile::Init: do vectorized loops (SIMD like) for method %s\n",  method()->name()->as_quoted_ascii());
+    }
+#endif
+  }
+
   set_age_code(has_method() && method()->profile_aging());
   set_rtm_state(NoRTM); // No RTM lock eliding by default
   method_has_option_value("MaxNodeLimit", _max_node_limit);
@@ -4334,3 +4355,63 @@
   assert(count > 0, "only positive");
   return (os::random() & RANDOMIZED_DOMAIN_MASK) < (RANDOMIZED_DOMAIN / count);
 }
+
+const char*   CloneMap::debug_option_name = "CloneMapDebug";
+CloneMap&     Compile::clone_map()                 { return _clone_map; }
+void          Compile::set_clone_map(Dict* d)      { _clone_map._dict = d; }
+
+void NodeCloneInfo::dump() const {
+  tty->print(" {%d:%d} ", idx(), gen());
+}
+
+void CloneMap::clone(Node* old, Node* nnn, int gen) {
+  uint64_t val = value(old->_idx);
+  NodeCloneInfo cio(val);
+  assert(val != 0, "old node should be in the map");
+  NodeCloneInfo cin(cio.idx(), gen + cio.gen());
+  insert(nnn->_idx, cin.get());
+#ifndef PRODUCT
+  if (is_debug()) {
+    tty->print_cr("CloneMap::clone inserted node %d info {%d:%d} into CloneMap", nnn->_idx, cin.idx(), cin.gen());
+  }
+#endif
+}
+
+void CloneMap::verify_insert_and_clone(Node* old, Node* nnn, int gen) {
+  NodeCloneInfo cio(value(old->_idx));
+  if (cio.get() == 0) {
+    cio.set(old->_idx, 0);
+    insert(old->_idx, cio.get());
+#ifndef PRODUCT
+    if (is_debug()) {
+      tty->print_cr("CloneMap::verify_insert_and_clone inserted node %d info {%d:%d} into CloneMap", old->_idx, cio.idx(), cio.gen());
+    }
+#endif
+  }
+  clone(old, nnn, gen);
+}
+
+int CloneMap::max_gen() const {
+  int g = 0;
+  DictI di(_dict);
+  for(; di.test(); ++di) {
+    int t = gen(di._key);
+    if (g < t) {
+      g = t;
+#ifndef PRODUCT
+      if (is_debug()) {
+        tty->print_cr("CloneMap::max_gen() update max=%d from %d", g, _2_node_idx_t(di._key));
+      }
+#endif
+    }
+  }
+  return g;
+}
+
+void CloneMap::dump(node_idx_t key) const {
+  uint64_t val = value(key);
+  if (val != 0) {
+    NodeCloneInfo ni(val);
+    ni.dump();
+  }
+}
--- a/hotspot/src/share/vm/opto/compile.hpp	Tue May 05 19:27:08 2015 +0200
+++ b/hotspot/src/share/vm/opto/compile.hpp	Tue May 05 12:33:57 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -47,6 +47,7 @@
 class Bundle;
 class C2Compiler;
 class CallGenerator;
+class CloneMap;
 class ConnectionGraph;
 class InlineTree;
 class Int_Array;
@@ -59,6 +60,7 @@
 class Node;
 class Node_Array;
 class Node_Notes;
+class NodeCloneInfo;
 class OptoReg;
 class PhaseCFG;
 class PhaseGVN;
@@ -84,6 +86,62 @@
 class Node_Stack;
 struct Final_Reshape_Counts;
 
+typedef unsigned int node_idx_t;
+class NodeCloneInfo {
+ private:
+  uint64_t  _idx_clone_orig;
+ public:
+
+  void set_idx(node_idx_t idx) {
+    _idx_clone_orig = _idx_clone_orig & 0xFFFFFFFF00000000 | idx;
+  }
+  node_idx_t idx() const { return (node_idx_t)(_idx_clone_orig & 0xFFFFFFFF); }
+
+  void set_gen(int generation) {
+    uint64_t  g = (uint64_t)generation << 32;
+    _idx_clone_orig = _idx_clone_orig & 0xFFFFFFFF | g;
+  }
+  int gen() const { return (int)(_idx_clone_orig >> 32); }
+
+  void set(uint64_t x) {  _idx_clone_orig = x; }
+  void set(node_idx_t x, int g) {  set_idx(x); set_gen(g); }
+  uint64_t get() const { return _idx_clone_orig; }
+
+  NodeCloneInfo(uint64_t idx_clone_orig) : _idx_clone_orig(idx_clone_orig) {}
+  NodeCloneInfo(node_idx_t x, int g) {set(x, g);}
+
+  void dump() const;
+};
+
+class CloneMap {
+  friend class Compile;
+ private:
+  bool      _debug;
+  Dict*     _dict;
+  int       _clone_idx;   // current cloning iteration/generation in loop unroll
+ public:
+  void*     _2p(node_idx_t key)   const          { return (void*)(intptr_t)key; } // 2 conversion functions to make gcc happy
+  node_idx_t _2_node_idx_t(const void* k) const  { return (node_idx_t)(intptr_t)k; }
+  Dict*     dict()                const          { return _dict; }
+  void insert(node_idx_t key, uint64_t val)      { assert(_dict->operator[](_2p(key)) == NULL, "key existed"); _dict->Insert(_2p(key), (void*)val); }
+  void insert(node_idx_t key, NodeCloneInfo& ci) { insert(key, ci.get()); }
+  void remove(node_idx_t key)                    { _dict->Delete(_2p(key)); }
+  uint64_t value(node_idx_t key)  const          { return (uint64_t)_dict->operator[](_2p(key)); }
+  node_idx_t idx(node_idx_t key)  const          { return NodeCloneInfo(value(key)).idx(); }
+  int gen(node_idx_t key)         const          { return NodeCloneInfo(value(key)).gen(); }
+  int gen(const void* k)          const          { return gen(_2_node_idx_t(k)); }
+  int max_gen()                   const;
+  void clone(Node* old, Node* nnn, int gen);
+  void verify_insert_and_clone(Node* old, Node* nnn, int gen);
+  void dump(node_idx_t key)       const;
+
+  int  clone_idx() const                         { return _clone_idx; }
+  void set_clone_idx(int x)                      { _clone_idx = x; }
+  bool is_debug()                 const          { return _debug; }
+  void set_debug(bool debug)                     { _debug = debug; }
+  static const char* debug_option_name;
+};
+
 //------------------------------Compile----------------------------------------
 // This class defines a top-level Compiler invocation.
 
@@ -312,6 +370,7 @@
   bool                  _do_freq_based_layout;  // True if we intend to do frequency based block layout
   bool                  _do_count_invocations;  // True if we generate code to count invocations
   bool                  _do_method_data_update; // True if we generate code to update MethodData*s
+  bool                  _do_vector_loop;        // True if allowed to execute loop in parallel iterations
   bool                  _age_code;              // True if we need to profile code age (decrement the aging counter)
   int                   _AliasLevel;            // Locally-adjusted version of AliasLevel flag.
   bool                  _print_assembly;        // True if we should dump assembly code for this compilation
@@ -380,6 +439,7 @@
   Arena                 _Compile_types;         // Arena for all types
   Arena*                _type_arena;            // Alias for _Compile_types except in Initialize_shared()
   Dict*                 _type_dict;             // Intern table
+  CloneMap              _clone_map;             // used for recording history of cloned nodes
   void*                 _type_hwm;              // Last allocation (see Type::operator new/delete)
   size_t                _type_last_size;        // Last allocation size (see Type::operator new/delete)
   ciMethod*             _last_tf_m;             // Cache for
@@ -586,6 +646,8 @@
   void          set_do_count_invocations(bool z){ _do_count_invocations = z; }
   bool              do_method_data_update() const { return _do_method_data_update; }
   void          set_do_method_data_update(bool z) { _do_method_data_update = z; }
+  bool              do_vector_loop() const      { return _do_vector_loop; }
+  void          set_do_vector_loop(bool z)      { _do_vector_loop = z; }
   bool              age_code() const             { return _age_code; }
   void          set_age_code(bool z)             { _age_code = z; }
   int               AliasLevel() const           { return _AliasLevel; }
@@ -1228,6 +1290,11 @@
 
   // Auxiliary method for randomized fuzzing/stressing
   static bool randomized_select(int count);
+
+  // supporting clone_map
+  CloneMap&     clone_map();
+  void          set_clone_map(Dict* d);
+
 };
 
 #endif // SHARE_VM_OPTO_COMPILE_HPP
--- a/hotspot/src/share/vm/opto/loopTransform.cpp	Tue May 05 19:27:08 2015 +0200
+++ b/hotspot/src/share/vm/opto/loopTransform.cpp	Tue May 05 12:33:57 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -1210,6 +1210,16 @@
     }
     loop->dump_head();
   }
+
+  if (C->do_vector_loop() && (PrintOpto && VerifyLoopOptimizations || TraceLoopOpts)) {
+    Arena* arena = Thread::current()->resource_area();
+    Node_Stack stack(arena, C->unique() >> 2);
+    Node_List rpo_list;
+    VectorSet visited(arena);
+    visited.set(loop_head->_idx);
+    rpo( loop_head, stack, visited, rpo_list );
+    dump(loop, rpo_list.size(), rpo_list );
+  }
 #endif
 
   // Remember loop node count before unrolling to detect
@@ -1497,6 +1507,30 @@
   }
 
   loop->record_for_igvn();
+
+#ifndef PRODUCT
+  if (C->do_vector_loop() && (PrintOpto && VerifyLoopOptimizations || TraceLoopOpts)) {
+    tty->print("\nnew loop after unroll\n");       loop->dump_head();
+    for (uint i = 0; i < loop->_body.size(); i++) {
+      loop->_body.at(i)->dump();
+    }
+    if(C->clone_map().is_debug()) {
+      tty->print("\nCloneMap\n");
+      Dict* dict = C->clone_map().dict();
+      DictI i(dict);
+      tty->print_cr("Dict@%p[%d] = ", dict, dict->Size());
+      for (int ii = 0; i.test(); ++i, ++ii) {
+        NodeCloneInfo cl((uint64_t)dict->operator[]((void*)i._key));
+        tty->print("%d->%d:%d,", (int)(intptr_t)i._key, cl.idx(), cl.gen());
+        if (ii % 10 == 9) {
+          tty->print_cr(" ");
+        }
+      }
+      tty->print_cr(" ");
+    }
+  }
+#endif
+
 }
 
 //------------------------------do_maximally_unroll----------------------------
--- a/hotspot/src/share/vm/opto/loopnode.cpp	Tue May 05 19:27:08 2015 +0200
+++ b/hotspot/src/share/vm/opto/loopnode.cpp	Tue May 05 12:33:57 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -3678,6 +3678,7 @@
 }
 
 void PhaseIdealLoop::dump( IdealLoopTree *loop, uint idx, Node_List &rpo_list ) const {
+  CloneMap& cm = C->clone_map();
   loop->dump_head();
 
   // Now scan for CFG nodes in the same loop
@@ -3709,6 +3710,7 @@
         cached_idom = find_non_split_ctrl(cached_idom);
       }
       tty->print(" ID:%d",computed_idom->_idx);
+      cm.dump(n->_idx);
       n->dump();
       if( cached_idom != computed_idom ) {
         tty->print_cr("*** BROKEN IDOM!  Computed as: %d, cached as: %d",
@@ -3728,6 +3730,7 @@
           for( uint j = 0; j < loop->_nest; j++ )
             tty->print("  ");
           tty->print(" ");
+          cm.dump(m->_idx);
           m->dump();
         }
       }
--- a/hotspot/src/share/vm/opto/loopopts.cpp	Tue May 05 19:27:08 2015 +0200
+++ b/hotspot/src/share/vm/opto/loopopts.cpp	Tue May 05 12:33:57 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -1267,12 +1267,36 @@
 void PhaseIdealLoop::clone_loop( IdealLoopTree *loop, Node_List &old_new, int dd,
                                  Node* side_by_side_idom) {
 
+#ifndef PRODUCT
+  if (C->do_vector_loop() && PrintOpto) {
+    const char* mname = C->method()->name()->as_quoted_ascii();
+    if (mname != NULL) {
+      tty->print("PhaseIdealLoop::clone_loop: for vectorize method %s\n", mname);
+    }
+  }
+#endif
+
+  CloneMap& cm = C->clone_map();
+  Dict* dict = cm.dict();
+  if (C->do_vector_loop()) {
+    cm.set_clone_idx(cm.max_gen()+1);
+#ifndef PRODUCT
+    if (PrintOpto) {
+      tty->print_cr("PhaseIdealLoop::clone_loop: _clone_idx %d", cm.clone_idx());
+      loop->dump_head();
+    }
+#endif
+  }
+
   // Step 1: Clone the loop body.  Make the old->new mapping.
   uint i;
   for( i = 0; i < loop->_body.size(); i++ ) {
     Node *old = loop->_body.at(i);
     Node *nnn = old->clone();
     old_new.map( old->_idx, nnn );
+    if (C->do_vector_loop()) {
+      cm.verify_insert_and_clone(old, nnn, cm.clone_idx());
+    }
     _igvn.register_new_node_with_optimizer(nnn);
   }
 
@@ -1335,6 +1359,9 @@
 
         // Clone the loop exit control projection
         Node *newuse = use->clone();
+        if (C->do_vector_loop()) {
+          cm.verify_insert_and_clone(use, newuse, cm.clone_idx());
+        }
         newuse->set_req(0,nnn);
         _igvn.register_new_node_with_optimizer(newuse);
         set_loop(newuse, use_loop);
--- a/hotspot/src/share/vm/opto/node.cpp	Tue May 05 19:27:08 2015 +0200
+++ b/hotspot/src/share/vm/opto/node.cpp	Tue May 05 12:33:57 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -1658,6 +1658,9 @@
     return;                     // don't process dead nodes
   }
 
+  if (C->clone_map().value(_idx) != 0) {
+    C->clone_map().dump(_idx);
+  }
   // Dump node-specific info
   dump_spec(st);
 #ifdef ASSERT
--- a/hotspot/src/share/vm/opto/superword.cpp	Tue May 05 19:27:08 2015 +0200
+++ b/hotspot/src/share/vm/opto/superword.cpp	Tue May 05 12:33:57 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2007, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -54,6 +54,7 @@
   _mem_slice_head(arena(), 8,  0, NULL),  // memory slice heads
   _mem_slice_tail(arena(), 8,  0, NULL),  // memory slice tails
   _node_info(arena(), 8,  0, SWNodeInfo::initial), // info needed per node
+  _clone_map(phase->C->clone_map()),      // map of nodes created in cloning
   _align_to_ref(NULL),                    // memory reference to align vectors to
   _disjoint_ptrs(arena(), 8,  0, OrderedPair::initial), // runtime disambiguated pointer pairs
   _dg(_arena),                            // dependence graph
@@ -68,7 +69,12 @@
   _iv(NULL),                              // induction var
   _race_possible(false),                  // cases where SDMU is true
   _num_work_vecs(0),                      // amount of vector work we have
-  _num_reductions(0)                      // amount of reduction work we have
+  _num_reductions(0),                     // amount of reduction work we have
+  _do_vector_loop(phase->C->do_vector_loop()),  // whether to do vectorization/simd style
+  _ii_first(-1),                          // first loop generation index - only if do_vector_loop()
+  _ii_last(-1),                           // last loop generation index - only if do_vector_loop()
+  _ii_order(arena(), 8, 0, 0),
+  _vector_loop_debug(phase->C->has_method() && phase->C->method_has_option("VectorizeDebug"))
 {}
 
 //------------------------------transform_loop---------------------------
@@ -147,14 +153,53 @@
 //
 void SuperWord::SLP_extract() {
 
+#ifndef PRODUCT
+  if (_do_vector_loop && TraceSuperWord) {
+    tty->print("SuperWord::SLP_extract\n");
+    tty->print("input loop\n");
+    _lpt->dump_head();
+    _lpt->dump();
+    for (uint i = 0; i < _lpt->_body.size(); i++) {
+      _lpt->_body.at(i)->dump();
+    }
+  }
+#endif
   // Ready the block
-  if (!construct_bb())
+  if (!construct_bb()) {
     return; // Exit if no interesting nodes or complex graph.
-
+  }
+  // build    _dg, _disjoint_ptrs
   dependence_graph();
 
+  // compute function depth(Node*)
   compute_max_depth();
 
+  if (_do_vector_loop) {
+    if (mark_generations() != -1) {
+      hoist_loads_in_graph(); // this only rebuild the graph; all basic structs need rebuild explicitly
+
+      if (!construct_bb()) {
+        return; // Exit if no interesting nodes or complex graph.
+      }
+      dependence_graph();
+      compute_max_depth();
+    }
+
+#ifndef PRODUCT
+    if (TraceSuperWord) {
+      tty->print_cr("\nSuperWord::_do_vector_loop: graph after hoist_loads_in_graph");
+      _lpt->dump_head();
+      for (int j = 0; j < _block.length(); j++) {
+        Node* n = _block.at(j);
+        int d = depth(n);
+        for (int i = 0;  i < d; i++) tty->print("%s", "  ");
+        tty->print("%d :", d);
+        n->dump();
+      }
+    }
+#endif
+  }
+
   compute_vector_element_type();
 
   // Attempt vectorization
@@ -163,6 +208,17 @@
 
   extend_packlist();
 
+  if (_do_vector_loop) {
+    if (_packset.length() == 0) {
+#ifndef PRODUCT
+      if (TraceSuperWord) {
+        tty->print_cr("\nSuperWord::_do_vector_loop DFA could not build packset, now trying to build anyway");
+      }
+#endif
+      pack_parallel();
+    }
+  }
+
   combine_packs();
 
   construct_my_pack_map();
@@ -228,7 +284,7 @@
     // Create initial pack pairs of memory operations for which
     // alignment is set and vectors will be aligned.
     bool create_pack = true;
-    if (memory_alignment(mem_ref, best_iv_adjustment) == 0) {
+    if (memory_alignment(mem_ref, best_iv_adjustment) == 0 || _do_vector_loop) {
       if (!Matcher::misaligned_vectors_ok()) {
         int vw = vector_width(mem_ref);
         int vw_best = vector_width(best_align_to_mem_ref);
@@ -274,7 +330,9 @@
               Node_List* pair = new Node_List();
               pair->push(s1);
               pair->push(s2);
-              _packset.append(pair);
+              if (!_do_vector_loop || _clone_map.idx(s1->_idx) == _clone_map.idx(s2->_idx)) {
+                _packset.append(pair);
+              }
             }
           }
         }
@@ -419,7 +477,7 @@
 
 #ifdef ASSERT
   if (TraceSuperWord && Verbose) {
-    tty->print_cr("\nVector memops after find_align_to_refs");
+    tty->print_cr("\nVector memops after find_align_to_ref");
     for (uint i = 0; i < memops.size(); i++) {
       MemNode* s = memops.at(i)->as_Mem();
       s->dump();
@@ -528,6 +586,14 @@
     // Get slice in predecessor order (last is first)
     mem_slice_preds(n_tail, n, _nlist);
 
+#ifndef PRODUCT
+    if(TraceSuperWord && Verbose) {
+      tty->print_cr("SuperWord::dependence_graph: built a new mem slice");
+      for (int j = _nlist.length() - 1; j >= 0 ; j--) {
+        _nlist.at(j)->dump();
+      }
+    }
+#endif
     // Make the slice dependent on the root
     DepMem* slice = _dg.dep(n);
     _dg.make_edge(_dg.root(), slice);
@@ -2362,6 +2428,8 @@
   _data_entry.clear();
   _mem_slice_head.clear();
   _mem_slice_tail.clear();
+  _iteration_first.clear();
+  _iteration_last.clear();
   _node_info.clear();
   _align_to_ref = NULL;
   _lpt = NULL;
@@ -2373,6 +2441,18 @@
   _num_reductions = 0;
 }
 
+//------------------------------restart---------------------------
+void SuperWord::restart() {
+  _dg.init();
+  _packset.clear();
+  _disjoint_ptrs.clear();
+  _block.clear();
+  _data_entry.clear();
+  _mem_slice_head.clear();
+  _mem_slice_tail.clear();
+  _node_info.clear();
+}
+
 //------------------------------print_packset---------------------------
 void SuperWord::print_packset() {
 #ifndef PRODUCT
@@ -2750,3 +2830,401 @@
     _done = true;
   }
 }
+
+//
+// --------------------------------- vectorization/simd -----------------------------------
+//
+Node*  SuperWord::find_phi_for_mem_dep(LoadNode* ld) {
+  assert(in_bb(ld), "must be in block");
+  if (_clone_map.gen(ld->_idx) == _ii_first) {
+#ifndef PRODUCT
+    if (_vector_loop_debug) {
+      tty->print_cr("SuperWord::find_phi_for_mem_dep _clone_map.gen(ld->_idx)=%d",
+                    _clone_map.gen(ld->_idx));
+    }
+#endif
+    return NULL; //we think that any ld in the first gen being vectorizable
+  }
+
+  Node* mem = ld->in(MemNode::Memory);
+  if (mem->outcnt() <= 1) {
+    // we don't want to remove the only edge from mem node to load
+#ifndef PRODUCT
+    if (_vector_loop_debug) {
+      tty->print_cr("SuperWord::find_phi_for_mem_dep input node %d to load %d has no other outputs and edge mem->load cannot be removed",
+                    mem->_idx, ld->_idx);
+      ld->dump();
+      mem->dump();
+    }
+#endif
+    return NULL;
+  }
+  if (!in_bb(mem) || _clone_map.gen(mem->_idx) == _clone_map.gen(ld->_idx)) {
+#ifndef PRODUCT
+    if (_vector_loop_debug) {
+      tty->print_cr("SuperWord::find_phi_for_mem_dep _clone_map.gen(mem->_idx)=%d",
+                    _clone_map.gen(mem->_idx));
+    }
+#endif
+    return NULL; // does not depend on loop volatile node or depends on the same generation
+  }
+
+  //otherwise first node should depend on mem-phi
+  Node* first = first_node(ld);
+  assert(first->is_Load(), "must be Load");
+  Node* phi = first->as_Load()->in(MemNode::Memory);
+  if (!phi->is_Phi() || phi->bottom_type() != Type::MEMORY) {
+#ifndef PRODUCT
+    if (_vector_loop_debug) {
+      tty->print_cr("SuperWord::find_phi_for_mem_dep load is not vectorizable node, since it's `first` does not take input from mem phi");
+      ld->dump();
+      first->dump();
+    }
+#endif
+    return NULL;
+  }
+
+  Node* tail = 0;
+  for (int m = 0; m < _mem_slice_head.length(); m++) {
+    if (_mem_slice_head.at(m) == phi) {
+      tail = _mem_slice_tail.at(m);
+    }
+  }
+  if (tail == 0) { //test that found phi is in the list  _mem_slice_head
+#ifndef PRODUCT
+    if (_vector_loop_debug) {
+      tty->print_cr("SuperWord::find_phi_for_mem_dep load %d is not vectorizable node, its phi %d is not _mem_slice_head",
+                    ld->_idx, phi->_idx);
+      ld->dump();
+      phi->dump();
+    }
+#endif
+    return NULL;
+  }
+
+  // now all conditions are met
+  return phi;
+}
+
+Node* SuperWord::first_node(Node* nd) {
+  for (int ii = 0; ii < _iteration_first.length(); ii++) {
+    Node* nnn = _iteration_first.at(ii);
+    if (_clone_map.idx(nnn->_idx) == _clone_map.idx(nd->_idx)) {
+#ifndef PRODUCT
+      if (_vector_loop_debug) {
+        tty->print_cr("SuperWord::first_node: %d is the first iteration node for %d (_clone_map.idx(nnn->_idx) = %d)",
+                      nnn->_idx, nd->_idx, _clone_map.idx(nnn->_idx));
+      }
+#endif
+      return nnn;
+    }
+  }
+
+#ifndef PRODUCT
+  if (_vector_loop_debug) {
+    tty->print_cr("SuperWord::first_node: did not find first iteration node for %d (_clone_map.idx(nd->_idx)=%d)",
+                  nd->_idx, _clone_map.idx(nd->_idx));
+  }
+#endif
+  return 0;
+}
+
+Node* SuperWord::last_node(Node* nd) {
+  for (int ii = 0; ii < _iteration_last.length(); ii++) {
+    Node* nnn = _iteration_last.at(ii);
+    if (_clone_map.idx(nnn->_idx) == _clone_map.idx(nd->_idx)) {
+#ifndef PRODUCT
+      if (_vector_loop_debug) {
+        tty->print_cr("SuperWord::last_node _clone_map.idx(nnn->_idx)=%d, _clone_map.idx(nd->_idx)=%d",
+                      _clone_map.idx(nnn->_idx), _clone_map.idx(nd->_idx));
+      }
+#endif
+      return nnn;
+    }
+  }
+  return 0;
+}
+
+int SuperWord::mark_generations() {
+  Node *ii_err = 0, *tail_err;
+  for (int i = 0; i < _mem_slice_head.length(); i++) {
+    Node* phi  = _mem_slice_head.at(i);
+    assert(phi->is_Phi(), "must be phi");
+
+    Node* tail = _mem_slice_tail.at(i);
+    if (_ii_last == -1) {
+      tail_err = tail;
+      _ii_last = _clone_map.gen(tail->_idx);
+    }
+    else if (_ii_last != _clone_map.gen(tail->_idx)) {
+#ifndef PRODUCT
+      if (TraceSuperWord && Verbose) {
+        tty->print_cr("SuperWord::mark_generations _ii_last error - found different generations in two tail nodes ");
+        tail->dump();
+        tail_err->dump();
+      }
+#endif
+      return -1;
+    }
+
+    // find first iteration in the loop
+    for (DUIterator_Fast imax, i = phi->fast_outs(imax); i < imax; i++) {
+      Node* ii = phi->fast_out(i);
+      if (in_bb(ii) && ii->is_Store()) { // we speculate that normally Stores of one and one only generation have deps from mem phi
+        if (_ii_first == -1) {
+          ii_err = ii;
+          _ii_first = _clone_map.gen(ii->_idx);
+        } else if (_ii_first != _clone_map.gen(ii->_idx)) {
+#ifndef PRODUCT
+          if (TraceSuperWord && Verbose) {
+            tty->print_cr("SuperWord::mark_generations _ii_first error - found different generations in two nodes ");
+            ii->dump();
+            ii_err->dump();
+          }
+#endif
+          return -1; // this phi has Stores from different generations of unroll and cannot be simd/vectorized
+        }
+      }
+    }//for (DUIterator_Fast imax,
+  }//for (int i...
+
+  if (_ii_first == -1 || _ii_last == -1) {
+#ifndef PRODUCT
+    if (TraceSuperWord && Verbose) {
+      tty->print_cr("SuperWord::mark_generations unknown error, something vent wrong");
+    }
+#endif
+    return -1; // something vent wrong
+  }
+  // collect nodes in the first and last generations
+  assert(_iteration_first.length() == 0, "_iteration_first must be empty");
+  assert(_iteration_last.length() == 0, "_iteration_last must be empty");
+  for (int j = 0; j < _block.length(); j++) {
+    Node* n = _block.at(j);
+    node_idx_t gen = _clone_map.gen(n->_idx);
+    if ((signed)gen == _ii_first) {
+      _iteration_first.push(n);
+    } else if ((signed)gen == _ii_last) {
+      _iteration_last.push(n);
+    }
+  }
+
+  // building order of iterations
+  assert(_ii_order.length() == 0, "should be empty");
+  if (ii_err != 0) {
+    assert(in_bb(ii_err) && ii_err->is_Store(), "should be Store in bb");
+    Node* nd = ii_err;
+    while(_clone_map.gen(nd->_idx) != _ii_last) {
+      _ii_order.push(_clone_map.gen(nd->_idx));
+      bool found = false;
+      for (DUIterator_Fast imax, i = nd->fast_outs(imax); i < imax; i++) {
+        Node* use = nd->fast_out(i);
+        if (_clone_map.idx(use->_idx) == _clone_map.idx(nd->_idx) && use->as_Store()->in(MemNode::Memory) == nd) {
+          found = true;
+          nd = use;
+          break;
+        }
+      }//for
+
+      if (found == false) {
+#ifndef PRODUCT
+        if (TraceSuperWord && Verbose) {
+          tty->print_cr("SuperWord::mark_generations: Cannot build order of iterations - no dependent Store for %d", nd->_idx);
+        }
+#endif
+        _ii_order.clear();
+        return -1;
+      }
+    } //while
+    _ii_order.push(_clone_map.gen(nd->_idx));
+  }
+
+#ifndef PRODUCT
+  if (_vector_loop_debug) {
+    tty->print_cr("SuperWord::mark_generations");
+    tty->print_cr("First generation (%d) nodes:", _ii_first);
+    for (int ii = 0; ii < _iteration_first.length(); ii++)  _iteration_first.at(ii)->dump();
+    tty->print_cr("Last generation (%d) nodes:", _ii_last);
+    for (int ii = 0; ii < _iteration_last.length(); ii++)  _iteration_last.at(ii)->dump();
+    tty->print_cr(" ");
+
+    tty->print("SuperWord::List of generations: ");
+    for (int jj = 0; jj < _ii_order.length(); ++jj) {
+      tty->print("%d:%d ", jj, _ii_order.at(jj));
+    }
+    tty->print_cr(" ");
+  }
+#endif
+
+  return _ii_first;
+}
+
+bool SuperWord::fix_commutative_inputs(Node* gold, Node* fix) {
+  assert(gold->is_Add() && fix->is_Add() || gold->is_Mul() && fix->is_Mul(), "should be only Add or Mul nodes");
+  assert(_clone_map.idx(gold->_idx) == _clone_map.idx(fix->_idx), "should be clones of the same node");
+  Node* gin1 = gold->in(1);
+  Node* gin2 = gold->in(2);
+  Node* fin1 = fix->in(1);
+  Node* fin2 = fix->in(2);
+  bool swapped = false;
+
+  if (in_bb(gin1) && in_bb(gin2) && in_bb(fin1) && in_bb(fin1)) {
+    if (_clone_map.idx(gin1->_idx) == _clone_map.idx(fin1->_idx) &&
+        _clone_map.idx(gin2->_idx) == _clone_map.idx(fin2->_idx)) {
+      return true; // nothing to fix
+    }
+    if (_clone_map.idx(gin1->_idx) == _clone_map.idx(fin2->_idx) &&
+        _clone_map.idx(gin2->_idx) == _clone_map.idx(fin1->_idx)) {
+      fix->swap_edges(1, 2);
+      swapped = true;
+    }
+  }
+  // at least one input comes from outside of bb
+  if (gin1->_idx == fin1->_idx)  {
+    return true; // nothing to fix
+  }
+  if (!swapped && (gin1->_idx == fin2->_idx || gin2->_idx == fin1->_idx))  { //swapping is expensive, check condition first
+    fix->swap_edges(1, 2);
+    swapped = true;
+  }
+
+  if (swapped) {
+#ifndef PRODUCT
+    if (_vector_loop_debug) {
+      tty->print_cr("SuperWord::fix_commutative_inputs: fixed node %d", fix->_idx);
+    }
+#endif
+    return true;
+  }
+
+#ifndef PRODUCT
+  if (TraceSuperWord && Verbose) {
+    tty->print_cr("SuperWord::fix_commutative_inputs: cannot fix node %d", fix->_idx);
+  }
+#endif
+  return false;
+}
+
+bool SuperWord::pack_parallel() {
+#ifndef PRODUCT
+  if (_vector_loop_debug) {
+    tty->print_cr("SuperWord::pack_parallel: START");
+  }
+#endif
+
+  _packset.clear();
+
+  for (int ii = 0; ii < _iteration_first.length(); ii++) {
+    Node* nd = _iteration_first.at(ii);
+    if (in_bb(nd) && (nd->is_Load() || nd->is_Store() || nd->is_Add() || nd->is_Mul())) {
+      Node_List* pk = new Node_List();
+      pk->push(nd);
+      for (int gen = 1; gen < _ii_order.length(); ++gen) {
+        for (int kk = 0; kk < _block.length(); kk++) {
+          Node* clone = _block.at(kk);
+          if (_clone_map.idx(clone->_idx) == _clone_map.idx(nd->_idx) &&
+              _clone_map.gen(clone->_idx) == _ii_order.at(gen)) {
+            if (nd->is_Add() || nd->is_Mul()) {
+              fix_commutative_inputs(nd, clone);
+            }
+            pk->push(clone);
+            if (pk->size() == 4) {
+              _packset.append(pk);
+#ifndef PRODUCT
+              if (_vector_loop_debug) {
+                tty->print_cr("SuperWord::pack_parallel: added pack ");
+                pk->dump();
+              }
+#endif
+              if (_clone_map.gen(clone->_idx) != _ii_last) {
+                pk = new Node_List();
+              }
+            }
+            break;
+          }
+        }
+      }//for
+    }//if
+  }//for
+
+#ifndef PRODUCT
+  if (_vector_loop_debug) {
+    tty->print_cr("SuperWord::pack_parallel: END");
+  }
+#endif
+
+  return true;
+}
+
+bool SuperWord::hoist_loads_in_graph() {
+  GrowableArray<Node*> loads;
+
+#ifndef PRODUCT
+  if (_vector_loop_debug) {
+    tty->print_cr("SuperWord::hoist_loads_in_graph: total number _mem_slice_head.length() = %d", _mem_slice_head.length());
+  }
+#endif
+
+  for (int i = 0; i < _mem_slice_head.length(); i++) {
+    Node* n = _mem_slice_head.at(i);
+    if ( !in_bb(n) || !n->is_Phi() || n->bottom_type() != Type::MEMORY) {
+#ifndef PRODUCT
+      if (TraceSuperWord && Verbose) {
+        tty->print_cr("SuperWord::hoist_loads_in_graph: skipping unexpected node n=%d", n->_idx);
+      }
+#endif
+      continue;
+    }
+
+#ifndef PRODUCT
+    if (_vector_loop_debug) {
+      tty->print_cr("SuperWord::hoist_loads_in_graph: processing phi %d  = _mem_slice_head.at(%d);", n->_idx, i);
+    }
+#endif
+
+    for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
+      Node* ld = n->fast_out(i);
+      if (ld->is_Load() && ld->as_Load()->in(MemNode::Memory) == n && in_bb(ld)) {
+        for (int i = 0; i < _block.length(); i++) {
+          Node* ld2 = _block.at(i);
+          if (ld2->is_Load() &&
+              _clone_map.idx(ld->_idx) == _clone_map.idx(ld2->_idx) &&
+              _clone_map.gen(ld->_idx) != _clone_map.gen(ld2->_idx)) { // <= do not collect the first generation ld
+#ifndef PRODUCT
+            if (_vector_loop_debug) {
+              tty->print_cr("SuperWord::hoist_loads_in_graph: will try to hoist load ld2->_idx=%d, cloned from %d (ld->_idx=%d)",
+                            ld2->_idx, _clone_map.idx(ld->_idx), ld->_idx);
+            }
+#endif
+            // could not do on-the-fly, since iterator is immutable
+            loads.push(ld2);
+          }
+        }// for
+      }//if
+    }//for (DUIterator_Fast imax,
+  }//for (int i = 0; i
+
+  for (int i = 0; i < loads.length(); i++) {
+    LoadNode* ld = loads.at(i)->as_Load();
+    Node* phi = find_phi_for_mem_dep(ld);
+    if (phi != NULL) {
+#ifndef PRODUCT
+      if (_vector_loop_debug) {
+        tty->print_cr("SuperWord::hoist_loads_in_graph replacing MemNode::Memory(%d) edge in %d with one from %d",
+                      MemNode::Memory, ld->_idx, phi->_idx);
+      }
+#endif
+      _igvn.replace_input_of(ld, MemNode::Memory, phi);
+    }
+  }//for
+
+  restart(); // invalidate all basic structures, since we rebuilt the graph
+
+#ifndef PRODUCT
+  if (TraceSuperWord && Verbose) {
+    tty->print_cr("\nSuperWord::hoist_loads_in_graph() the graph was rebuilt, all structures invalidated and need rebuild");
+  }
+#endif
+  return true;
+}
+
--- a/hotspot/src/share/vm/opto/superword.hpp	Tue May 05 19:27:08 2015 +0200
+++ b/hotspot/src/share/vm/opto/superword.hpp	Tue May 05 12:33:57 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2007, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -218,8 +218,10 @@
   GrowableArray<Node*> _data_entry;      // Nodes with all inputs from outside
   GrowableArray<Node*> _mem_slice_head;  // Memory slice head nodes
   GrowableArray<Node*> _mem_slice_tail;  // Memory slice tail nodes
-
+  GrowableArray<Node*> _iteration_first; // nodes in the generation that has deps from phi
+  GrowableArray<Node*> _iteration_last;  // nodes in the generation that has deps to   phi
   GrowableArray<SWNodeInfo> _node_info;  // Info needed per node
+  CloneMap&                 _clone_map;  // map of nodes created in cloning
 
   MemNode* _align_to_ref;                // Memory reference that pre-loop will align to
 
@@ -250,8 +252,13 @@
   Node*          _bb;              // Current basic block
   PhiNode*       _iv;              // Induction var
   bool           _race_possible;   // In cases where SDMU is true
+  bool           _do_vector_loop;  // whether to do vectorization/simd style
+  bool           _vector_loop_debug; // provide more printing in debug mode
   int            _num_work_vecs;   // Number of non memory vector operations
   int            _num_reductions;  // Number of reduction expressions applied
+  int            _ii_first;        // generation with direct deps from mem phi
+  int            _ii_last;         // generation with direct deps to   mem phi
+  GrowableArray<int> _ii_order;
 
   // Accessors
   Arena* arena()                   { return _arena; }
@@ -326,6 +333,22 @@
   int get_iv_adjustment(MemNode* mem);
   // Can the preloop align the reference to position zero in the vector?
   bool ref_is_alignable(SWPointer& p);
+  // rebuild the graph so all loads in different iterations of cloned loop become dependant on phi node (in _do_vector_loop only)
+  bool hoist_loads_in_graph();
+  // Test whether MemNode::Memory dependency to the same load but in the first iteration of this loop is coming from memory phi
+  // Return false if failed.
+  Node* find_phi_for_mem_dep(LoadNode* ld);
+  // Return same node but from the first generation. Return 0, if not found
+  Node* first_node(Node* nd);
+  // Return same node as this but from the last generation. Return 0, if not found
+  Node* last_node(Node* n);
+  // Mark nodes belonging to first and last generation,
+  // returns first generation index or -1 if vectorization/simd is impossible
+  int mark_generations();
+  // swapping inputs of commutative instruction (Add or Mul)
+  bool fix_commutative_inputs(Node* gold, Node* fix);
+  // make packs forcefully (in _do_vector_loop only)
+  bool pack_parallel();
   // Construct dependency graph.
   void dependence_graph();
   // Return a memory slice (node list) in predecessor order starting at "start"
@@ -419,6 +442,8 @@
   // Is the use of d1 in u1 at the same operand position as d2 in u2?
   bool opnd_positions_match(Node* d1, Node* u1, Node* d2, Node* u2);
   void init();
+  // clean up some basic structures - used if the ideal graph was rebuilt
+  void restart();
 
   // print methods
   void print_packset();
--- a/hotspot/src/share/vm/utilities/hashtable.cpp	Tue May 05 19:27:08 2015 +0200
+++ b/hotspot/src/share/vm/utilities/hashtable.cpp	Tue May 05 12:33:57 2015 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -381,3 +381,4 @@
 template class BasicHashtable<mtSymbol>;
 template class BasicHashtable<mtCode>;
 template class BasicHashtable<mtInternal>;
+template class BasicHashtable<mtCompiler>;