8149421: Vectorized Post Loops
authormcberg
Fri, 12 Feb 2016 16:12:15 -0800
changeset 36066 60ce66ce3c76
parent 36065 4f0e0cb7b016
child 36067 63eabfc8429b
8149421: Vectorized Post Loops Summary: add vectorised post loop for counted loops with vectors. Reviewed-by: kvn
hotspot/src/cpu/aarch64/vm/c2_globals_aarch64.hpp
hotspot/src/cpu/ppc/vm/c2_globals_ppc.hpp
hotspot/src/cpu/sparc/vm/c2_globals_sparc.hpp
hotspot/src/cpu/x86/vm/assembler_x86.cpp
hotspot/src/cpu/x86/vm/assembler_x86.hpp
hotspot/src/cpu/x86/vm/c2_globals_x86.hpp
hotspot/src/share/vm/opto/c2_globals.hpp
hotspot/src/share/vm/opto/loopTransform.cpp
hotspot/src/share/vm/opto/loopnode.hpp
hotspot/src/share/vm/opto/superword.cpp
--- a/hotspot/src/cpu/aarch64/vm/c2_globals_aarch64.hpp	Wed Feb 03 12:36:18 2016 +0100
+++ b/hotspot/src/cpu/aarch64/vm/c2_globals_aarch64.hpp	Fri Feb 12 16:12:15 2016 -0800
@@ -54,6 +54,7 @@
 define_pd_global(intx, InteriorEntryAlignment,       16);
 define_pd_global(intx, NewSizeThreadIncrease, ScaleForWordSize(4*K));
 define_pd_global(intx, LoopUnrollLimit,              60);
+define_pd_global(intx, LoopPercentProfileLimit,      10);
 // InitialCodeCacheSize derived from specjbb2000 run.
 define_pd_global(intx, InitialCodeCacheSize,         2496*K); // Integral multiple of CodeCacheExpansionSize
 define_pd_global(intx, CodeCacheExpansionSize,       64*K);
--- a/hotspot/src/cpu/ppc/vm/c2_globals_ppc.hpp	Wed Feb 03 12:36:18 2016 +0100
+++ b/hotspot/src/cpu/ppc/vm/c2_globals_ppc.hpp	Fri Feb 12 16:12:15 2016 -0800
@@ -54,6 +54,7 @@
 define_pd_global(bool, UseTLAB,                      true);
 define_pd_global(bool, ResizeTLAB,                   true);
 define_pd_global(intx, LoopUnrollLimit,              60);
+define_pd_global(intx, LoopPercentProfileLimit,      10);
 
 // Peephole and CISC spilling both break the graph, and so make the
 // scheduler sick.
--- a/hotspot/src/cpu/sparc/vm/c2_globals_sparc.hpp	Wed Feb 03 12:36:18 2016 +0100
+++ b/hotspot/src/cpu/sparc/vm/c2_globals_sparc.hpp	Fri Feb 12 16:12:15 2016 -0800
@@ -52,6 +52,7 @@
 define_pd_global(bool, UseTLAB,                      true);
 define_pd_global(bool, ResizeTLAB,                   true);
 define_pd_global(intx, LoopUnrollLimit,              60); // Design center runs on 1.3.1
+define_pd_global(intx, LoopPercentProfileLimit,      10);
 define_pd_global(intx, MinJumpTableSize,             5);
 
 // Peephole and CISC spilling both break the graph, and so makes the
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Wed Feb 03 12:36:18 2016 +0100
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Fri Feb 12 16:12:15 2016 -0800
@@ -2361,7 +2361,7 @@
 void Assembler::movdqu(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   simd_prefix(dst, xnoreg, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x6F);
@@ -2398,7 +2398,7 @@
 void Assembler::vmovdqu(XMMRegister dst, Address src) {
   assert(UseAVX > 0, "");
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   vex_prefix(src, 0, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x6F);
@@ -2486,7 +2486,7 @@
 void Assembler::evmovdqul(XMMRegister dst, Address src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false , /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   vex_prefix(src, 0, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x6F);
@@ -2515,7 +2515,7 @@
 void Assembler::evmovdquq(XMMRegister dst, Address src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
   vex_prefix(src, 0, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x6F);
@@ -2640,7 +2640,7 @@
 
 void Assembler::movsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x10);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -2649,7 +2649,7 @@
 void Assembler::movsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
   simd_prefix(dst, xnoreg, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x10);
@@ -2668,7 +2668,7 @@
 
 void Assembler::movss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x10);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -2677,7 +2677,7 @@
 void Assembler::movss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
   simd_prefix(dst, xnoreg, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x10);
@@ -2782,7 +2782,7 @@
 void Assembler::mulsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
   simd_prefix(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
@@ -2791,7 +2791,7 @@
 
 void Assembler::mulsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -2800,7 +2800,7 @@
 void Assembler::mulss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
   simd_prefix(dst, dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
@@ -2809,7 +2809,7 @@
 
 void Assembler::mulss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -3993,7 +3993,7 @@
 
 void Assembler::sqrtsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x51);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4002,7 +4002,7 @@
 void Assembler::sqrtsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
   simd_prefix(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x51);
@@ -4011,7 +4011,7 @@
 
 void Assembler::sqrtss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x51);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4024,7 +4024,7 @@
 void Assembler::sqrtss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
   simd_prefix(dst, dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x51);
@@ -4078,7 +4078,7 @@
 
 void Assembler::subsd(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4087,7 +4087,7 @@
 void Assembler::subsd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
   simd_prefix(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
@@ -4096,7 +4096,7 @@
 
 void Assembler::subss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false , /* uses_vl */ false);
   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -4105,7 +4105,7 @@
 void Assembler::subss(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
   simd_prefix(dst, dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
@@ -4293,7 +4293,7 @@
 void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
@@ -4303,7 +4303,7 @@
 
 void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x58);
@@ -4313,7 +4313,7 @@
 void Assembler::vaddss(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
@@ -4323,7 +4323,7 @@
 
 void Assembler::vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x58);
@@ -4333,7 +4333,7 @@
 void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
@@ -4343,7 +4343,7 @@
 
 void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5E);
@@ -4353,7 +4353,7 @@
 void Assembler::vdivss(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
@@ -4363,7 +4363,7 @@
 
 void Assembler::vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5E);
@@ -4373,7 +4373,7 @@
 void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
@@ -4383,7 +4383,7 @@
 
 void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
@@ -4393,7 +4393,7 @@
 void Assembler::vmulss(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
@@ -4403,7 +4403,7 @@
 
 void Assembler::vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x59);
@@ -4413,7 +4413,7 @@
 void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
@@ -4423,7 +4423,7 @@
 
 void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
@@ -4433,7 +4433,7 @@
 void Assembler::vsubss(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
@@ -4443,7 +4443,7 @@
 
 void Assembler::vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
   assert(VM_Version::supports_avx(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
   int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
   emit_int8(0x5C);
@@ -5901,7 +5901,7 @@
 // duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x78);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -5911,7 +5911,7 @@
   assert(VM_Version::supports_evex(), "");
   assert(dst != xnoreg, "sanity");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_8bit);
   // swap src<->dst for encoding
   vex_prefix(src, dst->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
@@ -5922,7 +5922,7 @@
 // duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x79);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -5932,7 +5932,7 @@
   assert(VM_Version::supports_evex(), "");
   assert(dst != xnoreg, "sanity");
   InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_16bit);
   // swap src<->dst for encoding
   vex_prefix(src, dst->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
@@ -6027,7 +6027,7 @@
 // duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastb(XMMRegister dst, Register src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x7A);
   emit_int8((unsigned char)(0xC0 | encode));
@@ -6036,7 +6036,7 @@
 // duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastw(XMMRegister dst, Register src, int vector_len) {
   assert(VM_Version::supports_evex(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
   int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
   emit_int8(0x7B);
   emit_int8((unsigned char)(0xC0 | encode));
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Wed Feb 03 12:36:18 2016 +0100
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Fri Feb 12 16:12:15 2016 -0800
@@ -2044,11 +2044,11 @@
 class InstructionAttr {
 public:
   InstructionAttr(
-    int vector_len,
-    bool rex_vex_w,
-    bool legacy_mode,
-    bool no_reg_mask,
-    bool uses_vl)
+    int vector_len,     // The length of vector to be applied in encoding - for both AVX and EVEX
+    bool rex_vex_w,     // Width of data: if 32-bits or less, false, else if 64-bit or specially defined, true
+    bool legacy_mode,   // Details if either this instruction is conditionally encoded to AVX or earlier if true else possibly EVEX
+    bool no_reg_mask,   // when true, k0 is used when EVEX encoding is chosen, else k1 is used under the same condition
+    bool uses_vl)       // This instruction may have legacy constraints based on vector length for EVEX
     :
       _avx_vector_len(vector_len),
       _rex_vex_w(rex_vex_w),
--- a/hotspot/src/cpu/x86/vm/c2_globals_x86.hpp	Wed Feb 03 12:36:18 2016 +0100
+++ b/hotspot/src/cpu/x86/vm/c2_globals_x86.hpp	Fri Feb 12 16:12:15 2016 -0800
@@ -46,6 +46,7 @@
 define_pd_global(intx, ConditionalMoveLimit,         3);
 define_pd_global(intx, FreqInlineSize,               325);
 define_pd_global(intx, MinJumpTableSize,             10);
+define_pd_global(intx, LoopPercentProfileLimit,      30);
 #ifdef AMD64
 define_pd_global(intx, INTPRESSURE,                  13);
 define_pd_global(intx, FLOATPRESSURE,                14);
--- a/hotspot/src/share/vm/opto/c2_globals.hpp	Wed Feb 03 12:36:18 2016 +0100
+++ b/hotspot/src/share/vm/opto/c2_globals.hpp	Fri Feb 12 16:12:15 2016 -0800
@@ -182,6 +182,10 @@
           "Unroll loop bodies with node count less than this")              \
           range(0, max_jint / 4)                                            \
                                                                             \
+  product_pd(intx, LoopPercentProfileLimit,                                 \
+             "Unroll loop bodies with % node count of profile limit")       \
+             range(10, 100)                                                 \
+                                                                            \
   product(intx,  LoopMaxUnroll, 16,                                         \
           "Maximum number of unrolls for main loop")                        \
           range(0, max_jint)                                                \
--- a/hotspot/src/share/vm/opto/loopTransform.cpp	Wed Feb 03 12:36:18 2016 +0100
+++ b/hotspot/src/share/vm/opto/loopTransform.cpp	Fri Feb 12 16:12:15 2016 -0800
@@ -666,7 +666,8 @@
     if (future_unroll_ct > LoopMaxUnroll) return false;
   } else {
     // obey user constraints on vector mapped loops with additional unrolling applied
-    if ((future_unroll_ct / cl->slp_max_unroll()) > LoopMaxUnroll) return false;
+    int unroll_constraint = (cl->slp_max_unroll()) ? cl->slp_max_unroll() : 1;
+    if ((future_unroll_ct / unroll_constraint) > LoopMaxUnroll) return false;
   }
 
   // Check for initial stride being a small enough constant
@@ -689,7 +690,7 @@
   //   Progress defined as current size less than 20% larger than previous size.
   if (UseSuperWord && cl->node_count_before_unroll() > 0 &&
       future_unroll_ct > LoopUnrollMin &&
-      (future_unroll_ct - 1) * 10.0 > cl->profile_trip_cnt() &&
+      (future_unroll_ct - 1) * (100 / LoopPercentProfileLimit) > cl->profile_trip_cnt() &&
       1.2 * cl->node_count_before_unroll() < (double)_body.size()) {
     return false;
   }
@@ -1260,6 +1261,146 @@
   loop->record_for_igvn();
 }
 
+//------------------------------insert_vector_post_loop------------------------
+// Insert a copy of the atomic unrolled vectorized main loop as a post loop,
+// unroll_policy has already informed us that more unrolling is about to happen to
+// the main loop.  The resultant post loop will serve as a vectorized drain loop.
+void PhaseIdealLoop::insert_vector_post_loop(IdealLoopTree *loop, Node_List &old_new) {
+  if (!loop->_head->is_CountedLoop()) return;
+
+  CountedLoopNode *cl = loop->_head->as_CountedLoop();
+
+  // only process vectorized main loops
+  if (!cl->is_vectorized_loop() || !cl->is_main_loop()) return;
+
+  int slp_max_unroll_factor = cl->slp_max_unroll();
+  int cur_unroll = cl->unrolled_count();
+
+  if (slp_max_unroll_factor == 0) return;
+
+  // only process atomic unroll vector loops (not super unrolled after vectorization)
+  if (cur_unroll != slp_max_unroll_factor) return;
+
+  // we only ever process this one time
+  if (cl->has_atomic_post_loop()) return;
+
+#ifndef PRODUCT
+  if (TraceLoopOpts) {
+    tty->print("PostVector  ");
+    loop->dump_head();
+  }
+#endif
+  C->set_major_progress();
+
+  // Find common pieces of the loop being guarded with pre & post loops
+  CountedLoopNode *main_head = loop->_head->as_CountedLoop();
+  CountedLoopEndNode *main_end = main_head->loopexit();
+  guarantee(main_end != NULL, "no loop exit node");
+  // diagnostic to show loop end is not properly formed
+  assert(main_end->outcnt() == 2, "1 true, 1 false path only");
+  uint dd_main_head = dom_depth(main_head);
+  uint max = main_head->outcnt();
+
+  // mark this loop as processed
+  main_head->mark_has_atomic_post_loop();
+
+  Node *pre_header = main_head->in(LoopNode::EntryControl);
+  Node *init = main_head->init_trip();
+  Node *incr = main_end->incr();
+  Node *limit = main_end->limit();
+  Node *stride = main_end->stride();
+  Node *cmp = main_end->cmp_node();
+  BoolTest::mask b_test = main_end->test_trip();
+
+  //------------------------------
+  // Step A: Create a new post-Loop.
+  Node* main_exit = main_end->proj_out(false);
+  assert(main_exit->Opcode() == Op_IfFalse, "");
+  int dd_main_exit = dom_depth(main_exit);
+
+  // Step A1: Clone the loop body of main.  The clone becomes the vector post-loop.
+  // The main loop pre-header illegally has 2 control users (old & new loops).
+  clone_loop(loop, old_new, dd_main_exit);
+  assert(old_new[main_end->_idx]->Opcode() == Op_CountedLoopEnd, "");
+  CountedLoopNode *post_head = old_new[main_head->_idx]->as_CountedLoop();
+  post_head->set_normal_loop();
+  post_head->set_post_loop(main_head);
+
+  // Reduce the post-loop trip count.
+  CountedLoopEndNode* post_end = old_new[main_end->_idx]->as_CountedLoopEnd();
+  post_end->_prob = PROB_FAIR;
+
+  // Build the main-loop normal exit.
+  IfFalseNode *new_main_exit = new IfFalseNode(main_end);
+  _igvn.register_new_node_with_optimizer(new_main_exit);
+  set_idom(new_main_exit, main_end, dd_main_exit);
+  set_loop(new_main_exit, loop->_parent);
+
+  // Step A2: Build a zero-trip guard for the vector post-loop.  After leaving the
+  // main-loop, the vector post-loop may not execute at all.  We 'opaque' the incr
+  // (the vectorized main-loop trip-counter exit value) because we will be changing
+  // the exit value (via additional unrolling) so we cannot constant-fold away the zero
+  // trip guard until all unrolling is done.
+  Node *zer_opaq = new Opaque1Node(C, incr);
+  Node *zer_cmp = new CmpINode(zer_opaq, limit);
+  Node *zer_bol = new BoolNode(zer_cmp, b_test);
+  register_new_node(zer_opaq, new_main_exit);
+  register_new_node(zer_cmp, new_main_exit);
+  register_new_node(zer_bol, new_main_exit);
+
+  // Build the IfNode
+  IfNode *zer_iff = new IfNode(new_main_exit, zer_bol, PROB_FAIR, COUNT_UNKNOWN);
+  _igvn.register_new_node_with_optimizer(zer_iff);
+  set_idom(zer_iff, new_main_exit, dd_main_exit);
+  set_loop(zer_iff, loop->_parent);
+
+  // Plug in the false-path, taken if we need to skip vector post-loop
+  _igvn.replace_input_of(main_exit, 0, zer_iff);
+  set_idom(main_exit, zer_iff, dd_main_exit);
+  set_idom(main_exit->unique_out(), zer_iff, dd_main_exit);
+  // Make the true-path, must enter the vector post loop
+  Node *zer_taken = new IfTrueNode(zer_iff);
+  _igvn.register_new_node_with_optimizer(zer_taken);
+  set_idom(zer_taken, zer_iff, dd_main_exit);
+  set_loop(zer_taken, loop->_parent);
+  // Plug in the true path
+  _igvn.hash_delete(post_head);
+  post_head->set_req(LoopNode::EntryControl, zer_taken);
+  set_idom(post_head, zer_taken, dd_main_exit);
+
+  Arena *a = Thread::current()->resource_area();
+  VectorSet visited(a);
+  Node_Stack clones(a, main_head->back_control()->outcnt());
+  // Step A3: Make the fall-in values to the vector post-loop come from the
+  // fall-out values of the main-loop.
+  for (DUIterator_Fast imax, i = main_head->fast_outs(imax); i < imax; i++) {
+    Node* main_phi = main_head->fast_out(i);
+    if (main_phi->is_Phi() && main_phi->in(0) == main_head && main_phi->outcnt() >0) {
+      Node *cur_phi = old_new[main_phi->_idx];
+      Node *fallnew = clone_up_backedge_goo(main_head->back_control(),
+                                            post_head->init_control(),
+                                            main_phi->in(LoopNode::LoopBackControl),
+                                            visited, clones);
+      _igvn.hash_delete(cur_phi);
+      cur_phi->set_req(LoopNode::EntryControl, fallnew);
+    }
+  }
+
+  // CastII for the new post loop:
+  bool inserted = cast_incr_before_loop(zer_opaq->in(1), zer_taken, post_head);
+  assert(inserted, "no castII inserted");
+
+  // It's difficult to be precise about the trip-counts
+  // for post loops.  They are usually very short,
+  // so guess that unit vector trips is a reasonable value.
+  post_head->set_profile_trip_cnt((float)slp_max_unroll_factor);
+
+  // Now force out all loop-invariant dominating tests.  The optimizer
+  // finds some, but we _know_ they are all useless.
+  peeled_dom_test_elim(loop, old_new);
+  loop->record_for_igvn();
+}
+
 //------------------------------is_invariant-----------------------------
 // Return true if n is invariant
 bool IdealLoopTree::is_invariant(Node* n) const {
@@ -2608,6 +2749,9 @@
     // and we'd rather unroll the post-RCE'd loop SO... do not unroll if
     // peeling.
     if (should_unroll && !should_peel) {
+      if (SuperWordLoopUnrollAnalysis) {
+        phase->insert_vector_post_loop(this, old_new);
+      }
       phase->do_unroll(this, old_new, true);
     }
 
--- a/hotspot/src/share/vm/opto/loopnode.hpp	Wed Feb 03 12:36:18 2016 +0100
+++ b/hotspot/src/share/vm/opto/loopnode.hpp	Fri Feb 12 16:12:15 2016 -0800
@@ -67,7 +67,9 @@
          HasReductions=128,
          WasSlpAnalyzed=256,
          PassedSlpAnalysis=512,
-         DoUnrollOnly=1024 };
+         DoUnrollOnly=1024,
+         VectorizedLoop=2048,
+         HasAtomicPostLoop=4096 };
   char _unswitch_count;
   enum { _unswitch_max=3 };
 
@@ -86,6 +88,8 @@
   void mark_was_slp() { _loop_flags |= WasSlpAnalyzed; }
   void mark_passed_slp() { _loop_flags |= PassedSlpAnalysis; }
   void mark_do_unroll_only() { _loop_flags |= DoUnrollOnly; }
+  void mark_loop_vectorized() { _loop_flags |= VectorizedLoop; }
+  void mark_has_atomic_post_loop() { _loop_flags |= HasAtomicPostLoop; }
 
   int unswitch_max() { return _unswitch_max; }
   int unswitch_count() { return _unswitch_count; }
@@ -221,6 +225,8 @@
   int has_passed_slp   () const { return (_loop_flags&PassedSlpAnalysis) == PassedSlpAnalysis; }
   int do_unroll_only      () const { return (_loop_flags&DoUnrollOnly) == DoUnrollOnly; }
   int is_main_no_pre_loop() const { return _loop_flags & MainHasNoPreLoop; }
+  int is_vectorized_loop    () const { return (_loop_flags & VectorizedLoop) == VectorizedLoop; }
+  int has_atomic_post_loop  () const { return (_loop_flags & HasAtomicPostLoop) == HasAtomicPostLoop; }
   void set_main_no_pre_loop() { _loop_flags |= MainHasNoPreLoop; }
 
   int main_idx() const { return _main_idx; }
@@ -893,6 +899,8 @@
   // Add pre and post loops around the given loop.  These loops are used
   // during RCE, unrolling and aligning loops.
   void insert_pre_post_loops( IdealLoopTree *loop, Node_List &old_new, bool peel_only );
+  // Add a vector post loop between a vector main loop and the current post loop
+  void insert_vector_post_loop(IdealLoopTree *loop, Node_List &old_new);
   // If Node n lives in the back_ctrl block, we clone a private version of n
   // in preheader_ctrl block and return that, otherwise return n.
   Node *clone_up_backedge_goo( Node *back_ctrl, Node *preheader_ctrl, Node *n, VectorSet &visited, Node_Stack &clones );
--- a/hotspot/src/share/vm/opto/superword.cpp	Wed Feb 03 12:36:18 2016 +0100
+++ b/hotspot/src/share/vm/opto/superword.cpp	Fri Feb 12 16:12:15 2016 -0800
@@ -2253,6 +2253,9 @@
           C->set_major_progress();
         }
         cl->mark_do_unroll_only();
+        if (do_reserve_copy()) {
+          cl->mark_loop_vectorized();
+        }
       }
     }
   }