8074981: Integer/FP scalar reduction optimization
Summary: Add scalar reduction optimization to C2 to take advantage of vector instructions in modern x86 CPUs.
Reviewed-by: kvn, twisti
Contributed-by: michael.c.berg@intel.com
--- a/hotspot/make/build.sh Fri Mar 27 13:47:33 2015 +0100
+++ b/hotspot/make/build.sh Wed Apr 01 18:07:50 2015 -0700
@@ -40,7 +40,7 @@
exit 1
fi
-if [ "${JAVA_HOME-}" = "" -o ! -d "${JAVA_HOME-}" -o ! -d ${JAVA_HOME-}/jre/lib/ ]; then
+if [ "${JAVA_HOME-}" = "" -o ! -d "${JAVA_HOME-}" ]; then
echo "JAVA_HOME needs to be set to a valid JDK path"
echo "JAVA_HOME: ${JAVA_HOME-}"
exit 1
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp Fri Mar 27 13:47:33 2015 +0100
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp Wed Apr 01 18:07:50 2015 -0700
@@ -3359,6 +3359,20 @@
// Integer vector arithmetic
+void Assembler::vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+ assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+ int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38);
+ emit_int8(0x01);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+ assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+ int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38);
+ emit_int8(0x02);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
void Assembler::paddb(XMMRegister dst, XMMRegister src) {
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
emit_simd_arith(0xFC, dst, src, VEX_SIMD_66);
@@ -3379,6 +3393,20 @@
emit_simd_arith(0xD4, dst, src, VEX_SIMD_66);
}
+void Assembler::phaddw(XMMRegister dst, XMMRegister src) {
+ NOT_LP64(assert(VM_Version::supports_sse3(), ""));
+ int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+ emit_int8(0x01);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::phaddd(XMMRegister dst, XMMRegister src) {
+ NOT_LP64(assert(VM_Version::supports_sse3(), ""));
+ int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+ emit_int8(0x02);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector256);
@@ -3804,6 +3832,17 @@
emit_int8(0x01);
}
+void Assembler::vextractf128h(XMMRegister dst, XMMRegister src) {
+ assert(VM_Version::supports_avx(), "");
+ bool vector256 = true;
+ int encode = vex_prefix_and_encode(src, xnoreg, dst, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A);
+ emit_int8(0x19);
+ emit_int8((unsigned char)(0xC0 | encode));
+ // 0x00 - insert into lower 128 bits
+ // 0x01 - insert into upper 128 bits
+ emit_int8(0x01);
+}
+
void Assembler::vextractf128h(Address dst, XMMRegister src) {
assert(VM_Version::supports_avx(), "");
InstructionMark im(this);
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp Fri Mar 27 13:47:33 2015 +0100
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp Wed Apr 01 18:07:50 2015 -0700
@@ -1777,6 +1777,12 @@
void vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
void vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
+ // Add horizontal packed integers
+ void vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+ void vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+ void phaddw(XMMRegister dst, XMMRegister src);
+ void phaddd(XMMRegister dst, XMMRegister src);
+
// Add packed integers
void paddb(XMMRegister dst, XMMRegister src);
void paddw(XMMRegister dst, XMMRegister src);
@@ -1869,6 +1875,7 @@
// Copy low 128bit into high 128bit of YMM registers.
void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
+ void vextractf128h(XMMRegister dst, XMMRegister src);
// Load/store high 128bit of YMM registers which does not destroy other half.
void vinsertf128h(XMMRegister dst, Address src);
--- a/hotspot/src/cpu/x86/vm/x86.ad Fri Mar 27 13:47:33 2015 +0100
+++ b/hotspot/src/cpu/x86/vm/x86.ad Wed Apr 01 18:07:50 2015 -0700
@@ -490,7 +490,7 @@
class NativeJump;
class CallStubImpl {
-
+
//--------------------------------------------------------------
//---< Used for optimization in Compile::shorten_branches >---
//--------------------------------------------------------------
@@ -500,9 +500,9 @@
static uint size_call_trampoline() {
return 0; // no call trampolines on this platform
}
-
+
// number of relocations needed by a call trampoline stub
- static uint reloc_call_trampoline() {
+ static uint reloc_call_trampoline() {
return 0; // no call trampolines on this platform
}
};
@@ -623,6 +623,22 @@
if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
return false;
break;
+ case Op_AddReductionVL:
+ if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here
+ return false;
+ case Op_AddReductionVI:
+ if (UseSSE < 3) // requires at least SSE3
+ return false;
+ case Op_MulReductionVI:
+ if (UseSSE < 4) // requires at least SSE4
+ return false;
+ case Op_AddReductionVF:
+ case Op_AddReductionVD:
+ case Op_MulReductionVF:
+ case Op_MulReductionVD:
+ if (UseSSE < 1) // requires at least SSE
+ return false;
+ break;
case Op_CompareAndSwapL:
#ifdef _LP64
case Op_CompareAndSwapP:
@@ -2532,6 +2548,574 @@
ins_pipe( fpu_reg_reg );
%}
+// ====================REDUCTION ARITHMETIC=======================================
+
+instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
+ predicate(UseSSE > 2 && UseAVX == 0);
+ match(Set dst (AddReductionVI src1 src2));
+ effect(TEMP tmp2, TEMP tmp);
+ format %{ "movdqu $tmp2,$src2\n\t"
+ "phaddd $tmp2,$tmp2\n\t"
+ "movd $tmp,$src1\n\t"
+ "paddd $tmp,$tmp2\n\t"
+ "movd $dst,$tmp\t! add reduction2I" %}
+ ins_encode %{
+ __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
+ __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
+ __ movdl($tmp$$XMMRegister, $src1$$Register);
+ __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
+ __ movdl($dst$$Register, $tmp$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
+ predicate(UseAVX > 0);
+ match(Set dst (AddReductionVI src1 src2));
+ effect(TEMP tmp, TEMP tmp2);
+ format %{ "vphaddd $tmp,$src2,$src2\n\t"
+ "movd $tmp2,$src1\n\t"
+ "vpaddd $tmp2,$tmp2,$tmp\n\t"
+ "movd $dst,$tmp2\t! add reduction2I" %}
+ ins_encode %{
+ __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, false);
+ __ movdl($tmp2$$XMMRegister, $src1$$Register);
+ __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, false);
+ __ movdl($dst$$Register, $tmp2$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
+ predicate(UseSSE > 2 && UseAVX == 0);
+ match(Set dst (AddReductionVI src1 src2));
+ effect(TEMP tmp2, TEMP tmp);
+ format %{ "movdqu $tmp2,$src2\n\t"
+ "phaddd $tmp2,$tmp2\n\t"
+ "phaddd $tmp2,$tmp2\n\t"
+ "movd $tmp,$src1\n\t"
+ "paddd $tmp,$tmp2\n\t"
+ "movd $dst,$tmp\t! add reduction4I" %}
+ ins_encode %{
+ __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
+ __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
+ __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
+ __ movdl($tmp$$XMMRegister, $src1$$Register);
+ __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
+ __ movdl($dst$$Register, $tmp$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
+ predicate(UseAVX > 0);
+ match(Set dst (AddReductionVI src1 src2));
+ effect(TEMP tmp, TEMP tmp2);
+ format %{ "vphaddd $tmp,$src2,$src2\n\t"
+ "vphaddd $tmp,$tmp,$tmp2\n\t"
+ "movd $tmp2,$src1\n\t"
+ "vpaddd $tmp2,$tmp2,$tmp\n\t"
+ "movd $dst,$tmp2\t! add reduction4I" %}
+ ins_encode %{
+ __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, false);
+ __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
+ __ movdl($tmp2$$XMMRegister, $src1$$Register);
+ __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, false);
+ __ movdl($dst$$Register, $tmp2$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
+ predicate(UseAVX > 0);
+ match(Set dst (AddReductionVI src1 src2));
+ effect(TEMP tmp, TEMP tmp2);
+ format %{ "vphaddd $tmp,$src2,$src2\n\t"
+ "vphaddd $tmp,$tmp,$tmp2\n\t"
+ "vextractf128 $tmp2,$tmp\n\t"
+ "vpaddd $tmp,$tmp,$tmp2\n\t"
+ "movd $tmp2,$src1\n\t"
+ "vpaddd $tmp2,$tmp2,$tmp\n\t"
+ "movd $dst,$tmp2\t! add reduction8I" %}
+ ins_encode %{
+ __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, true);
+ __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, true);
+ __ vextractf128h($tmp2$$XMMRegister, $tmp$$XMMRegister);
+ __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
+ __ movdl($tmp2$$XMMRegister, $src1$$Register);
+ __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, false);
+ __ movdl($dst$$Register, $tmp2$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rsadd2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
+ predicate(UseSSE >= 1 && UseAVX == 0);
+ match(Set dst (AddReductionVF src1 src2));
+ effect(TEMP tmp, TEMP tmp2);
+ format %{ "movdqu $tmp,$src1\n\t"
+ "addss $tmp,$src2\n\t"
+ "pshufd $tmp2,$src2,0x01\n\t"
+ "addss $tmp,$tmp2\n\t"
+ "movdqu $dst,$tmp\t! add reduction2F" %}
+ ins_encode %{
+ __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
+ __ addss($tmp$$XMMRegister, $src2$$XMMRegister);
+ __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
+ __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister);
+ __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rvadd2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
+ predicate(UseAVX > 0);
+ match(Set dst (AddReductionVF src1 src2));
+ effect(TEMP tmp2, TEMP tmp);
+ format %{ "vaddss $tmp2,$src1,$src2\n\t"
+ "pshufd $tmp,$src2,0x01\n\t"
+ "vaddss $dst,$tmp2,$tmp\t! add reduction2F" %}
+ ins_encode %{
+ __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+ __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
+ __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rsadd4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
+ predicate(UseSSE >= 1 && UseAVX == 0);
+ match(Set dst (AddReductionVF src1 src2));
+ effect(TEMP tmp, TEMP tmp2);
+ format %{ "movdqu $tmp,$src1\n\t"
+ "addss $tmp,$src2\n\t"
+ "pshufd $tmp2,$src2,0x01\n\t"
+ "addss $tmp,$tmp2\n\t"
+ "pshufd $tmp2,$src2,0x02\n\t"
+ "addss $tmp,$tmp2\n\t"
+ "pshufd $tmp2,$src2,0x03\n\t"
+ "addss $tmp,$tmp2\n\t"
+ "movdqu $dst,$tmp\t! add reduction4F" %}
+ ins_encode %{
+ __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
+ __ addss($tmp$$XMMRegister, $src2$$XMMRegister);
+ __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
+ __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister);
+ __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x02);
+ __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister);
+ __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x03);
+ __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister);
+ __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rvadd4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
+ predicate(UseAVX > 0);
+ match(Set dst (AddReductionVF src1 src2));
+ effect(TEMP tmp, TEMP tmp2);
+ format %{ "vaddss $tmp2,$src1,$src2\n\t"
+ "pshufd $tmp,$src2,0x01\n\t"
+ "vaddss $tmp2,$tmp2,$tmp\n\t"
+ "pshufd $tmp,$src2,0x02\n\t"
+ "vaddss $tmp2,$tmp2,$tmp\n\t"
+ "pshufd $tmp,$src2,0x03\n\t"
+ "vaddss $dst,$tmp2,$tmp\t! add reduction4F" %}
+ ins_encode %{
+ __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+ __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
+ __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+ __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
+ __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+ __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
+ __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct radd8F_reduction_reg(regF dst, regF src1, vecY src2, regF tmp, regF tmp2, regF tmp3) %{
+ predicate(UseAVX > 0);
+ match(Set dst (AddReductionVF src1 src2));
+ effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
+ format %{ "vaddss $tmp2,$src1,$src2\n\t"
+ "pshufd $tmp,$src2,0x01\n\t"
+ "vaddss $tmp2,$tmp2,$tmp\n\t"
+ "pshufd $tmp,$src2,0x02\n\t"
+ "vaddss $tmp2,$tmp2,$tmp\n\t"
+ "pshufd $tmp,$src2,0x03\n\t"
+ "vaddss $tmp2,$tmp2,$tmp\n\t"
+ "vextractf128 $tmp3,$src2\n\t"
+ "vaddss $tmp2,$tmp2,$tmp3\n\t"
+ "pshufd $tmp,$tmp3,0x01\n\t"
+ "vaddss $tmp2,$tmp2,$tmp\n\t"
+ "pshufd $tmp,$tmp3,0x02\n\t"
+ "vaddss $tmp2,$tmp2,$tmp\n\t"
+ "pshufd $tmp,$tmp3,0x03\n\t"
+ "vaddss $dst,$tmp2,$tmp\t! add reduction8F" %}
+ ins_encode %{
+ __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+ __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
+ __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+ __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
+ __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+ __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
+ __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+ __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
+ __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
+ __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
+ __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+ __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
+ __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+ __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
+ __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rsadd2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp) %{
+ predicate(UseSSE >= 1 && UseAVX == 0);
+ match(Set dst (AddReductionVD src1 src2));
+ effect(TEMP tmp, TEMP dst);
+ format %{ "movdqu $tmp,$src1\n\t"
+ "addsd $tmp,$src2\n\t"
+ "pshufd $dst,$src2,0xE\n\t"
+ "addsd $dst,$tmp\t! add reduction2D" %}
+ ins_encode %{
+ __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
+ __ addsd($tmp$$XMMRegister, $src2$$XMMRegister);
+ __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xE);
+ __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rvadd2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp, regD tmp2) %{
+ predicate(UseAVX > 0);
+ match(Set dst (AddReductionVD src1 src2));
+ effect(TEMP tmp, TEMP tmp2);
+ format %{ "vaddsd $tmp2,$src1,$src2\n\t"
+ "pshufd $tmp,$src2,0xE\n\t"
+ "vaddsd $dst,$tmp2,$tmp\t! add reduction2D" %}
+ ins_encode %{
+ __ vaddsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+ __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
+ __ vaddsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rvadd4D_reduction_reg(regD dst, regD src1, vecY src2, regD tmp, regD tmp2, regD tmp3) %{
+ predicate(UseAVX > 0);
+ match(Set dst (AddReductionVD src1 src2));
+ effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
+ format %{ "vaddsd $tmp2,$src1,$src2\n\t"
+ "pshufd $tmp,$src2,0xE\n\t"
+ "vaddsd $tmp2,$tmp2,$tmp\n\t"
+ "vextractf128 $tmp3,$src2\n\t"
+ "vaddsd $tmp2,$tmp2,$tmp3\n\t"
+ "pshufd $tmp,$tmp3,0xE\n\t"
+ "vaddsd $dst,$tmp2,$tmp\t! add reduction4D" %}
+ ins_encode %{
+ __ vaddsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+ __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
+ __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+ __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
+ __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
+ __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
+ __ vaddsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
+ predicate(UseSSE > 3 && UseAVX == 0);
+ match(Set dst (MulReductionVI src1 src2));
+ effect(TEMP tmp, TEMP tmp2);
+ format %{ "pshufd $tmp2,$src2,0x1\n\t"
+ "pmulld $tmp2,$src2\n\t"
+ "movd $tmp,$src1\n\t"
+ "pmulld $tmp2,$tmp\n\t"
+ "movd $dst,$tmp2\t! mul reduction2I" %}
+ ins_encode %{
+ __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
+ __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
+ __ movdl($tmp$$XMMRegister, $src1$$Register);
+ __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
+ __ movdl($dst$$Register, $tmp2$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
+ predicate(UseAVX > 0);
+ match(Set dst (MulReductionVI src1 src2));
+ effect(TEMP tmp, TEMP tmp2);
+ format %{ "pshufd $tmp2,$src2,0x1\n\t"
+ "vpmulld $tmp,$src2,$tmp2\n\t"
+ "movd $tmp2,$src1\n\t"
+ "vpmulld $tmp2,$tmp,$tmp2\n\t"
+ "movd $dst,$tmp2\t! mul reduction2I" %}
+ ins_encode %{
+ __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
+ __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, false);
+ __ movdl($tmp2$$XMMRegister, $src1$$Register);
+ __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
+ __ movdl($dst$$Register, $tmp2$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
+ predicate(UseSSE > 3 && UseAVX == 0);
+ match(Set dst (MulReductionVI src1 src2));
+ effect(TEMP tmp, TEMP tmp2);
+ format %{ "pshufd $tmp2,$src2,0xE\n\t"
+ "pmulld $tmp2,$src2\n\t"
+ "pshufd $tmp,$tmp2,0x1\n\t"
+ "pmulld $tmp2,$tmp\n\t"
+ "movd $tmp,$src1\n\t"
+ "pmulld $tmp2,$tmp\n\t"
+ "movd $dst,$tmp2\t! mul reduction4I" %}
+ ins_encode %{
+ __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
+ __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
+ __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
+ __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
+ __ movdl($tmp$$XMMRegister, $src1$$Register);
+ __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
+ __ movdl($dst$$Register, $tmp2$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
+ predicate(UseAVX > 0);
+ match(Set dst (MulReductionVI src1 src2));
+ effect(TEMP tmp, TEMP tmp2);
+ format %{ "pshufd $tmp2,$src2,0xE\n\t"
+ "vpmulld $tmp,$src2,$tmp2\n\t"
+ "pshufd $tmp2,$tmp,0x1\n\t"
+ "vpmulld $tmp,$tmp,$tmp2\n\t"
+ "movd $tmp2,$src1\n\t"
+ "vpmulld $tmp2,$tmp,$tmp2\n\t"
+ "movd $dst,$tmp2\t! mul reduction4I" %}
+ ins_encode %{
+ __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
+ __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, false);
+ __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
+ __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
+ __ movdl($tmp2$$XMMRegister, $src1$$Register);
+ __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
+ __ movdl($dst$$Register, $tmp2$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
+ predicate(UseAVX > 0);
+ match(Set dst (MulReductionVI src1 src2));
+ effect(TEMP tmp, TEMP tmp2);
+ format %{ "vextractf128 $tmp,$src2\n\t"
+ "vpmulld $tmp,$tmp,$src2\n\t"
+ "pshufd $tmp2,$tmp,0xE\n\t"
+ "vpmulld $tmp,$tmp,$tmp2\n\t"
+ "pshufd $tmp2,$tmp,0x1\n\t"
+ "vpmulld $tmp,$tmp,$tmp2\n\t"
+ "movd $tmp2,$src1\n\t"
+ "vpmulld $tmp2,$tmp,$tmp2\n\t"
+ "movd $dst,$tmp2\t! mul reduction8I" %}
+ ins_encode %{
+ __ vextractf128h($tmp$$XMMRegister, $src2$$XMMRegister);
+ __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, false);
+ __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
+ __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
+ __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
+ __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
+ __ movdl($tmp2$$XMMRegister, $src1$$Register);
+ __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
+ __ movdl($dst$$Register, $tmp2$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rsmul2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
+ predicate(UseSSE >= 1 && UseAVX == 0);
+ match(Set dst (MulReductionVF src1 src2));
+ effect(TEMP tmp, TEMP tmp2);
+ format %{ "movdqu $tmp,$src1\n\t"
+ "mulss $tmp,$src2\n\t"
+ "pshufd $tmp2,$src2,0x01\n\t"
+ "mulss $tmp,$tmp2\n\t"
+ "movdqu $dst,$tmp\t! add reduction2F" %}
+ ins_encode %{
+ __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
+ __ mulss($tmp$$XMMRegister, $src2$$XMMRegister);
+ __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
+ __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister);
+ __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rvmul2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
+ predicate(UseAVX > 0);
+ match(Set dst (MulReductionVF src1 src2));
+ effect(TEMP tmp, TEMP tmp2);
+ format %{ "vmulss $tmp2,$src1,$src2\n\t"
+ "pshufd $tmp,$src2,0x01\n\t"
+ "vmulss $dst,$tmp2,$tmp\t! add reduction2F" %}
+ ins_encode %{
+ __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+ __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
+ __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rsmul4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
+ predicate(UseSSE >= 1 && UseAVX == 0);
+ match(Set dst (MulReductionVF src1 src2));
+ effect(TEMP tmp, TEMP tmp2);
+ format %{ "movdqu $tmp,$src1\n\t"
+ "mulss $tmp,$src2\n\t"
+ "pshufd $tmp2,$src2,0x01\n\t"
+ "mulss $tmp,$tmp2\n\t"
+ "pshufd $tmp2,$src2,0x02\n\t"
+ "mulss $tmp,$tmp2\n\t"
+ "pshufd $tmp2,$src2,0x03\n\t"
+ "mulss $tmp,$tmp2\n\t"
+ "movdqu $dst,$tmp\t! add reduction4F" %}
+ ins_encode %{
+ __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
+ __ mulss($tmp$$XMMRegister, $src2$$XMMRegister);
+ __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
+ __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister);
+ __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x02);
+ __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister);
+ __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x03);
+ __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister);
+ __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rvmul4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
+ predicate(UseAVX > 0);
+ match(Set dst (MulReductionVF src1 src2));
+ effect(TEMP tmp, TEMP tmp2);
+ format %{ "vmulss $tmp2,$src1,$src2\n\t"
+ "pshufd $tmp,$src2,0x01\n\t"
+ "vmulss $tmp2,$tmp2,$tmp\n\t"
+ "pshufd $tmp,$src2,0x02\n\t"
+ "vmulss $tmp2,$tmp2,$tmp\n\t"
+ "pshufd $tmp,$src2,0x03\n\t"
+ "vmulss $dst,$tmp2,$tmp\t! add reduction4F" %}
+ ins_encode %{
+ __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+ __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
+ __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+ __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
+ __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+ __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
+ __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rvmul8F_reduction_reg(regF dst, regF src1, vecY src2, regF tmp, regF tmp2, regF tmp3) %{
+ predicate(UseAVX > 0);
+ match(Set dst (MulReductionVF src1 src2));
+ effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
+ format %{ "vmulss $tmp2,$src1,$src2\n\t"
+ "pshufd $tmp,$src2,0x01\n\t"
+ "vmulss $tmp2,$tmp2,$tmp\n\t"
+ "pshufd $tmp,$src2,0x02\n\t"
+ "vmulss $tmp2,$tmp2,$tmp\n\t"
+ "pshufd $tmp,$src2,0x03\n\t"
+ "vmulss $tmp2,$tmp2,$tmp\n\t"
+ "vextractf128 $tmp3,$src2\n\t"
+ "vmulss $tmp2,$tmp2,$tmp3\n\t"
+ "pshufd $tmp,$tmp3,0x01\n\t"
+ "vmulss $tmp2,$tmp2,$tmp\n\t"
+ "pshufd $tmp,$tmp3,0x02\n\t"
+ "vmulss $tmp2,$tmp2,$tmp\n\t"
+ "pshufd $tmp,$tmp3,0x03\n\t"
+ "vmulss $dst,$tmp2,$tmp\t! mul reduction8F" %}
+ ins_encode %{
+ __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+ __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
+ __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+ __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
+ __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+ __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
+ __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+ __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
+ __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
+ __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
+ __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+ __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
+ __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+ __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
+ __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rsmul2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp) %{
+ predicate(UseSSE >= 1 && UseAVX == 0);
+ match(Set dst (MulReductionVD src1 src2));
+ effect(TEMP tmp, TEMP dst);
+ format %{ "movdqu $tmp,$src1\n\t"
+ "mulsd $tmp,$src2\n\t"
+ "pshufd $dst,$src2,0xE\n\t"
+ "mulsd $dst,$tmp\t! add reduction2D" %}
+ ins_encode %{
+ __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
+ __ mulsd($tmp$$XMMRegister, $src2$$XMMRegister);
+ __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xE);
+ __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rvmul2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp, regD tmp2) %{
+ predicate(UseAVX > 0);
+ match(Set dst (MulReductionVD src1 src2));
+ effect(TEMP tmp, TEMP tmp2);
+ format %{ "vmulsd $tmp2,$src1,$src2\n\t"
+ "pshufd $tmp,$src2,0xE\n\t"
+ "vmulsd $dst,$tmp2,$tmp\t! mul reduction2D" %}
+ ins_encode %{
+ __ vmulsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+ __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
+ __ vmulsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rvmul4D_reduction_reg(regD dst, regD src1, vecY src2, regD tmp, regD tmp2, regD tmp3) %{
+ predicate(UseAVX > 0);
+ match(Set dst (MulReductionVD src1 src2));
+ effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
+ format %{ "vmulsd $tmp2,$src1,$src2\n\t"
+ "pshufd $tmp,$src2,0xE\n\t"
+ "vmulsd $tmp2,$tmp2,$tmp\n\t"
+ "vextractf128 $tmp3,$src2\n\t"
+ "vmulsd $tmp2,$tmp2,$tmp3\n\t"
+ "pshufd $tmp,$tmp3,0xE\n\t"
+ "vmulsd $dst,$tmp2,$tmp\t! mul reduction4D" %}
+ ins_encode %{
+ __ vmulsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+ __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
+ __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+ __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
+ __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
+ __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
+ __ vmulsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
// ====================VECTOR ARITHMETIC=======================================
// --------------------------------- ADD --------------------------------------
--- a/hotspot/src/share/vm/adlc/formssel.cpp Fri Mar 27 13:47:33 2015 +0100
+++ b/hotspot/src/share/vm/adlc/formssel.cpp Wed Apr 01 18:07:50 2015 -0700
@@ -4043,6 +4043,13 @@
strcmp(opType,"ReplicateL")==0 ||
strcmp(opType,"ReplicateF")==0 ||
strcmp(opType,"ReplicateD")==0 ||
+ strcmp(opType,"AddReductionVI")==0 ||
+ strcmp(opType,"AddReductionVL")==0 ||
+ strcmp(opType,"AddReductionVF")==0 ||
+ strcmp(opType,"AddReductionVD")==0 ||
+ strcmp(opType,"MulReductionVI")==0 ||
+ strcmp(opType,"MulReductionVF")==0 ||
+ strcmp(opType,"MulReductionVD")==0 ||
0 /* 0 to line up columns nicely */ )
return 1;
}
@@ -4135,6 +4142,10 @@
"MulVS","MulVI","MulVF","MulVD",
"DivVF","DivVD",
"AndV" ,"XorV" ,"OrV",
+ "AddReductionVI", "AddReductionVL",
+ "AddReductionVF", "AddReductionVD",
+ "MulReductionVI",
+ "MulReductionVF", "MulReductionVD",
"LShiftCntV","RShiftCntV",
"LShiftVB","LShiftVS","LShiftVI","LShiftVL",
"RShiftVB","RShiftVS","RShiftVI","RShiftVL",
--- a/hotspot/src/share/vm/opto/c2_globals.hpp Fri Mar 27 13:47:33 2015 +0100
+++ b/hotspot/src/share/vm/opto/c2_globals.hpp Wed Apr 01 18:07:50 2015 -0700
@@ -324,6 +324,9 @@
develop(bool, SuperWordRTDepCheck, false, \
"Enable runtime dependency checks.") \
\
+ product(bool, SuperWordReductions, true, \
+ "Enable reductions support in superword.") \
+ \
notproduct(bool, TraceSuperWord, false, \
"Trace superword transforms") \
\
--- a/hotspot/src/share/vm/opto/classes.hpp Fri Mar 27 13:47:33 2015 +0100
+++ b/hotspot/src/share/vm/opto/classes.hpp Wed Apr 01 18:07:50 2015 -0700
@@ -266,9 +266,13 @@
macro(AddVB)
macro(AddVS)
macro(AddVI)
+macro(AddReductionVI)
macro(AddVL)
+macro(AddReductionVL)
macro(AddVF)
+macro(AddReductionVF)
macro(AddVD)
+macro(AddReductionVD)
macro(SubVB)
macro(SubVS)
macro(SubVI)
@@ -277,8 +281,11 @@
macro(SubVD)
macro(MulVS)
macro(MulVI)
+macro(MulReductionVI)
macro(MulVF)
+macro(MulReductionVF)
macro(MulVD)
+macro(MulReductionVD)
macro(DivVF)
macro(DivVD)
macro(LShiftCntV)
--- a/hotspot/src/share/vm/opto/compile.cpp Fri Mar 27 13:47:33 2015 +0100
+++ b/hotspot/src/share/vm/opto/compile.cpp Wed Apr 01 18:07:50 2015 -0700
@@ -3049,6 +3049,15 @@
case Op_StoreVector:
break;
+ case Op_AddReductionVI:
+ case Op_AddReductionVL:
+ case Op_AddReductionVF:
+ case Op_AddReductionVD:
+ case Op_MulReductionVI:
+ case Op_MulReductionVF:
+ case Op_MulReductionVD:
+ break;
+
case Op_PackB:
case Op_PackS:
case Op_PackI:
--- a/hotspot/src/share/vm/opto/loopTransform.cpp Fri Mar 27 13:47:33 2015 +0100
+++ b/hotspot/src/share/vm/opto/loopTransform.cpp Wed Apr 01 18:07:50 2015 -0700
@@ -38,6 +38,7 @@
#include "opto/rootnode.hpp"
#include "opto/runtime.hpp"
#include "opto/subnode.hpp"
+#include "opto/vectornode.hpp"
//------------------------------is_loop_exit-----------------------------------
// Given an IfNode, return the loop-exiting projection or NULL if both
@@ -1524,6 +1525,44 @@
}
}
+void PhaseIdealLoop::mark_reductions(IdealLoopTree *loop) {
+ if (SuperWordReductions == false) return;
+
+ CountedLoopNode* loop_head = loop->_head->as_CountedLoop();
+ if (loop_head->unrolled_count() > 1) {
+ return;
+ }
+
+ Node* trip_phi = loop_head->phi();
+ for (DUIterator_Fast imax, i = loop_head->fast_outs(imax); i < imax; i++) {
+ Node* phi = loop_head->fast_out(i);
+ if (phi->is_Phi() && phi->outcnt() > 0 && phi != trip_phi) {
+ // For definitions which are loop inclusive and not tripcounts.
+ Node* def_node = phi->in(LoopNode::LoopBackControl);
+
+ if (def_node != NULL) {
+ Node* n_ctrl = get_ctrl(def_node);
+ if (n_ctrl != NULL && loop->is_member(get_loop(n_ctrl))) {
+ // Now test it to see if it fits the standard pattern for a reduction operator.
+ int opc = def_node->Opcode();
+ if (opc != ReductionNode::opcode(opc, def_node->bottom_type()->basic_type())) {
+ if (!def_node->is_reduction()) { // Not marked yet
+ // To be a reduction, the arithmetic node must have the phi as input and provide a def to it
+ for (unsigned j = 1; j < def_node->req(); j++) {
+ Node* in = def_node->in(j);
+ if (in == phi) {
+ def_node->add_flag(Node::Flag_is_reduction);
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
//------------------------------dominates_backedge---------------------------------
// Returns true if ctrl is executed on every complete iteration
bool IdealLoopTree::dominates_backedge(Node* ctrl) {
@@ -2361,8 +2400,10 @@
// an even number of trips). If we are peeling, we might enable some RCE
// and we'd rather unroll the post-RCE'd loop SO... do not unroll if
// peeling.
- if (should_unroll && !should_peel)
- phase->do_unroll(this,old_new, true);
+ if (should_unroll && !should_peel) {
+ phase->mark_reductions(this);
+ phase->do_unroll(this, old_new, true);
+ }
// Adjust the pre-loop limits to align the main body
// iterations.
--- a/hotspot/src/share/vm/opto/loopnode.hpp Fri Mar 27 13:47:33 2015 +0100
+++ b/hotspot/src/share/vm/opto/loopnode.hpp Wed Apr 01 18:07:50 2015 -0700
@@ -872,6 +872,9 @@
// Unroll the loop body one step - make each trip do 2 iterations.
void do_unroll( IdealLoopTree *loop, Node_List &old_new, bool adjust_min_trip );
+ // Mark vector reduction candidates before loop unrolling
+ void mark_reductions( IdealLoopTree *loop );
+
// Return true if exp is a constant times an induction var
bool is_scaled_iv(Node* exp, Node* iv, int* p_scale);
--- a/hotspot/src/share/vm/opto/node.hpp Fri Mar 27 13:47:33 2015 +0100
+++ b/hotspot/src/share/vm/opto/node.hpp Wed Apr 01 18:07:50 2015 -0700
@@ -673,7 +673,8 @@
Flag_avoid_back_to_back_before = Flag_may_be_short_branch << 1,
Flag_avoid_back_to_back_after = Flag_avoid_back_to_back_before << 1,
Flag_has_call = Flag_avoid_back_to_back_after << 1,
- Flag_is_expensive = Flag_has_call << 1,
+ Flag_is_reduction = Flag_has_call << 1,
+ Flag_is_expensive = Flag_is_reduction << 1,
_max_flags = (Flag_is_expensive << 1) - 1 // allow flags combination
};
@@ -701,6 +702,10 @@
const jushort flags() const { return _flags; }
+ void add_flag(jushort fl) { init_flags(fl); }
+
+ void remove_flag(jushort fl) { clear_flag(fl); }
+
// Return a dense integer opcode number
virtual int Opcode() const;
@@ -852,6 +857,10 @@
// The node is expensive: the best control is set during loop opts
bool is_expensive() const { return (_flags & Flag_is_expensive) != 0 && in(0) != NULL; }
+ // An arithmetic node which accumulates a data in a loop.
+ // It must have the loop's phi as input and provide a def to the phi.
+ bool is_reduction() const { return (_flags & Flag_is_reduction) != 0; }
+
//----------------- Optimization
// Get the worst-case Type output for this Node.
--- a/hotspot/src/share/vm/opto/superword.cpp Fri Mar 27 13:47:33 2015 +0100
+++ b/hotspot/src/share/vm/opto/superword.cpp Wed Apr 01 18:07:50 2015 -0700
@@ -65,7 +65,8 @@
_lpt(NULL), // loop tree node
_lp(NULL), // LoopNode
_bb(NULL), // basic block
- _iv(NULL) // induction var
+ _iv(NULL), // induction var
+ _race_possible(false) // cases where SDMU is true
{}
//------------------------------transform_loop---------------------------
@@ -145,7 +146,6 @@
void SuperWord::SLP_extract() {
// Ready the block
-
if (!construct_bb())
return; // Exit if no interesting nodes or complex graph.
@@ -640,7 +640,7 @@
}
if (isomorphic(s1, s2)) {
- if (independent(s1, s2)) {
+ if (independent(s1, s2) || reduction(s1, s2)) {
if (!exists_at(s1, 0) && !exists_at(s2, 1)) {
if (!s1->is_Mem() || are_adjacent_refs(s1, s2)) {
int s1_align = alignment(s1);
@@ -718,6 +718,28 @@
return independent_path(shallow, deep);
}
+//------------------------------reduction---------------------------
+// Is there a data path between s1 and s2 and the nodes reductions?
+bool SuperWord::reduction(Node* s1, Node* s2) {
+ bool retValue = false;
+ int d1 = depth(s1);
+ int d2 = depth(s2);
+ if (d1 + 1 == d2) {
+ if (s1->is_reduction() && s2->is_reduction()) {
+ // This is an ordered set, so s1 should define s2
+ for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
+ Node* t1 = s1->fast_out(i);
+ if (t1 == s2) {
+ // both nodes are reductions and connected
+ retValue = true;
+ }
+ }
+ }
+ }
+
+ return retValue;
+}
+
//------------------------------independent_path------------------------------
// Helper for independent
bool SuperWord::independent_path(Node* shallow, Node* deep, uint dp) {
@@ -761,6 +783,7 @@
void SuperWord::extend_packlist() {
bool changed;
do {
+ packset_sort(_packset.length());
changed = false;
for (int i = 0; i < _packset.length(); i++) {
Node_List* p = _packset.at(i);
@@ -769,6 +792,13 @@
}
} while (changed);
+ if (_race_possible) {
+ for (int i = 0; i < _packset.length(); i++) {
+ Node_List* p = _packset.at(i);
+ order_def_uses(p);
+ }
+ }
+
#ifndef PRODUCT
if (TraceSuperWord) {
tty->print_cr("\nAfter extend_packlist");
@@ -825,10 +855,12 @@
int align = alignment(s1);
int savings = -1;
+ int num_s1_uses = 0;
Node* u1 = NULL;
Node* u2 = NULL;
for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
Node* t1 = s1->fast_out(i);
+ num_s1_uses++;
if (!in_bb(t1)) continue;
for (DUIterator_Fast jmax, j = s2->fast_outs(jmax); j < jmax; j++) {
Node* t2 = s2->fast_out(j);
@@ -845,6 +877,9 @@
}
}
}
+ if (num_s1_uses > 1) {
+ _race_possible = true;
+ }
if (savings >= 0) {
Node_List* pair = new Node_List();
pair->push(u1);
@@ -856,9 +891,64 @@
return changed;
}
+//------------------------------order_def_uses---------------------------
+// For extended packsets, ordinally arrange uses packset by major component
+void SuperWord::order_def_uses(Node_List* p) {
+ Node* s1 = p->at(0);
+
+ if (s1->is_Store()) return;
+
+ // reductions are always managed beforehand
+ if (s1->is_reduction()) return;
+
+ for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
+ Node* t1 = s1->fast_out(i);
+
+ // Only allow operand swap on commuting operations
+ if (!t1->is_Add() && !t1->is_Mul()) {
+ break;
+ }
+
+ // Now find t1's packset
+ Node_List* p2 = NULL;
+ for (int j = 0; j < _packset.length(); j++) {
+ p2 = _packset.at(j);
+ Node* first = p2->at(0);
+ if (t1 == first) {
+ break;
+ }
+ p2 = NULL;
+ }
+ // Arrange all sub components by the major component
+ if (p2 != NULL) {
+ for (uint j = 1; j < p->size(); j++) {
+ Node* d1 = p->at(j);
+ Node* u1 = p2->at(j);
+ opnd_positions_match(s1, t1, d1, u1);
+ }
+ }
+ }
+}
+
//---------------------------opnd_positions_match-------------------------
// Is the use of d1 in u1 at the same operand position as d2 in u2?
bool SuperWord::opnd_positions_match(Node* d1, Node* u1, Node* d2, Node* u2) {
+ // check reductions to see if they are marshalled to represent the reduction
+ // operator in a specified opnd
+ if (u1->is_reduction() && u2->is_reduction()) {
+ // ensure reductions have phis and reduction definitions feeding the 1st operand
+ Node* first = u1->in(2);
+ if (first->is_Phi() || first->is_reduction()) {
+ u1->swap_edges(1, 2);
+ }
+ // ensure reductions have phis and reduction definitions feeding the 1st operand
+ first = u2->in(2);
+ if (first->is_Phi() || first->is_reduction()) {
+ u2->swap_edges(1, 2);
+ }
+ return true;
+ }
+
uint ct = u1->req();
if (ct != u2->req()) return false;
uint i1 = 0;
@@ -940,7 +1030,8 @@
for (int i = 0; i < _packset.length(); i++) {
Node_List* p1 = _packset.at(i);
if (p1 == NULL) continue;
- for (int j = 0; j < _packset.length(); j++) {
+ // Because of sorting we can start at i + 1
+ for (int j = i + 1; j < _packset.length(); j++) {
Node_List* p2 = _packset.at(j);
if (p2 == NULL) continue;
if (i == j) continue;
@@ -1067,8 +1158,19 @@
//------------------------------implemented---------------------------
// Can code be generated for pack p?
bool SuperWord::implemented(Node_List* p) {
+ bool retValue = false;
Node* p0 = p->at(0);
- return VectorNode::implemented(p0->Opcode(), p->size(), velt_basic_type(p0));
+ if (p0 != NULL) {
+ int opc = p0->Opcode();
+ uint size = p->size();
+ if (p0->is_reduction()) {
+ const Type *arith_type = p0->bottom_type();
+ retValue = ReductionNode::implemented(opc, size, arith_type->basic_type());
+ } else {
+ retValue = VectorNode::implemented(opc, size, velt_basic_type(p0));
+ }
+ }
+ return retValue;
}
//------------------------------same_inputs--------------------------
@@ -1102,6 +1204,18 @@
if (!is_vector_use(p0, i))
return false;
}
+ // Check if reductions are connected
+ if (p0->is_reduction()) {
+ Node* second_in = p0->in(2);
+ Node_List* second_pk = my_pack(second_in);
+ if (second_pk == NULL) {
+ // Remove reduction flag if no parent pack, it is not profitable
+ p0->remove_flag(Node::Flag_is_reduction);
+ return false;
+ } else if (second_pk->size() != p->size()) {
+ return false;
+ }
+ }
if (VectorNode::is_shift(p0)) {
// For now, return false if shift count is vector or not scalar promotion
// case (different shift counts) because it is not supported yet.
@@ -1123,6 +1237,9 @@
for (uint k = 0; k < use->req(); k++) {
Node* n = use->in(k);
if (def == n) {
+ // reductions can be loop carried dependences
+ if (def->is_reduction() && use->is_Phi())
+ continue;
if (!is_vector_use(use, k)) {
return false;
}
@@ -1407,16 +1524,33 @@
vlen_in_bytes = vn->as_StoreVector()->memory_size();
} else if (n->req() == 3) {
// Promote operands to vector
- Node* in1 = vector_opd(p, 1);
+ Node* in1 = NULL;
+ bool node_isa_reduction = n->is_reduction();
+ if (node_isa_reduction) {
+ // the input to the first reduction operation is retained
+ in1 = low_adr->in(1);
+ } else {
+ in1 = vector_opd(p, 1);
+ }
Node* in2 = vector_opd(p, 2);
- if (VectorNode::is_invariant_vector(in1) && (n->is_Add() || n->is_Mul())) {
+ if (VectorNode::is_invariant_vector(in1) && (node_isa_reduction == false) && (n->is_Add() || n->is_Mul())) {
// Move invariant vector input into second position to avoid register spilling.
Node* tmp = in1;
in1 = in2;
in2 = tmp;
}
- vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
- vlen_in_bytes = vn->as_Vector()->length_in_bytes();
+ if (node_isa_reduction) {
+ const Type *arith_type = n->bottom_type();
+ vn = ReductionNode::make(opc, NULL, in1, in2, arith_type->basic_type());
+ if (in2->is_Load()) {
+ vlen_in_bytes = in2->as_LoadVector()->memory_size();
+ } else {
+ vlen_in_bytes = in2->as_Vector()->length_in_bytes();
+ }
+ } else {
+ vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
+ vlen_in_bytes = vn->as_Vector()->length_in_bytes();
+ }
} else {
ShouldNotReachHere();
}
@@ -1556,6 +1690,8 @@
_n_idx_list.pop();
Node* def = use->in(idx);
+ if (def->is_reduction()) continue;
+
// Insert extract operation
_igvn.hash_delete(def);
int def_pos = alignment(def) / data_size(def);
@@ -1576,6 +1712,7 @@
bool SuperWord::is_vector_use(Node* use, int u_idx) {
Node_List* u_pk = my_pack(use);
if (u_pk == NULL) return false;
+ if (use->is_reduction()) return true;
Node* def = use->in(u_idx);
Node_List* d_pk = my_pack(def);
if (d_pk == NULL) {
@@ -1613,7 +1750,7 @@
// by the visited and post_visited sets,
// and count number of nodes in block.
int bb_ct = 0;
- for (uint i = 0; i < lpt()->_body.size(); i++ ) {
+ for (uint i = 0; i < lpt()->_body.size(); i++) {
Node *n = lpt()->_body.at(i);
set_bb_idx(n, i); // Create a temporary map
if (in_bb(n)) {
@@ -1674,6 +1811,7 @@
// Do a depth first walk over out edges
int rpo_idx = bb_ct - 1;
int size;
+ int reduction_uses = 0;
while ((size = _stk.length()) > 0) {
Node* n = _stk.top(); // Leave node on stack
if (!visited_test_set(n)) {
@@ -1685,6 +1823,14 @@
if (in_bb(use) && !visited_test(use) &&
// Don't go around backedge
(!use->is_Phi() || n == entry)) {
+ if (use->is_reduction()) {
+ // First see if we can map the reduction on the given system we are on, then
+ // make a data entry operation for each reduction we see.
+ BasicType bt = use->bottom_type()->basic_type();
+ if (ReductionNode::implemented(use->Opcode(), Matcher::min_vector_size(bt), bt)) {
+ reduction_uses++;
+ }
+ }
_stk.push(use);
}
}
@@ -1708,7 +1854,8 @@
set_bb_idx(n, j);
}
- initialize_bb(); // Ensure extra info is allocated.
+ // Ensure extra info is allocated.
+ initialize_bb();
#ifndef PRODUCT
if (TraceSuperWord) {
@@ -1726,7 +1873,7 @@
}
#endif
assert(rpo_idx == -1 && bb_ct == _block.length(), "all block members found");
- return (_mem_slice_head.length() > 0) || (_data_entry.length() > 0);
+ return (_mem_slice_head.length() > 0) || (reduction_uses > 0) || (_data_entry.length() > 0);
}
//------------------------------initialize_bb---------------------------
@@ -1959,6 +2106,27 @@
_packset.remove_at(pos);
}
+void SuperWord::packset_sort(int n) {
+ // simple bubble sort so that we capitalize with O(n) when its already sorted
+ while (n != 0) {
+ bool swapped = false;
+ for (int i = 1; i < n; i++) {
+ Node_List* q_low = _packset.at(i-1);
+ Node_List* q_i = _packset.at(i);
+
+ // only swap when we find something to swap
+ if (alignment(q_low->at(0)) > alignment(q_i->at(0))) {
+ Node_List* t = q_i;
+ *(_packset.adr_at(i)) = q_low;
+ *(_packset.adr_at(i-1)) = q_i;
+ swapped = true;
+ }
+ }
+ if (swapped == false) break;
+ n--;
+ }
+}
+
//------------------------------executed_first---------------------------
// Return the node executed first in pack p. Uses the RPO block list
// to determine order.
--- a/hotspot/src/share/vm/opto/superword.hpp Fri Mar 27 13:47:33 2015 +0100
+++ b/hotspot/src/share/vm/opto/superword.hpp Wed Apr 01 18:07:50 2015 -0700
@@ -249,6 +249,7 @@
LoopNode* _lp; // Current LoopNode
Node* _bb; // Current basic block
PhiNode* _iv; // Induction var
+ bool _race_possible; // In cases where SDMU is true
// Accessors
Arena* arena() { return _arena; }
@@ -337,6 +338,8 @@
bool isomorphic(Node* s1, Node* s2);
// Is there no data path from s1 to s2 or s2 to s1?
bool independent(Node* s1, Node* s2);
+ // Is there a data path between s1 and s2 and both are reductions?
+ bool reduction(Node* s1, Node* s2);
// Helper for independent
bool independent_path(Node* shallow, Node* deep, uint dp=0);
void set_alignment(Node* s1, Node* s2, int align);
@@ -347,6 +350,8 @@
bool follow_use_defs(Node_List* p);
// Extend the packset by visiting uses of nodes in pack p
bool follow_def_uses(Node_List* p);
+ // For extended packsets, ordinally arrange uses packset by major component
+ void order_def_uses(Node_List* p);
// Estimate the savings from executing s1 and s2 as a pack
int est_savings(Node* s1, Node* s2);
int adjacent_profit(Node* s1, Node* s2);
@@ -419,9 +424,12 @@
void print_bb();
void print_stmt(Node* s);
char* blank(uint depth);
+
+ void packset_sort(int n);
};
+
//------------------------------SWPointer---------------------------
// Information about an address for dependence checking and vector alignment
class SWPointer VALUE_OBJ_CLASS_SPEC {
--- a/hotspot/src/share/vm/opto/vectornode.cpp Fri Mar 27 13:47:33 2015 +0100
+++ b/hotspot/src/share/vm/opto/vectornode.cpp Wed Apr 01 18:07:50 2015 -0700
@@ -250,7 +250,6 @@
int vopc = VectorNode::opcode(opc, bt);
// This method should not be called for unimplemented vectors.
guarantee(vopc > 0, err_msg_res("Vector for '%s' is not implemented", NodeClassNames[opc]));
-
switch (vopc) {
case Op_AddVB: return new AddVBNode(n1, n2, vt);
case Op_AddVS: return new AddVSNode(n1, n2, vt);
@@ -441,3 +440,72 @@
return NULL;
}
+int ReductionNode::opcode(int opc, BasicType bt) {
+ int vopc = opc;
+ switch (opc) {
+ case Op_AddI:
+ assert(bt == T_INT, "must be");
+ vopc = Op_AddReductionVI;
+ break;
+ case Op_AddL:
+ assert(bt == T_LONG, "must be");
+ vopc = Op_AddReductionVL;
+ break;
+ case Op_AddF:
+ assert(bt == T_FLOAT, "must be");
+ vopc = Op_AddReductionVF;
+ break;
+ case Op_AddD:
+ assert(bt == T_DOUBLE, "must be");
+ vopc = Op_AddReductionVD;
+ break;
+ case Op_MulI:
+ assert(bt == T_INT, "must be");
+ vopc = Op_MulReductionVI;
+ break;
+ case Op_MulF:
+ assert(bt == T_FLOAT, "must be");
+ vopc = Op_MulReductionVF;
+ break;
+ case Op_MulD:
+ assert(bt == T_DOUBLE, "must be");
+ vopc = Op_MulReductionVD;
+ break;
+ // TODO: add MulL for targets that support it
+ default:
+ break;
+ }
+ return vopc;
+}
+
+// Return the appropriate reduction node.
+ReductionNode* ReductionNode::make(int opc, Node *ctrl, Node* n1, Node* n2, BasicType bt) {
+
+ int vopc = opcode(opc, bt);
+
+ // This method should not be called for unimplemented vectors.
+ guarantee(vopc != opc, err_msg_res("Vector for '%s' is not implemented", NodeClassNames[opc]));
+
+ switch (vopc) {
+ case Op_AddReductionVI: return new AddReductionVINode(ctrl, n1, n2);
+ case Op_AddReductionVL: return new AddReductionVLNode(ctrl, n1, n2);
+ case Op_AddReductionVF: return new AddReductionVFNode(ctrl, n1, n2);
+ case Op_AddReductionVD: return new AddReductionVDNode(ctrl, n1, n2);
+ case Op_MulReductionVI: return new MulReductionVINode(ctrl, n1, n2);
+ case Op_MulReductionVF: return new MulReductionVFNode(ctrl, n1, n2);
+ case Op_MulReductionVD: return new MulReductionVDNode(ctrl, n1, n2);
+ }
+ fatal(err_msg_res("Missed vector creation for '%s'", NodeClassNames[vopc]));
+ return NULL;
+}
+
+bool ReductionNode::implemented(int opc, uint vlen, BasicType bt) {
+ if (is_java_primitive(bt) &&
+ (vlen > 1) && is_power_of_2(vlen) &&
+ Matcher::vector_size_supported(bt, vlen)) {
+ int vopc = ReductionNode::opcode(opc, bt);
+ return vopc != opc && Matcher::match_rule_supported(vopc);
+ }
+ return false;
+}
+
--- a/hotspot/src/share/vm/opto/vectornode.hpp Fri Mar 27 13:47:33 2015 +0100
+++ b/hotspot/src/share/vm/opto/vectornode.hpp Wed Apr 01 18:07:50 2015 -0700
@@ -90,6 +90,37 @@
virtual int Opcode() const;
};
+//------------------------------ReductionNode------------------------------------
+// Perform reduction of a vector
+class ReductionNode : public Node {
+ public:
+ ReductionNode(Node *ctrl, Node* in1, Node* in2) : Node(ctrl, in1, in2) {}
+
+ static ReductionNode* make(int opc, Node *ctrl, Node* in1, Node* in2, BasicType bt);
+ static int opcode(int opc, BasicType bt);
+ static bool implemented(int opc, uint vlen, BasicType bt);
+};
+
+//------------------------------AddReductionVINode--------------------------------------
+// Vector add int as a reduction
+class AddReductionVINode : public ReductionNode {
+public:
+ AddReductionVINode(Node * ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
+ virtual int Opcode() const;
+ virtual const Type* bottom_type() const { return TypeInt::INT; }
+ virtual uint ideal_reg() const { return Op_RegI; }
+};
+
+//------------------------------AddReductionVLNode--------------------------------------
+// Vector add long as a reduction
+class AddReductionVLNode : public ReductionNode {
+public:
+ AddReductionVLNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
+ virtual int Opcode() const;
+ virtual const Type* bottom_type() const { return TypeLong::LONG; }
+ virtual uint ideal_reg() const { return Op_RegL; }
+};
+
//------------------------------AddVLNode--------------------------------------
// Vector add long
class AddVLNode : public VectorNode {
@@ -106,6 +137,16 @@
virtual int Opcode() const;
};
+//------------------------------AddReductionVFNode--------------------------------------
+// Vector add float as a reduction
+class AddReductionVFNode : public ReductionNode {
+public:
+ AddReductionVFNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
+ virtual int Opcode() const;
+ virtual const Type* bottom_type() const { return Type::FLOAT; }
+ virtual uint ideal_reg() const { return Op_RegF; }
+};
+
//------------------------------AddVDNode--------------------------------------
// Vector add double
class AddVDNode : public VectorNode {
@@ -114,6 +155,16 @@
virtual int Opcode() const;
};
+//------------------------------AddReductionVDNode--------------------------------------
+// Vector add double as a reduction
+class AddReductionVDNode : public ReductionNode {
+public:
+ AddReductionVDNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
+ virtual int Opcode() const;
+ virtual const Type* bottom_type() const { return Type::DOUBLE; }
+ virtual uint ideal_reg() const { return Op_RegD; }
+};
+
//------------------------------SubVBNode--------------------------------------
// Vector subtract byte
class SubVBNode : public VectorNode {
@@ -178,6 +229,16 @@
virtual int Opcode() const;
};
+//------------------------------MulReductionVINode--------------------------------------
+// Vector multiply int as a reduction
+class MulReductionVINode : public ReductionNode {
+public:
+ MulReductionVINode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
+ virtual int Opcode() const;
+ virtual const Type* bottom_type() const { return TypeInt::INT; }
+ virtual uint ideal_reg() const { return Op_RegI; }
+};
+
//------------------------------MulVFNode--------------------------------------
// Vector multiply float
class MulVFNode : public VectorNode {
@@ -186,6 +247,16 @@
virtual int Opcode() const;
};
+//------------------------------MulReductionVFNode--------------------------------------
+// Vector multiply float as a reduction
+class MulReductionVFNode : public ReductionNode {
+public:
+ MulReductionVFNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
+ virtual int Opcode() const;
+ virtual const Type* bottom_type() const { return Type::FLOAT; }
+ virtual uint ideal_reg() const { return Op_RegF; }
+};
+
//------------------------------MulVDNode--------------------------------------
// Vector multiply double
class MulVDNode : public VectorNode {
@@ -194,6 +265,16 @@
virtual int Opcode() const;
};
+//------------------------------MulReductionVDNode--------------------------------------
+// Vector multiply double as a reduction
+class MulReductionVDNode : public ReductionNode {
+public:
+ MulReductionVDNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
+ virtual int Opcode() const;
+ virtual const Type* bottom_type() const { return Type::DOUBLE; }
+ virtual uint ideal_reg() const { return Op_RegD; }
+};
+
//------------------------------DivVFNode--------------------------------------
// Vector divide float
class DivVFNode : public VectorNode {
--- a/hotspot/src/share/vm/runtime/vmStructs.cpp Fri Mar 27 13:47:33 2015 +0100
+++ b/hotspot/src/share/vm/runtime/vmStructs.cpp Wed Apr 01 18:07:50 2015 -0700
@@ -1982,13 +1982,18 @@
declare_c2_type(PowDNode, Node) \
declare_c2_type(ReverseBytesINode, Node) \
declare_c2_type(ReverseBytesLNode, Node) \
+ declare_c2_type(ReductionNode, Node) \
declare_c2_type(VectorNode, Node) \
declare_c2_type(AddVBNode, VectorNode) \
declare_c2_type(AddVSNode, VectorNode) \
declare_c2_type(AddVINode, VectorNode) \
+ declare_c2_type(AddReductionVINode, ReductionNode) \
declare_c2_type(AddVLNode, VectorNode) \
+ declare_c2_type(AddReductionVLNode, ReductionNode) \
declare_c2_type(AddVFNode, VectorNode) \
+ declare_c2_type(AddReductionVFNode, ReductionNode) \
declare_c2_type(AddVDNode, VectorNode) \
+ declare_c2_type(AddReductionVDNode, ReductionNode) \
declare_c2_type(SubVBNode, VectorNode) \
declare_c2_type(SubVSNode, VectorNode) \
declare_c2_type(SubVINode, VectorNode) \
@@ -1997,8 +2002,11 @@
declare_c2_type(SubVDNode, VectorNode) \
declare_c2_type(MulVSNode, VectorNode) \
declare_c2_type(MulVINode, VectorNode) \
+ declare_c2_type(MulReductionVINode, ReductionNode) \
declare_c2_type(MulVFNode, VectorNode) \
+ declare_c2_type(MulReductionVFNode, ReductionNode) \
declare_c2_type(MulVDNode, VectorNode) \
+ declare_c2_type(MulReductionVDNode, ReductionNode) \
declare_c2_type(DivVFNode, VectorNode) \
declare_c2_type(DivVDNode, VectorNode) \
declare_c2_type(LShiftVBNode, VectorNode) \
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/loopopts/superword/ProdRed_Double.java Wed Apr 01 18:07:50 2015 -0700
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 8074981
+ * @summary Add C2 x86 Superword support for scalar product reduction optimizations : float test
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 ProdRed_Double
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 ProdRed_Double
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 ProdRed_Double
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 ProdRed_Double
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 ProdRed_Double
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 ProdRed_Double
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 ProdRed_Double
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 ProdRed_Double
+ */
+
+public class ProdRed_Double
+{
+ public static void main(String[] args) throws Exception {
+ double[] a = new double[256*1024];
+ double[] b = new double[256*1024];
+ prodReductionInit(a,b);
+ double valid = 2000;
+ double total = 0;
+ for(int j = 0; j < 2000; j++) {
+ total = j + 1;
+ total = prodReductionImplement(a,b, total);
+ }
+ if(total == valid) {
+ System.out.println("Success");
+ } else {
+ System.out.println("Invalid sum of elements variable in total: " + total);
+ System.out.println("Expected value = " + valid);
+ throw new Exception("Failed");
+ }
+ }
+
+ public static void prodReductionInit(double[] a, double[] b)
+ {
+ for(int i = 0; i < a.length; i++)
+ {
+ a[i] = i + 2;
+ b[i] = i + 1;
+ }
+ }
+
+ public static double prodReductionImplement(double[] a, double[] b, double total)
+ {
+ for(int i = 0; i < a.length; i++)
+ {
+ total *= a[i] - b[i];
+ }
+ return total;
+ }
+
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/loopopts/superword/ProdRed_Float.java Wed Apr 01 18:07:50 2015 -0700
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 8074981
+ * @summary Add C2 x86 Superword support for scalar product reduction optimizations : float test
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 ProdRed_Float
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 ProdRed_Float
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 ProdRed_Float
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 ProdRed_Float
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 ProdRed_Float
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 ProdRed_Float
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 ProdRed_Float
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 ProdRed_Float
+ */
+
+public class ProdRed_Float
+{
+ public static void main(String[] args) throws Exception {
+ float[] a = new float[256*1024];
+ float[] b = new float[256*1024];
+ prodReductionInit(a,b);
+ float valid = 2000;
+ float total = 0;
+ for(int j = 0; j < 2000; j++) {
+ total = j + 1;
+ total = prodReductionImplement(a,b, total);
+ }
+ if(total == valid) {
+ System.out.println("Success");
+ } else {
+ System.out.println("Invalid sum of elements variable in total: " + total);
+ System.out.println("Expected value = " + valid);
+ throw new Exception("Failed");
+ }
+ }
+
+ public static void prodReductionInit(float[] a, float[] b)
+ {
+ for(int i = 0; i < a.length; i++)
+ {
+ a[i] = i + 2;
+ b[i] = i + 1;
+ }
+ }
+
+ public static float prodReductionImplement(float[] a, float[] b, float total)
+ {
+ for(int i = 0; i < a.length; i++)
+ {
+ total *= a[i] - b[i];
+ }
+ return total;
+ }
+
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/loopopts/superword/ProdRed_Int.java Wed Apr 01 18:07:50 2015 -0700
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 8074981
+ * @summary Add C2 x86 Superword support for scalar product reduction optimizations : int test
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 ProdRed_Int
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 ProdRed_Int
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 ProdRed_Int
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 ProdRed_Int
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 ProdRed_Int
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 ProdRed_Int
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 ProdRed_Int
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 ProdRed_Int
+ */
+
+public class ProdRed_Int
+{
+ public static void main(String[] args) throws Exception {
+ int[] a = new int[256*1024];
+ int[] b = new int[256*1024];
+ prodReductionInit(a,b);
+ int valid = 419430401;
+ int total = 1;
+ for(int j = 0; j < 2000; j++) {
+ total = prodReductionImplement(a,b,total);
+ }
+ if(total == valid) {
+ System.out.println("Success");
+ } else {
+ System.out.println("Invalid sum of elements variable in total: " + total);
+ System.out.println("Expected value = " + valid);
+ throw new Exception("Failed");
+ }
+ }
+
+ public static void prodReductionInit(int[] a, int[] b)
+ {
+ for(int i = 0; i < a.length; i++)
+ {
+ a[i] = i + 2;
+ b[i] = i + 1;
+ }
+ }
+
+ public static int prodReductionImplement(int[] a, int[] b, int total)
+ {
+ for(int i = 0; i < a.length; i++)
+ {
+ total *= a[i] + b[i];
+ }
+ return total;
+ }
+
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/loopopts/superword/SumRed_Double.java Wed Apr 01 18:07:50 2015 -0700
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 8074981
+ * @summary Add C2 x86 Superword support for scalar sum reduction optimizations : double test
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 SumRed_Double
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 SumRed_Double
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRed_Double
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRed_Double
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRed_Double
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRed_Double
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 SumRed_Double
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 SumRed_Double
+ */
+
+public class SumRed_Double
+{
+ public static void main(String[] args) throws Exception {
+ double[] a = new double[256*1024];
+ double[] b = new double[256*1024];
+ double[] c = new double[256*1024];
+ double[] d = new double[256*1024];
+ sumReductionInit(a,b,c);
+ double total = 0;
+ double valid = 3.6028590866691944E19;
+ for(int j = 0; j < 2000; j++) {
+ total = sumReductionImplement(a,b,c,d,total);
+ }
+ if(total == valid) {
+ System.out.println("Success");
+ } else {
+ System.out.println("Invalid sum of elements variable in total: " + total);
+ System.out.println("Expected value = " + valid);
+ throw new Exception("Failed");
+ }
+ }
+
+ public static void sumReductionInit(
+ double[] a,
+ double[] b,
+ double[] c)
+ {
+ for(int j = 0; j < 1; j++)
+ {
+ for(int i = 0; i < a.length; i++)
+ {
+ a[i] = i * 1 + j;
+ b[i] = i * 1 - j;
+ c[i] = i + j;
+ }
+ }
+ }
+
+ public static double sumReductionImplement(
+ double[] a,
+ double[] b,
+ double[] c,
+ double[] d,
+ double total)
+ {
+ for(int i = 0; i < a.length; i++)
+ {
+ d[i]= (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
+ total += d[i];
+ }
+ return total;
+ }
+
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/loopopts/superword/SumRed_Float.java Wed Apr 01 18:07:50 2015 -0700
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 8074981
+ * @summary Add C2 x86 Superword support for scalar sum reduction optimizations : float test
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 SumRed_Float
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 SumRed_Float
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRed_Float
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRed_Float
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRed_Float
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRed_Float
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 SumRed_Float
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 SumRed_Float
+ */
+
+public class SumRed_Float
+{
+ public static void main(String[] args) throws Exception {
+ float[] a = new float[256*1024];
+ float[] b = new float[256*1024];
+ float[] c = new float[256*1024];
+ float[] d = new float[256*1024];
+ sumReductionInit(a,b,c);
+ float total = 0;
+ float valid = (float)4.611686E18;
+ for(int j = 0; j < 2000; j++) {
+ total = sumReductionImplement(a,b,c,d,total);
+ }
+ if(total == valid) {
+ System.out.println("Success");
+ } else {
+ System.out.println("Invalid sum of elements variable in total: " + total);
+ System.out.println("Expected value = " + valid);
+ throw new Exception("Failed");
+ }
+ }
+
+ public static void sumReductionInit(
+ float[] a,
+ float[] b,
+ float[] c)
+ {
+ for(int j = 0; j < 1; j++)
+ {
+ for(int i = 0; i < a.length; i++)
+ {
+ a[i] = i * 1 + j;
+ b[i] = i * 1 - j;
+ c[i] = i + j;
+ }
+ }
+ }
+
+ public static float sumReductionImplement(
+ float[] a,
+ float[] b,
+ float[] c,
+ float[] d,
+ float total)
+ {
+ for(int i = 0; i < a.length; i++)
+ {
+ d[i]= (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
+ total += d[i];
+ }
+ return total;
+ }
+
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/loopopts/superword/SumRed_Int.java Wed Apr 01 18:07:50 2015 -0700
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 8074981
+ * @summary Add C2 x86 Superword support for scalar sum reduction optimizations : int test
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 SumRed_Int
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 SumRed_Int
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRed_Int
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRed_Int
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRed_Int
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRed_Int
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 SumRed_Int
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 SumRed_Int
+ */
+
+public class SumRed_Int
+{
+ public static void main(String[] args) throws Exception {
+ int[] a = new int[256*1024];
+ int[] b = new int[256*1024];
+ int[] c = new int[256*1024];
+ int[] d = new int[256*1024];
+ sumReductionInit(a,b,c);
+ int total = 0;
+ int valid = 262144000;
+ for(int j = 0; j < 2000; j++) {
+ total = sumReductionImplement(a,b,c,d,total);
+ }
+ if(total == valid) {
+ System.out.println("Success");
+ } else {
+ System.out.println("Invalid sum of elements variable in total: " + total);
+ System.out.println("Expected value = " + valid);
+ throw new Exception("Failed");
+ }
+ }
+
+ public static void sumReductionInit(
+ int[] a,
+ int[] b,
+ int[] c)
+ {
+ for(int j = 0; j < 1; j++)
+ {
+ for(int i = 0; i < a.length; i++)
+ {
+ a[i] = i * 1 + j;
+ b[i] = i * 1 - j;
+ c[i] = i + j;
+ }
+ }
+ }
+
+ public static int sumReductionImplement(
+ int[] a,
+ int[] b,
+ int[] c,
+ int[] d,
+ int total)
+ {
+ for(int i = 0; i < a.length; i++)
+ {
+ d[i]= (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
+ total += d[i];
+ }
+ return total;
+ }
+
+}