8052081: Optimize generated by C2 code for Intel's Atom processor
Summary: Allow to execute vectorization and crc32 optimization on Atom. Enable UseFPUForSpilling by default on x86.
Reviewed-by: roland
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp Tue Aug 05 08:19:03 2014 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp Tue Aug 05 15:02:10 2014 -0700
@@ -3854,6 +3854,15 @@
}
// Carry-Less Multiplication Quadword
+void Assembler::pclmulqdq(XMMRegister dst, XMMRegister src, int mask) {
+ assert(VM_Version::supports_clmul(), "");
+ int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
+ emit_int8(0x44);
+ emit_int8((unsigned char)(0xC0 | encode));
+ emit_int8((unsigned char)mask);
+}
+
+// Carry-Less Multiplication Quadword
void Assembler::vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask) {
assert(VM_Version::supports_avx() && VM_Version::supports_clmul(), "");
bool vector256 = false;
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp Tue Aug 05 08:19:03 2014 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp Tue Aug 05 15:02:10 2014 -0700
@@ -1837,6 +1837,7 @@
void vpbroadcastd(XMMRegister dst, XMMRegister src);
// Carry-Less Multiplication Quadword
+ void pclmulqdq(XMMRegister dst, XMMRegister src, int mask);
void vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask);
// AVX instruction which is used to clear upper 128 bits of YMM registers and
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Tue Aug 05 08:19:03 2014 -0700
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Tue Aug 05 15:02:10 2014 -0700
@@ -7316,17 +7316,34 @@
* Fold 128-bit data chunk
*/
void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
- vpclmulhdq(xtmp, xK, xcrc); // [123:64]
- vpclmulldq(xcrc, xK, xcrc); // [63:0]
- vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */);
- pxor(xcrc, xtmp);
+ if (UseAVX > 0) {
+ vpclmulhdq(xtmp, xK, xcrc); // [123:64]
+ vpclmulldq(xcrc, xK, xcrc); // [63:0]
+ vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */);
+ pxor(xcrc, xtmp);
+ } else {
+ movdqa(xtmp, xcrc);
+ pclmulhdq(xtmp, xK); // [123:64]
+ pclmulldq(xcrc, xK); // [63:0]
+ pxor(xcrc, xtmp);
+ movdqu(xtmp, Address(buf, offset));
+ pxor(xcrc, xtmp);
+ }
}
void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
- vpclmulhdq(xtmp, xK, xcrc);
- vpclmulldq(xcrc, xK, xcrc);
- pxor(xcrc, xbuf);
- pxor(xcrc, xtmp);
+ if (UseAVX > 0) {
+ vpclmulhdq(xtmp, xK, xcrc);
+ vpclmulldq(xcrc, xK, xcrc);
+ pxor(xcrc, xbuf);
+ pxor(xcrc, xtmp);
+ } else {
+ movdqa(xtmp, xcrc);
+ pclmulhdq(xtmp, xK);
+ pclmulldq(xcrc, xK);
+ pxor(xcrc, xbuf);
+ pxor(xcrc, xtmp);
+ }
}
/**
@@ -7444,9 +7461,17 @@
// Fold 128 bits in xmm1 down into 32 bits in crc register.
BIND(L_fold_128b);
movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
- vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
- vpand(xmm3, xmm0, xmm2, false /* vector256 */);
- vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
+ if (UseAVX > 0) {
+ vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
+ vpand(xmm3, xmm0, xmm2, false /* vector256 */);
+ vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
+ } else {
+ movdqa(xmm2, xmm0);
+ pclmulqdq(xmm2, xmm1, 0x1);
+ movdqa(xmm3, xmm0);
+ pand(xmm3, xmm2);
+ pclmulqdq(xmm0, xmm3, 0x1);
+ }
psrldq(xmm1, 8);
psrldq(xmm2, 4);
pxor(xmm0, xmm1);
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp Tue Aug 05 08:19:03 2014 -0700
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp Tue Aug 05 15:02:10 2014 -0700
@@ -966,6 +966,16 @@
void mulss(XMMRegister dst, Address src) { Assembler::mulss(dst, src); }
void mulss(XMMRegister dst, AddressLiteral src);
+ // Carry-Less Multiplication Quadword
+ void pclmulldq(XMMRegister dst, XMMRegister src) {
+ // 0x00 - multiply lower 64 bits [0:63]
+ Assembler::pclmulqdq(dst, src, 0x00);
+ }
+ void pclmulhdq(XMMRegister dst, XMMRegister src) {
+ // 0x11 - multiply upper 64 bits [64:127]
+ Assembler::pclmulqdq(dst, src, 0x11);
+ }
+
void sqrtsd(XMMRegister dst, XMMRegister src) { Assembler::sqrtsd(dst, src); }
void sqrtsd(XMMRegister dst, Address src) { Assembler::sqrtsd(dst, src); }
void sqrtsd(XMMRegister dst, AddressLiteral src);
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Tue Aug 05 08:19:03 2014 -0700
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Tue Aug 05 15:02:10 2014 -0700
@@ -559,7 +559,7 @@
FLAG_SET_DEFAULT(UseCLMUL, false);
}
- if (UseCLMUL && (UseAVX > 0) && (UseSSE > 2)) {
+ if (UseCLMUL && (UseSSE > 2)) {
if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {
UseCRC32Intrinsics = true;
}
@@ -805,6 +805,21 @@
}
}
}
+ if ((cpu_family() == 0x06) &&
+ ((extended_cpu_model() == 0x36) || // Centerton
+ (extended_cpu_model() == 0x37) || // Silvermont
+ (extended_cpu_model() == 0x4D))) {
+#ifdef COMPILER2
+ if (FLAG_IS_DEFAULT(OptoScheduling)) {
+ OptoScheduling = true;
+ }
+#endif
+ if (supports_sse4_2()) { // Silvermont
+ if (FLAG_IS_DEFAULT(UseUnalignedLoadStores)) {
+ UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus
+ }
+ }
+ }
}
// Use count leading zeros count instruction if available.
@@ -892,23 +907,25 @@
AllocatePrefetchDistance = allocate_prefetch_distance();
AllocatePrefetchStyle = allocate_prefetch_style();
- if( is_intel() && cpu_family() == 6 && supports_sse3() ) {
- if( AllocatePrefetchStyle == 2 ) { // watermark prefetching on Core
+ if (is_intel() && cpu_family() == 6 && supports_sse3()) {
+ if (AllocatePrefetchStyle == 2) { // watermark prefetching on Core
#ifdef _LP64
AllocatePrefetchDistance = 384;
#else
AllocatePrefetchDistance = 320;
#endif
}
- if( supports_sse4_2() && supports_ht() ) { // Nehalem based cpus
+ if (supports_sse4_2() && supports_ht()) { // Nehalem based cpus
AllocatePrefetchDistance = 192;
AllocatePrefetchLines = 4;
+ }
#ifdef COMPILER2
- if (AggressiveOpts && FLAG_IS_DEFAULT(UseFPUForSpilling)) {
+ if (supports_sse4_2()) {
+ if (FLAG_IS_DEFAULT(UseFPUForSpilling)) {
FLAG_SET_DEFAULT(UseFPUForSpilling, true);
}
+ }
#endif
- }
}
assert(AllocatePrefetchDistance % AllocatePrefetchStepSize == 0, "invalid value");
--- a/hotspot/src/share/vm/opto/lcm.cpp Tue Aug 05 08:19:03 2014 -0700
+++ b/hotspot/src/share/vm/opto/lcm.cpp Tue Aug 05 15:02:10 2014 -0700
@@ -464,7 +464,9 @@
iop == Op_CreateEx || // Create-exception must start block
iop == Op_CheckCastPP
) {
- worklist.map(i,worklist.pop());
+ // select the node n
+ // remove n from worklist and retain the order of remaining nodes
+ worklist.remove((uint)i);
return n;
}
@@ -550,7 +552,9 @@
assert(idx >= 0, "index should be set");
Node *n = worklist[(uint)idx]; // Get the winner
- worklist.map((uint)idx, worklist.pop()); // Compress worklist
+ // select the node n
+ // remove n from worklist and retain the order of remaining nodes
+ worklist.remove((uint)idx);
return n;
}
--- a/hotspot/src/share/vm/opto/superword.cpp Tue Aug 05 08:19:03 2014 -0700
+++ b/hotspot/src/share/vm/opto/superword.cpp Tue Aug 05 15:02:10 2014 -0700
@@ -1378,6 +1378,20 @@
if (n->is_Load()) {
Node* ctl = n->in(MemNode::Control);
Node* mem = first->in(MemNode::Memory);
+ SWPointer p1(n->as_Mem(), this);
+ // Identify the memory dependency for the new loadVector node by
+ // walking up through memory chain.
+ // This is done to give flexibility to the new loadVector node so that
+ // it can move above independent storeVector nodes.
+ while (mem->is_StoreVector()) {
+ SWPointer p2(mem->as_Mem(), this);
+ int cmp = p1.cmp(p2);
+ if (SWPointer::not_equal(cmp) || !SWPointer::comparable(cmp)) {
+ mem = mem->in(MemNode::Memory);
+ } else {
+ break; // dependent memory
+ }
+ }
Node* adr = low_adr->in(MemNode::Address);
const TypePtr* atyp = n->adr_type();
vn = LoadVectorNode::make(opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n));