6940701: Don't align loops in stubs for Niagara sparc
Summary: Don't align loops in stubs for Niagara sparc since NOPs are expensive.
Reviewed-by: twisti, never
--- a/hotspot/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp Tue Apr 06 15:18:10 2010 -0700
+++ b/hotspot/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp Wed Apr 07 09:37:47 2010 -0700
@@ -2849,7 +2849,7 @@
void LIR_Assembler::align_backward_branch_target() {
- __ align(16);
+ __ align(OptoLoopAlignment);
}
--- a/hotspot/src/cpu/sparc/vm/c2_globals_sparc.hpp Tue Apr 06 15:18:10 2010 -0700
+++ b/hotspot/src/cpu/sparc/vm/c2_globals_sparc.hpp Wed Apr 07 09:37:47 2010 -0700
@@ -60,9 +60,6 @@
define_pd_global(intx, INTPRESSURE, 48); // large register set
define_pd_global(intx, InteriorEntryAlignment, 16); // = CodeEntryAlignment
define_pd_global(intx, NewSizeThreadIncrease, ScaleForWordSize(4*K));
-// The default setting 16/16 seems to work best.
-// (For _228_jack 16/16 is 2% better than 4/4, 16/4, 32/32, 32/16, or 16/32.)
-define_pd_global(intx, OptoLoopAlignment, 16); // = 4*wordSize
define_pd_global(intx, RegisterCostAreaRatio, 12000);
define_pd_global(bool, UseTLAB, true);
define_pd_global(bool, ResizeTLAB, true);
--- a/hotspot/src/cpu/sparc/vm/globals_sparc.hpp Tue Apr 06 15:18:10 2010 -0700
+++ b/hotspot/src/cpu/sparc/vm/globals_sparc.hpp Wed Apr 07 09:37:47 2010 -0700
@@ -40,6 +40,9 @@
define_pd_global(bool, UncommonNullCast, true); // Uncommon-trap NULLs past to check cast
define_pd_global(intx, CodeEntryAlignment, 32);
+// The default setting 16/16 seems to work best.
+// (For _228_jack 16/16 is 2% better than 4/4, 16/4, 32/32, 32/16, or 16/32.)
+define_pd_global(intx, OptoLoopAlignment, 16); // = 4*wordSize
define_pd_global(intx, InlineFrequencyCount, 50); // we can use more inlining on the SPARC
define_pd_global(intx, InlineSmallCode, 1500);
#ifdef _LP64
--- a/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp Tue Apr 06 15:18:10 2010 -0700
+++ b/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp Wed Apr 07 09:37:47 2010 -0700
@@ -1148,7 +1148,7 @@
__ andn(from, 7, from); // Align address
__ ldx(from, 0, O3);
__ inc(from, 8);
- __ align(16);
+ __ align(OptoLoopAlignment);
__ BIND(L_loop);
__ ldx(from, 0, O4);
__ deccc(count, count_dec); // Can we do next iteration after this one?
@@ -1220,7 +1220,7 @@
//
__ andn(end_from, 7, end_from); // Align address
__ ldx(end_from, 0, O3);
- __ align(16);
+ __ align(OptoLoopAlignment);
__ BIND(L_loop);
__ ldx(end_from, -8, O4);
__ deccc(count, count_dec); // Can we do next iteration after this one?
@@ -1349,7 +1349,7 @@
__ BIND(L_copy_byte);
__ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
__ delayed()->nop();
- __ align(16);
+ __ align(OptoLoopAlignment);
__ BIND(L_copy_byte_loop);
__ ldub(from, offset, O3);
__ deccc(count);
@@ -1445,7 +1445,7 @@
L_aligned_copy, L_copy_byte);
}
// copy 4 elements (16 bytes) at a time
- __ align(16);
+ __ align(OptoLoopAlignment);
__ BIND(L_aligned_copy);
__ dec(end_from, 16);
__ ldx(end_from, 8, O3);
@@ -1461,7 +1461,7 @@
__ BIND(L_copy_byte);
__ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
__ delayed()->nop();
- __ align(16);
+ __ align(OptoLoopAlignment);
__ BIND(L_copy_byte_loop);
__ dec(end_from);
__ dec(end_to);
@@ -1577,7 +1577,7 @@
__ BIND(L_copy_2_bytes);
__ br_zero(Assembler::zero, false, Assembler::pt, count, L_exit);
__ delayed()->nop();
- __ align(16);
+ __ align(OptoLoopAlignment);
__ BIND(L_copy_2_bytes_loop);
__ lduh(from, offset, O3);
__ deccc(count);
@@ -1684,7 +1684,7 @@
L_aligned_copy, L_copy_2_bytes);
}
// copy 4 elements (16 bytes) at a time
- __ align(16);
+ __ align(OptoLoopAlignment);
__ BIND(L_aligned_copy);
__ dec(end_from, 16);
__ ldx(end_from, 8, O3);
@@ -1781,7 +1781,7 @@
// copy with shift 4 elements (16 bytes) at a time
__ dec(count, 4); // The cmp at the beginning guaranty count >= 4
- __ align(16);
+ __ align(OptoLoopAlignment);
__ BIND(L_copy_16_bytes);
__ ldx(from, 4, O4);
__ deccc(count, 4); // Can we do next iteration after this one?
@@ -1907,7 +1907,7 @@
// to form 2 aligned 8-bytes chunks to store.
//
__ ldx(end_from, -4, O3);
- __ align(16);
+ __ align(OptoLoopAlignment);
__ BIND(L_copy_16_bytes);
__ ldx(end_from, -12, O4);
__ deccc(count, 4);
@@ -1929,7 +1929,7 @@
__ delayed()->inc(count, 4);
// copy 4 elements (16 bytes) at a time
- __ align(16);
+ __ align(OptoLoopAlignment);
__ BIND(L_aligned_copy);
__ dec(end_from, 16);
__ ldx(end_from, 8, O3);
@@ -2045,7 +2045,7 @@
__ mov(O3, count);
__ mov(from, from64);
- __ align(16);
+ __ align(OptoLoopAlignment);
__ BIND(L_copy_64_bytes);
for( int off = 0; off < 64; off += 16 ) {
__ ldx(from64, off+0, O4);
@@ -2065,7 +2065,7 @@
__ delayed()->add(offset0, 8, offset8);
// Copy by 16 bytes chunks
- __ align(16);
+ __ align(OptoLoopAlignment);
__ BIND(L_copy_16_bytes);
__ ldx(from, offset0, O3);
__ ldx(from, offset8, G3);
@@ -2139,7 +2139,7 @@
__ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes );
__ delayed()->sllx(count, LogBytesPerLong, offset8);
__ sub(offset8, 8, offset0);
- __ align(16);
+ __ align(OptoLoopAlignment);
__ BIND(L_copy_16_bytes);
__ ldx(from, offset8, O2);
__ ldx(from, offset0, O3);
@@ -2405,7 +2405,7 @@
// (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays
// (O2 = len; O2 != 0; O2--) --- number of oops *remaining*
// G3, G4, G5 --- current oop, oop.klass, oop.klass.super
- __ align(16);
+ __ align(OptoLoopAlignment);
__ BIND(store_element);
__ deccc(G1_remain); // decrement the count
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp Tue Apr 06 15:18:10 2010 -0700
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp Wed Apr 07 09:37:47 2010 -0700
@@ -86,14 +86,14 @@
if (FLAG_IS_DEFAULT(InteriorEntryAlignment)) {
FLAG_SET_DEFAULT(InteriorEntryAlignment, 4);
}
- if (FLAG_IS_DEFAULT(OptoLoopAlignment)) {
- FLAG_SET_DEFAULT(OptoLoopAlignment, 4);
- }
if (is_niagara1_plus() && FLAG_IS_DEFAULT(AllocatePrefetchDistance)) {
// Use smaller prefetch distance on N2
FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256);
}
#endif
+ if (FLAG_IS_DEFAULT(OptoLoopAlignment)) {
+ FLAG_SET_DEFAULT(OptoLoopAlignment, 4);
+ }
}
// Use hardware population count instruction if available.
--- a/hotspot/src/cpu/x86/vm/c2_globals_x86.hpp Tue Apr 06 15:18:10 2010 -0700
+++ b/hotspot/src/cpu/x86/vm/c2_globals_x86.hpp Wed Apr 07 09:37:47 2010 -0700
@@ -80,7 +80,6 @@
// Ergonomics related flags
define_pd_global(uint64_t,MaxRAM, 4ULL*G);
#endif // AMD64
-define_pd_global(intx, OptoLoopAlignment, 16);
define_pd_global(intx, RegisterCostAreaRatio, 16000);
// Peephole and CISC spilling both break the graph, and so makes the
--- a/hotspot/src/cpu/x86/vm/globals_x86.hpp Tue Apr 06 15:18:10 2010 -0700
+++ b/hotspot/src/cpu/x86/vm/globals_x86.hpp Wed Apr 07 09:37:47 2010 -0700
@@ -45,6 +45,7 @@
#else
define_pd_global(intx, CodeEntryAlignment, 16);
#endif // COMPILER2
+define_pd_global(intx, OptoLoopAlignment, 16);
define_pd_global(intx, InlineFrequencyCount, 100);
define_pd_global(intx, InlineSmallCode, 1000);
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp Tue Apr 06 15:18:10 2010 -0700
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp Wed Apr 07 09:37:47 2010 -0700
@@ -812,7 +812,7 @@
Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
// Copy 64-byte chunks
__ jmpb(L_copy_64_bytes);
- __ align(16);
+ __ align(OptoLoopAlignment);
__ BIND(L_copy_64_bytes_loop);
if(UseUnalignedLoadStores) {
@@ -874,7 +874,7 @@
Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
// Copy 64-byte chunks
__ jmpb(L_copy_64_bytes);
- __ align(16);
+ __ align(OptoLoopAlignment);
__ BIND(L_copy_64_bytes_loop);
__ movq(mmx0, Address(from, 0));
__ movq(mmx1, Address(from, 8));
@@ -1144,7 +1144,7 @@
__ movl(Address(to, count, sf, 0), rdx);
__ jmpb(L_copy_8_bytes);
- __ align(16);
+ __ align(OptoLoopAlignment);
// Move 8 bytes
__ BIND(L_copy_8_bytes_loop);
if (UseXMMForArrayCopy) {
@@ -1235,7 +1235,7 @@
}
} else {
__ jmpb(L_copy_8_bytes);
- __ align(16);
+ __ align(OptoLoopAlignment);
__ BIND(L_copy_8_bytes_loop);
__ fild_d(Address(from, 0));
__ fistp_d(Address(from, to_from, Address::times_1));
@@ -1282,7 +1282,7 @@
__ jmpb(L_copy_8_bytes);
- __ align(16);
+ __ align(OptoLoopAlignment);
__ BIND(L_copy_8_bytes_loop);
if (VM_Version::supports_mmx()) {
if (UseXMMForArrayCopy) {
@@ -1454,7 +1454,7 @@
// Loop control:
// for (count = -count; count != 0; count++)
// Base pointers src, dst are biased by 8*count,to last element.
- __ align(16);
+ __ align(OptoLoopAlignment);
__ BIND(L_store_element);
__ movptr(to_element_addr, elem); // store the oop
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp Tue Apr 06 15:18:10 2010 -0700
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp Wed Apr 07 09:37:47 2010 -0700
@@ -871,9 +871,8 @@
}
address generate_fp_mask(const char *stub_name, int64_t mask) {
+ __ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", stub_name);
-
- __ align(16);
address start = __ pc();
__ emit_data64( mask, relocInfo::none );
@@ -1268,7 +1267,7 @@
Label& L_copy_32_bytes, Label& L_copy_8_bytes) {
DEBUG_ONLY(__ stop("enter at entry label, not here"));
Label L_loop;
- __ align(16);
+ __ align(OptoLoopAlignment);
__ BIND(L_loop);
if(UseUnalignedLoadStores) {
__ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
@@ -1309,7 +1308,7 @@
Label& L_copy_32_bytes, Label& L_copy_8_bytes) {
DEBUG_ONLY(__ stop("enter at entry label, not here"));
Label L_loop;
- __ align(16);
+ __ align(OptoLoopAlignment);
__ BIND(L_loop);
if(UseUnalignedLoadStores) {
__ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
@@ -2229,7 +2228,7 @@
// Loop control:
// for (count = -count; count != 0; count++)
// Base pointers src, dst are biased by 8*(count-1),to last element.
- __ align(16);
+ __ align(OptoLoopAlignment);
__ BIND(L_store_element);
__ store_heap_oop(to_element_addr, rax_oop); // store the oop
--- a/hotspot/src/share/vm/opto/c2_globals.hpp Tue Apr 06 15:18:10 2010 -0700
+++ b/hotspot/src/share/vm/opto/c2_globals.hpp Wed Apr 07 09:37:47 2010 -0700
@@ -52,9 +52,6 @@
"Code alignment for interior entry points " \
"in generated code (in bytes)") \
\
- product_pd(intx, OptoLoopAlignment, \
- "Align inner loops to zero relative to this modulus") \
- \
product(intx, MaxLoopPad, (OptoLoopAlignment-1), \
"Align a loop if padding size in bytes is less or equal to this value") \
\
--- a/hotspot/src/share/vm/runtime/globals.hpp Tue Apr 06 15:18:10 2010 -0700
+++ b/hotspot/src/share/vm/runtime/globals.hpp Wed Apr 07 09:37:47 2010 -0700
@@ -3110,6 +3110,9 @@
develop_pd(intx, CodeEntryAlignment, \
"Code entry alignment for generated code (in bytes)") \
\
+ product_pd(intx, OptoLoopAlignment, \
+ "Align inner loops to zero relative to this modulus") \
+ \
product_pd(uintx, InitialCodeCacheSize, \
"Initial code cache size (in bytes)") \
\