7079329: Adjust allocation prefetching for T4
Summary: on T4 2 BIS instructions should be issued to prefetch 64 bytes
Reviewed-by: iveresov, phh, twisti
--- a/hotspot/src/cpu/sparc/vm/assembler_sparc.hpp Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/cpu/sparc/vm/assembler_sparc.hpp Tue Aug 16 16:59:46 2011 -0700
@@ -886,7 +886,11 @@
enum ASIs { // page 72, v9
ASI_PRIMARY = 0x80,
- ASI_PRIMARY_LITTLE = 0x88
+ ASI_PRIMARY_LITTLE = 0x88,
+ // Block initializing store
+ ASI_ST_BLKINIT_PRIMARY = 0xE2,
+ // Most-Recently-Used (MRU) BIS variant
+ ASI_ST_BLKINIT_MRU_PRIMARY = 0xF2
// add more from book as needed
};
--- a/hotspot/src/cpu/sparc/vm/sparc.ad Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/cpu/sparc/vm/sparc.ad Tue Aug 16 16:59:46 2011 -0700
@@ -471,9 +471,6 @@
source %{
#define __ _masm.
-// Block initializing store
-#define ASI_BLK_INIT_QUAD_LDD_P 0xE2
-
// tertiary op of a LoadP or StoreP encoding
#define REGP_OP true
@@ -2819,10 +2816,10 @@
Register nof_bytes_arg = reg_to_register_object($cnt$$reg);
Register nof_bytes_tmp = reg_to_register_object($temp$$reg);
Register base_pointer_arg = reg_to_register_object($base$$reg);
-
+
Label loop;
__ mov(nof_bytes_arg, nof_bytes_tmp);
-
+
// Loop and clear, walking backwards through the array.
// nof_bytes_tmp (if >0) is always the number of bytes to zero
__ bind(loop);
@@ -6269,6 +6266,7 @@
instruct prefetchr( memory mem ) %{
match( PrefetchRead mem );
ins_cost(MEMORY_REF_COST);
+ size(4);
format %{ "PREFETCH $mem,0\t! Prefetch read-many" %}
opcode(Assembler::prefetch_op3);
@@ -6277,9 +6275,9 @@
%}
instruct prefetchw( memory mem ) %{
- predicate(AllocatePrefetchStyle != 3 );
match( PrefetchWrite mem );
ins_cost(MEMORY_REF_COST);
+ size(4);
format %{ "PREFETCH $mem,2\t! Prefetch write-many (and read)" %}
opcode(Assembler::prefetch_op3);
@@ -6287,24 +6285,62 @@
ins_pipe(iload_mem);
%}
-// Use BIS instruction to prefetch.
-instruct prefetchw_bis( memory mem ) %{
- predicate(AllocatePrefetchStyle == 3);
- match( PrefetchWrite mem );
- ins_cost(MEMORY_REF_COST);
-
- format %{ "STXA G0,$mem\t! // Block initializing store" %}
- ins_encode %{
- Register base = as_Register($mem$$base);
- int disp = $mem$$disp;
- if (disp != 0) {
- __ add(base, AllocatePrefetchStepSize, base);
- }
- __ stxa(G0, base, G0, ASI_BLK_INIT_QUAD_LDD_P);
+// Prefetch instructions for allocation.
+
+instruct prefetchAlloc( memory mem ) %{
+ predicate(AllocatePrefetchInstr == 0);
+ match( PrefetchAllocation mem );
+ ins_cost(MEMORY_REF_COST);
+ size(4);
+
+ format %{ "PREFETCH $mem,2\t! Prefetch allocation" %}
+ opcode(Assembler::prefetch_op3);
+ ins_encode( form3_mem_prefetch_write( mem ) );
+ ins_pipe(iload_mem);
+%}
+
+// Use BIS instruction to prefetch for allocation.
+// Could fault, need space at the end of TLAB.
+instruct prefetchAlloc_bis( iRegP dst ) %{
+ predicate(AllocatePrefetchInstr == 1);
+ match( PrefetchAllocation dst );
+ ins_cost(MEMORY_REF_COST);
+ size(4);
+
+ format %{ "STXA [$dst]\t! // Prefetch allocation using BIS" %}
+ ins_encode %{
+ __ stxa(G0, $dst$$Register, G0, Assembler::ASI_ST_BLKINIT_PRIMARY);
%}
ins_pipe(istore_mem_reg);
%}
+// Next code is used for finding next cache line address to prefetch.
+#ifndef _LP64
+instruct cacheLineAdr( iRegP dst, iRegP src, immI13 mask ) %{
+ match(Set dst (CastX2P (AndI (CastP2X src) mask)));
+ ins_cost(DEFAULT_COST);
+ size(4);
+
+ format %{ "AND $src,$mask,$dst\t! next cache line address" %}
+ ins_encode %{
+ __ and3($src$$Register, $mask$$constant, $dst$$Register);
+ %}
+ ins_pipe(ialu_reg_imm);
+%}
+#else
+instruct cacheLineAdr( iRegP dst, iRegP src, immL13 mask ) %{
+ match(Set dst (CastX2P (AndL (CastP2X src) mask)));
+ ins_cost(DEFAULT_COST);
+ size(4);
+
+ format %{ "AND $src,$mask,$dst\t! next cache line address" %}
+ ins_encode %{
+ __ and3($src$$Register, $mask$$constant, $dst$$Register);
+ %}
+ ins_pipe(ialu_reg_imm);
+%}
+#endif
+
//----------Store Instructions-------------------------------------------------
// Store Byte
instruct storeB(memory mem, iRegI src) %{
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp Tue Aug 16 16:59:46 2011 -0700
@@ -44,20 +44,31 @@
PrefetchScanIntervalInBytes = prefetch_scan_interval_in_bytes();
PrefetchFieldsAhead = prefetch_fields_ahead();
+ assert(0 <= AllocatePrefetchInstr && AllocatePrefetchInstr <= 1, "invalid value");
+ if( AllocatePrefetchInstr < 0 ) AllocatePrefetchInstr = 0;
+ if( AllocatePrefetchInstr > 1 ) AllocatePrefetchInstr = 0;
+
// Allocation prefetch settings
- intx cache_line_size = L1_data_cache_line_size();
+ intx cache_line_size = prefetch_data_size();
if( cache_line_size > AllocatePrefetchStepSize )
AllocatePrefetchStepSize = cache_line_size;
- if( FLAG_IS_DEFAULT(AllocatePrefetchLines) )
- AllocatePrefetchLines = 3; // Optimistic value
- assert( AllocatePrefetchLines > 0, "invalid value");
- if( AllocatePrefetchLines < 1 ) // set valid value in product VM
- AllocatePrefetchLines = 1; // Conservative value
+
+ assert(AllocatePrefetchLines > 0, "invalid value");
+ if( AllocatePrefetchLines < 1 ) // set valid value in product VM
+ AllocatePrefetchLines = 3;
+ assert(AllocateInstancePrefetchLines > 0, "invalid value");
+ if( AllocateInstancePrefetchLines < 1 ) // set valid value in product VM
+ AllocateInstancePrefetchLines = 1;
AllocatePrefetchDistance = allocate_prefetch_distance();
AllocatePrefetchStyle = allocate_prefetch_style();
- assert(AllocatePrefetchDistance % AllocatePrefetchStepSize == 0, "invalid value");
+ assert((AllocatePrefetchDistance % AllocatePrefetchStepSize) == 0 &&
+ (AllocatePrefetchDistance > 0), "invalid value");
+ if ((AllocatePrefetchDistance % AllocatePrefetchStepSize) != 0 ||
+ (AllocatePrefetchDistance <= 0)) {
+ AllocatePrefetchDistance = AllocatePrefetchStepSize;
+ }
if (AllocatePrefetchStyle == 3 && !has_blk_init()) {
warning("BIS instructions are not available on this CPU");
@@ -66,7 +77,7 @@
UseSSE = 0; // Only on x86 and x64
- _supports_cx8 = has_v9();
+ _supports_cx8 = has_v9();
if (is_niagara()) {
// Indirect branch is the same cost as direct
@@ -99,19 +110,42 @@
FLAG_SET_DEFAULT(InteriorEntryAlignment, 4);
}
if (is_niagara_plus()) {
- if (has_blk_init() && AllocatePrefetchStyle > 0 &&
- FLAG_IS_DEFAULT(AllocatePrefetchStyle)) {
- // Use BIS instruction for allocation prefetch.
- FLAG_SET_DEFAULT(AllocatePrefetchStyle, 3);
+ if (has_blk_init() && UseTLAB &&
+ FLAG_IS_DEFAULT(AllocatePrefetchInstr)) {
+ // Use BIS instruction for TLAB allocation prefetch.
+ FLAG_SET_ERGO(intx, AllocatePrefetchInstr, 1);
+ if (FLAG_IS_DEFAULT(AllocatePrefetchStyle)) {
+ FLAG_SET_ERGO(intx, AllocatePrefetchStyle, 3);
+ }
if (FLAG_IS_DEFAULT(AllocatePrefetchDistance)) {
- // Use smaller prefetch distance on N2 with BIS
+ // Use smaller prefetch distance with BIS
FLAG_SET_DEFAULT(AllocatePrefetchDistance, 64);
}
}
+ if (is_T4()) {
+ // Double number of prefetched cache lines on T4
+ // since L2 cache line size is smaller (32 bytes).
+ if (FLAG_IS_DEFAULT(AllocatePrefetchLines)) {
+ FLAG_SET_ERGO(intx, AllocatePrefetchLines, AllocatePrefetchLines*2);
+ }
+ if (FLAG_IS_DEFAULT(AllocateInstancePrefetchLines)) {
+ FLAG_SET_ERGO(intx, AllocateInstancePrefetchLines, AllocateInstancePrefetchLines*2);
+ }
+ }
if (AllocatePrefetchStyle != 3 && FLAG_IS_DEFAULT(AllocatePrefetchDistance)) {
// Use different prefetch distance without BIS
FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256);
}
+ if (AllocatePrefetchInstr == 1) {
+ // Need a space at the end of TLAB for BIS since it
+ // will fault when accessing memory outside of heap.
+
+ // +1 for rounding up to next cache line, +1 to be safe
+ int lines = AllocatePrefetchLines + 2;
+ int step_size = AllocatePrefetchStepSize;
+ int distance = AllocatePrefetchDistance;
+ _reserve_for_allocation_prefetch = (distance + step_size*lines)/(int)HeapWordSize;
+ }
}
#endif
}
@@ -185,14 +219,20 @@
#ifndef PRODUCT
if (PrintMiscellaneous && Verbose) {
- tty->print("Allocation: ");
+ tty->print("Allocation");
if (AllocatePrefetchStyle <= 0) {
- tty->print_cr("no prefetching");
+ tty->print_cr(": no prefetching");
} else {
+ tty->print(" prefetching: ");
+ if (AllocatePrefetchInstr == 0) {
+ tty->print("PREFETCH");
+ } else if (AllocatePrefetchInstr == 1) {
+ tty->print("BIS");
+ }
if (AllocatePrefetchLines > 1) {
- tty->print_cr("PREFETCH %d, %d lines of size %d bytes", AllocatePrefetchDistance, AllocatePrefetchLines, AllocatePrefetchStepSize);
+ tty->print_cr(" at distance %d, %d lines of %d bytes", AllocatePrefetchDistance, AllocatePrefetchLines, AllocatePrefetchStepSize);
} else {
- tty->print_cr("PREFETCH %d, one line", AllocatePrefetchDistance);
+ tty->print_cr(" at distance %d, one line of %d bytes", AllocatePrefetchDistance, AllocatePrefetchStepSize);
}
}
if (PrefetchCopyIntervalInBytes > 0) {
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.hpp Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.hpp Tue Aug 16 16:59:46 2011 -0700
@@ -121,6 +121,7 @@
// Returns true if the platform is in the niagara line (T series)
// and newer than the niagara1.
static bool is_niagara_plus() { return is_T_family(_features) && !is_T1_model(_features); }
+ static bool is_T4() { return is_T_family(_features) && has_cbcond(); }
// Fujitsu SPARC64
static bool is_sparc64() { return (_features & sparc64_family_m) != 0; }
@@ -130,13 +131,17 @@
static bool has_fast_fxtof() { return is_niagara() || is_sparc64() || has_v9() && !is_ultra3(); }
static bool has_fast_idiv() { return is_niagara_plus() || is_sparc64(); }
+
// T4 and newer Sparc have fast RDPC instruction.
- static bool has_fast_rdpc() { return is_niagara_plus() && has_cbcond(); }
+ static bool has_fast_rdpc() { return is_T4(); }
+
+ // T4 and newer Sparc have Most-Recently-Used (MRU) BIS.
+ static bool has_mru_blk_init() { return has_blk_init() && is_T4(); }
static const char* cpu_features() { return _features_str; }
- static intx L1_data_cache_line_size() {
- return 64; // default prefetch block size on sparc
+ static intx prefetch_data_size() {
+ return is_T4() ? 32 : 64; // default prefetch block size on sparc
}
// Prefetch
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp Tue Aug 16 16:59:46 2011 -0700
@@ -2315,7 +2315,7 @@
}
void Assembler::prefetchr(Address src) {
- NOT_LP64(assert(VM_Version::supports_3dnow_prefetch(), "must support"));
+ assert(VM_Version::supports_3dnow_prefetch(), "must support");
InstructionMark im(this);
prefetch_prefix(src);
emit_byte(0x0D);
@@ -2347,7 +2347,7 @@
}
void Assembler::prefetchw(Address src) {
- NOT_LP64(assert(VM_Version::supports_3dnow_prefetch(), "must support"));
+ assert(VM_Version::supports_3dnow_prefetch(), "must support");
InstructionMark im(this);
prefetch_prefix(src);
emit_byte(0x0D);
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Tue Aug 16 16:59:46 2011 -0700
@@ -557,14 +557,16 @@
if( !supports_sse() && supports_3dnow_prefetch() ) AllocatePrefetchInstr = 3;
// Allocation prefetch settings
- intx cache_line_size = L1_data_cache_line_size();
+ intx cache_line_size = prefetch_data_size();
if( cache_line_size > AllocatePrefetchStepSize )
AllocatePrefetchStepSize = cache_line_size;
- if( FLAG_IS_DEFAULT(AllocatePrefetchLines) )
- AllocatePrefetchLines = 3; // Optimistic value
+
assert(AllocatePrefetchLines > 0, "invalid value");
- if( AllocatePrefetchLines < 1 ) // set valid value in product VM
- AllocatePrefetchLines = 1; // Conservative value
+ if( AllocatePrefetchLines < 1 ) // set valid value in product VM
+ AllocatePrefetchLines = 3;
+ assert(AllocateInstancePrefetchLines > 0, "invalid value");
+ if( AllocateInstancePrefetchLines < 1 ) // set valid value in product VM
+ AllocateInstancePrefetchLines = 1;
AllocatePrefetchDistance = allocate_prefetch_distance();
AllocatePrefetchStyle = allocate_prefetch_style();
@@ -601,10 +603,11 @@
tty->print_cr("Logical CPUs per core: %u",
logical_processors_per_package());
tty->print_cr("UseSSE=%d",UseSSE);
- tty->print("Allocation: ");
+ tty->print("Allocation");
if (AllocatePrefetchStyle <= 0 || UseSSE == 0 && !supports_3dnow_prefetch()) {
- tty->print_cr("no prefetching");
+ tty->print_cr(": no prefetching");
} else {
+ tty->print(" prefetching: ");
if (UseSSE == 0 && supports_3dnow_prefetch()) {
tty->print("PREFETCHW");
} else if (UseSSE >= 1) {
@@ -619,9 +622,9 @@
}
}
if (AllocatePrefetchLines > 1) {
- tty->print_cr(" %d, %d lines with step %d bytes", AllocatePrefetchDistance, AllocatePrefetchLines, AllocatePrefetchStepSize);
+ tty->print_cr(" at distance %d, %d lines of %d bytes", AllocatePrefetchDistance, AllocatePrefetchLines, AllocatePrefetchStepSize);
} else {
- tty->print_cr(" %d, one line", AllocatePrefetchDistance);
+ tty->print_cr(" at distance %d, one line of %d bytes", AllocatePrefetchDistance, AllocatePrefetchStepSize);
}
}
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp Tue Aug 16 16:59:46 2011 -0700
@@ -419,7 +419,7 @@
return result;
}
- static intx L1_data_cache_line_size() {
+ static intx prefetch_data_size() {
intx result = 0;
if (is_intel()) {
result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1);
--- a/hotspot/src/cpu/x86/vm/x86_32.ad Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad Tue Aug 16 16:59:46 2011 -0700
@@ -7325,8 +7325,9 @@
ins_cost(100);
format %{ "PREFETCHR $mem\t! Prefetch into level 1 cache for read" %}
- opcode(0x0F, 0x0d); /* Opcode 0F 0d /0 */
- ins_encode(OpcP, OpcS, RMopc_Mem(0x00,mem));
+ ins_encode %{
+ __ prefetchr($mem$$Address);
+ %}
ins_pipe(ialu_mem);
%}
@@ -7336,8 +7337,9 @@
ins_cost(100);
format %{ "PREFETCHNTA $mem\t! Prefetch into non-temporal cache for read" %}
- opcode(0x0F, 0x18); /* Opcode 0F 18 /0 */
- ins_encode(OpcP, OpcS, RMopc_Mem(0x00,mem));
+ ins_encode %{
+ __ prefetchnta($mem$$Address);
+ %}
ins_pipe(ialu_mem);
%}
@@ -7347,8 +7349,9 @@
ins_cost(100);
format %{ "PREFETCHT0 $mem\t! Prefetch into L1 and L2 caches for read" %}
- opcode(0x0F, 0x18); /* Opcode 0F 18 /1 */
- ins_encode(OpcP, OpcS, RMopc_Mem(0x01,mem));
+ ins_encode %{
+ __ prefetcht0($mem$$Address);
+ %}
ins_pipe(ialu_mem);
%}
@@ -7358,8 +7361,9 @@
ins_cost(100);
format %{ "PREFETCHT2 $mem\t! Prefetch into L2 cache for read" %}
- opcode(0x0F, 0x18); /* Opcode 0F 18 /3 */
- ins_encode(OpcP, OpcS, RMopc_Mem(0x03,mem));
+ ins_encode %{
+ __ prefetcht2($mem$$Address);
+ %}
ins_pipe(ialu_mem);
%}
@@ -7374,46 +7378,86 @@
%}
instruct prefetchw( memory mem ) %{
- predicate(UseSSE==0 && VM_Version::supports_3dnow_prefetch() || AllocatePrefetchInstr==3);
+ predicate(UseSSE==0 && VM_Version::supports_3dnow_prefetch());
match( PrefetchWrite mem );
ins_cost(100);
format %{ "PREFETCHW $mem\t! Prefetch into L1 cache and mark modified" %}
- opcode(0x0F, 0x0D); /* Opcode 0F 0D /1 */
- ins_encode(OpcP, OpcS, RMopc_Mem(0x01,mem));
+ ins_encode %{
+ __ prefetchw($mem$$Address);
+ %}
ins_pipe(ialu_mem);
%}
instruct prefetchwNTA( memory mem ) %{
- predicate(UseSSE>=1 && AllocatePrefetchInstr==0);
+ predicate(UseSSE>=1);
match(PrefetchWrite mem);
ins_cost(100);
format %{ "PREFETCHNTA $mem\t! Prefetch into non-temporal cache for write" %}
- opcode(0x0F, 0x18); /* Opcode 0F 18 /0 */
- ins_encode(OpcP, OpcS, RMopc_Mem(0x00,mem));
+ ins_encode %{
+ __ prefetchnta($mem$$Address);
+ %}
ins_pipe(ialu_mem);
%}
-instruct prefetchwT0( memory mem ) %{
- predicate(UseSSE>=1 && AllocatePrefetchInstr==1);
- match(PrefetchWrite mem);
+// Prefetch instructions for allocation.
+
+instruct prefetchAlloc0( memory mem ) %{
+ predicate(UseSSE==0 && AllocatePrefetchInstr!=3);
+ match(PrefetchAllocation mem);
+ ins_cost(0);
+ size(0);
+ format %{ "Prefetch allocation (non-SSE is empty encoding)" %}
+ ins_encode();
+ ins_pipe(empty);
+%}
+
+instruct prefetchAlloc( memory mem ) %{
+ predicate(AllocatePrefetchInstr==3);
+ match( PrefetchAllocation mem );
ins_cost(100);
- format %{ "PREFETCHT0 $mem\t! Prefetch into L1 and L2 caches for write" %}
- opcode(0x0F, 0x18); /* Opcode 0F 18 /1 */
- ins_encode(OpcP, OpcS, RMopc_Mem(0x01,mem));
+ format %{ "PREFETCHW $mem\t! Prefetch allocation into L1 cache and mark modified" %}
+ ins_encode %{
+ __ prefetchw($mem$$Address);
+ %}
+ ins_pipe(ialu_mem);
+%}
+
+instruct prefetchAllocNTA( memory mem ) %{
+ predicate(UseSSE>=1 && AllocatePrefetchInstr==0);
+ match(PrefetchAllocation mem);
+ ins_cost(100);
+
+ format %{ "PREFETCHNTA $mem\t! Prefetch allocation into non-temporal cache for write" %}
+ ins_encode %{
+ __ prefetchnta($mem$$Address);
+ %}
ins_pipe(ialu_mem);
%}
-instruct prefetchwT2( memory mem ) %{
- predicate(UseSSE>=1 && AllocatePrefetchInstr==2);
- match(PrefetchWrite mem);
+instruct prefetchAllocT0( memory mem ) %{
+ predicate(UseSSE>=1 && AllocatePrefetchInstr==1);
+ match(PrefetchAllocation mem);
ins_cost(100);
- format %{ "PREFETCHT2 $mem\t! Prefetch into L2 cache for write" %}
- opcode(0x0F, 0x18); /* Opcode 0F 18 /3 */
- ins_encode(OpcP, OpcS, RMopc_Mem(0x03,mem));
+ format %{ "PREFETCHT0 $mem\t! Prefetch allocation into L1 and L2 caches for write" %}
+ ins_encode %{
+ __ prefetcht0($mem$$Address);
+ %}
+ ins_pipe(ialu_mem);
+%}
+
+instruct prefetchAllocT2( memory mem ) %{
+ predicate(UseSSE>=1 && AllocatePrefetchInstr==2);
+ match(PrefetchAllocation mem);
+ ins_cost(100);
+
+ format %{ "PREFETCHT2 $mem\t! Prefetch allocation into L2 cache for write" %}
+ ins_encode %{
+ __ prefetcht2($mem$$Address);
+ %}
ins_pipe(ialu_mem);
%}
--- a/hotspot/src/cpu/x86/vm/x86_64.ad Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad Tue Aug 16 16:59:46 2011 -0700
@@ -6617,8 +6617,9 @@
ins_cost(125);
format %{ "PREFETCHR $mem\t# Prefetch into level 1 cache" %}
- opcode(0x0F, 0x0D); /* Opcode 0F 0D /0 */
- ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x00, mem));
+ ins_encode %{
+ __ prefetchr($mem$$Address);
+ %}
ins_pipe(ialu_mem);
%}
@@ -6628,8 +6629,9 @@
ins_cost(125);
format %{ "PREFETCHNTA $mem\t# Prefetch into non-temporal cache for read" %}
- opcode(0x0F, 0x18); /* Opcode 0F 18 /0 */
- ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x00, mem));
+ ins_encode %{
+ __ prefetchnta($mem$$Address);
+ %}
ins_pipe(ialu_mem);
%}
@@ -6639,8 +6641,9 @@
ins_cost(125);
format %{ "PREFETCHT0 $mem\t# prefetch into L1 and L2 caches for read" %}
- opcode(0x0F, 0x18); /* Opcode 0F 18 /1 */
- ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x01, mem));
+ ins_encode %{
+ __ prefetcht0($mem$$Address);
+ %}
ins_pipe(ialu_mem);
%}
@@ -6650,52 +6653,70 @@
ins_cost(125);
format %{ "PREFETCHT2 $mem\t# prefetch into L2 caches for read" %}
- opcode(0x0F, 0x18); /* Opcode 0F 18 /3 */
- ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x03, mem));
- ins_pipe(ialu_mem);
-%}
-
-instruct prefetchw( memory mem ) %{
- predicate(AllocatePrefetchInstr==3);
- match(PrefetchWrite mem);
- ins_cost(125);
-
- format %{ "PREFETCHW $mem\t# Prefetch into level 1 cache and mark modified" %}
- opcode(0x0F, 0x0D); /* Opcode 0F 0D /1 */
- ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x01, mem));
+ ins_encode %{
+ __ prefetcht2($mem$$Address);
+ %}
ins_pipe(ialu_mem);
%}
instruct prefetchwNTA( memory mem ) %{
- predicate(AllocatePrefetchInstr==0);
match(PrefetchWrite mem);
ins_cost(125);
format %{ "PREFETCHNTA $mem\t# Prefetch to non-temporal cache for write" %}
- opcode(0x0F, 0x18); /* Opcode 0F 18 /0 */
- ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x00, mem));
+ ins_encode %{
+ __ prefetchnta($mem$$Address);
+ %}
ins_pipe(ialu_mem);
%}
-instruct prefetchwT0( memory mem ) %{
- predicate(AllocatePrefetchInstr==1);
- match(PrefetchWrite mem);
+// Prefetch instructions for allocation.
+
+instruct prefetchAlloc( memory mem ) %{
+ predicate(AllocatePrefetchInstr==3);
+ match(PrefetchAllocation mem);
+ ins_cost(125);
+
+ format %{ "PREFETCHW $mem\t# Prefetch allocation into level 1 cache and mark modified" %}
+ ins_encode %{
+ __ prefetchw($mem$$Address);
+ %}
+ ins_pipe(ialu_mem);
+%}
+
+instruct prefetchAllocNTA( memory mem ) %{
+ predicate(AllocatePrefetchInstr==0);
+ match(PrefetchAllocation mem);
ins_cost(125);
- format %{ "PREFETCHT0 $mem\t# Prefetch to level 1 and 2 caches for write" %}
- opcode(0x0F, 0x18); /* Opcode 0F 18 /1 */
- ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x01, mem));
+ format %{ "PREFETCHNTA $mem\t# Prefetch allocation to non-temporal cache for write" %}
+ ins_encode %{
+ __ prefetchnta($mem$$Address);
+ %}
ins_pipe(ialu_mem);
%}
-instruct prefetchwT2( memory mem ) %{
- predicate(AllocatePrefetchInstr==2);
- match(PrefetchWrite mem);
+instruct prefetchAllocT0( memory mem ) %{
+ predicate(AllocatePrefetchInstr==1);
+ match(PrefetchAllocation mem);
ins_cost(125);
- format %{ "PREFETCHT2 $mem\t# Prefetch to level 2 cache for write" %}
- opcode(0x0F, 0x18); /* Opcode 0F 18 /3 */
- ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x03, mem));
+ format %{ "PREFETCHT0 $mem\t# Prefetch allocation to level 1 and 2 caches for write" %}
+ ins_encode %{
+ __ prefetcht0($mem$$Address);
+ %}
+ ins_pipe(ialu_mem);
+%}
+
+instruct prefetchAllocT2( memory mem ) %{
+ predicate(AllocatePrefetchInstr==2);
+ match(PrefetchAllocation mem);
+ ins_cost(125);
+
+ format %{ "PREFETCHT2 $mem\t# Prefetch allocation to level 2 cache for write" %}
+ ins_encode %{
+ __ prefetcht2($mem$$Address);
+ %}
ins_pipe(ialu_mem);
%}
--- a/hotspot/src/share/vm/adlc/formssel.cpp Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/share/vm/adlc/formssel.cpp Tue Aug 16 16:59:46 2011 -0700
@@ -3390,7 +3390,9 @@
"ClearArray"
};
int cnt = sizeof(needs_ideal_memory_list)/sizeof(char*);
- if( strcmp(_opType,"PrefetchRead")==0 || strcmp(_opType,"PrefetchWrite")==0 )
+ if( strcmp(_opType,"PrefetchRead")==0 ||
+ strcmp(_opType,"PrefetchWrite")==0 ||
+ strcmp(_opType,"PrefetchAllocation")==0 )
return 1;
if( _lChild ) {
const char *opType = _lChild->_opType;
--- a/hotspot/src/share/vm/memory/threadLocalAllocBuffer.hpp Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/share/vm/memory/threadLocalAllocBuffer.hpp Tue Aug 16 16:59:46 2011 -0700
@@ -124,16 +124,7 @@
// Reserve space at the end of TLAB
static size_t end_reserve() {
int reserve_size = typeArrayOopDesc::header_size(T_INT);
- if (AllocatePrefetchStyle == 3) {
- // BIS is used to prefetch - we need a space for it.
- // +1 for rounding up to next cache line +1 to be safe
- int lines = AllocatePrefetchLines + 2;
- int step_size = AllocatePrefetchStepSize;
- int distance = AllocatePrefetchDistance;
- int prefetch_end = (distance + step_size*lines)/(int)HeapWordSize;
- reserve_size = MAX2(reserve_size, prefetch_end);
- }
- return reserve_size;
+ return MAX2(reserve_size, VM_Version::reserve_for_allocation_prefetch());
}
static size_t alignment_reserve() { return align_object_size(end_reserve()); }
static size_t alignment_reserve_in_bytes() { return alignment_reserve() * HeapWordSize; }
--- a/hotspot/src/share/vm/opto/classes.hpp Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/share/vm/opto/classes.hpp Tue Aug 16 16:59:46 2011 -0700
@@ -196,6 +196,7 @@
macro(PopCountI)
macro(PopCountL)
macro(PowD)
+macro(PrefetchAllocation)
macro(PrefetchRead)
macro(PrefetchWrite)
macro(Proj)
--- a/hotspot/src/share/vm/opto/macro.cpp Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/share/vm/opto/macro.cpp Tue Aug 16 16:59:46 2011 -0700
@@ -1590,7 +1590,7 @@
prefetch_adr = new (C, 4) AddPNode( old_pf_wm, new_pf_wmt,
_igvn.MakeConX(distance) );
transform_later(prefetch_adr);
- prefetch = new (C, 3) PrefetchWriteNode( i_o, prefetch_adr );
+ prefetch = new (C, 3) PrefetchAllocationNode( i_o, prefetch_adr );
transform_later(prefetch);
distance += step_size;
i_o = prefetch;
@@ -1611,13 +1611,14 @@
contended_phi_rawmem = pf_phi_rawmem;
i_o = pf_phi_abio;
} else if( UseTLAB && AllocatePrefetchStyle == 3 ) {
- // Insert a prefetch for each allocation only on the fast-path
+ // Insert a prefetch for each allocation.
+ // This code is used for Sparc with BIS.
Node *pf_region = new (C, 3) RegionNode(3);
Node *pf_phi_rawmem = new (C, 3) PhiNode( pf_region, Type::MEMORY,
TypeRawPtr::BOTTOM );
- // Generate several prefetch instructions only for arrays.
- uint lines = (length != NULL) ? AllocatePrefetchLines : 1;
+ // Generate several prefetch instructions.
+ uint lines = (length != NULL) ? AllocatePrefetchLines : AllocateInstancePrefetchLines;
uint step_size = AllocatePrefetchStepSize;
uint distance = AllocatePrefetchDistance;
@@ -1634,7 +1635,7 @@
transform_later(cache_adr);
// Prefetch
- Node *prefetch = new (C, 3) PrefetchWriteNode( contended_phi_rawmem, cache_adr );
+ Node *prefetch = new (C, 3) PrefetchAllocationNode( contended_phi_rawmem, cache_adr );
prefetch->set_req(0, needgc_false);
transform_later(prefetch);
contended_phi_rawmem = prefetch;
@@ -1644,7 +1645,7 @@
prefetch_adr = new (C, 4) AddPNode( cache_adr, cache_adr,
_igvn.MakeConX(distance) );
transform_later(prefetch_adr);
- prefetch = new (C, 3) PrefetchWriteNode( contended_phi_rawmem, prefetch_adr );
+ prefetch = new (C, 3) PrefetchAllocationNode( contended_phi_rawmem, prefetch_adr );
transform_later(prefetch);
distance += step_size;
contended_phi_rawmem = prefetch;
@@ -1653,15 +1654,15 @@
// Insert a prefetch for each allocation only on the fast-path
Node *prefetch_adr;
Node *prefetch;
- // Generate several prefetch instructions only for arrays.
- uint lines = (length != NULL) ? AllocatePrefetchLines : 1;
+ // Generate several prefetch instructions.
+ uint lines = (length != NULL) ? AllocatePrefetchLines : AllocateInstancePrefetchLines;
uint step_size = AllocatePrefetchStepSize;
uint distance = AllocatePrefetchDistance;
for ( uint i = 0; i < lines; i++ ) {
prefetch_adr = new (C, 4) AddPNode( old_eden_top, new_eden_top,
_igvn.MakeConX(distance) );
transform_later(prefetch_adr);
- prefetch = new (C, 3) PrefetchWriteNode( i_o, prefetch_adr );
+ prefetch = new (C, 3) PrefetchAllocationNode( i_o, prefetch_adr );
// Do not let it float too high, since if eden_top == eden_end,
// both might be null.
if( i == 0 ) { // Set control for first prefetch, next follows it
--- a/hotspot/src/share/vm/opto/matcher.cpp Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/share/vm/opto/matcher.cpp Tue Aug 16 16:59:46 2011 -0700
@@ -826,6 +826,7 @@
switch (n->Opcode()) {
case Op_PrefetchRead:
case Op_PrefetchWrite:
+ case Op_PrefetchAllocation:
nidx = Compile::AliasIdxRaw;
nat = TypeRawPtr::BOTTOM;
break;
--- a/hotspot/src/share/vm/opto/memnode.hpp Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/share/vm/opto/memnode.hpp Tue Aug 16 16:59:46 2011 -0700
@@ -1278,6 +1278,16 @@
virtual int Opcode() const;
virtual uint ideal_reg() const { return NotAMachineReg; }
virtual uint match_edge(uint idx) const { return idx==2; }
+ virtual const Type *bottom_type() const { return Type::ABIO; }
+};
+
+// Allocation prefetch which may fault, TLAB size have to be adjusted.
+class PrefetchAllocationNode : public Node {
+public:
+ PrefetchAllocationNode(Node *mem, Node *adr) : Node(0,mem,adr) {}
+ virtual int Opcode() const;
+ virtual uint ideal_reg() const { return NotAMachineReg; }
+ virtual uint match_edge(uint idx) const { return idx==2; }
virtual const Type *bottom_type() const { return ( AllocatePrefetchStyle == 3 ) ? Type::MEMORY : Type::ABIO; }
};
--- a/hotspot/src/share/vm/runtime/globals.hpp Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/share/vm/runtime/globals.hpp Tue Aug 16 16:59:46 2011 -0700
@@ -2897,8 +2897,11 @@
product(intx, AllocatePrefetchDistance, -1, \
"Distance to prefetch ahead of allocation pointer") \
\
- product(intx, AllocatePrefetchLines, 1, \
- "Number of lines to prefetch ahead of allocation pointer") \
+ product(intx, AllocatePrefetchLines, 3, \
+ "Number of lines to prefetch ahead of array allocation pointer") \
+ \
+ product(intx, AllocateInstancePrefetchLines, 1, \
+ "Number of lines to prefetch ahead of instance allocation pointer") \
\
product(intx, AllocatePrefetchStepSize, 16, \
"Step size in bytes of sequential prefetch instructions") \
--- a/hotspot/src/share/vm/runtime/vm_version.cpp Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/share/vm/runtime/vm_version.cpp Tue Aug 16 16:59:46 2011 -0700
@@ -46,6 +46,7 @@
const char* Abstract_VM_Version::_s_internal_vm_info_string = Abstract_VM_Version::internal_vm_info_string();
bool Abstract_VM_Version::_supports_cx8 = false;
unsigned int Abstract_VM_Version::_logical_processors_per_package = 1U;
+int Abstract_VM_Version::_reserve_for_allocation_prefetch = 0;
#ifndef HOTSPOT_RELEASE_VERSION
#error HOTSPOT_RELEASE_VERSION must be defined
--- a/hotspot/src/share/vm/runtime/vm_version.hpp Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/share/vm/runtime/vm_version.hpp Tue Aug 16 16:59:46 2011 -0700
@@ -44,6 +44,7 @@
static bool _initialized;
static int _parallel_worker_threads;
static bool _parallel_worker_threads_initialized;
+ static int _reserve_for_allocation_prefetch;
static unsigned int nof_parallel_worker_threads(unsigned int num,
unsigned int dem,
@@ -77,6 +78,12 @@
return _logical_processors_per_package;
}
+ // Need a space at the end of TLAB for prefetch instructions
+ // which may fault when accessing memory outside of heap.
+ static int reserve_for_allocation_prefetch() {
+ return _reserve_for_allocation_prefetch;
+ }
+
// ARCH specific policy for the BiasedLocking
static bool use_biased_locking() { return true; }