7079329: Adjust allocation prefetching for T4
authorkvn
Tue, 16 Aug 2011 16:59:46 -0700
changeset 10267 8bdeec886dc4
parent 10266 2ea344c79e33
child 10268 3b789f46f950
7079329: Adjust allocation prefetching for T4 Summary: on T4 2 BIS instructions should be issued to prefetch 64 bytes Reviewed-by: iveresov, phh, twisti
hotspot/src/cpu/sparc/vm/assembler_sparc.hpp
hotspot/src/cpu/sparc/vm/sparc.ad
hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp
hotspot/src/cpu/sparc/vm/vm_version_sparc.hpp
hotspot/src/cpu/x86/vm/assembler_x86.cpp
hotspot/src/cpu/x86/vm/vm_version_x86.cpp
hotspot/src/cpu/x86/vm/vm_version_x86.hpp
hotspot/src/cpu/x86/vm/x86_32.ad
hotspot/src/cpu/x86/vm/x86_64.ad
hotspot/src/share/vm/adlc/formssel.cpp
hotspot/src/share/vm/memory/threadLocalAllocBuffer.hpp
hotspot/src/share/vm/opto/classes.hpp
hotspot/src/share/vm/opto/macro.cpp
hotspot/src/share/vm/opto/matcher.cpp
hotspot/src/share/vm/opto/memnode.hpp
hotspot/src/share/vm/runtime/globals.hpp
hotspot/src/share/vm/runtime/vm_version.cpp
hotspot/src/share/vm/runtime/vm_version.hpp
--- a/hotspot/src/cpu/sparc/vm/assembler_sparc.hpp	Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/cpu/sparc/vm/assembler_sparc.hpp	Tue Aug 16 16:59:46 2011 -0700
@@ -886,7 +886,11 @@
 
   enum ASIs { // page 72, v9
     ASI_PRIMARY        = 0x80,
-    ASI_PRIMARY_LITTLE = 0x88
+    ASI_PRIMARY_LITTLE = 0x88,
+    // Block initializing store
+    ASI_ST_BLKINIT_PRIMARY = 0xE2,
+    // Most-Recently-Used (MRU) BIS variant
+    ASI_ST_BLKINIT_MRU_PRIMARY = 0xF2
     // add more from book as needed
   };
 
--- a/hotspot/src/cpu/sparc/vm/sparc.ad	Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/cpu/sparc/vm/sparc.ad	Tue Aug 16 16:59:46 2011 -0700
@@ -471,9 +471,6 @@
 source %{
 #define __ _masm.
 
-// Block initializing store
-#define ASI_BLK_INIT_QUAD_LDD_P    0xE2
-
 // tertiary op of a LoadP or StoreP encoding
 #define REGP_OP true
 
@@ -2819,10 +2816,10 @@
     Register    nof_bytes_arg   = reg_to_register_object($cnt$$reg);
     Register    nof_bytes_tmp    = reg_to_register_object($temp$$reg);
     Register    base_pointer_arg = reg_to_register_object($base$$reg);
-
+  
     Label loop;
     __ mov(nof_bytes_arg, nof_bytes_tmp);
-
+  
     // Loop and clear, walking backwards through the array.
     // nof_bytes_tmp (if >0) is always the number of bytes to zero
     __ bind(loop);
@@ -6269,6 +6266,7 @@
 instruct prefetchr( memory mem ) %{
   match( PrefetchRead mem );
   ins_cost(MEMORY_REF_COST);
+  size(4);
 
   format %{ "PREFETCH $mem,0\t! Prefetch read-many" %}
   opcode(Assembler::prefetch_op3);
@@ -6277,9 +6275,9 @@
 %}
 
 instruct prefetchw( memory mem ) %{
-  predicate(AllocatePrefetchStyle != 3 );
   match( PrefetchWrite mem );
   ins_cost(MEMORY_REF_COST);
+  size(4);
 
   format %{ "PREFETCH $mem,2\t! Prefetch write-many (and read)" %}
   opcode(Assembler::prefetch_op3);
@@ -6287,24 +6285,62 @@
   ins_pipe(iload_mem);
 %}
 
-// Use BIS instruction to prefetch.
-instruct prefetchw_bis( memory mem ) %{
-  predicate(AllocatePrefetchStyle == 3);
-  match( PrefetchWrite mem );
-  ins_cost(MEMORY_REF_COST);
-
-  format %{ "STXA   G0,$mem\t! // Block initializing store" %}
-  ins_encode %{
-     Register base = as_Register($mem$$base);
-     int disp = $mem$$disp;
-     if (disp != 0) {
-       __ add(base, AllocatePrefetchStepSize, base);
-     }
-     __ stxa(G0, base, G0, ASI_BLK_INIT_QUAD_LDD_P);
+// Prefetch instructions for allocation.
+
+instruct prefetchAlloc( memory mem ) %{
+  predicate(AllocatePrefetchInstr == 0);
+  match( PrefetchAllocation mem );
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+
+  format %{ "PREFETCH $mem,2\t! Prefetch allocation" %}
+  opcode(Assembler::prefetch_op3);
+  ins_encode( form3_mem_prefetch_write( mem ) );
+  ins_pipe(iload_mem);
+%}
+
+// Use BIS instruction to prefetch for allocation.
+// Could fault, need space at the end of TLAB.
+instruct prefetchAlloc_bis( iRegP dst ) %{
+  predicate(AllocatePrefetchInstr == 1);
+  match( PrefetchAllocation dst );
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+
+  format %{ "STXA   [$dst]\t! // Prefetch allocation using BIS" %}
+  ins_encode %{
+    __ stxa(G0, $dst$$Register, G0, Assembler::ASI_ST_BLKINIT_PRIMARY);
   %}
   ins_pipe(istore_mem_reg);
 %}
 
+// Next code is used for finding next cache line address to prefetch.
+#ifndef _LP64
+instruct cacheLineAdr( iRegP dst, iRegP src, immI13 mask ) %{
+  match(Set dst (CastX2P (AndI (CastP2X src) mask)));
+  ins_cost(DEFAULT_COST);
+  size(4);
+
+  format %{ "AND    $src,$mask,$dst\t! next cache line address" %}
+  ins_encode %{
+    __ and3($src$$Register, $mask$$constant, $dst$$Register);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+#else
+instruct cacheLineAdr( iRegP dst, iRegP src, immL13 mask ) %{
+  match(Set dst (CastX2P (AndL (CastP2X src) mask)));
+  ins_cost(DEFAULT_COST);
+  size(4);
+
+  format %{ "AND    $src,$mask,$dst\t! next cache line address" %}
+  ins_encode %{
+    __ and3($src$$Register, $mask$$constant, $dst$$Register);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+#endif
+
 //----------Store Instructions-------------------------------------------------
 // Store Byte
 instruct storeB(memory mem, iRegI src) %{
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp	Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp	Tue Aug 16 16:59:46 2011 -0700
@@ -44,20 +44,31 @@
   PrefetchScanIntervalInBytes = prefetch_scan_interval_in_bytes();
   PrefetchFieldsAhead         = prefetch_fields_ahead();
 
+  assert(0 <= AllocatePrefetchInstr && AllocatePrefetchInstr <= 1, "invalid value");
+  if( AllocatePrefetchInstr < 0 ) AllocatePrefetchInstr = 0;
+  if( AllocatePrefetchInstr > 1 ) AllocatePrefetchInstr = 0;
+
   // Allocation prefetch settings
-  intx cache_line_size = L1_data_cache_line_size();
+  intx cache_line_size = prefetch_data_size();
   if( cache_line_size > AllocatePrefetchStepSize )
     AllocatePrefetchStepSize = cache_line_size;
-  if( FLAG_IS_DEFAULT(AllocatePrefetchLines) )
-    AllocatePrefetchLines = 3; // Optimistic value
-  assert( AllocatePrefetchLines > 0, "invalid value");
-  if( AllocatePrefetchLines < 1 ) // set valid value in product VM
-    AllocatePrefetchLines = 1; // Conservative value
+
+  assert(AllocatePrefetchLines > 0, "invalid value");
+  if( AllocatePrefetchLines < 1 )     // set valid value in product VM
+    AllocatePrefetchLines = 3;
+  assert(AllocateInstancePrefetchLines > 0, "invalid value");
+  if( AllocateInstancePrefetchLines < 1 ) // set valid value in product VM
+    AllocateInstancePrefetchLines = 1;
 
   AllocatePrefetchDistance = allocate_prefetch_distance();
   AllocatePrefetchStyle    = allocate_prefetch_style();
 
-  assert(AllocatePrefetchDistance % AllocatePrefetchStepSize == 0, "invalid value");
+  assert((AllocatePrefetchDistance % AllocatePrefetchStepSize) == 0 &&
+         (AllocatePrefetchDistance > 0), "invalid value");
+  if ((AllocatePrefetchDistance % AllocatePrefetchStepSize) != 0 ||
+      (AllocatePrefetchDistance <= 0)) {
+    AllocatePrefetchDistance = AllocatePrefetchStepSize;
+  }
 
   if (AllocatePrefetchStyle == 3 && !has_blk_init()) {
     warning("BIS instructions are not available on this CPU");
@@ -66,7 +77,7 @@
 
   UseSSE = 0; // Only on x86 and x64
 
-  _supports_cx8               = has_v9();
+  _supports_cx8 = has_v9();
 
   if (is_niagara()) {
     // Indirect branch is the same cost as direct
@@ -99,19 +110,42 @@
       FLAG_SET_DEFAULT(InteriorEntryAlignment, 4);
     }
     if (is_niagara_plus()) {
-      if (has_blk_init() && AllocatePrefetchStyle > 0 &&
-          FLAG_IS_DEFAULT(AllocatePrefetchStyle)) {
-        // Use BIS instruction for allocation prefetch.
-        FLAG_SET_DEFAULT(AllocatePrefetchStyle, 3);
+      if (has_blk_init() && UseTLAB &&
+          FLAG_IS_DEFAULT(AllocatePrefetchInstr)) {
+        // Use BIS instruction for TLAB allocation prefetch.
+        FLAG_SET_ERGO(intx, AllocatePrefetchInstr, 1);
+        if (FLAG_IS_DEFAULT(AllocatePrefetchStyle)) {
+          FLAG_SET_ERGO(intx, AllocatePrefetchStyle, 3);
+        }
         if (FLAG_IS_DEFAULT(AllocatePrefetchDistance)) {
-          // Use smaller prefetch distance on N2 with BIS
+          // Use smaller prefetch distance with BIS
           FLAG_SET_DEFAULT(AllocatePrefetchDistance, 64);
         }
       }
+      if (is_T4()) {
+        // Double number of prefetched cache lines on T4
+        // since L2 cache line size is smaller (32 bytes).
+        if (FLAG_IS_DEFAULT(AllocatePrefetchLines)) {
+          FLAG_SET_ERGO(intx, AllocatePrefetchLines, AllocatePrefetchLines*2);
+        }
+        if (FLAG_IS_DEFAULT(AllocateInstancePrefetchLines)) {
+          FLAG_SET_ERGO(intx, AllocateInstancePrefetchLines, AllocateInstancePrefetchLines*2);
+        }
+      }
       if (AllocatePrefetchStyle != 3 && FLAG_IS_DEFAULT(AllocatePrefetchDistance)) {
         // Use different prefetch distance without BIS
         FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256);
       }
+      if (AllocatePrefetchInstr == 1) {
+        // Need a space at the end of TLAB for BIS since it
+        // will fault when accessing memory outside of heap.
+
+        // +1 for rounding up to next cache line, +1 to be safe
+        int lines = AllocatePrefetchLines + 2;
+        int step_size = AllocatePrefetchStepSize;
+        int distance = AllocatePrefetchDistance;
+        _reserve_for_allocation_prefetch = (distance + step_size*lines)/(int)HeapWordSize;
+      }
     }
 #endif
   }
@@ -185,14 +219,20 @@
 
 #ifndef PRODUCT
   if (PrintMiscellaneous && Verbose) {
-    tty->print("Allocation: ");
+    tty->print("Allocation");
     if (AllocatePrefetchStyle <= 0) {
-      tty->print_cr("no prefetching");
+      tty->print_cr(": no prefetching");
     } else {
+      tty->print(" prefetching: ");
+      if (AllocatePrefetchInstr == 0) {
+          tty->print("PREFETCH");
+      } else if (AllocatePrefetchInstr == 1) {
+          tty->print("BIS");
+      }
       if (AllocatePrefetchLines > 1) {
-        tty->print_cr("PREFETCH %d, %d lines of size %d bytes", AllocatePrefetchDistance, AllocatePrefetchLines, AllocatePrefetchStepSize);
+        tty->print_cr(" at distance %d, %d lines of %d bytes", AllocatePrefetchDistance, AllocatePrefetchLines, AllocatePrefetchStepSize);
       } else {
-        tty->print_cr("PREFETCH %d, one line", AllocatePrefetchDistance);
+        tty->print_cr(" at distance %d, one line of %d bytes", AllocatePrefetchDistance, AllocatePrefetchStepSize);
       }
     }
     if (PrefetchCopyIntervalInBytes > 0) {
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.hpp	Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.hpp	Tue Aug 16 16:59:46 2011 -0700
@@ -121,6 +121,7 @@
   // Returns true if the platform is in the niagara line (T series)
   // and newer than the niagara1.
   static bool is_niagara_plus()         { return is_T_family(_features) && !is_T1_model(_features); }
+  static bool is_T4()                   { return is_T_family(_features) && has_cbcond(); }
 
   // Fujitsu SPARC64
   static bool is_sparc64()              { return (_features & sparc64_family_m) != 0; }
@@ -130,13 +131,17 @@
 
   static bool has_fast_fxtof()          { return is_niagara() || is_sparc64() || has_v9() && !is_ultra3(); }
   static bool has_fast_idiv()           { return is_niagara_plus() || is_sparc64(); }
+
   // T4 and newer Sparc have fast RDPC instruction.
-  static bool has_fast_rdpc()           { return is_niagara_plus() && has_cbcond(); }
+  static bool has_fast_rdpc()           { return is_T4(); }
+
+  // T4 and newer Sparc have Most-Recently-Used (MRU) BIS.
+  static bool has_mru_blk_init()        { return has_blk_init() && is_T4(); }
 
   static const char* cpu_features()     { return _features_str; }
 
-  static intx L1_data_cache_line_size()  {
-    return 64;  // default prefetch block size on sparc
+  static intx prefetch_data_size()  {
+    return is_T4() ? 32 : 64;  // default prefetch block size on sparc
   }
 
   // Prefetch
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Tue Aug 16 16:59:46 2011 -0700
@@ -2315,7 +2315,7 @@
 }
 
 void Assembler::prefetchr(Address src) {
-  NOT_LP64(assert(VM_Version::supports_3dnow_prefetch(), "must support"));
+  assert(VM_Version::supports_3dnow_prefetch(), "must support");
   InstructionMark im(this);
   prefetch_prefix(src);
   emit_byte(0x0D);
@@ -2347,7 +2347,7 @@
 }
 
 void Assembler::prefetchw(Address src) {
-  NOT_LP64(assert(VM_Version::supports_3dnow_prefetch(), "must support"));
+  assert(VM_Version::supports_3dnow_prefetch(), "must support");
   InstructionMark im(this);
   prefetch_prefix(src);
   emit_byte(0x0D);
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp	Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp	Tue Aug 16 16:59:46 2011 -0700
@@ -557,14 +557,16 @@
   if( !supports_sse() && supports_3dnow_prefetch() ) AllocatePrefetchInstr = 3;
 
   // Allocation prefetch settings
-  intx cache_line_size = L1_data_cache_line_size();
+  intx cache_line_size = prefetch_data_size();
   if( cache_line_size > AllocatePrefetchStepSize )
     AllocatePrefetchStepSize = cache_line_size;
-  if( FLAG_IS_DEFAULT(AllocatePrefetchLines) )
-    AllocatePrefetchLines = 3; // Optimistic value
+
   assert(AllocatePrefetchLines > 0, "invalid value");
-  if( AllocatePrefetchLines < 1 ) // set valid value in product VM
-    AllocatePrefetchLines = 1; // Conservative value
+  if( AllocatePrefetchLines < 1 )     // set valid value in product VM
+    AllocatePrefetchLines = 3;
+  assert(AllocateInstancePrefetchLines > 0, "invalid value");
+  if( AllocateInstancePrefetchLines < 1 ) // set valid value in product VM
+    AllocateInstancePrefetchLines = 1;
 
   AllocatePrefetchDistance = allocate_prefetch_distance();
   AllocatePrefetchStyle    = allocate_prefetch_style();
@@ -601,10 +603,11 @@
     tty->print_cr("Logical CPUs per core: %u",
                   logical_processors_per_package());
     tty->print_cr("UseSSE=%d",UseSSE);
-    tty->print("Allocation: ");
+    tty->print("Allocation");
     if (AllocatePrefetchStyle <= 0 || UseSSE == 0 && !supports_3dnow_prefetch()) {
-      tty->print_cr("no prefetching");
+      tty->print_cr(": no prefetching");
     } else {
+      tty->print(" prefetching: ");
       if (UseSSE == 0 && supports_3dnow_prefetch()) {
         tty->print("PREFETCHW");
       } else if (UseSSE >= 1) {
@@ -619,9 +622,9 @@
         }
       }
       if (AllocatePrefetchLines > 1) {
-        tty->print_cr(" %d, %d lines with step %d bytes", AllocatePrefetchDistance, AllocatePrefetchLines, AllocatePrefetchStepSize);
+        tty->print_cr(" at distance %d, %d lines of %d bytes", AllocatePrefetchDistance, AllocatePrefetchLines, AllocatePrefetchStepSize);
       } else {
-        tty->print_cr(" %d, one line", AllocatePrefetchDistance);
+        tty->print_cr(" at distance %d, one line of %d bytes", AllocatePrefetchDistance, AllocatePrefetchStepSize);
       }
     }
 
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp	Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp	Tue Aug 16 16:59:46 2011 -0700
@@ -419,7 +419,7 @@
     return result;
   }
 
-  static intx L1_data_cache_line_size()  {
+  static intx prefetch_data_size()  {
     intx result = 0;
     if (is_intel()) {
       result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1);
--- a/hotspot/src/cpu/x86/vm/x86_32.ad	Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad	Tue Aug 16 16:59:46 2011 -0700
@@ -7325,8 +7325,9 @@
   ins_cost(100);
 
   format %{ "PREFETCHR $mem\t! Prefetch into level 1 cache for read" %}
-  opcode(0x0F, 0x0d);     /* Opcode 0F 0d /0 */
-  ins_encode(OpcP, OpcS, RMopc_Mem(0x00,mem));
+  ins_encode %{
+    __ prefetchr($mem$$Address);
+  %}
   ins_pipe(ialu_mem);
 %}
 
@@ -7336,8 +7337,9 @@
   ins_cost(100);
 
   format %{ "PREFETCHNTA $mem\t! Prefetch into non-temporal cache for read" %}
-  opcode(0x0F, 0x18);     /* Opcode 0F 18 /0 */
-  ins_encode(OpcP, OpcS, RMopc_Mem(0x00,mem));
+  ins_encode %{
+    __ prefetchnta($mem$$Address);
+  %}
   ins_pipe(ialu_mem);
 %}
 
@@ -7347,8 +7349,9 @@
   ins_cost(100);
 
   format %{ "PREFETCHT0 $mem\t! Prefetch into L1 and L2 caches for read" %}
-  opcode(0x0F, 0x18);     /* Opcode 0F 18 /1 */
-  ins_encode(OpcP, OpcS, RMopc_Mem(0x01,mem));
+  ins_encode %{
+    __ prefetcht0($mem$$Address);
+  %}
   ins_pipe(ialu_mem);
 %}
 
@@ -7358,8 +7361,9 @@
   ins_cost(100);
 
   format %{ "PREFETCHT2 $mem\t! Prefetch into L2 cache for read" %}
-  opcode(0x0F, 0x18);     /* Opcode 0F 18 /3 */
-  ins_encode(OpcP, OpcS, RMopc_Mem(0x03,mem));
+  ins_encode %{
+    __ prefetcht2($mem$$Address);
+  %}
   ins_pipe(ialu_mem);
 %}
 
@@ -7374,46 +7378,86 @@
 %}
 
 instruct prefetchw( memory mem ) %{
-  predicate(UseSSE==0 && VM_Version::supports_3dnow_prefetch() || AllocatePrefetchInstr==3);
+  predicate(UseSSE==0 && VM_Version::supports_3dnow_prefetch());
   match( PrefetchWrite mem );
   ins_cost(100);
 
   format %{ "PREFETCHW $mem\t! Prefetch into L1 cache and mark modified" %}
-  opcode(0x0F, 0x0D);     /* Opcode 0F 0D /1 */
-  ins_encode(OpcP, OpcS, RMopc_Mem(0x01,mem));
+  ins_encode %{
+    __ prefetchw($mem$$Address);
+  %}
   ins_pipe(ialu_mem);
 %}
 
 instruct prefetchwNTA( memory mem ) %{
-  predicate(UseSSE>=1 && AllocatePrefetchInstr==0);
+  predicate(UseSSE>=1);
   match(PrefetchWrite mem);
   ins_cost(100);
 
   format %{ "PREFETCHNTA $mem\t! Prefetch into non-temporal cache for write" %}
-  opcode(0x0F, 0x18);     /* Opcode 0F 18 /0 */
-  ins_encode(OpcP, OpcS, RMopc_Mem(0x00,mem));
+  ins_encode %{
+    __ prefetchnta($mem$$Address);
+  %}
   ins_pipe(ialu_mem);
 %}
 
-instruct prefetchwT0( memory mem ) %{
-  predicate(UseSSE>=1 && AllocatePrefetchInstr==1);
-  match(PrefetchWrite mem);
+// Prefetch instructions for allocation.
+
+instruct prefetchAlloc0( memory mem ) %{
+  predicate(UseSSE==0 && AllocatePrefetchInstr!=3);
+  match(PrefetchAllocation mem);
+  ins_cost(0);
+  size(0);
+  format %{ "Prefetch allocation (non-SSE is empty encoding)" %}
+  ins_encode();
+  ins_pipe(empty);
+%}
+
+instruct prefetchAlloc( memory mem ) %{
+  predicate(AllocatePrefetchInstr==3);
+  match( PrefetchAllocation mem );
   ins_cost(100);
 
-  format %{ "PREFETCHT0 $mem\t! Prefetch into L1 and L2 caches for write" %}
-  opcode(0x0F, 0x18);     /* Opcode 0F 18 /1 */
-  ins_encode(OpcP, OpcS, RMopc_Mem(0x01,mem));
+  format %{ "PREFETCHW $mem\t! Prefetch allocation into L1 cache and mark modified" %}
+  ins_encode %{
+    __ prefetchw($mem$$Address);
+  %}
+  ins_pipe(ialu_mem);
+%}
+
+instruct prefetchAllocNTA( memory mem ) %{
+  predicate(UseSSE>=1 && AllocatePrefetchInstr==0);
+  match(PrefetchAllocation mem);
+  ins_cost(100);
+
+  format %{ "PREFETCHNTA $mem\t! Prefetch allocation into non-temporal cache for write" %}
+  ins_encode %{
+    __ prefetchnta($mem$$Address);
+  %}
   ins_pipe(ialu_mem);
 %}
 
-instruct prefetchwT2( memory mem ) %{
-  predicate(UseSSE>=1 && AllocatePrefetchInstr==2);
-  match(PrefetchWrite mem);
+instruct prefetchAllocT0( memory mem ) %{
+  predicate(UseSSE>=1 && AllocatePrefetchInstr==1);
+  match(PrefetchAllocation mem);
   ins_cost(100);
 
-  format %{ "PREFETCHT2 $mem\t! Prefetch into L2 cache for write" %}
-  opcode(0x0F, 0x18);     /* Opcode 0F 18 /3 */
-  ins_encode(OpcP, OpcS, RMopc_Mem(0x03,mem));
+  format %{ "PREFETCHT0 $mem\t! Prefetch allocation into L1 and L2 caches for write" %}
+  ins_encode %{
+    __ prefetcht0($mem$$Address);
+  %}
+  ins_pipe(ialu_mem);
+%}
+
+instruct prefetchAllocT2( memory mem ) %{
+  predicate(UseSSE>=1 && AllocatePrefetchInstr==2);
+  match(PrefetchAllocation mem);
+  ins_cost(100);
+
+  format %{ "PREFETCHT2 $mem\t! Prefetch allocation into L2 cache for write" %}
+  ins_encode %{
+    __ prefetcht2($mem$$Address);
+  %}
   ins_pipe(ialu_mem);
 %}
 
--- a/hotspot/src/cpu/x86/vm/x86_64.ad	Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad	Tue Aug 16 16:59:46 2011 -0700
@@ -6617,8 +6617,9 @@
   ins_cost(125);
 
   format %{ "PREFETCHR $mem\t# Prefetch into level 1 cache" %}
-  opcode(0x0F, 0x0D);     /* Opcode 0F 0D /0 */
-  ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x00, mem));
+  ins_encode %{
+    __ prefetchr($mem$$Address);
+  %}
   ins_pipe(ialu_mem);
 %}
 
@@ -6628,8 +6629,9 @@
   ins_cost(125);
 
   format %{ "PREFETCHNTA $mem\t# Prefetch into non-temporal cache for read" %}
-  opcode(0x0F, 0x18);     /* Opcode 0F 18 /0 */
-  ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x00, mem));
+  ins_encode %{
+    __ prefetchnta($mem$$Address);
+  %}
   ins_pipe(ialu_mem);
 %}
 
@@ -6639,8 +6641,9 @@
   ins_cost(125);
 
   format %{ "PREFETCHT0 $mem\t# prefetch into L1 and L2 caches for read" %}
-  opcode(0x0F, 0x18); /* Opcode 0F 18 /1 */
-  ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x01, mem));
+  ins_encode %{
+    __ prefetcht0($mem$$Address);
+  %}
   ins_pipe(ialu_mem);
 %}
 
@@ -6650,52 +6653,70 @@
   ins_cost(125);
 
   format %{ "PREFETCHT2 $mem\t# prefetch into L2 caches for read" %}
-  opcode(0x0F, 0x18); /* Opcode 0F 18 /3 */
-  ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x03, mem));
-  ins_pipe(ialu_mem);
-%}
-
-instruct prefetchw( memory mem ) %{
-  predicate(AllocatePrefetchInstr==3);
-  match(PrefetchWrite mem);
-  ins_cost(125);
-
-  format %{ "PREFETCHW $mem\t# Prefetch into level 1 cache and mark modified" %}
-  opcode(0x0F, 0x0D);     /* Opcode 0F 0D /1 */
-  ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x01, mem));
+  ins_encode %{
+    __ prefetcht2($mem$$Address);
+  %}
   ins_pipe(ialu_mem);
 %}
 
 instruct prefetchwNTA( memory mem ) %{
-  predicate(AllocatePrefetchInstr==0);
   match(PrefetchWrite mem);
   ins_cost(125);
 
   format %{ "PREFETCHNTA $mem\t# Prefetch to non-temporal cache for write" %}
-  opcode(0x0F, 0x18);     /* Opcode 0F 18 /0 */
-  ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x00, mem));
+  ins_encode %{
+    __ prefetchnta($mem$$Address);
+  %}
   ins_pipe(ialu_mem);
 %}
 
-instruct prefetchwT0( memory mem ) %{
-  predicate(AllocatePrefetchInstr==1);
-  match(PrefetchWrite mem);
+// Prefetch instructions for allocation.
+
+instruct prefetchAlloc( memory mem ) %{
+  predicate(AllocatePrefetchInstr==3);
+  match(PrefetchAllocation mem);
+  ins_cost(125);
+
+  format %{ "PREFETCHW $mem\t# Prefetch allocation into level 1 cache and mark modified" %}
+  ins_encode %{
+    __ prefetchw($mem$$Address);
+  %}
+  ins_pipe(ialu_mem);
+%}
+
+instruct prefetchAllocNTA( memory mem ) %{
+  predicate(AllocatePrefetchInstr==0);
+  match(PrefetchAllocation mem);
   ins_cost(125);
 
-  format %{ "PREFETCHT0 $mem\t# Prefetch to level 1 and 2 caches for write" %}
-  opcode(0x0F, 0x18);     /* Opcode 0F 18 /1 */
-  ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x01, mem));
+  format %{ "PREFETCHNTA $mem\t# Prefetch allocation to non-temporal cache for write" %}
+  ins_encode %{
+    __ prefetchnta($mem$$Address);
+  %}
   ins_pipe(ialu_mem);
 %}
 
-instruct prefetchwT2( memory mem ) %{
-  predicate(AllocatePrefetchInstr==2);
-  match(PrefetchWrite mem);
+instruct prefetchAllocT0( memory mem ) %{
+  predicate(AllocatePrefetchInstr==1);
+  match(PrefetchAllocation mem);
   ins_cost(125);
 
-  format %{ "PREFETCHT2 $mem\t# Prefetch to level 2 cache for write" %}
-  opcode(0x0F, 0x18);     /* Opcode 0F 18 /3 */
-  ins_encode(REX_mem(mem), OpcP, OpcS, RM_opc_mem(0x03, mem));
+  format %{ "PREFETCHT0 $mem\t# Prefetch allocation to level 1 and 2 caches for write" %}
+  ins_encode %{
+    __ prefetcht0($mem$$Address);
+  %}
+  ins_pipe(ialu_mem);
+%}
+
+instruct prefetchAllocT2( memory mem ) %{
+  predicate(AllocatePrefetchInstr==2);
+  match(PrefetchAllocation mem);
+  ins_cost(125);
+
+  format %{ "PREFETCHT2 $mem\t# Prefetch allocation to level 2 cache for write" %}
+  ins_encode %{
+    __ prefetcht2($mem$$Address);
+  %}
   ins_pipe(ialu_mem);
 %}
 
--- a/hotspot/src/share/vm/adlc/formssel.cpp	Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/share/vm/adlc/formssel.cpp	Tue Aug 16 16:59:46 2011 -0700
@@ -3390,7 +3390,9 @@
     "ClearArray"
   };
   int cnt = sizeof(needs_ideal_memory_list)/sizeof(char*);
-  if( strcmp(_opType,"PrefetchRead")==0 || strcmp(_opType,"PrefetchWrite")==0 )
+  if( strcmp(_opType,"PrefetchRead")==0 ||
+      strcmp(_opType,"PrefetchWrite")==0 ||
+      strcmp(_opType,"PrefetchAllocation")==0 )
     return 1;
   if( _lChild ) {
     const char *opType = _lChild->_opType;
--- a/hotspot/src/share/vm/memory/threadLocalAllocBuffer.hpp	Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/share/vm/memory/threadLocalAllocBuffer.hpp	Tue Aug 16 16:59:46 2011 -0700
@@ -124,16 +124,7 @@
   // Reserve space at the end of TLAB
   static size_t end_reserve() {
     int reserve_size = typeArrayOopDesc::header_size(T_INT);
-    if (AllocatePrefetchStyle == 3) {
-      // BIS is used to prefetch - we need a space for it.
-      // +1 for rounding up to next cache line +1 to be safe
-      int lines = AllocatePrefetchLines + 2;
-      int step_size = AllocatePrefetchStepSize;
-      int distance = AllocatePrefetchDistance;
-      int prefetch_end = (distance + step_size*lines)/(int)HeapWordSize;
-      reserve_size = MAX2(reserve_size, prefetch_end);
-    }
-    return reserve_size;
+    return MAX2(reserve_size, VM_Version::reserve_for_allocation_prefetch());
   }
   static size_t alignment_reserve()              { return align_object_size(end_reserve()); }
   static size_t alignment_reserve_in_bytes()     { return alignment_reserve() * HeapWordSize; }
--- a/hotspot/src/share/vm/opto/classes.hpp	Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/share/vm/opto/classes.hpp	Tue Aug 16 16:59:46 2011 -0700
@@ -196,6 +196,7 @@
 macro(PopCountI)
 macro(PopCountL)
 macro(PowD)
+macro(PrefetchAllocation)
 macro(PrefetchRead)
 macro(PrefetchWrite)
 macro(Proj)
--- a/hotspot/src/share/vm/opto/macro.cpp	Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/share/vm/opto/macro.cpp	Tue Aug 16 16:59:46 2011 -0700
@@ -1590,7 +1590,7 @@
         prefetch_adr = new (C, 4) AddPNode( old_pf_wm, new_pf_wmt,
                                             _igvn.MakeConX(distance) );
         transform_later(prefetch_adr);
-        prefetch = new (C, 3) PrefetchWriteNode( i_o, prefetch_adr );
+        prefetch = new (C, 3) PrefetchAllocationNode( i_o, prefetch_adr );
         transform_later(prefetch);
         distance += step_size;
         i_o = prefetch;
@@ -1611,13 +1611,14 @@
       contended_phi_rawmem = pf_phi_rawmem;
       i_o = pf_phi_abio;
    } else if( UseTLAB && AllocatePrefetchStyle == 3 ) {
-      // Insert a prefetch for each allocation only on the fast-path
+      // Insert a prefetch for each allocation.
+      // This code is used for Sparc with BIS.
       Node *pf_region = new (C, 3) RegionNode(3);
       Node *pf_phi_rawmem = new (C, 3) PhiNode( pf_region, Type::MEMORY,
                                                 TypeRawPtr::BOTTOM );
 
-      // Generate several prefetch instructions only for arrays.
-      uint lines = (length != NULL) ? AllocatePrefetchLines : 1;
+      // Generate several prefetch instructions.
+      uint lines = (length != NULL) ? AllocatePrefetchLines : AllocateInstancePrefetchLines;
       uint step_size = AllocatePrefetchStepSize;
       uint distance = AllocatePrefetchDistance;
 
@@ -1634,7 +1635,7 @@
       transform_later(cache_adr);
 
       // Prefetch
-      Node *prefetch = new (C, 3) PrefetchWriteNode( contended_phi_rawmem, cache_adr );
+      Node *prefetch = new (C, 3) PrefetchAllocationNode( contended_phi_rawmem, cache_adr );
       prefetch->set_req(0, needgc_false);
       transform_later(prefetch);
       contended_phi_rawmem = prefetch;
@@ -1644,7 +1645,7 @@
         prefetch_adr = new (C, 4) AddPNode( cache_adr, cache_adr,
                                             _igvn.MakeConX(distance) );
         transform_later(prefetch_adr);
-        prefetch = new (C, 3) PrefetchWriteNode( contended_phi_rawmem, prefetch_adr );
+        prefetch = new (C, 3) PrefetchAllocationNode( contended_phi_rawmem, prefetch_adr );
         transform_later(prefetch);
         distance += step_size;
         contended_phi_rawmem = prefetch;
@@ -1653,15 +1654,15 @@
       // Insert a prefetch for each allocation only on the fast-path
       Node *prefetch_adr;
       Node *prefetch;
-      // Generate several prefetch instructions only for arrays.
-      uint lines = (length != NULL) ? AllocatePrefetchLines : 1;
+      // Generate several prefetch instructions.
+      uint lines = (length != NULL) ? AllocatePrefetchLines : AllocateInstancePrefetchLines;
       uint step_size = AllocatePrefetchStepSize;
       uint distance = AllocatePrefetchDistance;
       for ( uint i = 0; i < lines; i++ ) {
         prefetch_adr = new (C, 4) AddPNode( old_eden_top, new_eden_top,
                                             _igvn.MakeConX(distance) );
         transform_later(prefetch_adr);
-        prefetch = new (C, 3) PrefetchWriteNode( i_o, prefetch_adr );
+        prefetch = new (C, 3) PrefetchAllocationNode( i_o, prefetch_adr );
         // Do not let it float too high, since if eden_top == eden_end,
         // both might be null.
         if( i == 0 ) { // Set control for first prefetch, next follows it
--- a/hotspot/src/share/vm/opto/matcher.cpp	Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/share/vm/opto/matcher.cpp	Tue Aug 16 16:59:46 2011 -0700
@@ -826,6 +826,7 @@
     switch (n->Opcode()) {
     case Op_PrefetchRead:
     case Op_PrefetchWrite:
+    case Op_PrefetchAllocation:
       nidx = Compile::AliasIdxRaw;
       nat = TypeRawPtr::BOTTOM;
       break;
--- a/hotspot/src/share/vm/opto/memnode.hpp	Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/share/vm/opto/memnode.hpp	Tue Aug 16 16:59:46 2011 -0700
@@ -1278,6 +1278,16 @@
   virtual int Opcode() const;
   virtual uint ideal_reg() const { return NotAMachineReg; }
   virtual uint match_edge(uint idx) const { return idx==2; }
+  virtual const Type *bottom_type() const { return Type::ABIO; }
+};
+
+// Allocation prefetch which may fault, TLAB size have to be adjusted.
+class PrefetchAllocationNode : public Node {
+public:
+  PrefetchAllocationNode(Node *mem, Node *adr) : Node(0,mem,adr) {}
+  virtual int Opcode() const;
+  virtual uint ideal_reg() const { return NotAMachineReg; }
+  virtual uint match_edge(uint idx) const { return idx==2; }
   virtual const Type *bottom_type() const { return ( AllocatePrefetchStyle == 3 ) ? Type::MEMORY : Type::ABIO; }
 };
 
--- a/hotspot/src/share/vm/runtime/globals.hpp	Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/share/vm/runtime/globals.hpp	Tue Aug 16 16:59:46 2011 -0700
@@ -2897,8 +2897,11 @@
   product(intx,  AllocatePrefetchDistance, -1,                              \
           "Distance to prefetch ahead of allocation pointer")               \
                                                                             \
-  product(intx,  AllocatePrefetchLines, 1,                                  \
-          "Number of lines to prefetch ahead of allocation pointer")        \
+  product(intx,  AllocatePrefetchLines, 3,                                  \
+          "Number of lines to prefetch ahead of array allocation pointer")  \
+                                                                            \
+  product(intx,  AllocateInstancePrefetchLines, 1,                          \
+          "Number of lines to prefetch ahead of instance allocation pointer") \
                                                                             \
   product(intx,  AllocatePrefetchStepSize, 16,                              \
           "Step size in bytes of sequential prefetch instructions")         \
--- a/hotspot/src/share/vm/runtime/vm_version.cpp	Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/share/vm/runtime/vm_version.cpp	Tue Aug 16 16:59:46 2011 -0700
@@ -46,6 +46,7 @@
 const char* Abstract_VM_Version::_s_internal_vm_info_string = Abstract_VM_Version::internal_vm_info_string();
 bool Abstract_VM_Version::_supports_cx8 = false;
 unsigned int Abstract_VM_Version::_logical_processors_per_package = 1U;
+int Abstract_VM_Version::_reserve_for_allocation_prefetch = 0;
 
 #ifndef HOTSPOT_RELEASE_VERSION
   #error HOTSPOT_RELEASE_VERSION must be defined
--- a/hotspot/src/share/vm/runtime/vm_version.hpp	Tue Aug 16 11:53:57 2011 -0700
+++ b/hotspot/src/share/vm/runtime/vm_version.hpp	Tue Aug 16 16:59:46 2011 -0700
@@ -44,6 +44,7 @@
   static bool         _initialized;
   static int          _parallel_worker_threads;
   static bool         _parallel_worker_threads_initialized;
+  static int          _reserve_for_allocation_prefetch;
 
   static unsigned int nof_parallel_worker_threads(unsigned int num,
                                                   unsigned int dem,
@@ -77,6 +78,12 @@
     return _logical_processors_per_package;
   }
 
+  // Need a space at the end of TLAB for prefetch instructions
+  // which may fault when accessing memory outside of heap.
+  static int reserve_for_allocation_prefetch() {
+    return _reserve_for_allocation_prefetch;
+  }
+
   // ARCH specific policy for the BiasedLocking
   static bool use_biased_locking()  { return true; }