8005522: use fast-string instructions on x86 for zeroing
authorkvn
Thu, 03 Jan 2013 15:09:55 -0800
changeset 15114 4074553c678b
parent 15113 823590505eb4
child 15115 f8ef87f6f07f
8005522: use fast-string instructions on x86 for zeroing Summary: use 'rep stosb' instead of 'rep stosq' when fast-string operations are available. Reviewed-by: twisti, roland
hotspot/src/cpu/x86/vm/assembler_x86.cpp
hotspot/src/cpu/x86/vm/assembler_x86.hpp
hotspot/src/cpu/x86/vm/globals_x86.hpp
hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
hotspot/src/cpu/x86/vm/vm_version_x86.cpp
hotspot/src/cpu/x86/vm/vm_version_x86.hpp
hotspot/src/cpu/x86/vm/x86_32.ad
hotspot/src/cpu/x86/vm/x86_64.ad
hotspot/src/share/vm/opto/memnode.cpp
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Sun Dec 23 17:08:22 2012 +0100
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Thu Jan 03 15:09:55 2013 -0800
@@ -2544,12 +2544,18 @@
   emit_int8((unsigned char)0xA5);
 }
 
+// sets rcx bytes with rax, value at [edi]
+void Assembler::rep_stosb() {
+  emit_int8((unsigned char)0xF3); // REP
+  LP64_ONLY(prefix(REX_W));
+  emit_int8((unsigned char)0xAA); // STOSB
+}
+
 // sets rcx pointer sized words with rax, value at [edi]
 // generic
-void Assembler::rep_set() { // rep_set
-  emit_int8((unsigned char)0xF3);
-  // STOSQ
-  LP64_ONLY(prefix(REX_W));
+void Assembler::rep_stos() {
+  emit_int8((unsigned char)0xF3); // REP
+  LP64_ONLY(prefix(REX_W));       // LP64:STOSQ, LP32:STOSD
   emit_int8((unsigned char)0xAB);
 }
 
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Sun Dec 23 17:08:22 2012 +0100
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Thu Jan 03 15:09:55 2013 -0800
@@ -832,7 +832,8 @@
 
   // These do register sized moves/scans
   void rep_mov();
-  void rep_set();
+  void rep_stos();
+  void rep_stosb();
   void repne_scan();
 #ifdef _LP64
   void repne_scanl();
--- a/hotspot/src/cpu/x86/vm/globals_x86.hpp	Sun Dec 23 17:08:22 2012 +0100
+++ b/hotspot/src/cpu/x86/vm/globals_x86.hpp	Thu Jan 03 15:09:55 2013 -0800
@@ -120,6 +120,9 @@
   product(bool, UseUnalignedLoadStores, false,                              \
           "Use SSE2 MOVDQU instruction for Arraycopy")                      \
                                                                             \
+  product(bool, UseFastStosb, false,                                        \
+          "Use fast-string operation for zeroing: rep stosb")               \
+                                                                            \
   /* assembler */                                                           \
   product(bool, Use486InstrsOnly, false,                                    \
           "Use 80486 Compliant instruction subset")                         \
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Sun Dec 23 17:08:22 2012 +0100
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Thu Jan 03 15:09:55 2013 -0800
@@ -5224,6 +5224,22 @@
 
 }
 
+void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp) {
+  // cnt - number of qwords (8-byte words).
+  // base - start address, qword aligned.
+  assert(base==rdi, "base register must be edi for rep stos");
+  assert(tmp==rax,   "tmp register must be eax for rep stos");
+  assert(cnt==rcx,   "cnt register must be ecx for rep stos");
+
+  xorptr(tmp, tmp);
+  if (UseFastStosb) {
+    shlptr(cnt,3); // convert to number of bytes
+    rep_stosb();
+  } else {
+    NOT_LP64(shlptr(cnt,1);) // convert to number of dwords for 32-bit VM
+    rep_stos();
+  }
+}
 
 // IndexOf for constant substrings with size >= 8 chars
 // which don't need to be loaded through stack.
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp	Sun Dec 23 17:08:22 2012 +0100
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp	Thu Jan 03 15:09:55 2013 -0800
@@ -1096,6 +1096,9 @@
   // C2 compiled method's prolog code.
   void verified_entry(int framesize, bool stack_bang, bool fp_mode_24b);
 
+  // clear memory of size 'cnt' qwords, starting at 'base'.
+  void clear_mem(Register base, Register cnt, Register rtmp);
+
   // IndexOf strings.
   // Small strings are loaded through stack if they cross page boundary.
   void string_indexof(Register str1, Register str2,
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp	Sun Dec 23 17:08:22 2012 +0100
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp	Thu Jan 03 15:09:55 2013 -0800
@@ -429,7 +429,7 @@
   }
 
   char buf[256];
-  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
                cores_per_cpu(), threads_per_core(),
                cpu_family(), _model, _stepping,
                (supports_cmov() ? ", cmov" : ""),
@@ -446,6 +446,7 @@
                (supports_avx()    ? ", avx" : ""),
                (supports_avx2()   ? ", avx2" : ""),
                (supports_aes()    ? ", aes" : ""),
+               (supports_erms()   ? ", erms" : ""),
                (supports_mmx_ext() ? ", mmxext" : ""),
                (supports_3dnow_prefetch() ? ", 3dnowpref" : ""),
                (supports_lzcnt()   ? ", lzcnt": ""),
@@ -671,6 +672,16 @@
     FLAG_SET_DEFAULT(UsePopCountInstruction, false);
   }
 
+  // Use fast-string operations if available.
+  if (supports_erms()) {
+    if (FLAG_IS_DEFAULT(UseFastStosb)) {
+      UseFastStosb = true;
+    }
+  } else if (UseFastStosb) {
+    warning("fast-string operations are not available on this CPU");
+    FLAG_SET_DEFAULT(UseFastStosb, false);
+  }
+
 #ifdef COMPILER2
   if (FLAG_IS_DEFAULT(AlignVector)) {
     // Modern processors allow misaligned memory operations for vectors.
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp	Sun Dec 23 17:08:22 2012 +0100
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp	Thu Jan 03 15:09:55 2013 -0800
@@ -204,7 +204,8 @@
                    avx2 : 1,
                         : 2,
                    bmi2 : 1,
-                        : 23;
+                   erms : 1,
+                        : 22;
     } bits;
   };
 
@@ -247,7 +248,8 @@
     CPU_TSCINV = (1 << 16),
     CPU_AVX    = (1 << 17),
     CPU_AVX2   = (1 << 18),
-    CPU_AES    = (1 << 19)
+    CPU_AES    = (1 << 19),
+    CPU_ERMS   = (1 << 20) // enhanced 'rep movsb/stosb' instructions
   } cpuFeatureFlags;
 
   enum {
@@ -425,6 +427,8 @@
       result |= CPU_TSCINV;
     if (_cpuid_info.std_cpuid1_ecx.bits.aes != 0)
       result |= CPU_AES;
+    if (_cpuid_info.sef_cpuid7_ebx.bits.erms != 0)
+      result |= CPU_ERMS;
 
     // AMD features.
     if (is_amd()) {
@@ -489,7 +493,7 @@
     return (_cpuid_info.std_max_function >= 0xB) &&
            // eax[4:0] | ebx[0:15] == 0 indicates invalid topology level.
            // Some cpus have max cpuid >= 0xB but do not support processor topology.
-           ((_cpuid_info.tpl_cpuidB0_eax & 0x1f | _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus) != 0);
+           (((_cpuid_info.tpl_cpuidB0_eax & 0x1f) | _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus) != 0);
   }
 
   static uint cores_per_cpu()  {
@@ -550,6 +554,7 @@
   static bool supports_avx2()     { return (_cpuFeatures & CPU_AVX2) != 0; }
   static bool supports_tsc()      { return (_cpuFeatures & CPU_TSC)    != 0; }
   static bool supports_aes()      { return (_cpuFeatures & CPU_AES) != 0; }
+  static bool supports_erms()     { return (_cpuFeatures & CPU_ERMS) != 0; }
 
   // Intel features
   static bool is_intel_family_core() { return is_intel() &&
--- a/hotspot/src/cpu/x86/vm/x86_32.ad	Sun Dec 23 17:08:22 2012 +0100
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad	Thu Jan 03 15:09:55 2013 -0800
@@ -11572,15 +11572,28 @@
 // =======================================================================
 // fast clearing of an array
 instruct rep_stos(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
+  predicate(!UseFastStosb);
   match(Set dummy (ClearArray cnt base));
   effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
-  format %{ "SHL    ECX,1\t# Convert doublewords to words\n\t"
-            "XOR    EAX,EAX\n\t"
+  format %{ "XOR    EAX,EAX\t# ClearArray:\n\t"
+            "SHL    ECX,1\t# Convert doublewords to words\n\t"
             "REP STOS\t# store EAX into [EDI++] while ECX--" %}
-  opcode(0,0x4);
-  ins_encode( Opcode(0xD1), RegOpc(ECX),
-              OpcRegReg(0x33,EAX,EAX),
-              Opcode(0xF3), Opcode(0xAB) );
+  ins_encode %{ 
+    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rep_fast_stosb(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
+  predicate(UseFastStosb);
+  match(Set dummy (ClearArray cnt base));
+  effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
+  format %{ "XOR    EAX,EAX\t# ClearArray:\n\t"
+            "SHL    ECX,3\t# Convert doublewords to bytes\n\t"
+            "REP STOSB\t# store EAX into [EDI++] while ECX--" %}
+  ins_encode %{ 
+    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
+  %}
   ins_pipe( pipe_slow );
 %}
 
--- a/hotspot/src/cpu/x86/vm/x86_64.ad	Sun Dec 23 17:08:22 2012 +0100
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad	Thu Jan 03 15:09:55 2013 -0800
@@ -10374,16 +10374,33 @@
 instruct rep_stos(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
                   rFlagsReg cr)
 %{
+  predicate(!UseFastStosb);
   match(Set dummy (ClearArray cnt base));
   effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
 
-  format %{ "xorl    rax, rax\t# ClearArray:\n\t"
-            "rep stosq\t# Store rax to *rdi++ while rcx--" %}
-  ins_encode(opc_reg_reg(0x33, RAX, RAX), // xorl %eax, %eax
-             Opcode(0xF3), Opcode(0x48), Opcode(0xAB)); // rep REX_W stos
+  format %{ "xorq    rax, rax\t# ClearArray:\n\t"
+            "rep     stosq\t# Store rax to *rdi++ while rcx--" %}
+  ins_encode %{ 
+    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
+  %}
   ins_pipe(pipe_slow);
 %}
 
+instruct rep_fast_stosb(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
+                        rFlagsReg cr)
+%{
+  predicate(UseFastStosb);
+  match(Set dummy (ClearArray cnt base));
+  effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
+  format %{ "xorq    rax, rax\t# ClearArray:\n\t"
+            "shlq    rcx,3\t# Convert doublewords to bytes\n\t"
+            "rep     stosb\t# Store rax to *rdi++ while rcx--" %}
+  ins_encode %{ 
+    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 instruct string_compare(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2,
                         rax_RegI result, regD tmp1, rFlagsReg cr)
 %{
--- a/hotspot/src/share/vm/opto/memnode.cpp	Sun Dec 23 17:08:22 2012 +0100
+++ b/hotspot/src/share/vm/opto/memnode.cpp	Thu Jan 03 15:09:55 2013 -0800
@@ -2725,10 +2725,8 @@
     zend  = phase->transform( new(C) URShiftXNode(zend,  shift) );
   }
 
+  // Bulk clear double-words
   Node* zsize = phase->transform( new(C) SubXNode(zend, zbase) );
-  Node* zinit = phase->zerocon((unit == BytesPerLong) ? T_LONG : T_INT);
-
-  // Bulk clear double-words
   Node* adr = phase->transform( new(C) AddPNode(dest, dest, start_offset) );
   mem = new (C) ClearArrayNode(ctl, mem, zsize, adr);
   return phase->transform(mem);