8146801: Allocating short arrays of non-constant size is slow
authorshade
Fri, 04 Mar 2016 01:30:11 +0300
changeset 36554 a7eb9ee4680c
parent 36553 203b2b5d149b
child 36555 4f37fd7a5a09
8146801: Allocating short arrays of non-constant size is slow Reviewed-by: kvn, twisti, vlivanov
hotspot/src/cpu/aarch64/vm/aarch64.ad
hotspot/src/cpu/aarch64/vm/globals_aarch64.hpp
hotspot/src/cpu/ppc/vm/globals_ppc.hpp
hotspot/src/cpu/ppc/vm/ppc.ad
hotspot/src/cpu/sparc/vm/globals_sparc.hpp
hotspot/src/cpu/sparc/vm/sparc.ad
hotspot/src/cpu/x86/vm/globals_x86.hpp
hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
hotspot/src/cpu/x86/vm/x86_32.ad
hotspot/src/cpu/x86/vm/x86_64.ad
hotspot/src/share/vm/opto/matcher.hpp
hotspot/src/share/vm/opto/memnode.cpp
hotspot/src/share/vm/opto/memnode.hpp
hotspot/src/share/vm/runtime/commandLineFlagConstraintsCompiler.cpp
hotspot/src/share/vm/runtime/commandLineFlagConstraintsCompiler.hpp
hotspot/src/share/vm/runtime/globals.hpp
--- a/hotspot/src/cpu/aarch64/vm/aarch64.ad	Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/cpu/aarch64/vm/aarch64.ad	Fri Mar 04 01:30:11 2016 +0300
@@ -3425,9 +3425,6 @@
 // false => size gets scaled to BytesPerLong, ok.
 const bool Matcher::init_array_count_is_in_bytes = false;
 
-// Threshold size for cleararray.
-const int Matcher::init_array_short_size = 18 * BytesPerLong;
-
 // Use conditional move (CMOVL)
 const int Matcher::long_cmove_cost() {
   // long cmoves are no more expensive than int cmoves
--- a/hotspot/src/cpu/aarch64/vm/globals_aarch64.hpp	Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/cpu/aarch64/vm/globals_aarch64.hpp	Fri Mar 04 01:30:11 2016 +0300
@@ -76,6 +76,8 @@
 // avoid biased locking while we are bootstrapping the aarch64 build
 define_pd_global(bool, UseBiasedLocking, false);
 
+define_pd_global(intx, InitArrayShortSize, 18*BytesPerLong);
+
 #if defined(COMPILER1) || defined(COMPILER2)
 define_pd_global(intx, InlineSmallCode,          1000);
 #endif
--- a/hotspot/src/cpu/ppc/vm/globals_ppc.hpp	Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/cpu/ppc/vm/globals_ppc.hpp	Fri Mar 04 01:30:11 2016 +0300
@@ -76,6 +76,8 @@
 
 define_pd_global(bool, CompactStrings, true);
 
+define_pd_global(intx, InitArrayShortSize, 8*BytesPerLong);
+
 // Platform dependent flag handling: flags only defined on this platform.
 #define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint)  \
                                                                             \
--- a/hotspot/src/cpu/ppc/vm/ppc.ad	Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/cpu/ppc/vm/ppc.ad	Fri Mar 04 01:30:11 2016 +0300
@@ -2137,8 +2137,6 @@
   return decode;
 }
 */
-// Threshold size for cleararray.
-const int Matcher::init_array_short_size = 8 * BytesPerLong;
 
 // false => size gets scaled to BytesPerLong, ok.
 const bool Matcher::init_array_count_is_in_bytes = false;
--- a/hotspot/src/cpu/sparc/vm/globals_sparc.hpp	Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/cpu/sparc/vm/globals_sparc.hpp	Fri Mar 04 01:30:11 2016 +0300
@@ -90,6 +90,8 @@
 
 define_pd_global(bool, CompactStrings, true);
 
+define_pd_global(intx, InitArrayShortSize, 8*BytesPerLong);
+
 #define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint) \
                                                                             \
   product(intx, UseVIS, 99,                                                 \
--- a/hotspot/src/cpu/sparc/vm/sparc.ad	Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/cpu/sparc/vm/sparc.ad	Fri Mar 04 01:30:11 2016 +0300
@@ -1980,9 +1980,6 @@
 // No scaling for the parameter the ClearArray node.
 const bool Matcher::init_array_count_is_in_bytes = true;
 
-// Threshold size for cleararray.
-const int Matcher::init_array_short_size = 8 * BytesPerLong;
-
 // No additional cost for CMOVL.
 const int Matcher::long_cmove_cost() { return 0; }
 
--- a/hotspot/src/cpu/x86/vm/globals_x86.hpp	Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/cpu/x86/vm/globals_x86.hpp	Fri Mar 04 01:30:11 2016 +0300
@@ -97,6 +97,8 @@
 
 define_pd_global(bool, PreserveFramePointer, false);
 
+define_pd_global(intx, InitArrayShortSize, 8*BytesPerLong);
+
 #define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint) \
                                                                             \
   develop(bool, IEEEPrecision, true,                                        \
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Fri Mar 04 01:30:11 2016 +0300
@@ -7198,21 +7198,50 @@
 
 }
 
-void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp) {
+void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, bool is_large) {
   // cnt - number of qwords (8-byte words).
   // base - start address, qword aligned.
+  // is_large - if optimizers know cnt is larger than InitArrayShortSize
   assert(base==rdi, "base register must be edi for rep stos");
   assert(tmp==rax,   "tmp register must be eax for rep stos");
   assert(cnt==rcx,   "cnt register must be ecx for rep stos");
+  assert(InitArrayShortSize % BytesPerLong == 0,
+    "InitArrayShortSize should be the multiple of BytesPerLong");
+
+  Label DONE;
 
   xorptr(tmp, tmp);
+
+  if (!is_large) {
+    Label LOOP, LONG;
+    cmpptr(cnt, InitArrayShortSize/BytesPerLong);
+    jccb(Assembler::greater, LONG);
+
+    NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
+
+    decrement(cnt);
+    jccb(Assembler::negative, DONE); // Zero length
+
+    // Use individual pointer-sized stores for small counts:
+    BIND(LOOP);
+    movptr(Address(base, cnt, Address::times_ptr), tmp);
+    decrement(cnt);
+    jccb(Assembler::greaterEqual, LOOP);
+    jmpb(DONE);
+
+    BIND(LONG);
+  }
+
+  // Use longer rep-prefixed ops for non-small counts:
   if (UseFastStosb) {
-    shlptr(cnt,3); // convert to number of bytes
+    shlptr(cnt, 3); // convert to number of bytes
     rep_stosb();
   } else {
-    NOT_LP64(shlptr(cnt,1);) // convert to number of dwords for 32-bit VM
+    NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
     rep_stos();
   }
+
+  BIND(DONE);
 }
 
 #ifdef COMPILER2
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp	Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp	Fri Mar 04 01:30:11 2016 +0300
@@ -1284,8 +1284,9 @@
   // C2 compiled method's prolog code.
   void verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b);
 
-  // clear memory of size 'cnt' qwords, starting at 'base'.
-  void clear_mem(Register base, Register cnt, Register rtmp);
+  // clear memory of size 'cnt' qwords, starting at 'base';
+  // if 'is_large' is set, do not try to produce short loop
+  void clear_mem(Register base, Register cnt, Register rtmp, bool is_large);
 
 #ifdef COMPILER2
   void string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
--- a/hotspot/src/cpu/x86/vm/x86_32.ad	Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad	Fri Mar 04 01:30:11 2016 +0300
@@ -1420,9 +1420,6 @@
 // The ecx parameter to rep stos for the ClearArray node is in dwords.
 const bool Matcher::init_array_count_is_in_bytes = false;
 
-// Threshold size for cleararray.
-const int Matcher::init_array_short_size = 8 * BytesPerLong;
-
 // Needs 2 CMOV's for longs.
 const int Matcher::long_cmove_cost() { return 1; }
 
@@ -11369,27 +11366,54 @@
 // =======================================================================
 // fast clearing of an array
 instruct rep_stos(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
-  predicate(!UseFastStosb);
+  predicate(!((ClearArrayNode*)n)->is_large());
   match(Set dummy (ClearArray cnt base));
   effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
-  format %{ "XOR    EAX,EAX\t# ClearArray:\n\t"
-            "SHL    ECX,1\t# Convert doublewords to words\n\t"
-            "REP STOS\t# store EAX into [EDI++] while ECX--" %}
-  ins_encode %{
-    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct rep_fast_stosb(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
-  predicate(UseFastStosb);
+
+  format %{ $$template
+    $$emit$$"XOR    EAX,EAX\t# ClearArray:\n\t"
+    $$emit$$"CMP    InitArrayShortSize,rcx\n\t"
+    $$emit$$"JG     LARGE\n\t"
+    $$emit$$"SHL    ECX, 1\n\t"
+    $$emit$$"DEC    ECX\n\t"
+    $$emit$$"JS     DONE\t# Zero length\n\t"
+    $$emit$$"MOV    EAX,(EDI,ECX,4)\t# LOOP\n\t"
+    $$emit$$"DEC    ECX\n\t"
+    $$emit$$"JGE    LOOP\n\t"
+    $$emit$$"JMP    DONE\n\t"
+    $$emit$$"# LARGE:\n\t"
+    if (UseFastStosb) {
+       $$emit$$"SHL    ECX,3\t# Convert doublewords to bytes\n\t"
+       $$emit$$"REP STOSB\t# store EAX into [EDI++] while ECX--\n\t"
+    } else {
+       $$emit$$"SHL    ECX,1\t# Convert doublewords to words\n\t"
+       $$emit$$"REP STOS\t# store EAX into [EDI++] while ECX--\n\t"
+    }
+    $$emit$$"# DONE"
+  %}
+  ins_encode %{
+    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, false);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rep_stos_large(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
+  predicate(((ClearArrayNode*)n)->is_large());
   match(Set dummy (ClearArray cnt base));
   effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
-  format %{ "XOR    EAX,EAX\t# ClearArray:\n\t"
-            "SHL    ECX,3\t# Convert doublewords to bytes\n\t"
-            "REP STOSB\t# store EAX into [EDI++] while ECX--" %}
-  ins_encode %{
-    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
+  format %{ $$template
+    $$emit$$"XOR    EAX,EAX\t# ClearArray:\n\t"
+    if (UseFastStosb) {
+       $$emit$$"SHL    ECX,3\t# Convert doublewords to bytes\n\t"
+       $$emit$$"REP STOSB\t# store EAX into [EDI++] while ECX--\n\t"
+    } else {
+       $$emit$$"SHL    ECX,1\t# Convert doublewords to words\n\t"
+       $$emit$$"REP STOS\t# store EAX into [EDI++] while ECX--\n\t"
+    }
+    $$emit$$"# DONE"
+  %}
+  ins_encode %{
+    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, true);
   %}
   ins_pipe( pipe_slow );
 %}
--- a/hotspot/src/cpu/x86/vm/x86_64.ad	Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad	Fri Mar 04 01:30:11 2016 +0300
@@ -1637,9 +1637,6 @@
 // The ecx parameter to rep stosq for the ClearArray node is in words.
 const bool Matcher::init_array_count_is_in_bytes = false;
 
-// Threshold size for cleararray.
-const int Matcher::init_array_short_size = 8 * BytesPerLong;
-
 // No additional cost for CMOVL.
 const int Matcher::long_cmove_cost() { return 0; }
 
@@ -10460,31 +10457,55 @@
 instruct rep_stos(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
                   rFlagsReg cr)
 %{
-  predicate(!UseFastStosb);
+  predicate(!((ClearArrayNode*)n)->is_large());
   match(Set dummy (ClearArray cnt base));
   effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
 
-  format %{ "xorq    rax, rax\t# ClearArray:\n\t"
-            "rep     stosq\t# Store rax to *rdi++ while rcx--" %}
-  ins_encode %{
-    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
+  format %{ $$template
+    $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
+    $$emit$$"cmp     InitArrayShortSize,rcx\n\t"
+    $$emit$$"jg      LARGE\n\t"
+    $$emit$$"dec     rcx\n\t"
+    $$emit$$"js      DONE\t# Zero length\n\t"
+    $$emit$$"mov     rax,(rdi,rcx,8)\t# LOOP\n\t"
+    $$emit$$"dec     rcx\n\t"
+    $$emit$$"jge     LOOP\n\t"
+    $$emit$$"jmp     DONE\n\t"
+    $$emit$$"# LARGE:\n\t"
+    if (UseFastStosb) {
+       $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
+       $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--\n\t"
+    } else {
+       $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
+    }
+    $$emit$$"# DONE"
+  %}
+  ins_encode %{
+    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, false);
   %}
   ins_pipe(pipe_slow);
 %}
 
-instruct rep_fast_stosb(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
-                        rFlagsReg cr)
-%{
-  predicate(UseFastStosb);
+instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
+                  rFlagsReg cr)
+%{
+  predicate(((ClearArrayNode*)n)->is_large());
   match(Set dummy (ClearArray cnt base));
   effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
-  format %{ "xorq    rax, rax\t# ClearArray:\n\t"
-            "shlq    rcx,3\t# Convert doublewords to bytes\n\t"
-            "rep     stosb\t# Store rax to *rdi++ while rcx--" %}
-  ins_encode %{
-    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
-  %}
-  ins_pipe( pipe_slow );
+
+  format %{ $$template
+    $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
+    if (UseFastStosb) {
+       $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
+       $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--"
+    } else {
+       $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
+    }
+  %}
+  ins_encode %{
+    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, true);
+  %}
+  ins_pipe(pipe_slow);
 %}
 
 instruct string_compareL(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2,
--- a/hotspot/src/share/vm/opto/matcher.hpp	Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/share/vm/opto/matcher.hpp	Fri Mar 04 01:30:11 2016 +0300
@@ -399,10 +399,6 @@
   // Optional scaling for the parameter to the ClearArray/CopyArray node.
   static const bool init_array_count_is_in_bytes;
 
-  // Threshold small size (in bytes) for a ClearArray/CopyArray node.
-  // Anything this size or smaller may get converted to discrete scalar stores.
-  static const int init_array_short_size;
-
   // Some hardware needs 2 CMOV's for longs.
   static const int long_cmove_cost();
 
--- a/hotspot/src/share/vm/opto/memnode.cpp	Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/share/vm/opto/memnode.cpp	Fri Mar 04 01:30:11 2016 +0300
@@ -2741,6 +2741,9 @@
 //------------------------------Idealize---------------------------------------
 // Clearing a short array is faster with stores
 Node *ClearArrayNode::Ideal(PhaseGVN *phase, bool can_reshape){
+  // Already know this is a large node, do not try to ideal it
+  if (_is_large) return NULL;
+
   const int unit = BytesPerLong;
   const TypeX* t = phase->type(in(2))->isa_intptr_t();
   if (!t)  return NULL;
@@ -2753,8 +2756,11 @@
   // (see jck test stmt114.stmt11402.val).
   if (size <= 0 || size % unit != 0)  return NULL;
   intptr_t count = size / unit;
-  // Length too long; use fast hardware clear
-  if (size > Matcher::init_array_short_size)  return NULL;
+  // Length too long; communicate this to matchers and assemblers.
+  // Assemblers are responsible to produce fast hardware clears for it.
+  if (size > InitArrayShortSize) {
+    return new ClearArrayNode(in(0), in(1), in(2), in(3), true);
+  }
   Node *mem = in(1);
   if( phase->type(mem)==Type::TOP ) return NULL;
   Node *adr = in(3);
@@ -2852,7 +2858,7 @@
   // Bulk clear double-words
   Node* zsize = phase->transform(new SubXNode(zend, zbase) );
   Node* adr = phase->transform(new AddPNode(dest, dest, start_offset) );
-  mem = new ClearArrayNode(ctl, mem, zsize, adr);
+  mem = new ClearArrayNode(ctl, mem, zsize, adr, false);
   return phase->transform(mem);
 }
 
@@ -3901,7 +3907,7 @@
                                               zeroes_done, zeroes_needed,
                                               phase);
         zeroes_done = zeroes_needed;
-        if (zsize > Matcher::init_array_short_size && ++big_init_gaps > 2)
+        if (zsize > InitArrayShortSize && ++big_init_gaps > 2)
           do_zeroing = false;   // leave the hole, next time
       }
     }
--- a/hotspot/src/share/vm/opto/memnode.hpp	Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/share/vm/opto/memnode.hpp	Fri Mar 04 01:30:11 2016 +0300
@@ -1013,9 +1013,11 @@
 
 //------------------------------ClearArray-------------------------------------
 class ClearArrayNode: public Node {
+private:
+  bool _is_large;
 public:
-  ClearArrayNode( Node *ctrl, Node *arymem, Node *word_cnt, Node *base )
-    : Node(ctrl,arymem,word_cnt,base) {
+  ClearArrayNode( Node *ctrl, Node *arymem, Node *word_cnt, Node *base, bool is_large)
+    : Node(ctrl,arymem,word_cnt,base), _is_large(is_large) {
     init_class_id(Class_ClearArray);
   }
   virtual int         Opcode() const;
@@ -1026,6 +1028,7 @@
   virtual Node* Identity(PhaseGVN* phase);
   virtual Node *Ideal(PhaseGVN *phase, bool can_reshape);
   virtual uint match_edge(uint idx) const;
+  bool is_large() const { return _is_large; }
 
   // Clear the given area of an object or array.
   // The start offset must always be aligned mod BytesPerInt.
--- a/hotspot/src/share/vm/runtime/commandLineFlagConstraintsCompiler.cpp	Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/share/vm/runtime/commandLineFlagConstraintsCompiler.cpp	Fri Mar 04 01:30:11 2016 +0300
@@ -354,6 +354,14 @@
   return Flag::SUCCESS;
 }
 
+Flag::Error InitArrayShortSizeConstraintFunc(intx value, bool verbose) {
+  if (value % BytesPerLong != 0) {
+    return Flag::VIOLATES_CONSTRAINT;
+  } else {
+    return Flag::SUCCESS;
+  }
+}
+
 #ifdef COMPILER2
 Flag::Error InteriorEntryAlignmentConstraintFunc(intx value, bool verbose) {
   if (InteriorEntryAlignment > CodeEntryAlignment) {
--- a/hotspot/src/share/vm/runtime/commandLineFlagConstraintsCompiler.hpp	Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/share/vm/runtime/commandLineFlagConstraintsCompiler.hpp	Fri Mar 04 01:30:11 2016 +0300
@@ -62,6 +62,8 @@
 
 Flag::Error TypeProfileLevelConstraintFunc(uintx value, bool verbose);
 
+Flag::Error InitArrayShortSizeConstraintFunc(intx value, bool verbose);
+
 #ifdef COMPILER2
 Flag::Error InteriorEntryAlignmentConstraintFunc(intx value, bool verbose);
 
--- a/hotspot/src/share/vm/runtime/globals.hpp	Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/share/vm/runtime/globals.hpp	Fri Mar 04 01:30:11 2016 +0300
@@ -4162,6 +4162,13 @@
              "in the loaded class C. "                                      \
              "Check (3) is available only in debug builds.")                \
                                                                             \
+  develop_pd(intx, InitArrayShortSize,                                      \
+          "Threshold small size (in bytes) for clearing arrays. "           \
+          "Anything this size or smaller may get converted to discrete "    \
+          "scalar stores.")                                                 \
+          range(0, max_intx)                                                \
+          constraint(InitArrayShortSizeConstraintFunc, AfterErgo)           \
+                                                                            \
   diagnostic(bool, CompilerDirectivesIgnoreCompileCommands, false,          \
              "Disable backwards compatibility for compile commands.")       \
                                                                             \