8146801: Allocating short arrays of non-constant size is slow
Reviewed-by: kvn, twisti, vlivanov
--- a/hotspot/src/cpu/aarch64/vm/aarch64.ad Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/cpu/aarch64/vm/aarch64.ad Fri Mar 04 01:30:11 2016 +0300
@@ -3425,9 +3425,6 @@
// false => size gets scaled to BytesPerLong, ok.
const bool Matcher::init_array_count_is_in_bytes = false;
-// Threshold size for cleararray.
-const int Matcher::init_array_short_size = 18 * BytesPerLong;
-
// Use conditional move (CMOVL)
const int Matcher::long_cmove_cost() {
// long cmoves are no more expensive than int cmoves
--- a/hotspot/src/cpu/aarch64/vm/globals_aarch64.hpp Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/cpu/aarch64/vm/globals_aarch64.hpp Fri Mar 04 01:30:11 2016 +0300
@@ -76,6 +76,8 @@
// avoid biased locking while we are bootstrapping the aarch64 build
define_pd_global(bool, UseBiasedLocking, false);
+define_pd_global(intx, InitArrayShortSize, 18*BytesPerLong);
+
#if defined(COMPILER1) || defined(COMPILER2)
define_pd_global(intx, InlineSmallCode, 1000);
#endif
--- a/hotspot/src/cpu/ppc/vm/globals_ppc.hpp Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/cpu/ppc/vm/globals_ppc.hpp Fri Mar 04 01:30:11 2016 +0300
@@ -76,6 +76,8 @@
define_pd_global(bool, CompactStrings, true);
+define_pd_global(intx, InitArrayShortSize, 8*BytesPerLong);
+
// Platform dependent flag handling: flags only defined on this platform.
#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint) \
\
--- a/hotspot/src/cpu/ppc/vm/ppc.ad Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/cpu/ppc/vm/ppc.ad Fri Mar 04 01:30:11 2016 +0300
@@ -2137,8 +2137,6 @@
return decode;
}
*/
-// Threshold size for cleararray.
-const int Matcher::init_array_short_size = 8 * BytesPerLong;
// false => size gets scaled to BytesPerLong, ok.
const bool Matcher::init_array_count_is_in_bytes = false;
--- a/hotspot/src/cpu/sparc/vm/globals_sparc.hpp Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/cpu/sparc/vm/globals_sparc.hpp Fri Mar 04 01:30:11 2016 +0300
@@ -90,6 +90,8 @@
define_pd_global(bool, CompactStrings, true);
+define_pd_global(intx, InitArrayShortSize, 8*BytesPerLong);
+
#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint) \
\
product(intx, UseVIS, 99, \
--- a/hotspot/src/cpu/sparc/vm/sparc.ad Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/cpu/sparc/vm/sparc.ad Fri Mar 04 01:30:11 2016 +0300
@@ -1980,9 +1980,6 @@
// No scaling for the parameter the ClearArray node.
const bool Matcher::init_array_count_is_in_bytes = true;
-// Threshold size for cleararray.
-const int Matcher::init_array_short_size = 8 * BytesPerLong;
-
// No additional cost for CMOVL.
const int Matcher::long_cmove_cost() { return 0; }
--- a/hotspot/src/cpu/x86/vm/globals_x86.hpp Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/cpu/x86/vm/globals_x86.hpp Fri Mar 04 01:30:11 2016 +0300
@@ -97,6 +97,8 @@
define_pd_global(bool, PreserveFramePointer, false);
+define_pd_global(intx, InitArrayShortSize, 8*BytesPerLong);
+
#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct, range, constraint) \
\
develop(bool, IEEEPrecision, true, \
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Fri Mar 04 01:30:11 2016 +0300
@@ -7198,21 +7198,50 @@
}
-void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp) {
+void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, bool is_large) {
// cnt - number of qwords (8-byte words).
// base - start address, qword aligned.
+ // is_large - if optimizers know cnt is larger than InitArrayShortSize
assert(base==rdi, "base register must be edi for rep stos");
assert(tmp==rax, "tmp register must be eax for rep stos");
assert(cnt==rcx, "cnt register must be ecx for rep stos");
+ assert(InitArrayShortSize % BytesPerLong == 0,
+ "InitArrayShortSize should be the multiple of BytesPerLong");
+
+ Label DONE;
xorptr(tmp, tmp);
+
+ if (!is_large) {
+ Label LOOP, LONG;
+ cmpptr(cnt, InitArrayShortSize/BytesPerLong);
+ jccb(Assembler::greater, LONG);
+
+ NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
+
+ decrement(cnt);
+ jccb(Assembler::negative, DONE); // Zero length
+
+ // Use individual pointer-sized stores for small counts:
+ BIND(LOOP);
+ movptr(Address(base, cnt, Address::times_ptr), tmp);
+ decrement(cnt);
+ jccb(Assembler::greaterEqual, LOOP);
+ jmpb(DONE);
+
+ BIND(LONG);
+ }
+
+ // Use longer rep-prefixed ops for non-small counts:
if (UseFastStosb) {
- shlptr(cnt,3); // convert to number of bytes
+ shlptr(cnt, 3); // convert to number of bytes
rep_stosb();
} else {
- NOT_LP64(shlptr(cnt,1);) // convert to number of dwords for 32-bit VM
+ NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
rep_stos();
}
+
+ BIND(DONE);
}
#ifdef COMPILER2
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp Fri Mar 04 01:30:11 2016 +0300
@@ -1284,8 +1284,9 @@
// C2 compiled method's prolog code.
void verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b);
- // clear memory of size 'cnt' qwords, starting at 'base'.
- void clear_mem(Register base, Register cnt, Register rtmp);
+ // clear memory of size 'cnt' qwords, starting at 'base';
+ // if 'is_large' is set, do not try to produce short loop
+ void clear_mem(Register base, Register cnt, Register rtmp, bool is_large);
#ifdef COMPILER2
void string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
--- a/hotspot/src/cpu/x86/vm/x86_32.ad Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad Fri Mar 04 01:30:11 2016 +0300
@@ -1420,9 +1420,6 @@
// The ecx parameter to rep stos for the ClearArray node is in dwords.
const bool Matcher::init_array_count_is_in_bytes = false;
-// Threshold size for cleararray.
-const int Matcher::init_array_short_size = 8 * BytesPerLong;
-
// Needs 2 CMOV's for longs.
const int Matcher::long_cmove_cost() { return 1; }
@@ -11369,27 +11366,54 @@
// =======================================================================
// fast clearing of an array
instruct rep_stos(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
- predicate(!UseFastStosb);
+ predicate(!((ClearArrayNode*)n)->is_large());
match(Set dummy (ClearArray cnt base));
effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
- format %{ "XOR EAX,EAX\t# ClearArray:\n\t"
- "SHL ECX,1\t# Convert doublewords to words\n\t"
- "REP STOS\t# store EAX into [EDI++] while ECX--" %}
- ins_encode %{
- __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct rep_fast_stosb(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
- predicate(UseFastStosb);
+
+ format %{ $$template
+ $$emit$$"XOR EAX,EAX\t# ClearArray:\n\t"
+ $$emit$$"CMP InitArrayShortSize,rcx\n\t"
+ $$emit$$"JG LARGE\n\t"
+ $$emit$$"SHL ECX, 1\n\t"
+ $$emit$$"DEC ECX\n\t"
+ $$emit$$"JS DONE\t# Zero length\n\t"
+ $$emit$$"MOV EAX,(EDI,ECX,4)\t# LOOP\n\t"
+ $$emit$$"DEC ECX\n\t"
+ $$emit$$"JGE LOOP\n\t"
+ $$emit$$"JMP DONE\n\t"
+ $$emit$$"# LARGE:\n\t"
+ if (UseFastStosb) {
+ $$emit$$"SHL ECX,3\t# Convert doublewords to bytes\n\t"
+ $$emit$$"REP STOSB\t# store EAX into [EDI++] while ECX--\n\t"
+ } else {
+ $$emit$$"SHL ECX,1\t# Convert doublewords to words\n\t"
+ $$emit$$"REP STOS\t# store EAX into [EDI++] while ECX--\n\t"
+ }
+ $$emit$$"# DONE"
+ %}
+ ins_encode %{
+ __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, false);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rep_stos_large(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
+ predicate(((ClearArrayNode*)n)->is_large());
match(Set dummy (ClearArray cnt base));
effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
- format %{ "XOR EAX,EAX\t# ClearArray:\n\t"
- "SHL ECX,3\t# Convert doublewords to bytes\n\t"
- "REP STOSB\t# store EAX into [EDI++] while ECX--" %}
- ins_encode %{
- __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
+ format %{ $$template
+ $$emit$$"XOR EAX,EAX\t# ClearArray:\n\t"
+ if (UseFastStosb) {
+ $$emit$$"SHL ECX,3\t# Convert doublewords to bytes\n\t"
+ $$emit$$"REP STOSB\t# store EAX into [EDI++] while ECX--\n\t"
+ } else {
+ $$emit$$"SHL ECX,1\t# Convert doublewords to words\n\t"
+ $$emit$$"REP STOS\t# store EAX into [EDI++] while ECX--\n\t"
+ }
+ $$emit$$"# DONE"
+ %}
+ ins_encode %{
+ __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, true);
%}
ins_pipe( pipe_slow );
%}
--- a/hotspot/src/cpu/x86/vm/x86_64.ad Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad Fri Mar 04 01:30:11 2016 +0300
@@ -1637,9 +1637,6 @@
// The ecx parameter to rep stosq for the ClearArray node is in words.
const bool Matcher::init_array_count_is_in_bytes = false;
-// Threshold size for cleararray.
-const int Matcher::init_array_short_size = 8 * BytesPerLong;
-
// No additional cost for CMOVL.
const int Matcher::long_cmove_cost() { return 0; }
@@ -10460,31 +10457,55 @@
instruct rep_stos(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
rFlagsReg cr)
%{
- predicate(!UseFastStosb);
+ predicate(!((ClearArrayNode*)n)->is_large());
match(Set dummy (ClearArray cnt base));
effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
- format %{ "xorq rax, rax\t# ClearArray:\n\t"
- "rep stosq\t# Store rax to *rdi++ while rcx--" %}
- ins_encode %{
- __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
+ format %{ $$template
+ $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
+ $$emit$$"cmp InitArrayShortSize,rcx\n\t"
+ $$emit$$"jg LARGE\n\t"
+ $$emit$$"dec rcx\n\t"
+ $$emit$$"js DONE\t# Zero length\n\t"
+ $$emit$$"mov rax,(rdi,rcx,8)\t# LOOP\n\t"
+ $$emit$$"dec rcx\n\t"
+ $$emit$$"jge LOOP\n\t"
+ $$emit$$"jmp DONE\n\t"
+ $$emit$$"# LARGE:\n\t"
+ if (UseFastStosb) {
+ $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
+ $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--\n\t"
+ } else {
+ $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--\n\t"
+ }
+ $$emit$$"# DONE"
+ %}
+ ins_encode %{
+ __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, false);
%}
ins_pipe(pipe_slow);
%}
-instruct rep_fast_stosb(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
- rFlagsReg cr)
-%{
- predicate(UseFastStosb);
+instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
+ rFlagsReg cr)
+%{
+ predicate(((ClearArrayNode*)n)->is_large());
match(Set dummy (ClearArray cnt base));
effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
- format %{ "xorq rax, rax\t# ClearArray:\n\t"
- "shlq rcx,3\t# Convert doublewords to bytes\n\t"
- "rep stosb\t# Store rax to *rdi++ while rcx--" %}
- ins_encode %{
- __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
- %}
- ins_pipe( pipe_slow );
+
+ format %{ $$template
+ $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
+ if (UseFastStosb) {
+ $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
+ $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--"
+ } else {
+ $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--"
+ }
+ %}
+ ins_encode %{
+ __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, true);
+ %}
+ ins_pipe(pipe_slow);
%}
instruct string_compareL(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2,
--- a/hotspot/src/share/vm/opto/matcher.hpp Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/share/vm/opto/matcher.hpp Fri Mar 04 01:30:11 2016 +0300
@@ -399,10 +399,6 @@
// Optional scaling for the parameter to the ClearArray/CopyArray node.
static const bool init_array_count_is_in_bytes;
- // Threshold small size (in bytes) for a ClearArray/CopyArray node.
- // Anything this size or smaller may get converted to discrete scalar stores.
- static const int init_array_short_size;
-
// Some hardware needs 2 CMOV's for longs.
static const int long_cmove_cost();
--- a/hotspot/src/share/vm/opto/memnode.cpp Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/share/vm/opto/memnode.cpp Fri Mar 04 01:30:11 2016 +0300
@@ -2741,6 +2741,9 @@
//------------------------------Idealize---------------------------------------
// Clearing a short array is faster with stores
Node *ClearArrayNode::Ideal(PhaseGVN *phase, bool can_reshape){
+ // Already know this is a large node, do not try to ideal it
+ if (_is_large) return NULL;
+
const int unit = BytesPerLong;
const TypeX* t = phase->type(in(2))->isa_intptr_t();
if (!t) return NULL;
@@ -2753,8 +2756,11 @@
// (see jck test stmt114.stmt11402.val).
if (size <= 0 || size % unit != 0) return NULL;
intptr_t count = size / unit;
- // Length too long; use fast hardware clear
- if (size > Matcher::init_array_short_size) return NULL;
+ // Length too long; communicate this to matchers and assemblers.
+ // Assemblers are responsible to produce fast hardware clears for it.
+ if (size > InitArrayShortSize) {
+ return new ClearArrayNode(in(0), in(1), in(2), in(3), true);
+ }
Node *mem = in(1);
if( phase->type(mem)==Type::TOP ) return NULL;
Node *adr = in(3);
@@ -2852,7 +2858,7 @@
// Bulk clear double-words
Node* zsize = phase->transform(new SubXNode(zend, zbase) );
Node* adr = phase->transform(new AddPNode(dest, dest, start_offset) );
- mem = new ClearArrayNode(ctl, mem, zsize, adr);
+ mem = new ClearArrayNode(ctl, mem, zsize, adr, false);
return phase->transform(mem);
}
@@ -3901,7 +3907,7 @@
zeroes_done, zeroes_needed,
phase);
zeroes_done = zeroes_needed;
- if (zsize > Matcher::init_array_short_size && ++big_init_gaps > 2)
+ if (zsize > InitArrayShortSize && ++big_init_gaps > 2)
do_zeroing = false; // leave the hole, next time
}
}
--- a/hotspot/src/share/vm/opto/memnode.hpp Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/share/vm/opto/memnode.hpp Fri Mar 04 01:30:11 2016 +0300
@@ -1013,9 +1013,11 @@
//------------------------------ClearArray-------------------------------------
class ClearArrayNode: public Node {
+private:
+ bool _is_large;
public:
- ClearArrayNode( Node *ctrl, Node *arymem, Node *word_cnt, Node *base )
- : Node(ctrl,arymem,word_cnt,base) {
+ ClearArrayNode( Node *ctrl, Node *arymem, Node *word_cnt, Node *base, bool is_large)
+ : Node(ctrl,arymem,word_cnt,base), _is_large(is_large) {
init_class_id(Class_ClearArray);
}
virtual int Opcode() const;
@@ -1026,6 +1028,7 @@
virtual Node* Identity(PhaseGVN* phase);
virtual Node *Ideal(PhaseGVN *phase, bool can_reshape);
virtual uint match_edge(uint idx) const;
+ bool is_large() const { return _is_large; }
// Clear the given area of an object or array.
// The start offset must always be aligned mod BytesPerInt.
--- a/hotspot/src/share/vm/runtime/commandLineFlagConstraintsCompiler.cpp Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/share/vm/runtime/commandLineFlagConstraintsCompiler.cpp Fri Mar 04 01:30:11 2016 +0300
@@ -354,6 +354,14 @@
return Flag::SUCCESS;
}
+Flag::Error InitArrayShortSizeConstraintFunc(intx value, bool verbose) {
+ if (value % BytesPerLong != 0) {
+ return Flag::VIOLATES_CONSTRAINT;
+ } else {
+ return Flag::SUCCESS;
+ }
+}
+
#ifdef COMPILER2
Flag::Error InteriorEntryAlignmentConstraintFunc(intx value, bool verbose) {
if (InteriorEntryAlignment > CodeEntryAlignment) {
--- a/hotspot/src/share/vm/runtime/commandLineFlagConstraintsCompiler.hpp Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/share/vm/runtime/commandLineFlagConstraintsCompiler.hpp Fri Mar 04 01:30:11 2016 +0300
@@ -62,6 +62,8 @@
Flag::Error TypeProfileLevelConstraintFunc(uintx value, bool verbose);
+Flag::Error InitArrayShortSizeConstraintFunc(intx value, bool verbose);
+
#ifdef COMPILER2
Flag::Error InteriorEntryAlignmentConstraintFunc(intx value, bool verbose);
--- a/hotspot/src/share/vm/runtime/globals.hpp Thu Mar 03 23:57:29 2016 +0300
+++ b/hotspot/src/share/vm/runtime/globals.hpp Fri Mar 04 01:30:11 2016 +0300
@@ -4162,6 +4162,13 @@
"in the loaded class C. " \
"Check (3) is available only in debug builds.") \
\
+ develop_pd(intx, InitArrayShortSize, \
+ "Threshold small size (in bytes) for clearing arrays. " \
+ "Anything this size or smaller may get converted to discrete " \
+ "scalar stores.") \
+ range(0, max_intx) \
+ constraint(InitArrayShortSizeConstraintFunc, AfterErgo) \
+ \
diagnostic(bool, CompilerDirectivesIgnoreCompileCommands, false, \
"Disable backwards compatibility for compile commands.") \
\