8005522: use fast-string instructions on x86 for zeroing
Summary: use 'rep stosb' instead of 'rep stosq' when fast-string operations are available.
Reviewed-by: twisti, roland
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp Sun Dec 23 17:08:22 2012 +0100
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp Thu Jan 03 15:09:55 2013 -0800
@@ -2544,12 +2544,18 @@
emit_int8((unsigned char)0xA5);
}
+// sets rcx bytes with rax, value at [edi]
+void Assembler::rep_stosb() {
+ emit_int8((unsigned char)0xF3); // REP
+ LP64_ONLY(prefix(REX_W));
+ emit_int8((unsigned char)0xAA); // STOSB
+}
+
// sets rcx pointer sized words with rax, value at [edi]
// generic
-void Assembler::rep_set() { // rep_set
- emit_int8((unsigned char)0xF3);
- // STOSQ
- LP64_ONLY(prefix(REX_W));
+void Assembler::rep_stos() {
+ emit_int8((unsigned char)0xF3); // REP
+ LP64_ONLY(prefix(REX_W)); // LP64:STOSQ, LP32:STOSD
emit_int8((unsigned char)0xAB);
}
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp Sun Dec 23 17:08:22 2012 +0100
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp Thu Jan 03 15:09:55 2013 -0800
@@ -832,7 +832,8 @@
// These do register sized moves/scans
void rep_mov();
- void rep_set();
+ void rep_stos();
+ void rep_stosb();
void repne_scan();
#ifdef _LP64
void repne_scanl();
--- a/hotspot/src/cpu/x86/vm/globals_x86.hpp Sun Dec 23 17:08:22 2012 +0100
+++ b/hotspot/src/cpu/x86/vm/globals_x86.hpp Thu Jan 03 15:09:55 2013 -0800
@@ -120,6 +120,9 @@
product(bool, UseUnalignedLoadStores, false, \
"Use SSE2 MOVDQU instruction for Arraycopy") \
\
+ product(bool, UseFastStosb, false, \
+ "Use fast-string operation for zeroing: rep stosb") \
+ \
/* assembler */ \
product(bool, Use486InstrsOnly, false, \
"Use 80486 Compliant instruction subset") \
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Sun Dec 23 17:08:22 2012 +0100
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Thu Jan 03 15:09:55 2013 -0800
@@ -5224,6 +5224,22 @@
}
+void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp) {
+ // cnt - number of qwords (8-byte words).
+ // base - start address, qword aligned.
+ assert(base==rdi, "base register must be edi for rep stos");
+ assert(tmp==rax, "tmp register must be eax for rep stos");
+ assert(cnt==rcx, "cnt register must be ecx for rep stos");
+
+ xorptr(tmp, tmp);
+ if (UseFastStosb) {
+ shlptr(cnt,3); // convert to number of bytes
+ rep_stosb();
+ } else {
+ NOT_LP64(shlptr(cnt,1);) // convert to number of dwords for 32-bit VM
+ rep_stos();
+ }
+}
// IndexOf for constant substrings with size >= 8 chars
// which don't need to be loaded through stack.
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp Sun Dec 23 17:08:22 2012 +0100
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp Thu Jan 03 15:09:55 2013 -0800
@@ -1096,6 +1096,9 @@
// C2 compiled method's prolog code.
void verified_entry(int framesize, bool stack_bang, bool fp_mode_24b);
+ // clear memory of size 'cnt' qwords, starting at 'base'.
+ void clear_mem(Register base, Register cnt, Register rtmp);
+
// IndexOf strings.
// Small strings are loaded through stack if they cross page boundary.
void string_indexof(Register str1, Register str2,
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Sun Dec 23 17:08:22 2012 +0100
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Thu Jan 03 15:09:55 2013 -0800
@@ -429,7 +429,7 @@
}
char buf[256];
- jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+ jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
cores_per_cpu(), threads_per_core(),
cpu_family(), _model, _stepping,
(supports_cmov() ? ", cmov" : ""),
@@ -446,6 +446,7 @@
(supports_avx() ? ", avx" : ""),
(supports_avx2() ? ", avx2" : ""),
(supports_aes() ? ", aes" : ""),
+ (supports_erms() ? ", erms" : ""),
(supports_mmx_ext() ? ", mmxext" : ""),
(supports_3dnow_prefetch() ? ", 3dnowpref" : ""),
(supports_lzcnt() ? ", lzcnt": ""),
@@ -671,6 +672,16 @@
FLAG_SET_DEFAULT(UsePopCountInstruction, false);
}
+ // Use fast-string operations if available.
+ if (supports_erms()) {
+ if (FLAG_IS_DEFAULT(UseFastStosb)) {
+ UseFastStosb = true;
+ }
+ } else if (UseFastStosb) {
+ warning("fast-string operations are not available on this CPU");
+ FLAG_SET_DEFAULT(UseFastStosb, false);
+ }
+
#ifdef COMPILER2
if (FLAG_IS_DEFAULT(AlignVector)) {
// Modern processors allow misaligned memory operations for vectors.
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp Sun Dec 23 17:08:22 2012 +0100
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp Thu Jan 03 15:09:55 2013 -0800
@@ -204,7 +204,8 @@
avx2 : 1,
: 2,
bmi2 : 1,
- : 23;
+ erms : 1,
+ : 22;
} bits;
};
@@ -247,7 +248,8 @@
CPU_TSCINV = (1 << 16),
CPU_AVX = (1 << 17),
CPU_AVX2 = (1 << 18),
- CPU_AES = (1 << 19)
+ CPU_AES = (1 << 19),
+ CPU_ERMS = (1 << 20) // enhanced 'rep movsb/stosb' instructions
} cpuFeatureFlags;
enum {
@@ -425,6 +427,8 @@
result |= CPU_TSCINV;
if (_cpuid_info.std_cpuid1_ecx.bits.aes != 0)
result |= CPU_AES;
+ if (_cpuid_info.sef_cpuid7_ebx.bits.erms != 0)
+ result |= CPU_ERMS;
// AMD features.
if (is_amd()) {
@@ -489,7 +493,7 @@
return (_cpuid_info.std_max_function >= 0xB) &&
// eax[4:0] | ebx[0:15] == 0 indicates invalid topology level.
// Some cpus have max cpuid >= 0xB but do not support processor topology.
- ((_cpuid_info.tpl_cpuidB0_eax & 0x1f | _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus) != 0);
+ (((_cpuid_info.tpl_cpuidB0_eax & 0x1f) | _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus) != 0);
}
static uint cores_per_cpu() {
@@ -550,6 +554,7 @@
static bool supports_avx2() { return (_cpuFeatures & CPU_AVX2) != 0; }
static bool supports_tsc() { return (_cpuFeatures & CPU_TSC) != 0; }
static bool supports_aes() { return (_cpuFeatures & CPU_AES) != 0; }
+ static bool supports_erms() { return (_cpuFeatures & CPU_ERMS) != 0; }
// Intel features
static bool is_intel_family_core() { return is_intel() &&
--- a/hotspot/src/cpu/x86/vm/x86_32.ad Sun Dec 23 17:08:22 2012 +0100
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad Thu Jan 03 15:09:55 2013 -0800
@@ -11572,15 +11572,28 @@
// =======================================================================
// fast clearing of an array
instruct rep_stos(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
+ predicate(!UseFastStosb);
match(Set dummy (ClearArray cnt base));
effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
- format %{ "SHL ECX,1\t# Convert doublewords to words\n\t"
- "XOR EAX,EAX\n\t"
+ format %{ "XOR EAX,EAX\t# ClearArray:\n\t"
+ "SHL ECX,1\t# Convert doublewords to words\n\t"
"REP STOS\t# store EAX into [EDI++] while ECX--" %}
- opcode(0,0x4);
- ins_encode( Opcode(0xD1), RegOpc(ECX),
- OpcRegReg(0x33,EAX,EAX),
- Opcode(0xF3), Opcode(0xAB) );
+ ins_encode %{
+ __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rep_fast_stosb(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
+ predicate(UseFastStosb);
+ match(Set dummy (ClearArray cnt base));
+ effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
+ format %{ "XOR EAX,EAX\t# ClearArray:\n\t"
+ "SHL ECX,3\t# Convert doublewords to bytes\n\t"
+ "REP STOSB\t# store EAX into [EDI++] while ECX--" %}
+ ins_encode %{
+ __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
+ %}
ins_pipe( pipe_slow );
%}
--- a/hotspot/src/cpu/x86/vm/x86_64.ad Sun Dec 23 17:08:22 2012 +0100
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad Thu Jan 03 15:09:55 2013 -0800
@@ -10374,16 +10374,33 @@
instruct rep_stos(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
rFlagsReg cr)
%{
+ predicate(!UseFastStosb);
match(Set dummy (ClearArray cnt base));
effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
- format %{ "xorl rax, rax\t# ClearArray:\n\t"
- "rep stosq\t# Store rax to *rdi++ while rcx--" %}
- ins_encode(opc_reg_reg(0x33, RAX, RAX), // xorl %eax, %eax
- Opcode(0xF3), Opcode(0x48), Opcode(0xAB)); // rep REX_W stos
+ format %{ "xorq rax, rax\t# ClearArray:\n\t"
+ "rep stosq\t# Store rax to *rdi++ while rcx--" %}
+ ins_encode %{
+ __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
+ %}
ins_pipe(pipe_slow);
%}
+instruct rep_fast_stosb(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
+ rFlagsReg cr)
+%{
+ predicate(UseFastStosb);
+ match(Set dummy (ClearArray cnt base));
+ effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
+ format %{ "xorq rax, rax\t# ClearArray:\n\t"
+ "shlq rcx,3\t# Convert doublewords to bytes\n\t"
+ "rep stosb\t# Store rax to *rdi++ while rcx--" %}
+ ins_encode %{
+ __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct string_compare(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2,
rax_RegI result, regD tmp1, rFlagsReg cr)
%{
--- a/hotspot/src/share/vm/opto/memnode.cpp Sun Dec 23 17:08:22 2012 +0100
+++ b/hotspot/src/share/vm/opto/memnode.cpp Thu Jan 03 15:09:55 2013 -0800
@@ -2725,10 +2725,8 @@
zend = phase->transform( new(C) URShiftXNode(zend, shift) );
}
+ // Bulk clear double-words
Node* zsize = phase->transform( new(C) SubXNode(zend, zbase) );
- Node* zinit = phase->zerocon((unit == BytesPerLong) ? T_LONG : T_INT);
-
- // Bulk clear double-words
Node* adr = phase->transform( new(C) AddPNode(dest, dest, start_offset) );
mem = new (C) ClearArrayNode(ctl, mem, zsize, adr);
return phase->transform(mem);