8201193: Use XMM/YMM for objects initialization
Reviewed-by: jrose, kvn
Contributed-by: rohitarulraj@gmail.com
--- a/src/hotspot/cpu/x86/globals_x86.hpp Mon Jun 11 14:06:50 2018 -0700
+++ b/src/hotspot/cpu/x86/globals_x86.hpp Tue Jun 12 21:29:47 2018 -0700
@@ -150,6 +150,9 @@
product(bool, UseUnalignedLoadStores, false, \
"Use SSE2 MOVDQU instruction for Arraycopy") \
\
+ product(bool, UseXMMForObjInit, false, \
+ "Use XMM/YMM MOVDQU instruction for Object Initialization") \
+ \
product(bool, UseFastStosb, false, \
"Use fast-string operation for zeroing: rep stosb") \
\
--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp Mon Jun 11 14:06:50 2018 -0700
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp Tue Jun 12 21:29:47 2018 -0700
@@ -6777,7 +6777,59 @@
}
-void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, bool is_large) {
+// clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers
+void MacroAssembler::xmm_clear_mem(Register base, Register cnt, XMMRegister xtmp) {
+ // cnt - number of qwords (8-byte words).
+ // base - start address, qword aligned.
+ Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
+ if (UseAVX >= 2) {
+ vpxor(xtmp, xtmp, xtmp, AVX_256bit);
+ } else {
+ pxor(xtmp, xtmp);
+ }
+ jmp(L_zero_64_bytes);
+
+ BIND(L_loop);
+ if (UseAVX >= 2) {
+ vmovdqu(Address(base, 0), xtmp);
+ vmovdqu(Address(base, 32), xtmp);
+ } else {
+ movdqu(Address(base, 0), xtmp);
+ movdqu(Address(base, 16), xtmp);
+ movdqu(Address(base, 32), xtmp);
+ movdqu(Address(base, 48), xtmp);
+ }
+ addptr(base, 64);
+
+ BIND(L_zero_64_bytes);
+ subptr(cnt, 8);
+ jccb(Assembler::greaterEqual, L_loop);
+ addptr(cnt, 4);
+ jccb(Assembler::less, L_tail);
+ // Copy trailing 32 bytes
+ if (UseAVX >= 2) {
+ vmovdqu(Address(base, 0), xtmp);
+ } else {
+ movdqu(Address(base, 0), xtmp);
+ movdqu(Address(base, 16), xtmp);
+ }
+ addptr(base, 32);
+ subptr(cnt, 4);
+
+ BIND(L_tail);
+ addptr(cnt, 4);
+ jccb(Assembler::lessEqual, L_end);
+ decrement(cnt);
+
+ BIND(L_sloop);
+ movq(Address(base, 0), xtmp);
+ addptr(base, 8);
+ decrement(cnt);
+ jccb(Assembler::greaterEqual, L_sloop);
+ BIND(L_end);
+}
+
+void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp, bool is_large) {
// cnt - number of qwords (8-byte words).
// base - start address, qword aligned.
// is_large - if optimizers know cnt is larger than InitArrayShortSize
@@ -6789,7 +6841,9 @@
Label DONE;
- xorptr(tmp, tmp);
+ if (!is_large || !UseXMMForObjInit) {
+ xorptr(tmp, tmp);
+ }
if (!is_large) {
Label LOOP, LONG;
@@ -6815,6 +6869,9 @@
if (UseFastStosb) {
shlptr(cnt, 3); // convert to number of bytes
rep_stosb();
+ } else if (UseXMMForObjInit) {
+ movptr(tmp, base);
+ xmm_clear_mem(tmp, cnt, xtmp);
} else {
NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
rep_stos();
--- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp Mon Jun 11 14:06:50 2018 -0700
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp Tue Jun 12 21:29:47 2018 -0700
@@ -1578,7 +1578,10 @@
// clear memory of size 'cnt' qwords, starting at 'base';
// if 'is_large' is set, do not try to produce short loop
- void clear_mem(Register base, Register cnt, Register rtmp, bool is_large);
+ void clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, bool is_large);
+
+ // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers
+ void xmm_clear_mem(Register base, Register cnt, XMMRegister xtmp);
#ifdef COMPILER2
void string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
--- a/src/hotspot/cpu/x86/vm_version_x86.cpp Mon Jun 11 14:06:50 2018 -0700
+++ b/src/hotspot/cpu/x86/vm_version_x86.cpp Tue Jun 12 21:29:47 2018 -0700
@@ -1396,6 +1396,16 @@
FLAG_SET_DEFAULT(UseFastStosb, false);
}
+ // Use XMM/YMM MOVDQU instruction for Object Initialization
+ if (!UseFastStosb && UseSSE >= 2 && UseUnalignedLoadStores) {
+ if (FLAG_IS_DEFAULT(UseXMMForObjInit)) {
+ UseXMMForObjInit = true;
+ }
+ } else if (UseXMMForObjInit) {
+ warning("UseXMMForObjInit requires SSE2 and unaligned load/stores. Feature is switched off.");
+ FLAG_SET_DEFAULT(UseXMMForObjInit, false);
+ }
+
#ifdef COMPILER2
if (FLAG_IS_DEFAULT(AlignVector)) {
// Modern processors allow misaligned memory operations for vectors.
--- a/src/hotspot/cpu/x86/x86_32.ad Mon Jun 11 14:06:50 2018 -0700
+++ b/src/hotspot/cpu/x86/x86_32.ad Tue Jun 12 21:29:47 2018 -0700
@@ -11482,10 +11482,10 @@
// =======================================================================
// fast clearing of an array
-instruct rep_stos(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
+instruct rep_stos(eCXRegI cnt, eDIRegP base, regD tmp, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
predicate(!((ClearArrayNode*)n)->is_large());
match(Set dummy (ClearArray cnt base));
- effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
+ effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
format %{ $$template
$$emit$$"XOR EAX,EAX\t# ClearArray:\n\t"
@@ -11502,6 +11502,32 @@
if (UseFastStosb) {
$$emit$$"SHL ECX,3\t# Convert doublewords to bytes\n\t"
$$emit$$"REP STOSB\t# store EAX into [EDI++] while ECX--\n\t"
+ } else if (UseXMMForObjInit) {
+ $$emit$$"MOV RDI,RAX\n\t"
+ $$emit$$"VPXOR YMM0,YMM0,YMM0\n\t"
+ $$emit$$"JMPQ L_zero_64_bytes\n\t"
+ $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
+ $$emit$$"VMOVDQU YMM0,(RAX)\n\t"
+ $$emit$$"VMOVDQU YMM0,0x20(RAX)\n\t"
+ $$emit$$"ADD 0x40,RAX\n\t"
+ $$emit$$"# L_zero_64_bytes:\n\t"
+ $$emit$$"SUB 0x8,RCX\n\t"
+ $$emit$$"JGE L_loop\n\t"
+ $$emit$$"ADD 0x4,RCX\n\t"
+ $$emit$$"JL L_tail\n\t"
+ $$emit$$"VMOVDQU YMM0,(RAX)\n\t"
+ $$emit$$"ADD 0x20,RAX\n\t"
+ $$emit$$"SUB 0x4,RCX\n\t"
+ $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
+ $$emit$$"ADD 0x4,RCX\n\t"
+ $$emit$$"JLE L_end\n\t"
+ $$emit$$"DEC RCX\n\t"
+ $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
+ $$emit$$"VMOVQ XMM0,(RAX)\n\t"
+ $$emit$$"ADD 0x8,RAX\n\t"
+ $$emit$$"DEC RCX\n\t"
+ $$emit$$"JGE L_sloop\n\t"
+ $$emit$$"# L_end:\n\t"
} else {
$$emit$$"SHL ECX,1\t# Convert doublewords to words\n\t"
$$emit$$"REP STOS\t# store EAX into [EDI++] while ECX--\n\t"
@@ -11509,28 +11535,57 @@
$$emit$$"# DONE"
%}
ins_encode %{
- __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, false);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct rep_stos_large(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
+ __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
+ $tmp$$XMMRegister, false);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct rep_stos_large(eCXRegI cnt, eDIRegP base, regD tmp, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
predicate(((ClearArrayNode*)n)->is_large());
match(Set dummy (ClearArray cnt base));
- effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
+ effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
format %{ $$template
- $$emit$$"XOR EAX,EAX\t# ClearArray:\n\t"
if (UseFastStosb) {
+ $$emit$$"XOR EAX,EAX\t# ClearArray:\n\t"
$$emit$$"SHL ECX,3\t# Convert doublewords to bytes\n\t"
$$emit$$"REP STOSB\t# store EAX into [EDI++] while ECX--\n\t"
+ } else if (UseXMMForObjInit) {
+ $$emit$$"MOV RDI,RAX\t# ClearArray:\n\t"
+ $$emit$$"VPXOR YMM0,YMM0,YMM0\n\t"
+ $$emit$$"JMPQ L_zero_64_bytes\n\t"
+ $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
+ $$emit$$"VMOVDQU YMM0,(RAX)\n\t"
+ $$emit$$"VMOVDQU YMM0,0x20(RAX)\n\t"
+ $$emit$$"ADD 0x40,RAX\n\t"
+ $$emit$$"# L_zero_64_bytes:\n\t"
+ $$emit$$"SUB 0x8,RCX\n\t"
+ $$emit$$"JGE L_loop\n\t"
+ $$emit$$"ADD 0x4,RCX\n\t"
+ $$emit$$"JL L_tail\n\t"
+ $$emit$$"VMOVDQU YMM0,(RAX)\n\t"
+ $$emit$$"ADD 0x20,RAX\n\t"
+ $$emit$$"SUB 0x4,RCX\n\t"
+ $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
+ $$emit$$"ADD 0x4,RCX\n\t"
+ $$emit$$"JLE L_end\n\t"
+ $$emit$$"DEC RCX\n\t"
+ $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
+ $$emit$$"VMOVQ XMM0,(RAX)\n\t"
+ $$emit$$"ADD 0x8,RAX\n\t"
+ $$emit$$"DEC RCX\n\t"
+ $$emit$$"JGE L_sloop\n\t"
+ $$emit$$"# L_end:\n\t"
} else {
+ $$emit$$"XOR EAX,EAX\t# ClearArray:\n\t"
$$emit$$"SHL ECX,1\t# Convert doublewords to words\n\t"
$$emit$$"REP STOS\t# store EAX into [EDI++] while ECX--\n\t"
}
$$emit$$"# DONE"
%}
ins_encode %{
- __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, true);
+ __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
+ $tmp$$XMMRegister, true);
%}
ins_pipe( pipe_slow );
%}
--- a/src/hotspot/cpu/x86/x86_64.ad Mon Jun 11 14:06:50 2018 -0700
+++ b/src/hotspot/cpu/x86/x86_64.ad Tue Jun 12 21:29:47 2018 -0700
@@ -10770,12 +10770,12 @@
// =======================================================================
// fast clearing of an array
-instruct rep_stos(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
- rFlagsReg cr)
+instruct rep_stos(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero,
+ Universe dummy, rFlagsReg cr)
%{
predicate(!((ClearArrayNode*)n)->is_large());
match(Set dummy (ClearArray cnt base));
- effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
+ effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
format %{ $$template
$$emit$$"xorq rax, rax\t# ClearArray:\n\t"
@@ -10791,35 +10791,90 @@
if (UseFastStosb) {
$$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
$$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--\n\t"
+ } else if (UseXMMForObjInit) {
+ $$emit$$"mov rdi,rax\n\t"
+ $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
+ $$emit$$"jmpq L_zero_64_bytes\n\t"
+ $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
+ $$emit$$"vmovdqu ymm0,(rax)\n\t"
+ $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
+ $$emit$$"add 0x40,rax\n\t"
+ $$emit$$"# L_zero_64_bytes:\n\t"
+ $$emit$$"sub 0x8,rcx\n\t"
+ $$emit$$"jge L_loop\n\t"
+ $$emit$$"add 0x4,rcx\n\t"
+ $$emit$$"jl L_tail\n\t"
+ $$emit$$"vmovdqu ymm0,(rax)\n\t"
+ $$emit$$"add 0x20,rax\n\t"
+ $$emit$$"sub 0x4,rcx\n\t"
+ $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
+ $$emit$$"add 0x4,rcx\n\t"
+ $$emit$$"jle L_end\n\t"
+ $$emit$$"dec rcx\n\t"
+ $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
+ $$emit$$"vmovq xmm0,(rax)\n\t"
+ $$emit$$"add 0x8,rax\n\t"
+ $$emit$$"dec rcx\n\t"
+ $$emit$$"jge L_sloop\n\t"
+ $$emit$$"# L_end:\n\t"
} else {
$$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--\n\t"
}
$$emit$$"# DONE"
%}
ins_encode %{
- __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, false);
+ __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
+ $tmp$$XMMRegister, false);
%}
ins_pipe(pipe_slow);
%}
-instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
- rFlagsReg cr)
+instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero,
+ Universe dummy, rFlagsReg cr)
%{
predicate(((ClearArrayNode*)n)->is_large());
match(Set dummy (ClearArray cnt base));
- effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
+ effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
format %{ $$template
- $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
if (UseFastStosb) {
+ $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
$$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t"
$$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--"
+ } else if (UseXMMForObjInit) {
+ $$emit$$"mov rdi,rax\t# ClearArray:\n\t"
+ $$emit$$"vpxor ymm0,ymm0,ymm0\n\t"
+ $$emit$$"jmpq L_zero_64_bytes\n\t"
+ $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
+ $$emit$$"vmovdqu ymm0,(rax)\n\t"
+ $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
+ $$emit$$"add 0x40,rax\n\t"
+ $$emit$$"# L_zero_64_bytes:\n\t"
+ $$emit$$"sub 0x8,rcx\n\t"
+ $$emit$$"jge L_loop\n\t"
+ $$emit$$"add 0x4,rcx\n\t"
+ $$emit$$"jl L_tail\n\t"
+ $$emit$$"vmovdqu ymm0,(rax)\n\t"
+ $$emit$$"add 0x20,rax\n\t"
+ $$emit$$"sub 0x4,rcx\n\t"
+ $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
+ $$emit$$"add 0x4,rcx\n\t"
+ $$emit$$"jle L_end\n\t"
+ $$emit$$"dec rcx\n\t"
+ $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
+ $$emit$$"vmovq xmm0,(rax)\n\t"
+ $$emit$$"add 0x8,rax\n\t"
+ $$emit$$"dec rcx\n\t"
+ $$emit$$"jge L_sloop\n\t"
+ $$emit$$"# L_end:\n\t"
} else {
+ $$emit$$"xorq rax, rax\t# ClearArray:\n\t"
$$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--"
}
%}
ins_encode %{
- __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, true);
+ __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
+ $tmp$$XMMRegister, true);
%}
ins_pipe(pipe_slow);
%}