8201193: Use XMM/YMM for objects initialization
authorkvn
Tue, 12 Jun 2018 21:29:47 -0700
changeset 50534 a6a44177f99c
parent 50533 7c5fbc953121
child 50535 e1b3def12624
8201193: Use XMM/YMM for objects initialization Reviewed-by: jrose, kvn Contributed-by: rohitarulraj@gmail.com
src/hotspot/cpu/x86/globals_x86.hpp
src/hotspot/cpu/x86/macroAssembler_x86.cpp
src/hotspot/cpu/x86/macroAssembler_x86.hpp
src/hotspot/cpu/x86/vm_version_x86.cpp
src/hotspot/cpu/x86/x86_32.ad
src/hotspot/cpu/x86/x86_64.ad
--- a/src/hotspot/cpu/x86/globals_x86.hpp	Mon Jun 11 14:06:50 2018 -0700
+++ b/src/hotspot/cpu/x86/globals_x86.hpp	Tue Jun 12 21:29:47 2018 -0700
@@ -150,6 +150,9 @@
   product(bool, UseUnalignedLoadStores, false,                              \
           "Use SSE2 MOVDQU instruction for Arraycopy")                      \
                                                                             \
+  product(bool, UseXMMForObjInit, false,                                    \
+          "Use XMM/YMM MOVDQU instruction for Object Initialization")       \
+                                                                            \
   product(bool, UseFastStosb, false,                                        \
           "Use fast-string operation for zeroing: rep stosb")               \
                                                                             \
--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp	Mon Jun 11 14:06:50 2018 -0700
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp	Tue Jun 12 21:29:47 2018 -0700
@@ -6777,7 +6777,59 @@
 
 }
 
-void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, bool is_large) {
+// clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers
+void MacroAssembler::xmm_clear_mem(Register base, Register cnt, XMMRegister xtmp) {
+  // cnt - number of qwords (8-byte words).
+  // base - start address, qword aligned.
+  Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
+  if (UseAVX >= 2) {
+    vpxor(xtmp, xtmp, xtmp, AVX_256bit);
+  } else {
+    pxor(xtmp, xtmp);
+  }
+  jmp(L_zero_64_bytes);
+
+  BIND(L_loop);
+  if (UseAVX >= 2) {
+    vmovdqu(Address(base,  0), xtmp);
+    vmovdqu(Address(base, 32), xtmp);
+  } else {
+    movdqu(Address(base,  0), xtmp);
+    movdqu(Address(base, 16), xtmp);
+    movdqu(Address(base, 32), xtmp);
+    movdqu(Address(base, 48), xtmp);
+  }
+  addptr(base, 64);
+
+  BIND(L_zero_64_bytes);
+  subptr(cnt, 8);
+  jccb(Assembler::greaterEqual, L_loop);
+  addptr(cnt, 4);
+  jccb(Assembler::less, L_tail);
+  // Copy trailing 32 bytes
+  if (UseAVX >= 2) {
+    vmovdqu(Address(base, 0), xtmp);
+  } else {
+    movdqu(Address(base,  0), xtmp);
+    movdqu(Address(base, 16), xtmp);
+  }
+  addptr(base, 32);
+  subptr(cnt, 4);
+
+  BIND(L_tail);
+  addptr(cnt, 4);
+  jccb(Assembler::lessEqual, L_end);
+  decrement(cnt);
+
+  BIND(L_sloop);
+  movq(Address(base, 0), xtmp);
+  addptr(base, 8);
+  decrement(cnt);
+  jccb(Assembler::greaterEqual, L_sloop);
+  BIND(L_end);
+}
+
+void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp, bool is_large) {
   // cnt - number of qwords (8-byte words).
   // base - start address, qword aligned.
   // is_large - if optimizers know cnt is larger than InitArrayShortSize
@@ -6789,7 +6841,9 @@
 
   Label DONE;
 
-  xorptr(tmp, tmp);
+  if (!is_large || !UseXMMForObjInit) {
+    xorptr(tmp, tmp);
+  }
 
   if (!is_large) {
     Label LOOP, LONG;
@@ -6815,6 +6869,9 @@
   if (UseFastStosb) {
     shlptr(cnt, 3); // convert to number of bytes
     rep_stosb();
+  } else if (UseXMMForObjInit) {
+    movptr(tmp, base);
+    xmm_clear_mem(tmp, cnt, xtmp);
   } else {
     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
     rep_stos();
--- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp	Mon Jun 11 14:06:50 2018 -0700
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp	Tue Jun 12 21:29:47 2018 -0700
@@ -1578,7 +1578,10 @@
 
   // clear memory of size 'cnt' qwords, starting at 'base';
   // if 'is_large' is set, do not try to produce short loop
-  void clear_mem(Register base, Register cnt, Register rtmp, bool is_large);
+  void clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, bool is_large);
+
+  // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers
+  void xmm_clear_mem(Register base, Register cnt, XMMRegister xtmp);
 
 #ifdef COMPILER2
   void string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
--- a/src/hotspot/cpu/x86/vm_version_x86.cpp	Mon Jun 11 14:06:50 2018 -0700
+++ b/src/hotspot/cpu/x86/vm_version_x86.cpp	Tue Jun 12 21:29:47 2018 -0700
@@ -1396,6 +1396,16 @@
     FLAG_SET_DEFAULT(UseFastStosb, false);
   }
 
+  // Use XMM/YMM MOVDQU instruction for Object Initialization
+  if (!UseFastStosb && UseSSE >= 2 && UseUnalignedLoadStores) {
+    if (FLAG_IS_DEFAULT(UseXMMForObjInit)) {
+      UseXMMForObjInit = true;
+    }
+  } else if (UseXMMForObjInit) {
+    warning("UseXMMForObjInit requires SSE2 and unaligned load/stores. Feature is switched off.");
+    FLAG_SET_DEFAULT(UseXMMForObjInit, false);
+  }
+
 #ifdef COMPILER2
   if (FLAG_IS_DEFAULT(AlignVector)) {
     // Modern processors allow misaligned memory operations for vectors.
--- a/src/hotspot/cpu/x86/x86_32.ad	Mon Jun 11 14:06:50 2018 -0700
+++ b/src/hotspot/cpu/x86/x86_32.ad	Tue Jun 12 21:29:47 2018 -0700
@@ -11482,10 +11482,10 @@
 
 // =======================================================================
 // fast clearing of an array
-instruct rep_stos(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
+instruct rep_stos(eCXRegI cnt, eDIRegP base, regD tmp, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
   predicate(!((ClearArrayNode*)n)->is_large());
   match(Set dummy (ClearArray cnt base));
-  effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
+  effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
 
   format %{ $$template
     $$emit$$"XOR    EAX,EAX\t# ClearArray:\n\t"
@@ -11502,6 +11502,32 @@
     if (UseFastStosb) {
        $$emit$$"SHL    ECX,3\t# Convert doublewords to bytes\n\t"
        $$emit$$"REP STOSB\t# store EAX into [EDI++] while ECX--\n\t"
+    } else if (UseXMMForObjInit) {
+       $$emit$$"MOV     RDI,RAX\n\t"
+       $$emit$$"VPXOR    YMM0,YMM0,YMM0\n\t"
+       $$emit$$"JMPQ    L_zero_64_bytes\n\t"
+       $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
+       $$emit$$"VMOVDQU YMM0,(RAX)\n\t"
+       $$emit$$"VMOVDQU YMM0,0x20(RAX)\n\t"
+       $$emit$$"ADD     0x40,RAX\n\t"
+       $$emit$$"# L_zero_64_bytes:\n\t"
+       $$emit$$"SUB     0x8,RCX\n\t"
+       $$emit$$"JGE     L_loop\n\t"
+       $$emit$$"ADD     0x4,RCX\n\t"
+       $$emit$$"JL      L_tail\n\t"
+       $$emit$$"VMOVDQU YMM0,(RAX)\n\t"
+       $$emit$$"ADD     0x20,RAX\n\t"
+       $$emit$$"SUB     0x4,RCX\n\t"
+       $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
+       $$emit$$"ADD     0x4,RCX\n\t"
+       $$emit$$"JLE     L_end\n\t"
+       $$emit$$"DEC     RCX\n\t"
+       $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
+       $$emit$$"VMOVQ   XMM0,(RAX)\n\t"
+       $$emit$$"ADD     0x8,RAX\n\t"
+       $$emit$$"DEC     RCX\n\t"
+       $$emit$$"JGE     L_sloop\n\t"
+       $$emit$$"# L_end:\n\t"
     } else {
        $$emit$$"SHL    ECX,1\t# Convert doublewords to words\n\t"
        $$emit$$"REP STOS\t# store EAX into [EDI++] while ECX--\n\t"
@@ -11509,28 +11535,57 @@
     $$emit$$"# DONE"
   %}
   ins_encode %{
-    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, false);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct rep_stos_large(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
+    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
+                 $tmp$$XMMRegister, false);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rep_stos_large(eCXRegI cnt, eDIRegP base, regD tmp, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
   predicate(((ClearArrayNode*)n)->is_large());
   match(Set dummy (ClearArray cnt base));
-  effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
+  effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
   format %{ $$template
-    $$emit$$"XOR    EAX,EAX\t# ClearArray:\n\t"
     if (UseFastStosb) {
+       $$emit$$"XOR    EAX,EAX\t# ClearArray:\n\t"
        $$emit$$"SHL    ECX,3\t# Convert doublewords to bytes\n\t"
        $$emit$$"REP STOSB\t# store EAX into [EDI++] while ECX--\n\t"
+    } else if (UseXMMForObjInit) {
+       $$emit$$"MOV     RDI,RAX\t# ClearArray:\n\t"
+       $$emit$$"VPXOR   YMM0,YMM0,YMM0\n\t"
+       $$emit$$"JMPQ    L_zero_64_bytes\n\t"
+       $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
+       $$emit$$"VMOVDQU YMM0,(RAX)\n\t"
+       $$emit$$"VMOVDQU YMM0,0x20(RAX)\n\t"
+       $$emit$$"ADD     0x40,RAX\n\t"
+       $$emit$$"# L_zero_64_bytes:\n\t"
+       $$emit$$"SUB     0x8,RCX\n\t"
+       $$emit$$"JGE     L_loop\n\t"
+       $$emit$$"ADD     0x4,RCX\n\t"
+       $$emit$$"JL      L_tail\n\t"
+       $$emit$$"VMOVDQU YMM0,(RAX)\n\t"
+       $$emit$$"ADD     0x20,RAX\n\t"
+       $$emit$$"SUB     0x4,RCX\n\t"
+       $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
+       $$emit$$"ADD     0x4,RCX\n\t"
+       $$emit$$"JLE     L_end\n\t"
+       $$emit$$"DEC     RCX\n\t"
+       $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
+       $$emit$$"VMOVQ   XMM0,(RAX)\n\t"
+       $$emit$$"ADD     0x8,RAX\n\t"
+       $$emit$$"DEC     RCX\n\t"
+       $$emit$$"JGE     L_sloop\n\t"
+       $$emit$$"# L_end:\n\t"
     } else {
+       $$emit$$"XOR    EAX,EAX\t# ClearArray:\n\t"
        $$emit$$"SHL    ECX,1\t# Convert doublewords to words\n\t"
        $$emit$$"REP STOS\t# store EAX into [EDI++] while ECX--\n\t"
     }
     $$emit$$"# DONE"
   %}
   ins_encode %{
-    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, true);
+    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
+                 $tmp$$XMMRegister, true);
   %}
   ins_pipe( pipe_slow );
 %}
--- a/src/hotspot/cpu/x86/x86_64.ad	Mon Jun 11 14:06:50 2018 -0700
+++ b/src/hotspot/cpu/x86/x86_64.ad	Tue Jun 12 21:29:47 2018 -0700
@@ -10770,12 +10770,12 @@
 
 // =======================================================================
 // fast clearing of an array
-instruct rep_stos(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
-                  rFlagsReg cr)
+instruct rep_stos(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero,
+                  Universe dummy, rFlagsReg cr)
 %{
   predicate(!((ClearArrayNode*)n)->is_large());
   match(Set dummy (ClearArray cnt base));
-  effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
+  effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
 
   format %{ $$template
     $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
@@ -10791,35 +10791,90 @@
     if (UseFastStosb) {
        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--\n\t"
+    } else if (UseXMMForObjInit) {
+       $$emit$$"mov     rdi,rax\n\t"
+       $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
+       $$emit$$"jmpq    L_zero_64_bytes\n\t"
+       $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
+       $$emit$$"vmovdqu ymm0,(rax)\n\t"
+       $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
+       $$emit$$"add     0x40,rax\n\t"
+       $$emit$$"# L_zero_64_bytes:\n\t"
+       $$emit$$"sub     0x8,rcx\n\t"
+       $$emit$$"jge     L_loop\n\t"
+       $$emit$$"add     0x4,rcx\n\t"
+       $$emit$$"jl      L_tail\n\t"
+       $$emit$$"vmovdqu ymm0,(rax)\n\t"
+       $$emit$$"add     0x20,rax\n\t"
+       $$emit$$"sub     0x4,rcx\n\t"
+       $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
+       $$emit$$"add     0x4,rcx\n\t"
+       $$emit$$"jle     L_end\n\t"
+       $$emit$$"dec     rcx\n\t"
+       $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
+       $$emit$$"vmovq   xmm0,(rax)\n\t"
+       $$emit$$"add     0x8,rax\n\t"
+       $$emit$$"dec     rcx\n\t"
+       $$emit$$"jge     L_sloop\n\t"
+       $$emit$$"# L_end:\n\t"
     } else {
        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--\n\t"
     }
     $$emit$$"# DONE"
   %}
   ins_encode %{
-    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, false);
+    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
+                 $tmp$$XMMRegister, false);
   %}
   ins_pipe(pipe_slow);
 %}
 
-instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
-                  rFlagsReg cr)
+instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero, 
+                        Universe dummy, rFlagsReg cr)
 %{
   predicate(((ClearArrayNode*)n)->is_large());
   match(Set dummy (ClearArray cnt base));
-  effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
+  effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
 
   format %{ $$template
-    $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
     if (UseFastStosb) {
+       $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
        $$emit$$"shlq    rcx,3\t# Convert doublewords to bytes\n\t"
        $$emit$$"rep     stosb\t# Store rax to *rdi++ while rcx--"
+    } else if (UseXMMForObjInit) {
+       $$emit$$"mov     rdi,rax\t# ClearArray:\n\t"
+       $$emit$$"vpxor   ymm0,ymm0,ymm0\n\t"
+       $$emit$$"jmpq    L_zero_64_bytes\n\t"
+       $$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
+       $$emit$$"vmovdqu ymm0,(rax)\n\t"
+       $$emit$$"vmovdqu ymm0,0x20(rax)\n\t"
+       $$emit$$"add     0x40,rax\n\t"
+       $$emit$$"# L_zero_64_bytes:\n\t"
+       $$emit$$"sub     0x8,rcx\n\t"
+       $$emit$$"jge     L_loop\n\t"
+       $$emit$$"add     0x4,rcx\n\t"
+       $$emit$$"jl      L_tail\n\t"
+       $$emit$$"vmovdqu ymm0,(rax)\n\t"
+       $$emit$$"add     0x20,rax\n\t"
+       $$emit$$"sub     0x4,rcx\n\t"
+       $$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
+       $$emit$$"add     0x4,rcx\n\t"
+       $$emit$$"jle     L_end\n\t"
+       $$emit$$"dec     rcx\n\t"
+       $$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
+       $$emit$$"vmovq   xmm0,(rax)\n\t"
+       $$emit$$"add     0x8,rax\n\t"
+       $$emit$$"dec     rcx\n\t"
+       $$emit$$"jge     L_sloop\n\t"
+       $$emit$$"# L_end:\n\t"
     } else {
+       $$emit$$"xorq    rax, rax\t# ClearArray:\n\t"
        $$emit$$"rep     stosq\t# Store rax to *rdi++ while rcx--"
     }
   %}
   ins_encode %{
-    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, true);
+    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, 
+                 $tmp$$XMMRegister, true);
   %}
   ins_pipe(pipe_slow);
 %}