8187472: AARCH64: array_equals intrinsic doesn't use prefetch for large arrays
authordpochepk
Mon, 09 Apr 2018 18:43:40 +0300
changeset 49724 bf7f42f2f025
parent 49723 06ef6db47ec7
child 49725 e740e1a38c96
8187472: AARCH64: array_equals intrinsic doesn't use prefetch for large arrays Reviewed-by: dsamersoff
src/hotspot/cpu/aarch64/aarch64.ad
src/hotspot/cpu/aarch64/globals_aarch64.hpp
src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp
src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp
src/hotspot/cpu/aarch64/vm_version_aarch64.cpp
--- a/src/hotspot/cpu/aarch64/aarch64.ad	Mon Apr 09 18:40:20 2018 +0300
+++ b/src/hotspot/cpu/aarch64/aarch64.ad	Mon Apr 09 18:43:40 2018 +0300
@@ -16167,9 +16167,8 @@
   format %{ "String Equals $str1,$str2,$cnt -> $result" %}
   ins_encode %{
     // Count is in 8-bit bytes; non-Compact chars are 16 bits.
-    __ arrays_equals($str1$$Register, $str2$$Register,
-                     $result$$Register, $cnt$$Register,
-                     1, /*is_string*/true);
+    __ string_equals($str1$$Register, $str2$$Register,
+                     $result$$Register, $cnt$$Register, 1);
   %}
   ins_pipe(pipe_class_memory);
 %}
@@ -16184,42 +16183,42 @@
   format %{ "String Equals $str1,$str2,$cnt -> $result" %}
   ins_encode %{
     // Count is in 8-bit bytes; non-Compact chars are 16 bits.
-    __ asrw($cnt$$Register, $cnt$$Register, 1);
-    __ arrays_equals($str1$$Register, $str2$$Register,
-                     $result$$Register, $cnt$$Register,
-                     2, /*is_string*/true);
+    __ string_equals($str1$$Register, $str2$$Register,
+                     $result$$Register, $cnt$$Register, 2);
   %}
   ins_pipe(pipe_class_memory);
 %}
 
 instruct array_equalsB(iRegP_R1 ary1, iRegP_R2 ary2, iRegI_R0 result,
-                      iRegP_R10 tmp, rFlagsReg cr)
+                       iRegP_R3 tmp1, iRegP_R4 tmp2, iRegP_R5 tmp3,
+                       iRegP_R10 tmp, rFlagsReg cr)
 %{
   predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::LL);
   match(Set result (AryEq ary1 ary2));
-  effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, KILL cr);
+  effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
 
   format %{ "Array Equals $ary1,ary2 -> $result    // KILL $tmp" %}
   ins_encode %{
     __ arrays_equals($ary1$$Register, $ary2$$Register,
-                     $result$$Register, $tmp$$Register,
-                     1, /*is_string*/false);
+                     $tmp1$$Register, $tmp2$$Register, $tmp3$$Register,
+                     $result$$Register, $tmp$$Register, 1);
     %}
   ins_pipe(pipe_class_memory);
 %}
 
 instruct array_equalsC(iRegP_R1 ary1, iRegP_R2 ary2, iRegI_R0 result,
-                      iRegP_R10 tmp, rFlagsReg cr)
+                       iRegP_R3 tmp1, iRegP_R4 tmp2, iRegP_R5 tmp3,
+                       iRegP_R10 tmp, rFlagsReg cr)
 %{
   predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::UU);
   match(Set result (AryEq ary1 ary2));
-  effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, KILL cr);
+  effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr);
 
   format %{ "Array Equals $ary1,ary2 -> $result    // KILL $tmp" %}
   ins_encode %{
     __ arrays_equals($ary1$$Register, $ary2$$Register,
-                     $result$$Register, $tmp$$Register,
-                     2, /*is_string*/false);
+                     $tmp1$$Register, $tmp2$$Register, $tmp3$$Register,
+                     $result$$Register, $tmp$$Register, 2);
   %}
   ins_pipe(pipe_class_memory);
 %}
--- a/src/hotspot/cpu/aarch64/globals_aarch64.hpp	Mon Apr 09 18:40:20 2018 +0300
+++ b/src/hotspot/cpu/aarch64/globals_aarch64.hpp	Mon Apr 09 18:43:40 2018 +0300
@@ -147,6 +147,10 @@
           "Use CRC32 instructions for CRC32 computation")               \
   product(bool, UseSIMDForMemoryOps, false,                             \
           "Use SIMD instructions in generated memory move code")        \
+  product(bool, UseSIMDForArrayEquals, true,                            \
+          "Use SIMD instructions in generated array equals code")       \
+  product(bool, UseSimpleArrayEquals, false,                            \
+          "Use simpliest and shortest implementation for array equals") \
   product(bool, AvoidUnalignedAccesses, false,                          \
           "Avoid generating unaligned memory accesses")                 \
   product(bool, UseLSE, false,                                          \
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp	Mon Apr 09 18:40:20 2018 +0300
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp	Mon Apr 09 18:43:40 2018 +0300
@@ -5182,28 +5182,11 @@
   BIND(DONE);
 }
 
-// Compare Strings or char/byte arrays.
-
-// is_string is true iff this is a string comparison.
-
-// For Strings we're passed the address of the first characters in a1
-// and a2 and the length in cnt1.
-
-// For byte and char arrays we're passed the arrays themselves and we
-// have to extract length fields and do null checks here.
-
-// elem_size is the element size in bytes: either 1 or 2.
-
-// There are two implementations.  For arrays >= 8 bytes, all
-// comparisons (including the final one, which may overlap) are
-// performed 8 bytes at a time.  For arrays < 8 bytes, we compare a
-// halfword, then a short, and then a byte.
-
-void MacroAssembler::arrays_equals(Register a1, Register a2,
-                                   Register result, Register cnt1,
-                                   int elem_size, bool is_string)
+void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
+                                   Register tmp4, Register tmp5, Register result,
+                                   Register cnt1, int elem_size)
 {
-  Label SAME, DONE, SHORT, NEXT_WORD, ONE;
+  Label DONE;
   Register tmp1 = rscratch1;
   Register tmp2 = rscratch2;
   Register cnt2 = tmp2;  // cnt2 only used in array length compare
@@ -5212,6 +5195,7 @@
   int length_offset = arrayOopDesc::length_offset_in_bytes();
   int base_offset
     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
+  int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
 
   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
@@ -5220,43 +5204,229 @@
   {
     const char kind = (elem_size == 2) ? 'U' : 'L';
     char comment[64];
-    snprintf(comment, sizeof comment, "%s%c%s {",
-             is_string ? "string_equals" : "array_equals",
-             kind, "{");
+    snprintf(comment, sizeof comment, "array_equals%c{", kind);
+    BLOCK_COMMENT(comment);
+  }
+#endif
+  if (UseSimpleArrayEquals) {
+    Label NEXT_WORD, SHORT, SAME, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
+    // if (a1==a2)
+    //     return true;
+    // if (a==null || a2==null)
+    //     return false;
+    // a1 & a2 == 0 means (some-pointer is null) or
+    // (very-rare-or-even-probably-impossible-pointer-values)
+    // so, we can save one branch in most cases
+    eor(rscratch1, a1, a2);
+    tst(a1, a2);
+    mov(result, false);
+    cbz(rscratch1, SAME);
+    br(EQ, A_MIGHT_BE_NULL);
+    // if (a1.length != a2.length)
+    //      return false;
+    bind(A_IS_NOT_NULL);
+    ldrw(cnt1, Address(a1, length_offset));
+    ldrw(cnt2, Address(a2, length_offset));
+    eorw(tmp5, cnt1, cnt2);
+    cbnzw(tmp5, DONE);
+    lea(a1, Address(a1, base_offset));
+    lea(a2, Address(a2, base_offset));
+    // Check for short strings, i.e. smaller than wordSize.
+    subs(cnt1, cnt1, elem_per_word);
+    br(Assembler::LT, SHORT);
+    // Main 8 byte comparison loop.
+    bind(NEXT_WORD); {
+      ldr(tmp1, Address(post(a1, wordSize)));
+      ldr(tmp2, Address(post(a2, wordSize)));
+      subs(cnt1, cnt1, elem_per_word);
+      eor(tmp5, tmp1, tmp2);
+      cbnz(tmp5, DONE);
+    } br(GT, NEXT_WORD);
+    // Last longword.  In the case where length == 4 we compare the
+    // same longword twice, but that's still faster than another
+    // conditional branch.
+    // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
+    // length == 4.
+    if (log_elem_size > 0)
+      lsl(cnt1, cnt1, log_elem_size);
+    ldr(tmp3, Address(a1, cnt1));
+    ldr(tmp4, Address(a2, cnt1));
+    eor(tmp5, tmp3, tmp4);
+    cbnz(tmp5, DONE);
+    b(SAME);
+    bind(A_MIGHT_BE_NULL);
+    // in case both a1 and a2 are not-null, proceed with loads
+    cbz(a1, DONE);
+    cbz(a2, DONE);
+    b(A_IS_NOT_NULL);
+    bind(SHORT);
+
+    tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
+    {
+      ldrw(tmp1, Address(post(a1, 4)));
+      ldrw(tmp2, Address(post(a2, 4)));
+      eorw(tmp5, tmp1, tmp2);
+      cbnzw(tmp5, DONE);
+    }
+    bind(TAIL03);
+    tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
+    {
+      ldrh(tmp3, Address(post(a1, 2)));
+      ldrh(tmp4, Address(post(a2, 2)));
+      eorw(tmp5, tmp3, tmp4);
+      cbnzw(tmp5, DONE);
+    }
+    bind(TAIL01);
+    if (elem_size == 1) { // Only needed when comparing byte arrays.
+      tbz(cnt1, 0, SAME); // 0-1 bytes left.
+      {
+        ldrb(tmp1, a1);
+        ldrb(tmp2, a2);
+        eorw(tmp5, tmp1, tmp2);
+        cbnzw(tmp5, DONE);
+      }
+    }
+    bind(SAME);
+    mov(result, true);
+  } else {
+    Label NEXT_DWORD, A_IS_NULL, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
+        CSET_EQ, LAST_CHECK, LEN_IS_ZERO, SAME;
+    cbz(a1, A_IS_NULL);
+    ldrw(cnt1, Address(a1, length_offset));
+    cbz(a2, A_IS_NULL);
+    ldrw(cnt2, Address(a2, length_offset));
+    mov(result, false);
+    // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
+    // faster to perform another branch before comparing a1 and a2
+    cmp(cnt1, elem_per_word);
+    br(LE, SHORT); // short or same
+    cmp(a1, a2);
+    br(EQ, SAME);
+    ldr(tmp3, Address(pre(a1, base_offset)));
+    cmp(cnt1, stubBytesThreshold);
+    br(GE, STUB);
+    ldr(tmp4, Address(pre(a2, base_offset)));
+    sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
+    cmp(cnt2, cnt1);
+    br(NE, DONE);
+
+    // Main 16 byte comparison loop with 2 exits
+    bind(NEXT_DWORD); {
+      ldr(tmp1, Address(pre(a1, wordSize)));
+      ldr(tmp2, Address(pre(a2, wordSize)));
+      subs(cnt1, cnt1, 2 * elem_per_word);
+      br(LE, TAIL);
+      eor(tmp4, tmp3, tmp4);
+      cbnz(tmp4, DONE);
+      ldr(tmp3, Address(pre(a1, wordSize)));
+      ldr(tmp4, Address(pre(a2, wordSize)));
+      cmp(cnt1, elem_per_word);
+      br(LE, TAIL2);
+      cmp(tmp1, tmp2);
+    } br(EQ, NEXT_DWORD);
+    b(DONE);
+
+    bind(TAIL);
+    eor(tmp4, tmp3, tmp4);
+    eor(tmp2, tmp1, tmp2);
+    lslv(tmp2, tmp2, tmp5);
+    orr(tmp5, tmp4, tmp2);
+    cmp(tmp5, zr);
+    b(CSET_EQ);
+
+    bind(TAIL2);
+    eor(tmp2, tmp1, tmp2);
+    cbnz(tmp2, DONE);
+    b(LAST_CHECK);
+
+    bind(STUB);
+    ldr(tmp4, Address(pre(a2, base_offset)));
+    cmp(cnt2, cnt1);
+    br(NE, DONE);
+    if (elem_size == 2) { // convert to byte counter
+      lsl(cnt1, cnt1, 1);
+    }
+    eor(tmp5, tmp3, tmp4);
+    cbnz(tmp5, DONE);
+    RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
+    assert(stub.target() != NULL, "array_equals_long stub has not been generated");
+    trampoline_call(stub);
+    b(DONE);
+
+    bind(SAME);
+    mov(result, true);
+    b(DONE);
+    bind(A_IS_NULL);
+    // a1 or a2 is null. if a2 == a2 then return true. else return false
+    cmp(a1, a2);
+    b(CSET_EQ);
+    bind(EARLY_OUT);
+    // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
+    // so, if a2 == null => return false(0), else return true, so we can return a2
+    mov(result, a2);
+    b(DONE);
+    bind(LEN_IS_ZERO);
+    cmp(cnt2, zr);
+    b(CSET_EQ);
+    bind(SHORT);
+    cbz(cnt1, LEN_IS_ZERO);
+    sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
+    ldr(tmp3, Address(a1, base_offset));
+    ldr(tmp4, Address(a2, base_offset));
+    bind(LAST_CHECK);
+    eor(tmp4, tmp3, tmp4);
+    lslv(tmp5, tmp4, tmp5);
+    cmp(tmp5, zr);
+    bind(CSET_EQ);
+    cset(result, EQ);
+  }
+
+  // That's it.
+  bind(DONE);
+
+  BLOCK_COMMENT("} array_equals");
+}
+
+// Compare Strings
+
+// For Strings we're passed the address of the first characters in a1
+// and a2 and the length in cnt1.
+// elem_size is the element size in bytes: either 1 or 2.
+// There are two implementations.  For arrays >= 8 bytes, all
+// comparisons (including the final one, which may overlap) are
+// performed 8 bytes at a time.  For strings < 8 bytes, we compare a
+// halfword, then a short, and then a byte.
+
+void MacroAssembler::string_equals(Register a1, Register a2,
+                                   Register result, Register cnt1, int elem_size)
+{
+  Label SAME, DONE, SHORT, NEXT_WORD;
+  Register tmp1 = rscratch1;
+  Register tmp2 = rscratch2;
+  Register cnt2 = tmp2;  // cnt2 only used in array length compare
+
+  assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
+  assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
+
+#ifndef PRODUCT
+  {
+    const char kind = (elem_size == 2) ? 'U' : 'L';
+    char comment[64];
+    snprintf(comment, sizeof comment, "{string_equals%c", kind);
     BLOCK_COMMENT(comment);
   }
 #endif
 
   mov(result, false);
 
-  if (!is_string) {
-    // if (a==a2)
-    //     return true;
-    eor(rscratch1, a1, a2);
-    cbz(rscratch1, SAME);
-    // if (a==null || a2==null)
-    //     return false;
-    cbz(a1, DONE);
-    cbz(a2, DONE);
-    // if (a1.length != a2.length)
-    //      return false;
-    ldrw(cnt1, Address(a1, length_offset));
-    ldrw(cnt2, Address(a2, length_offset));
-    eorw(tmp1, cnt1, cnt2);
-    cbnzw(tmp1, DONE);
-
-    lea(a1, Address(a1, base_offset));
-    lea(a2, Address(a2, base_offset));
-  }
-
   // Check for short strings, i.e. smaller than wordSize.
-  subs(cnt1, cnt1, elem_per_word);
+  subs(cnt1, cnt1, wordSize);
   br(Assembler::LT, SHORT);
   // Main 8 byte comparison loop.
   bind(NEXT_WORD); {
     ldr(tmp1, Address(post(a1, wordSize)));
     ldr(tmp2, Address(post(a2, wordSize)));
-    subs(cnt1, cnt1, elem_per_word);
+    subs(cnt1, cnt1, wordSize);
     eor(tmp1, tmp1, tmp2);
     cbnz(tmp1, DONE);
   } br(GT, NEXT_WORD);
@@ -5265,18 +5435,16 @@
   // conditional branch.
   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
   // length == 4.
-  if (log_elem_size > 0)
-    lsl(cnt1, cnt1, log_elem_size);
   ldr(tmp1, Address(a1, cnt1));
   ldr(tmp2, Address(a2, cnt1));
-  eor(tmp1, tmp1, tmp2);
-  cbnz(tmp1, DONE);
+  eor(tmp2, tmp1, tmp2);
+  cbnz(tmp2, DONE);
   b(SAME);
 
   bind(SHORT);
   Label TAIL03, TAIL01;
 
-  tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
+  tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
   {
     ldrw(tmp1, Address(post(a1, 4)));
     ldrw(tmp2, Address(post(a2, 4)));
@@ -5284,7 +5452,7 @@
     cbnzw(tmp1, DONE);
   }
   bind(TAIL03);
-  tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
+  tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
   {
     ldrh(tmp1, Address(post(a1, 2)));
     ldrh(tmp2, Address(post(a2, 2)));
@@ -5292,7 +5460,7 @@
     cbnzw(tmp1, DONE);
   }
   bind(TAIL01);
-  if (elem_size == 1) { // Only needed when comparing byte arrays.
+  if (elem_size == 1) { // Only needed when comparing 1-byte elements
     tbz(cnt1, 0, SAME); // 0-1 bytes left.
     {
       ldrb(tmp1, a1);
@@ -5307,7 +5475,7 @@
 
   // That's it.
   bind(DONE);
-  BLOCK_COMMENT(is_string ? "} string_equals" : "} array_equals");
+  BLOCK_COMMENT("} string_equals");
 }
 
 
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp	Mon Apr 09 18:40:20 2018 +0300
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp	Mon Apr 09 18:43:40 2018 +0300
@@ -1225,9 +1225,11 @@
 
   void has_negatives(Register ary1, Register len, Register result);
 
-  void arrays_equals(Register a1, Register a2,
-                     Register result, Register cnt1,
-                     int elem_size, bool is_string);
+  void arrays_equals(Register a1, Register a2, Register result, Register cnt1,
+                     Register tmp1, Register tmp2, Register tmp3, int elem_size);
+
+  void string_equals(Register a1, Register a2, Register result, Register cnt1,
+                     int elem_size);
 
   void fill_words(Register base, Register cnt, Register value);
   void zero_words(Register base, u_int64_t cnt);
--- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp	Mon Apr 09 18:40:20 2018 +0300
+++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp	Mon Apr 09 18:43:40 2018 +0300
@@ -3813,6 +3813,182 @@
     __ ret(lr);
     return entry;
   }
+
+  void generate_large_array_equals_loop_nonsimd(int loopThreshold,
+        bool usePrefetch, Label &NOT_EQUAL) {
+    Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
+        tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
+        tmp7 = r12, tmp8 = r13;
+    Label LOOP;
+
+    __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
+    __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
+    __ bind(LOOP);
+    if (usePrefetch) {
+      __ prfm(Address(a1, SoftwarePrefetchHintDistance));
+      __ prfm(Address(a2, SoftwarePrefetchHintDistance));
+    }
+    __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
+    __ eor(tmp1, tmp1, tmp2);
+    __ eor(tmp3, tmp3, tmp4);
+    __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
+    __ orr(tmp1, tmp1, tmp3);
+    __ cbnz(tmp1, NOT_EQUAL);
+    __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
+    __ eor(tmp5, tmp5, tmp6);
+    __ eor(tmp7, tmp7, tmp8);
+    __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
+    __ orr(tmp5, tmp5, tmp7);
+    __ cbnz(tmp5, NOT_EQUAL);
+    __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
+    __ eor(tmp1, tmp1, tmp2);
+    __ eor(tmp3, tmp3, tmp4);
+    __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
+    __ orr(tmp1, tmp1, tmp3);
+    __ cbnz(tmp1, NOT_EQUAL);
+    __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
+    __ eor(tmp5, tmp5, tmp6);
+    __ sub(cnt1, cnt1, 8 * wordSize);
+    __ eor(tmp7, tmp7, tmp8);
+    __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
+    __ cmp(cnt1, loopThreshold);
+    __ orr(tmp5, tmp5, tmp7);
+    __ cbnz(tmp5, NOT_EQUAL);
+    __ br(__ GE, LOOP);
+    // post-loop
+    __ eor(tmp1, tmp1, tmp2);
+    __ eor(tmp3, tmp3, tmp4);
+    __ orr(tmp1, tmp1, tmp3);
+    __ sub(cnt1, cnt1, 2 * wordSize);
+    __ cbnz(tmp1, NOT_EQUAL);
+  }
+
+  void generate_large_array_equals_loop_simd(int loopThreshold,
+        bool usePrefetch, Label &NOT_EQUAL) {
+    Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
+        tmp2 = rscratch2;
+    Label LOOP;
+
+    __ bind(LOOP);
+    if (usePrefetch) {
+      __ prfm(Address(a1, SoftwarePrefetchHintDistance));
+      __ prfm(Address(a2, SoftwarePrefetchHintDistance));
+    }
+    __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
+    __ sub(cnt1, cnt1, 8 * wordSize);
+    __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
+    __ cmp(cnt1, loopThreshold);
+    __ eor(v0, __ T16B, v0, v4);
+    __ eor(v1, __ T16B, v1, v5);
+    __ eor(v2, __ T16B, v2, v6);
+    __ eor(v3, __ T16B, v3, v7);
+    __ orr(v0, __ T16B, v0, v1);
+    __ orr(v1, __ T16B, v2, v3);
+    __ orr(v0, __ T16B, v0, v1);
+    __ umov(tmp1, v0, __ D, 0);
+    __ umov(tmp2, v0, __ D, 1);
+    __ orr(tmp1, tmp1, tmp2);
+    __ cbnz(tmp1, NOT_EQUAL);
+    __ br(__ GE, LOOP);
+  }
+
+  // a1 = r1 - array1 address
+  // a2 = r2 - array2 address
+  // result = r0 - return value. Already contains "false"
+  // cnt1 = r10 - amount of elements left to check, reduced by wordSize
+  // r3-r5 are reserved temporary registers
+  address generate_large_array_equals() {
+    StubCodeMark mark(this, "StubRoutines", "large_array_equals");
+    Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
+        tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
+        tmp7 = r12, tmp8 = r13;
+    Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
+        SMALL_LOOP, POST_LOOP;
+    const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
+    // calculate if at least 32 prefetched bytes are used
+    int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
+    int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
+    RegSet spilled_regs = RegSet::range(tmp6, tmp8);
+    assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
+        tmp5, tmp6, tmp7, tmp8);
+
+    __ align(CodeEntryAlignment);
+    address entry = __ pc();
+    __ enter();
+    __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
+    // also advance pointers to use post-increment instead of pre-increment
+    __ add(a1, a1, wordSize);
+    __ add(a2, a2, wordSize);
+    if (AvoidUnalignedAccesses) {
+      // both implementations (SIMD/nonSIMD) are using relatively large load
+      // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
+      // on some CPUs in case of address is not at least 16-byte aligned.
+      // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
+      // load if needed at least for 1st address and make if 16-byte aligned.
+      Label ALIGNED16;
+      __ tbz(a1, 3, ALIGNED16);
+      __ ldr(tmp1, Address(__ post(a1, wordSize)));
+      __ ldr(tmp2, Address(__ post(a2, wordSize)));
+      __ sub(cnt1, cnt1, wordSize);
+      __ eor(tmp1, tmp1, tmp2);
+      __ cbnz(tmp1, NOT_EQUAL_NO_POP);
+      __ bind(ALIGNED16);
+    }
+    if (UseSIMDForArrayEquals) {
+      if (SoftwarePrefetchHintDistance >= 0) {
+        __ cmp(cnt1, prefetchLoopThreshold);
+        __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
+        generate_large_array_equals_loop_simd(prefetchLoopThreshold,
+            /* prfm = */ true, NOT_EQUAL);
+        __ cmp(cnt1, nonPrefetchLoopThreshold);
+        __ br(__ LT, TAIL);
+      }
+      __ bind(NO_PREFETCH_LARGE_LOOP);
+      generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
+          /* prfm = */ false, NOT_EQUAL);
+    } else {
+      __ push(spilled_regs, sp);
+      if (SoftwarePrefetchHintDistance >= 0) {
+        __ cmp(cnt1, prefetchLoopThreshold);
+        __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
+        generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
+            /* prfm = */ true, NOT_EQUAL);
+        __ cmp(cnt1, nonPrefetchLoopThreshold);
+        __ br(__ LT, TAIL);
+      }
+      __ bind(NO_PREFETCH_LARGE_LOOP);
+      generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
+          /* prfm = */ false, NOT_EQUAL);
+    }
+    __ bind(TAIL);
+      __ cbz(cnt1, EQUAL);
+      __ subs(cnt1, cnt1, wordSize);
+      __ br(__ LE, POST_LOOP);
+    __ bind(SMALL_LOOP);
+      __ ldr(tmp1, Address(__ post(a1, wordSize)));
+      __ ldr(tmp2, Address(__ post(a2, wordSize)));
+      __ subs(cnt1, cnt1, wordSize);
+      __ eor(tmp1, tmp1, tmp2);
+      __ cbnz(tmp1, NOT_EQUAL);
+      __ br(__ GT, SMALL_LOOP);
+    __ bind(POST_LOOP);
+      __ ldr(tmp1, Address(a1, cnt1));
+      __ ldr(tmp2, Address(a2, cnt1));
+      __ eor(tmp1, tmp1, tmp2);
+      __ cbnz(tmp1, NOT_EQUAL);
+    __ bind(EQUAL);
+      __ mov(result, true);
+    __ bind(NOT_EQUAL);
+      if (!UseSIMDForArrayEquals) {
+        __ pop(spilled_regs, sp);
+      }
+    __ bind(NOT_EQUAL_NO_POP);
+    __ leave();
+    __ ret(lr);
+    return entry;
+  }
+
+
   /**
    *  Arguments:
    *
@@ -4895,6 +5071,11 @@
     // has negatives stub for large arrays.
     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
 
+    // array equals stub for large arrays.
+    if (!UseSimpleArrayEquals) {
+      StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
+    }
+
     if (UseMultiplyToLenIntrinsic) {
       StubRoutines::_multiplyToLen = generate_multiplyToLen();
     }
--- a/src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp	Mon Apr 09 18:40:20 2018 +0300
+++ b/src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp	Mon Apr 09 18:43:40 2018 +0300
@@ -46,6 +46,7 @@
 address StubRoutines::aarch64::_zero_blocks = NULL;
 address StubRoutines::aarch64::_has_negatives = NULL;
 address StubRoutines::aarch64::_has_negatives_long = NULL;
+address StubRoutines::aarch64::_large_array_equals = NULL;
 bool StubRoutines::aarch64::_completed = false;
 
 /**
--- a/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp	Mon Apr 09 18:40:20 2018 +0300
+++ b/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp	Mon Apr 09 18:43:40 2018 +0300
@@ -65,6 +65,7 @@
 
   static address _has_negatives;
   static address _has_negatives_long;
+  static address _large_array_equals;
   static bool _completed;
 
  public:
@@ -131,6 +132,10 @@
       return _has_negatives_long;
   }
 
+  static address large_array_equals() {
+      return _large_array_equals;
+  }
+
   static bool complete() {
     return _completed;
   }
--- a/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp	Mon Apr 09 18:40:20 2018 +0300
+++ b/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp	Mon Apr 09 18:43:40 2018 +0300
@@ -203,7 +203,11 @@
     if (FLAG_IS_DEFAULT(UseSIMDForMemoryOps)) {
       FLAG_SET_DEFAULT(UseSIMDForMemoryOps, (_variant > 0));
     }
+    if (FLAG_IS_DEFAULT(UseSIMDForArrayEquals)) {
+      FLAG_SET_DEFAULT(UseSIMDForArrayEquals, false);
+    }
   }
+
   // ThunderX2
   if ((_cpu == CPU_CAVIUM && (_model == 0xAF)) ||
       (_cpu == CPU_BROADCOM && (_model == 0x516))) {
@@ -218,7 +222,25 @@
     }
   }
 
-  if (_cpu == CPU_ARM && (_model == 0xd03 || _model2 == 0xd03)) _features |= CPU_A53MAC;
+  // Cortex A53
+  if (_cpu == CPU_ARM && (_model == 0xd03 || _model2 == 0xd03)) {
+    _features |= CPU_A53MAC;
+    if (FLAG_IS_DEFAULT(UseSIMDForArrayEquals)) {
+      FLAG_SET_DEFAULT(UseSIMDForArrayEquals, false);
+    }
+  }
+
+  // Cortex A73
+  if (_cpu == CPU_ARM && (_model == 0xd09 || _model2 == 0xd09)) {
+    if (FLAG_IS_DEFAULT(SoftwarePrefetchHintDistance)) {
+      FLAG_SET_DEFAULT(SoftwarePrefetchHintDistance, -1);
+    }
+    // A73 is faster with short-and-easy-for-speculative-execution-loop
+    if (FLAG_IS_DEFAULT(UseSimpleArrayEquals)) {
+      FLAG_SET_DEFAULT(UseSimpleArrayEquals, true);
+    }
+  }
+
   if (_cpu == CPU_ARM && (_model == 0xd07 || _model2 == 0xd07)) _features |= CPU_STXR_PREFETCH;
   // If an olde style /proc/cpuinfo (cpu_lines == 1) then if _model is an A57 (0xd07)
   // we assume the worst and assume we could be on a big little system and have