8202326: AARCH64: optimize string compare intrinsic
authordpochepk
Mon, 25 Jun 2018 16:31:37 +0300
changeset 50756 7ad092f40454
parent 50755 680d04ae76e9
child 50757 866c9aa29ee4
8202326: AARCH64: optimize string compare intrinsic Reviewed-by: dsamersoff
src/hotspot/cpu/aarch64/aarch64.ad
src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp
src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp
--- a/src/hotspot/cpu/aarch64/aarch64.ad	Mon Jun 25 16:31:18 2018 +0300
+++ b/src/hotspot/cpu/aarch64/aarch64.ad	Mon Jun 25 16:31:37 2018 +0300
@@ -15852,70 +15852,76 @@
 %}
 
 instruct string_compareU(iRegP_R1 str1, iRegI_R2 cnt1, iRegP_R3 str2, iRegI_R4 cnt2,
-                        iRegI_R0 result, iRegP_R10 tmp1, rFlagsReg cr)
+                        iRegI_R0 result, iRegP_R10 tmp1, iRegL_R11 tmp2, rFlagsReg cr)
 %{
   predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::UU);
   match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
-  effect(KILL tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
+  effect(KILL tmp1, KILL tmp2, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
 
   format %{ "String Compare $str1,$cnt1,$str2,$cnt2 -> $result   # KILL $tmp1" %}
   ins_encode %{
     // Count is in 8-bit bytes; non-Compact chars are 16 bits.
     __ string_compare($str1$$Register, $str2$$Register,
                       $cnt1$$Register, $cnt2$$Register, $result$$Register,
-                      $tmp1$$Register,
-                      fnoreg, fnoreg, StrIntrinsicNode::UU);
+                      $tmp1$$Register, $tmp2$$Register,
+                      fnoreg, fnoreg, fnoreg, StrIntrinsicNode::UU);
   %}
   ins_pipe(pipe_class_memory);
 %}
 
 instruct string_compareL(iRegP_R1 str1, iRegI_R2 cnt1, iRegP_R3 str2, iRegI_R4 cnt2,
-                        iRegI_R0 result, iRegP_R10 tmp1, rFlagsReg cr)
+                        iRegI_R0 result, iRegP_R10 tmp1, iRegL_R11 tmp2, rFlagsReg cr)
 %{
   predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::LL);
   match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
-  effect(KILL tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
+  effect(KILL tmp1, KILL tmp2, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
 
   format %{ "String Compare $str1,$cnt1,$str2,$cnt2 -> $result   # KILL $tmp1" %}
   ins_encode %{
     __ string_compare($str1$$Register, $str2$$Register,
                       $cnt1$$Register, $cnt2$$Register, $result$$Register,
-                      $tmp1$$Register,
-                      fnoreg, fnoreg, StrIntrinsicNode::LL);
+                      $tmp1$$Register, $tmp2$$Register,
+                      fnoreg, fnoreg, fnoreg, StrIntrinsicNode::LL);
   %}
   ins_pipe(pipe_class_memory);
 %}
 
 instruct string_compareUL(iRegP_R1 str1, iRegI_R2 cnt1, iRegP_R3 str2, iRegI_R4 cnt2,
-                        iRegI_R0 result, vRegD vtmp1, vRegD vtmp2, iRegP_R10 tmp1, rFlagsReg cr)
+                        iRegI_R0 result, iRegP_R10 tmp1, iRegL_R11 tmp2,
+                        vRegD_V0 vtmp1, vRegD_V1 vtmp2, vRegD_V2 vtmp3, rFlagsReg cr)
 %{
   predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::UL);
   match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
-  effect(KILL tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, TEMP vtmp1, TEMP vtmp2, KILL cr);
-
-  format %{ "String Compare $str1,$cnt1,$str2,$cnt2 -> $result   # KILL $tmp1" %}
+  effect(KILL tmp1, KILL tmp2, KILL vtmp1, KILL vtmp2, KILL vtmp3,
+         USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
+
+  format %{ "String Compare $str1,$cnt1,$str2,$cnt2 -> $result   # KILL $tmp1, $tmp2, $vtmp1, $vtmp2, $vtmp3" %}
   ins_encode %{
     __ string_compare($str1$$Register, $str2$$Register,
                       $cnt1$$Register, $cnt2$$Register, $result$$Register,
-                      $tmp1$$Register,
-                      $vtmp1$$FloatRegister, $vtmp2$$FloatRegister, StrIntrinsicNode::UL);
+                      $tmp1$$Register, $tmp2$$Register,
+                      $vtmp1$$FloatRegister, $vtmp2$$FloatRegister,
+                      $vtmp3$$FloatRegister, StrIntrinsicNode::UL);
   %}
   ins_pipe(pipe_class_memory);
 %}
 
 instruct string_compareLU(iRegP_R1 str1, iRegI_R2 cnt1, iRegP_R3 str2, iRegI_R4 cnt2,
-                        iRegI_R0 result, vRegD vtmp1, vRegD vtmp2, iRegP_R10 tmp1, rFlagsReg cr)
+                        iRegI_R0 result, iRegP_R10 tmp1, iRegL_R11 tmp2,
+                        vRegD_V0 vtmp1, vRegD_V1 vtmp2, vRegD_V2 vtmp3, rFlagsReg cr)
 %{
   predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::LU);
   match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
-  effect(KILL tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, TEMP vtmp1, TEMP vtmp2, KILL cr);
-
-  format %{ "String Compare $str1,$cnt1,$str2,$cnt2 -> $result   # KILL $tmp1" %}
+  effect(KILL tmp1, KILL tmp2, KILL vtmp1, KILL vtmp2, KILL vtmp3,
+         USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
+
+  format %{ "String Compare $str1,$cnt1,$str2,$cnt2 -> $result   # KILL $tmp1, $tmp2, $vtmp1, $vtmp2, $vtmp3" %}
   ins_encode %{
     __ string_compare($str1$$Register, $str2$$Register,
                       $cnt1$$Register, $cnt2$$Register, $result$$Register,
-                      $tmp1$$Register,
-                      $vtmp1$$FloatRegister, $vtmp2$$FloatRegister, StrIntrinsicNode::LU);
+                      $tmp1$$Register, $tmp2$$Register,
+                      $vtmp1$$FloatRegister, $vtmp2$$FloatRegister,
+                      $vtmp3$$FloatRegister,StrIntrinsicNode::LU);
   %}
   ins_pipe(pipe_class_memory);
 %}
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp	Mon Jun 25 16:31:18 2018 +0300
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp	Mon Jun 25 16:31:37 2018 +0300
@@ -4733,12 +4733,13 @@
 
 // Compare strings.
 void MacroAssembler::string_compare(Register str1, Register str2,
-                                    Register cnt1, Register cnt2, Register result,
-                                    Register tmp1,
-                                    FloatRegister vtmp, FloatRegister vtmpZ, int ae) {
-  Label LENGTH_DIFF, DONE, SHORT_LOOP, SHORT_STRING,
-    NEXT_WORD, DIFFERENCE;
-
+    Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
+    FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
+  Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
+      DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
+      SHORT_LOOP_START, TAIL_CHECK;
+
+  const int STUB_THRESHOLD = 64 + 8;
   bool isLL = ae == StrIntrinsicNode::LL;
   bool isLU = ae == StrIntrinsicNode::LU;
   bool isUL = ae == StrIntrinsicNode::UL;
@@ -4750,7 +4751,9 @@
   int str2_chr_shift = str2_isL ? 0 : 1;
   int str1_chr_size = str1_isL ? 1 : 2;
   int str2_chr_size = str2_isL ? 1 : 2;
-
+  int minCharsInWord = isLL ? wordSize : wordSize/2;
+
+  FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
                                       (chr_insn)&MacroAssembler::ldrh;
   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
@@ -4766,73 +4769,116 @@
   if (!str2_isL) asrw(cnt2, cnt2, 1);
 
   // Compute the minimum of the string lengths and save the difference.
-  subsw(tmp1, cnt1, cnt2);
+  subsw(result, cnt1, cnt2);
   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
 
   // A very short string
-  cmpw(cnt2, isLL ? 8:4);
+  cmpw(cnt2, minCharsInWord);
   br(Assembler::LT, SHORT_STRING);
 
-  // Check if the strings start at the same location.
-  cmp(str1, str2);
-  br(Assembler::EQ, LENGTH_DIFF);
-
   // Compare longwords
+  // load first parts of strings and finish initialization while loading
   {
-    subw(cnt2, cnt2, isLL ? 8:4); // The last longword is a special case
-
-    // Move both string pointers to the last longword of their
-    // strings, negate the remaining count, and convert it to bytes.
-    lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
-    lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
-    if (isLU || isUL) {
-      sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
+    if (str1_isL == str2_isL) { // LL or UU
+      ldr(tmp1, Address(str1));
+      cmp(str1, str2);
+      br(Assembler::EQ, DONE);
+      ldr(tmp2, Address(str2));
+      cmp(cnt2, STUB_THRESHOLD);
+      br(GE, STUB);
+      subsw(cnt2, cnt2, minCharsInWord);
+      br(EQ, TAIL_CHECK);
+      lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
+      lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
+      sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
+    } else if (isLU) {
+      ldrs(vtmp, Address(str1));
+      cmp(str1, str2);
+      br(Assembler::EQ, DONE);
+      ldr(tmp2, Address(str2));
+      cmp(cnt2, STUB_THRESHOLD);
+      br(GE, STUB);
+      subsw(cnt2, cnt2, 4);
+      br(EQ, TAIL_CHECK);
       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
-    }
-    sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
-
-    // Loop, loading longwords and comparing them into rscratch2.
-    bind(NEXT_WORD);
-    if (isLU) {
-      ldrs(vtmp, Address(str1, cnt1));
+      lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
+      lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
       zip1(vtmp, T8B, vtmp, vtmpZ);
-      umov(result, vtmp, D, 0);
-    } else {
-      ldr(result, Address(str1, isUL ? cnt1:cnt2));
-    }
-    if (isUL) {
-      ldrs(vtmp, Address(str2, cnt2));
+      sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
+      sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
+      add(cnt1, cnt1, 4);
+      fmovd(tmp1, vtmp);
+    } else { // UL case
+      ldr(tmp1, Address(str1));
+      cmp(str1, str2);
+      br(Assembler::EQ, DONE);
+      ldrs(vtmp, Address(str2));
+      cmp(cnt2, STUB_THRESHOLD);
+      br(GE, STUB);
+      subsw(cnt2, cnt2, 4);
+      br(EQ, TAIL_CHECK);
+      lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
+      eor(vtmpZ, T16B, vtmpZ, vtmpZ);
+      lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
+      sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
       zip1(vtmp, T8B, vtmp, vtmpZ);
-      umov(rscratch1, vtmp, D, 0);
-    } else {
-      ldr(rscratch1, Address(str2, cnt2));
+      sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
+      add(cnt1, cnt1, 8);
+      fmovd(tmp2, vtmp);
     }
-    adds(cnt2, cnt2, isUL ? 4:8);
-    if (isLU || isUL) add(cnt1, cnt1, isLU ? 4:8);
-    eor(rscratch2, result, rscratch1);
+    adds(cnt2, cnt2, isUL ? 4 : 8);
+    br(GE, TAIL);
+    eor(rscratch2, tmp1, tmp2);
     cbnz(rscratch2, DIFFERENCE);
-    br(Assembler::LT, NEXT_WORD);
-
+    // main loop
+    bind(NEXT_WORD);
+    if (str1_isL == str2_isL) {
+      ldr(tmp1, Address(str1, cnt2));
+      ldr(tmp2, Address(str2, cnt2));
+      adds(cnt2, cnt2, 8);
+    } else if (isLU) {
+      ldrs(vtmp, Address(str1, cnt1));
+      ldr(tmp2, Address(str2, cnt2));
+      add(cnt1, cnt1, 4);
+      zip1(vtmp, T8B, vtmp, vtmpZ);
+      fmovd(tmp1, vtmp);
+      adds(cnt2, cnt2, 8);
+    } else { // UL
+      ldrs(vtmp, Address(str2, cnt2));
+      ldr(tmp1, Address(str1, cnt1));
+      zip1(vtmp, T8B, vtmp, vtmpZ);
+      add(cnt1, cnt1, 8);
+      fmovd(tmp2, vtmp);
+      adds(cnt2, cnt2, 4);
+    }
+    br(GE, TAIL);
+
+    eor(rscratch2, tmp1, tmp2);
+    cbz(rscratch2, NEXT_WORD);
+    b(DIFFERENCE);
+    bind(TAIL);
+    eor(rscratch2, tmp1, tmp2);
+    cbnz(rscratch2, DIFFERENCE);
     // Last longword.  In the case where length == 4 we compare the
     // same longword twice, but that's still faster than another
     // conditional branch.
-
-    if (isLU) {
+    if (str1_isL == str2_isL) {
+      ldr(tmp1, Address(str1));
+      ldr(tmp2, Address(str2));
+    } else if (isLU) {
       ldrs(vtmp, Address(str1));
+      ldr(tmp2, Address(str2));
       zip1(vtmp, T8B, vtmp, vtmpZ);
-      umov(result, vtmp, D, 0);
-    } else {
-      ldr(result, Address(str1));
+      fmovd(tmp1, vtmp);
+    } else { // UL
+      ldrs(vtmp, Address(str2));
+      ldr(tmp1, Address(str1));
+      zip1(vtmp, T8B, vtmp, vtmpZ);
+      fmovd(tmp2, vtmp);
     }
-    if (isUL) {
-      ldrs(vtmp, Address(str2));
-      zip1(vtmp, T8B, vtmp, vtmpZ);
-      umov(rscratch1, vtmp, D, 0);
-    } else {
-      ldr(rscratch1, Address(str2));
-    }
-    eor(rscratch2, result, rscratch1);
-    cbz(rscratch2, LENGTH_DIFF);
+    bind(TAIL_CHECK);
+    eor(rscratch2, tmp1, tmp2);
+    cbz(rscratch2, DONE);
 
     // Find the first different characters in the longwords and
     // compute their difference.
@@ -4840,31 +4886,78 @@
     rev(rscratch2, rscratch2);
     clz(rscratch2, rscratch2);
     andr(rscratch2, rscratch2, isLL ? -8 : -16);
-    lsrv(result, result, rscratch2);
-    (this->*ext_chr)(result, result);
-    lsrv(rscratch1, rscratch1, rscratch2);
-    (this->*ext_chr)(rscratch1, rscratch1);
-    subw(result, result, rscratch1);
+    lsrv(tmp1, tmp1, rscratch2);
+    (this->*ext_chr)(tmp1, tmp1);
+    lsrv(tmp2, tmp2, rscratch2);
+    (this->*ext_chr)(tmp2, tmp2);
+    subw(result, tmp1, tmp2);
     b(DONE);
   }
 
+  bind(STUB);
+    RuntimeAddress stub = NULL;
+    switch(ae) {
+      case StrIntrinsicNode::LL:
+        stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
+        break;
+      case StrIntrinsicNode::UU:
+        stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
+        break;
+      case StrIntrinsicNode::LU:
+        stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
+        break;
+      case StrIntrinsicNode::UL:
+        stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
+        break;
+      default:
+        ShouldNotReachHere();
+     }
+    assert(stub.target() != NULL, "compare_long_string stub has not been generated");
+    trampoline_call(stub);
+    b(DONE);
+
   bind(SHORT_STRING);
   // Is the minimum length zero?
-  cbz(cnt2, LENGTH_DIFF);
-
+  cbz(cnt2, DONE);
+  // arrange code to do most branches while loading and loading next characters
+  // while comparing previous
+  (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
+  subs(cnt2, cnt2, 1);
+  br(EQ, SHORT_LAST_INIT);
+  (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
+  b(SHORT_LOOP_START);
   bind(SHORT_LOOP);
-  (this->*str1_load_chr)(result, Address(post(str1, str1_chr_size)));
+  subs(cnt2, cnt2, 1);
+  br(EQ, SHORT_LAST);
+  bind(SHORT_LOOP_START);
+  (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
+  (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
+  cmp(tmp1, cnt1);
+  br(NE, SHORT_LOOP_TAIL);
+  subs(cnt2, cnt2, 1);
+  br(EQ, SHORT_LAST2);
+  (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
-  subw(result, result, cnt1);
-  cbnz(result, DONE);
-  sub(cnt2, cnt2, 1);
-  cbnz(cnt2, SHORT_LOOP);
-
-  // Strings are equal up to min length.  Return the length difference.
-  bind(LENGTH_DIFF);
-  mov(result, tmp1);
-
-  // That's it
+  cmp(tmp2, rscratch1);
+  br(EQ, SHORT_LOOP);
+  sub(result, tmp2, rscratch1);
+  b(DONE);
+  bind(SHORT_LOOP_TAIL);
+  sub(result, tmp1, cnt1);
+  b(DONE);
+  bind(SHORT_LAST2);
+  cmp(tmp2, rscratch1);
+  br(EQ, DONE);
+  sub(result, tmp2, rscratch1);
+
+  b(DONE);
+  bind(SHORT_LAST_INIT);
+  (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
+  bind(SHORT_LAST);
+  cmp(tmp1, cnt1);
+  br(EQ, DONE);
+  sub(result, tmp1, cnt1);
+
   bind(DONE);
 
   BLOCK_COMMENT("} string_compare");
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp	Mon Jun 25 16:31:18 2018 +0300
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp	Mon Jun 25 16:31:37 2018 +0300
@@ -1212,8 +1212,8 @@
 
   void string_compare(Register str1, Register str2,
                       Register cnt1, Register cnt2, Register result,
-                      Register tmp1,
-                      FloatRegister vtmp, FloatRegister vtmpZ, int ae);
+                      Register tmp1, Register tmp2, FloatRegister vtmp1,
+                      FloatRegister vtmp2, FloatRegister vtmp3, int ae);
 
   void has_negatives(Register ary1, Register len, Register result);
 
--- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp	Mon Jun 25 16:31:18 2018 +0300
+++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp	Mon Jun 25 16:31:37 2018 +0300
@@ -4014,6 +4014,317 @@
     return entry;
   }
 
+  // code for comparing 16 bytes of strings with same encoding
+  void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
+    Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
+    __ ldr(rscratch1, Address(__ post(str1, 8)));
+    __ eor(rscratch2, tmp1, tmp2);
+    __ ldr(cnt1, Address(__ post(str2, 8)));
+    __ cbnz(rscratch2, DIFF1);
+    __ ldr(tmp1, Address(__ post(str1, 8)));
+    __ eor(rscratch2, rscratch1, cnt1);
+    __ ldr(tmp2, Address(__ post(str2, 8)));
+    __ cbnz(rscratch2, DIFF2);
+  }
+
+  // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
+  void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
+      Label &DIFF2) {
+    Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12;
+    FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
+
+    __ ldrq(vtmp, Address(__ post(tmp2, 16)));
+    __ ldr(tmpU, Address(__ post(cnt1, 8)));
+    __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
+    // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
+
+    __ fmovd(tmpL, vtmp3);
+    __ eor(rscratch2, tmp3, tmpL);
+    __ cbnz(rscratch2, DIFF2);
+
+    __ ldr(tmp3, Address(__ post(cnt1, 8)));
+    __ umov(tmpL, vtmp3, __ D, 1);
+    __ eor(rscratch2, tmpU, tmpL);
+    __ cbnz(rscratch2, DIFF1);
+
+    __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
+    __ ldr(tmpU, Address(__ post(cnt1, 8)));
+    __ fmovd(tmpL, vtmp);
+    __ eor(rscratch2, tmp3, tmpL);
+    __ cbnz(rscratch2, DIFF2);
+
+    __ ldr(tmp3, Address(__ post(cnt1, 8)));
+    __ umov(tmpL, vtmp, __ D, 1);
+    __ eor(rscratch2, tmpU, tmpL);
+    __ cbnz(rscratch2, DIFF1);
+  }
+
+  // r0  = result
+  // r1  = str1
+  // r2  = cnt1
+  // r3  = str2
+  // r4  = cnt2
+  // r10 = tmp1
+  // r11 = tmp2
+  address generate_compare_long_string_different_encoding(bool isLU) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", isLU
+        ? "compare_long_string_different_encoding LU"
+        : "compare_long_string_different_encoding UL");
+    address entry = __ pc();
+    Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
+        DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, SMALL_LOOP_ENTER,
+        LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
+    Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
+        tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
+    FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
+    RegSet spilled_regs = RegSet::of(tmp3, tmp4);
+
+    int prefetchLoopExitCondition = MAX(32, SoftwarePrefetchHintDistance/2);
+
+    __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
+    // cnt2 == amount of characters left to compare
+    // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
+    __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
+    __ add(str1, str1, isLU ? wordSize/2 : wordSize);
+    __ add(str2, str2, isLU ? wordSize : wordSize/2);
+    __ fmovd(isLU ? tmp1 : tmp2, vtmp);
+    __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
+    __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1);
+    __ eor(rscratch2, tmp1, tmp2);
+    __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0);
+    __ mov(rscratch1, tmp2);
+    __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
+    Register strU = isLU ? str2 : str1,
+             strL = isLU ? str1 : str2,
+             tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
+             tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
+    __ push(spilled_regs, sp);
+    __ sub(tmp2, strL, cnt2); // strL pointer to load from
+    __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from
+
+    __ ldr(tmp3, Address(__ post(cnt1, 8)));
+
+    if (SoftwarePrefetchHintDistance >= 0) {
+      __ cmp(cnt2, prefetchLoopExitCondition);
+      __ br(__ LT, SMALL_LOOP);
+      __ bind(LARGE_LOOP_PREFETCH);
+        __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
+        __ mov(tmp4, 2);
+        __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
+        __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
+          compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
+          __ subs(tmp4, tmp4, 1);
+          __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
+          __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
+          __ mov(tmp4, 2);
+        __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
+          compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
+          __ subs(tmp4, tmp4, 1);
+          __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
+          __ sub(cnt2, cnt2, 64);
+          __ cmp(cnt2, prefetchLoopExitCondition);
+          __ br(__ GE, LARGE_LOOP_PREFETCH);
+    }
+    __ cbz(cnt2, LOAD_LAST); // no characters left except last load
+    __ subs(cnt2, cnt2, 16);
+    __ br(__ LT, TAIL);
+    __ b(SMALL_LOOP_ENTER);
+    __ bind(SMALL_LOOP); // smaller loop
+      __ subs(cnt2, cnt2, 16);
+    __ bind(SMALL_LOOP_ENTER);
+      compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
+      __ br(__ GE, SMALL_LOOP);
+      __ cbz(cnt2, LOAD_LAST);
+    __ bind(TAIL); // 1..15 characters left
+      __ cmp(cnt2, -8);
+      __ br(__ GT, TAIL_LOAD_16);
+      __ ldrd(vtmp, Address(tmp2));
+      __ zip1(vtmp3, __ T8B, vtmp, vtmpZ);
+
+      __ ldr(tmpU, Address(__ post(cnt1, 8)));
+      __ fmovd(tmpL, vtmp3);
+      __ eor(rscratch2, tmp3, tmpL);
+      __ cbnz(rscratch2, DIFF2);
+      __ umov(tmpL, vtmp3, __ D, 1);
+      __ eor(rscratch2, tmpU, tmpL);
+      __ cbnz(rscratch2, DIFF1);
+      __ b(LOAD_LAST);
+    __ bind(TAIL_LOAD_16);
+      __ ldrq(vtmp, Address(tmp2));
+      __ ldr(tmpU, Address(__ post(cnt1, 8)));
+      __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
+      __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
+      __ fmovd(tmpL, vtmp3);
+      __ eor(rscratch2, tmp3, tmpL);
+      __ cbnz(rscratch2, DIFF2);
+
+      __ ldr(tmp3, Address(__ post(cnt1, 8)));
+      __ umov(tmpL, vtmp3, __ D, 1);
+      __ eor(rscratch2, tmpU, tmpL);
+      __ cbnz(rscratch2, DIFF1);
+
+      __ ldr(tmpU, Address(__ post(cnt1, 8)));
+      __ fmovd(tmpL, vtmp);
+      __ eor(rscratch2, tmp3, tmpL);
+      __ cbnz(rscratch2, DIFF2);
+
+      __ umov(tmpL, vtmp, __ D, 1);
+      __ eor(rscratch2, tmpU, tmpL);
+      __ cbnz(rscratch2, DIFF1);
+      __ b(LOAD_LAST);
+    __ bind(DIFF2);
+      __ mov(tmpU, tmp3);
+    __ bind(DIFF1);
+      __ pop(spilled_regs, sp);
+      __ b(CALCULATE_DIFFERENCE);
+    __ bind(LOAD_LAST);
+      __ pop(spilled_regs, sp);
+
+      __ ldrs(vtmp, Address(strL));
+      __ ldr(tmpU, Address(strU));
+      __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
+      __ fmovd(tmpL, vtmp);
+
+      __ eor(rscratch2, tmpU, tmpL);
+      __ cbz(rscratch2, DONE);
+
+    // Find the first different characters in the longwords and
+    // compute their difference.
+    __ bind(CALCULATE_DIFFERENCE);
+      __ rev(rscratch2, rscratch2);
+      __ clz(rscratch2, rscratch2);
+      __ andr(rscratch2, rscratch2, -16);
+      __ lsrv(tmp1, tmp1, rscratch2);
+      __ uxthw(tmp1, tmp1);
+      __ lsrv(rscratch1, rscratch1, rscratch2);
+      __ uxthw(rscratch1, rscratch1);
+      __ subw(result, tmp1, rscratch1);
+    __ bind(DONE);
+      __ ret(lr);
+    return entry;
+  }
+
+  // r0  = result
+  // r1  = str1
+  // r2  = cnt1
+  // r3  = str2
+  // r4  = cnt2
+  // r10 = tmp1
+  // r11 = tmp2
+  address generate_compare_long_string_same_encoding(bool isLL) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", isLL
+        ? "compare_long_string_same_encoding LL"
+        : "compare_long_string_same_encoding UU");
+    address entry = __ pc();
+    Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
+        tmp1 = r10, tmp2 = r11;
+    Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
+        LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
+        DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
+    // exit from large loop when less than 64 bytes left to read or we're about
+    // to prefetch memory behind array border
+    int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
+    // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
+    // update cnt2 counter with already loaded 8 bytes
+    __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
+    // update pointers, because of previous read
+    __ add(str1, str1, wordSize);
+    __ add(str2, str2, wordSize);
+    if (SoftwarePrefetchHintDistance >= 0) {
+      __ bind(LARGE_LOOP_PREFETCH);
+        __ prfm(Address(str1, SoftwarePrefetchHintDistance));
+        __ prfm(Address(str2, SoftwarePrefetchHintDistance));
+        compare_string_16_bytes_same(DIFF, DIFF2);
+        compare_string_16_bytes_same(DIFF, DIFF2);
+        __ sub(cnt2, cnt2, isLL ? 64 : 32);
+        compare_string_16_bytes_same(DIFF, DIFF2);
+        __ cmp(cnt2, largeLoopExitCondition);
+        compare_string_16_bytes_same(DIFF, DIFF2);
+        __ br(__ GT, LARGE_LOOP_PREFETCH);
+        __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
+        // less than 16 bytes left?
+        __ subs(cnt2, cnt2, isLL ? 16 : 8);
+        __ br(__ LT, TAIL);
+    }
+    __ bind(SMALL_LOOP);
+      compare_string_16_bytes_same(DIFF, DIFF2);
+      __ subs(cnt2, cnt2, isLL ? 16 : 8);
+      __ br(__ GE, SMALL_LOOP);
+    __ bind(TAIL);
+      __ adds(cnt2, cnt2, isLL ? 16 : 8);
+      __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
+      __ subs(cnt2, cnt2, isLL ? 8 : 4);
+      __ br(__ LE, CHECK_LAST);
+      __ eor(rscratch2, tmp1, tmp2);
+      __ cbnz(rscratch2, DIFF);
+      __ ldr(tmp1, Address(__ post(str1, 8)));
+      __ ldr(tmp2, Address(__ post(str2, 8)));
+      __ sub(cnt2, cnt2, isLL ? 8 : 4);
+    __ bind(CHECK_LAST);
+      if (!isLL) {
+        __ add(cnt2, cnt2, cnt2); // now in bytes
+      }
+      __ eor(rscratch2, tmp1, tmp2);
+      __ cbnz(rscratch2, DIFF);
+      __ ldr(rscratch1, Address(str1, cnt2));
+      __ ldr(cnt1, Address(str2, cnt2));
+      __ eor(rscratch2, rscratch1, cnt1);
+      __ cbz(rscratch2, LENGTH_DIFF);
+      // Find the first different characters in the longwords and
+      // compute their difference.
+    __ bind(DIFF2);
+      __ rev(rscratch2, rscratch2);
+      __ clz(rscratch2, rscratch2);
+      __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
+      __ lsrv(rscratch1, rscratch1, rscratch2);
+      if (isLL) {
+        __ lsrv(cnt1, cnt1, rscratch2);
+        __ uxtbw(rscratch1, rscratch1);
+        __ uxtbw(cnt1, cnt1);
+      } else {
+        __ lsrv(cnt1, cnt1, rscratch2);
+        __ uxthw(rscratch1, rscratch1);
+        __ uxthw(cnt1, cnt1);
+      }
+      __ subw(result, rscratch1, cnt1);
+      __ b(LENGTH_DIFF);
+    __ bind(DIFF);
+      __ rev(rscratch2, rscratch2);
+      __ clz(rscratch2, rscratch2);
+      __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
+      __ lsrv(tmp1, tmp1, rscratch2);
+      if (isLL) {
+        __ lsrv(tmp2, tmp2, rscratch2);
+        __ uxtbw(tmp1, tmp1);
+        __ uxtbw(tmp2, tmp2);
+      } else {
+        __ lsrv(tmp2, tmp2, rscratch2);
+        __ uxthw(tmp1, tmp1);
+        __ uxthw(tmp2, tmp2);
+      }
+      __ subw(result, tmp1, tmp2);
+      __ b(LENGTH_DIFF);
+    __ bind(LAST_CHECK_AND_LENGTH_DIFF);
+      __ eor(rscratch2, tmp1, tmp2);
+      __ cbnz(rscratch2, DIFF);
+    __ bind(LENGTH_DIFF);
+      __ ret(lr);
+    return entry;
+  }
+
+  void generate_compare_long_strings() {
+      StubRoutines::aarch64::_compare_long_string_LL
+          = generate_compare_long_string_same_encoding(true);
+      StubRoutines::aarch64::_compare_long_string_UU
+          = generate_compare_long_string_same_encoding(false);
+      StubRoutines::aarch64::_compare_long_string_LU
+          = generate_compare_long_string_different_encoding(true);
+      StubRoutines::aarch64::_compare_long_string_UL
+          = generate_compare_long_string_different_encoding(false);
+  }
+
   /**
    *  Arguments:
    *
@@ -5113,6 +5424,8 @@
       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
     }
 
+    generate_compare_long_strings();
+
     if (UseMultiplyToLenIntrinsic) {
       StubRoutines::_multiplyToLen = generate_multiplyToLen();
     }
--- a/src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp	Mon Jun 25 16:31:18 2018 +0300
+++ b/src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp	Mon Jun 25 16:31:37 2018 +0300
@@ -48,6 +48,10 @@
 address StubRoutines::aarch64::_has_negatives = NULL;
 address StubRoutines::aarch64::_has_negatives_long = NULL;
 address StubRoutines::aarch64::_large_array_equals = NULL;
+address StubRoutines::aarch64::_compare_long_string_LL = NULL;
+address StubRoutines::aarch64::_compare_long_string_UU = NULL;
+address StubRoutines::aarch64::_compare_long_string_LU = NULL;
+address StubRoutines::aarch64::_compare_long_string_UL = NULL;
 bool StubRoutines::aarch64::_completed = false;
 
 /**
--- a/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp	Mon Jun 25 16:31:18 2018 +0300
+++ b/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp	Mon Jun 25 16:31:37 2018 +0300
@@ -66,6 +66,10 @@
   static address _has_negatives;
   static address _has_negatives_long;
   static address _large_array_equals;
+  static address _compare_long_string_LL;
+  static address _compare_long_string_LU;
+  static address _compare_long_string_UL;
+  static address _compare_long_string_UU;
   static bool _completed;
 
  public:
@@ -136,6 +140,22 @@
       return _large_array_equals;
   }
 
+  static address compare_long_string_LL() {
+      return _compare_long_string_LL;
+  }
+
+  static address compare_long_string_LU() {
+      return _compare_long_string_LU;
+  }
+
+  static address compare_long_string_UL() {
+      return _compare_long_string_UL;
+  }
+
+  static address compare_long_string_UU() {
+      return _compare_long_string_UU;
+  }
+
   static bool complete() {
     return _completed;
   }