8189113: AARCH64: StringLatin1 inflate intrinsic doesn't use prefetch instruction
authordpochepk
Mon, 25 Jun 2018 16:32:23 +0300
changeset 50758 afca3c78ea0f
parent 50757 866c9aa29ee4
child 50759 00c4edaf2017
8189113: AARCH64: StringLatin1 inflate intrinsic doesn't use prefetch instruction Reviewed-by: aph
src/hotspot/cpu/aarch64/aarch64.ad
src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp
src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp
--- a/src/hotspot/cpu/aarch64/aarch64.ad	Mon Jun 25 16:32:02 2018 +0300
+++ b/src/hotspot/cpu/aarch64/aarch64.ad	Mon Jun 25 16:32:23 2018 +0300
@@ -16168,7 +16168,7 @@
 
 // fast byte[] to char[] inflation
 instruct string_inflate(Universe dummy, iRegP_R0 src, iRegP_R1 dst, iRegI_R2 len,
-                        vRegD tmp1, vRegD tmp2, vRegD tmp3, iRegP_R3 tmp4, rFlagsReg cr)
+                        vRegD_V0 tmp1, vRegD_V1 tmp2, vRegD_V2 tmp3, iRegP_R3 tmp4, rFlagsReg cr)
 %{
   match(Set dummy (StrInflatedCopy src (Binary dst len)));
   effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr);
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp	Mon Jun 25 16:32:02 2018 +0300
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp	Mon Jun 25 16:32:23 2018 +0300
@@ -5681,26 +5681,24 @@
 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
                                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
                                         Register tmp4) {
-  Label big, done;
+  Label big, done, after_init, to_stub;
 
   assert_different_registers(src, dst, len, tmp4, rscratch1);
 
-  fmovd(vtmp1 , zr);
-  lsrw(rscratch1, len, 3);
-
-  cbnzw(rscratch1, big);
-
+  fmovd(vtmp1, zr);
+  lsrw(tmp4, len, 3);
+  bind(after_init);
+  cbnzw(tmp4, big);
   // Short string: less than 8 bytes.
   {
-    Label loop, around, tiny;
-
-    subsw(len, len, 4);
-    andw(len, len, 3);
-    br(LO, tiny);
-
+    Label loop, tiny;
+
+    cmpw(len, 4);
+    br(LT, tiny);
     // Use SIMD to do 4 bytes.
     ldrs(vtmp2, post(src, 4));
     zip1(vtmp3, T8B, vtmp2, vtmp1);
+    subw(len, len, 4);
     strd(vtmp3, post(dst, 8));
 
     cbzw(len, done);
@@ -5714,35 +5712,65 @@
     bind(tiny);
     cbnz(len, loop);
 
-    bind(around);
     b(done);
   }
 
+  if (SoftwarePrefetchHintDistance >= 0) {
+    bind(to_stub);
+      RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
+      assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
+      trampoline_call(stub);
+      b(after_init);
+  }
+
   // Unpack the bytes 8 at a time.
   bind(big);
-  andw(len, len, 7);
-
   {
-    Label loop, around;
-
-    bind(loop);
-    ldrd(vtmp2, post(src, 8));
-    sub(rscratch1, rscratch1, 1);
-    zip1(vtmp3, T16B, vtmp2, vtmp1);
-    st1(vtmp3, T8H, post(dst, 16));
-    cbnz(rscratch1, loop);
-
-    bind(around);
+    Label loop, around, loop_last, loop_start;
+
+    if (SoftwarePrefetchHintDistance >= 0) {
+      const int large_loop_threshold = (64 + 16)/8;
+      ldrd(vtmp2, post(src, 8));
+      andw(len, len, 7);
+      cmp(tmp4, large_loop_threshold);
+      br(GE, to_stub);
+      b(loop_start);
+
+      bind(loop);
+      ldrd(vtmp2, post(src, 8));
+      bind(loop_start);
+      subs(tmp4, tmp4, 1);
+      br(EQ, loop_last);
+      zip1(vtmp2, T16B, vtmp2, vtmp1);
+      ldrd(vtmp3, post(src, 8));
+      st1(vtmp2, T8H, post(dst, 16));
+      subs(tmp4, tmp4, 1);
+      zip1(vtmp3, T16B, vtmp3, vtmp1);
+      st1(vtmp3, T8H, post(dst, 16));
+      br(NE, loop);
+      b(around);
+      bind(loop_last);
+      zip1(vtmp2, T16B, vtmp2, vtmp1);
+      st1(vtmp2, T8H, post(dst, 16));
+      bind(around);
+      cbz(len, done);
+    } else {
+      andw(len, len, 7);
+      bind(loop);
+      ldrd(vtmp2, post(src, 8));
+      sub(tmp4, tmp4, 1);
+      zip1(vtmp3, T16B, vtmp2, vtmp1);
+      st1(vtmp3, T8H, post(dst, 16));
+      cbnz(tmp4, loop);
+    }
   }
 
   // Do the tail of up to 8 bytes.
-  sub(src, src, 8);
-  add(src, src, len, ext::uxtw, 0);
-  ldrd(vtmp2, Address(src));
-  sub(dst, dst, 16);
+  add(src, src, len);
+  ldrd(vtmp3, Address(src, -8));
   add(dst, dst, len, ext::uxtw, 1);
-  zip1(vtmp3, T16B, vtmp2, vtmp1);
-  st1(vtmp3, T8H, Address(dst));
+  zip1(vtmp3, T16B, vtmp3, vtmp1);
+  strq(vtmp3, Address(dst, -16));
 
   bind(done);
 }
--- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp	Mon Jun 25 16:32:02 2018 +0300
+++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp	Mon Jun 25 16:32:23 2018 +0300
@@ -4624,6 +4624,68 @@
     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
   }
 
+  void inflate_and_store_2_fp_registers(bool generatePrfm,
+      FloatRegister src1, FloatRegister src2) {
+    Register dst = r1;
+    __ zip1(v1, __ T16B, src1, v0);
+    __ zip2(v2, __ T16B, src1, v0);
+    if (generatePrfm) {
+      __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
+    }
+    __ zip1(v3, __ T16B, src2, v0);
+    __ zip2(v4, __ T16B, src2, v0);
+    __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
+  }
+
+  // R0 = src
+  // R1 = dst
+  // R2 = len
+  // R3 = len >> 3
+  // V0 = 0
+  // v1 = loaded 8 bytes
+  address generate_large_byte_array_inflate() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
+    address entry = __ pc();
+    Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
+    Register src = r0, dst = r1, len = r2, octetCounter = r3;
+    const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
+
+    // do one more 8-byte read to have address 16-byte aligned in most cases
+    // also use single store instruction
+    __ ldrd(v2, __ post(src, 8));
+    __ sub(octetCounter, octetCounter, 2);
+    __ zip1(v1, __ T16B, v1, v0);
+    __ zip1(v2, __ T16B, v2, v0);
+    __ st1(v1, v2, __ T16B, __ post(dst, 32));
+    __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
+    __ cmp(octetCounter, large_loop_threshold);
+    __ br(__ LE, LOOP_START);
+    __ b(LOOP_PRFM_START);
+    __ bind(LOOP_PRFM);
+      __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
+    __ bind(LOOP_PRFM_START);
+      __ prfm(Address(src, SoftwarePrefetchHintDistance));
+      __ sub(octetCounter, octetCounter, 8);
+      __ cmp(octetCounter, large_loop_threshold);
+      inflate_and_store_2_fp_registers(true, v3, v4);
+      inflate_and_store_2_fp_registers(true, v5, v6);
+      __ br(__ GT, LOOP_PRFM);
+      __ cmp(octetCounter, 8);
+      __ br(__ LT, DONE);
+    __ bind(LOOP);
+      __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
+      __ bind(LOOP_START);
+      __ sub(octetCounter, octetCounter, 8);
+      __ cmp(octetCounter, 8);
+      inflate_and_store_2_fp_registers(false, v3, v4);
+      inflate_and_store_2_fp_registers(false, v5, v6);
+      __ br(__ GE, LOOP);
+    __ bind(DONE);
+      __ ret(lr);
+    return entry;
+  }
+
   /**
    *  Arguments:
    *
@@ -5727,6 +5789,9 @@
 
     generate_string_indexof_stubs();
 
+    // byte_array_inflate stub for large arrays.
+    StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
+
     if (UseMultiplyToLenIntrinsic) {
       StubRoutines::_multiplyToLen = generate_multiplyToLen();
     }
--- a/src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp	Mon Jun 25 16:32:02 2018 +0300
+++ b/src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp	Mon Jun 25 16:32:23 2018 +0300
@@ -55,6 +55,7 @@
 address StubRoutines::aarch64::_string_indexof_linear_ll = NULL;
 address StubRoutines::aarch64::_string_indexof_linear_uu = NULL;
 address StubRoutines::aarch64::_string_indexof_linear_ul = NULL;
+address StubRoutines::aarch64::_large_byte_array_inflate = NULL;
 bool StubRoutines::aarch64::_completed = false;
 
 /**
--- a/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp	Mon Jun 25 16:32:02 2018 +0300
+++ b/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp	Mon Jun 25 16:32:23 2018 +0300
@@ -73,6 +73,7 @@
   static address _string_indexof_linear_ll;
   static address _string_indexof_linear_uu;
   static address _string_indexof_linear_ul;
+  static address _large_byte_array_inflate;
   static bool _completed;
 
  public:
@@ -171,6 +172,10 @@
       return _string_indexof_linear_uu;
   }
 
+  static address large_byte_array_inflate() {
+      return _large_byte_array_inflate;
+  }
+
   static bool complete() {
     return _completed;
   }