8151502: optimize pd_disjoint_words and pd_conjoint_words
authorenevill
Thu, 10 Mar 2016 14:53:09 +0000
changeset 36595 3322a76f3a00
parent 36594 5a2cfca38c3d
child 36596 d0e06b2ecfbe
child 36597 ee256e343585
8151502: optimize pd_disjoint_words and pd_conjoint_words Summary: optimize copy routines using inline assembler Reviewed-by: aph
hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
hotspot/src/os_cpu/linux_aarch64/vm/copy_linux_aarch64.inline.hpp
hotspot/src/os_cpu/linux_aarch64/vm/copy_linux_aarch64.s
--- a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Thu Mar 10 13:04:43 2016 -0800
+++ b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Thu Mar 10 14:53:09 2016 +0000
@@ -849,8 +849,8 @@
       __ bind(L1);
 
       if (direction == copy_forwards) {
-        __ add(s, s, 2 * wordSize);
-        __ add(d, d, 2 * wordSize);
+        __ add(s, s, bias);
+        __ add(d, d, bias);
       }
 
       __ tbz(count, 1, L2);
--- a/hotspot/src/os_cpu/linux_aarch64/vm/copy_linux_aarch64.inline.hpp	Thu Mar 10 13:04:43 2016 -0800
+++ b/hotspot/src/os_cpu/linux_aarch64/vm/copy_linux_aarch64.inline.hpp	Thu Mar 10 14:53:09 2016 +0000
@@ -26,44 +26,108 @@
 #ifndef OS_CPU_LINUX_AARCH64_VM_COPY_LINUX_AARCH64_INLINE_HPP
 #define OS_CPU_LINUX_AARCH64_VM_COPY_LINUX_AARCH64_INLINE_HPP
 
+#define COPY_SMALL(from, to, count)                                     \
+{                                                                       \
+        long tmp0, tmp1, tmp2, tmp3;                                    \
+        long tmp4, tmp5, tmp6, tmp7;                                    \
+  __asm volatile(                                                       \
+"       adr     %[t0], 0f;"                                             \
+"       add     %[t0], %[t0], %[cnt], lsl #5;"                          \
+"       br      %[t0];"                                                 \
+"       .align  5;"                                                     \
+"0:"                                                                    \
+"       b       1f;"                                                    \
+"       .align  5;"                                                     \
+"       ldr     %[t0], [%[s], #0];"                                     \
+"       str     %[t0], [%[d], #0];"                                     \
+"       b       1f;"                                                    \
+"       .align  5;"                                                     \
+"       ldp     %[t0], %[t1], [%[s], #0];"                              \
+"       stp     %[t0], %[t1], [%[d], #0];"                              \
+"       b       1f;"                                                    \
+"       .align  5;"                                                     \
+"       ldp     %[t0], %[t1], [%[s], #0];"                              \
+"       ldr     %[t2], [%[s], #16];"                                    \
+"       stp     %[t0], %[t1], [%[d], #0];"                              \
+"       str     %[t2], [%[d], #16];"                                    \
+"       b       1f;"                                                    \
+"       .align  5;"                                                     \
+"       ldp     %[t0], %[t1], [%[s], #0];"                              \
+"       ldp     %[t2], %[t3], [%[s], #16];"                             \
+"       stp     %[t0], %[t1], [%[d], #0];"                              \
+"       stp     %[t2], %[t3], [%[d], #16];"                             \
+"       b       1f;"                                                    \
+"       .align  5;"                                                     \
+"       ldp     %[t0], %[t1], [%[s], #0];"                              \
+"       ldp     %[t2], %[t3], [%[s], #16];"                             \
+"       ldr     %[t4], [%[s], #32];"                                    \
+"       stp     %[t0], %[t1], [%[d], #0];"                              \
+"       stp     %[t2], %[t3], [%[d], #16];"                             \
+"       str     %[t4], [%[d], #32];"                                    \
+"       b       1f;"                                                    \
+"       .align  5;"                                                     \
+"       ldp     %[t0], %[t1], [%[s], #0];"                              \
+"       ldp     %[t2], %[t3], [%[s], #16];"                             \
+"       ldp     %[t4], %[t5], [%[s], #32];"                             \
+"2:"                                                                    \
+"       stp     %[t0], %[t1], [%[d], #0];"                              \
+"       stp     %[t2], %[t3], [%[d], #16];"                             \
+"       stp     %[t4], %[t5], [%[d], #32];"                             \
+"       b       1f;"                                                    \
+"       .align  5;"                                                     \
+"       ldr     %[t6], [%[s], #0];"                                     \
+"       ldp     %[t0], %[t1], [%[s], #8];"                              \
+"       ldp     %[t2], %[t3], [%[s], #24];"                             \
+"       ldp     %[t4], %[t5], [%[s], #40];"                             \
+"       str     %[t6], [%[d]], #8;"                                     \
+"       b       2b;"                                                    \
+"       .align  5;"                                                     \
+"       ldp     %[t0], %[t1], [%[s], #0];"                              \
+"       ldp     %[t2], %[t3], [%[s], #16];"                             \
+"       ldp     %[t4], %[t5], [%[s], #32];"                             \
+"       ldp     %[t6], %[t7], [%[s], #48];"                             \
+"       stp     %[t0], %[t1], [%[d], #0];"                              \
+"       stp     %[t2], %[t3], [%[d], #16];"                             \
+"       stp     %[t4], %[t5], [%[d], #32];"                             \
+"       stp     %[t6], %[t7], [%[d], #48];"                             \
+"1:"                                                                    \
+                                                                        \
+  : [s]"+r"(from), [d]"+r"(to), [cnt]"+r"(count),                       \
+    [t0]"=&r"(tmp0), [t1]"=&r"(tmp1), [t2]"=&r"(tmp2), [t3]"=&r"(tmp3), \
+    [t4]"=&r"(tmp4), [t5]"=&r"(tmp5), [t6]"=&r"(tmp6), [t7]"=&r"(tmp7)  \
+  :                                                                     \
+  : "memory", "cc");                                                    \
+}
+
 static void pd_conjoint_words(HeapWord* from, HeapWord* to, size_t count) {
-  (void)memmove(to, from, count * HeapWordSize);
+  __asm volatile( "prfm pldl1strm, [%[s], #0];" :: [s]"r"(from) : "memory");
+  if (__builtin_expect(count <= 8, 1)) {
+    COPY_SMALL(from, to, count);
+    return;
+  }
+  _Copy_conjoint_words(from, to, count);
 }
 
 static void pd_disjoint_words(HeapWord* from, HeapWord* to, size_t count) {
-  switch (count) {
-  case 8:  to[7] = from[7];
-  case 7:  to[6] = from[6];
-  case 6:  to[5] = from[5];
-  case 5:  to[4] = from[4];
-  case 4:  to[3] = from[3];
-  case 3:  to[2] = from[2];
-  case 2:  to[1] = from[1];
-  case 1:  to[0] = from[0];
-  case 0:  break;
-  default:
-    (void)memcpy(to, from, count * HeapWordSize);
-    break;
+  if (__builtin_constant_p(count)) {
+    memcpy(to, from, count * sizeof(HeapWord));
+    return;
   }
+  __asm volatile( "prfm pldl1strm, [%[s], #0];" :: [s]"r"(from) : "memory");
+  if (__builtin_expect(count <= 8, 1)) {
+    COPY_SMALL(from, to, count);
+    return;
+  }
+  _Copy_disjoint_words(from, to, count);
 }
 
 static void pd_disjoint_words_atomic(HeapWord* from, HeapWord* to, size_t count) {
-  switch (count) {
-  case 8:  to[7] = from[7];
-  case 7:  to[6] = from[6];
-  case 6:  to[5] = from[5];
-  case 5:  to[4] = from[4];
-  case 4:  to[3] = from[3];
-  case 3:  to[2] = from[2];
-  case 2:  to[1] = from[1];
-  case 1:  to[0] = from[0];
-  case 0:  break;
-  default:
-    while (count-- > 0) {
-      *to++ = *from++;
-    }
-    break;
+  __asm volatile( "prfm pldl1strm, [%[s], #0];" :: [s]"r"(from) : "memory");
+  if (__builtin_expect(count <= 8, 1)) {
+    COPY_SMALL(from, to, count);
+    return;
   }
+  _Copy_disjoint_words(from, to, count);
 }
 
 static void pd_aligned_conjoint_words(HeapWord* from, HeapWord* to, size_t count) {
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/os_cpu/linux_aarch64/vm/copy_linux_aarch64.s	Thu Mar 10 14:53:09 2016 +0000
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2016, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+        .global _Copy_conjoint_words
+        .global _Copy_disjoint_words
+
+s       .req    x0
+d       .req    x1
+count   .req    x2
+t0      .req    x3
+t1      .req    x4
+t2      .req    x5
+t3      .req    x6
+t4      .req    x7
+t5      .req    x8
+t6      .req    x9
+t7      .req    x10
+
+        .align  6
+_Copy_disjoint_words:
+        // Ensure 2 word aligned
+        tbz     s, #3, fwd_copy_aligned
+        ldr     t0, [s], #8
+        str     t0, [d], #8
+        sub     count, count, #1
+
+fwd_copy_aligned:
+        // Bias s & d so we only pre index on the last copy
+        sub     s, s, #16
+        sub     d, d, #16
+
+        ldp     t0, t1, [s, #16]
+        ldp     t2, t3, [s, #32]
+        ldp     t4, t5, [s, #48]
+        ldp     t6, t7, [s, #64]!
+
+        subs    count, count, #16
+        blo     fwd_copy_drain
+
+fwd_copy_again:
+        prfm    pldl1keep, [s, #256]
+        stp     t0, t1, [d, #16]
+        ldp     t0, t1, [s, #16]
+        stp     t2, t3, [d, #32]
+        ldp     t2, t3, [s, #32]
+        stp     t4, t5, [d, #48]
+        ldp     t4, t5, [s, #48]
+        stp     t6, t7, [d, #64]!
+        ldp     t6, t7, [s, #64]!
+        subs    count, count, #8
+        bhs     fwd_copy_again
+
+fwd_copy_drain:
+        stp     t0, t1, [d, #16]
+        stp     t2, t3, [d, #32]
+        stp     t4, t5, [d, #48]
+        stp     t6, t7, [d, #64]!
+
+        // count is now -8..-1 for 0..7 words to copy
+        adr     t0, 0f
+        add     t0, t0, count, lsl #5
+        br      t0
+
+        .align  5
+        ret                             // -8 == 0 words
+        .align  5
+        ldr     t0, [s, #16]            // -7 == 1 word
+        str     t0, [d, #16]
+        ret
+        .align  5
+        ldp     t0, t1, [s, #16]        // -6 = 2 words
+        stp     t0, t1, [d, #16]
+        ret
+        .align  5
+        ldp     t0, t1, [s, #16]        // -5 = 3 words
+        ldr     t2, [s, #32]
+        stp     t0, t1, [d, #16]
+        str     t2, [d, #32]
+        ret
+        .align  5
+        ldp     t0, t1, [s, #16]        // -4 = 4 words
+        ldp     t2, t3, [s, #32]
+        stp     t0, t1, [d, #16]
+        stp     t2, t3, [d, #32]
+        ret
+        .align  5
+        ldp     t0, t1, [s, #16]        // -3 = 5 words
+        ldp     t2, t3, [s, #32]
+        ldr     t4, [s, #48]
+        stp     t0, t1, [d, #16]
+        stp     t2, t3, [d, #32]
+        str     t4, [d, #48]
+        ret
+        .align  5
+        ldp     t0, t1, [s, #16]        // -2 = 6 words
+        ldp     t2, t3, [s, #32]
+        ldp     t4, t5, [s, #48]
+        stp     t0, t1, [d, #16]
+        stp     t2, t3, [d, #32]
+        stp     t4, t5, [d, #48]
+        ret
+        .align  5
+        ldp     t0, t1, [s, #16]        // -1 = 7 words
+        ldp     t2, t3, [s, #32]
+        ldp     t4, t5, [s, #48]
+        ldr     t6, [s, #64]
+        stp     t0, t1, [d, #16]
+        stp     t2, t3, [d, #32]
+        stp     t4, t5, [d, #48]
+        str     t6, [d, #64]
+        // Is always aligned here, code for 7 words is one instruction
+        // too large so it just falls through.
+        .align  5
+0:
+        ret
+
+        .align  6
+_Copy_conjoint_words:
+        sub     t0, d, s
+        cmp     t0, count, lsl #3
+        bhs     _Copy_disjoint_words
+
+        add     s, s, count, lsl #3
+        add     d, d, count, lsl #3
+
+        // Ensure 2 word aligned
+        tbz     s, #3, bwd_copy_aligned
+        ldr     t0, [s, #-8]!
+        str     t0, [d, #-8]!
+        sub     count, count, #1
+
+bwd_copy_aligned:
+        ldp     t0, t1, [s, #-16]
+        ldp     t2, t3, [s, #-32]
+        ldp     t4, t5, [s, #-48]
+        ldp     t6, t7, [s, #-64]!
+
+        subs    count, count, #16
+        blo     bwd_copy_drain
+
+bwd_copy_again:
+        prfm    pldl1keep, [s, #-256]
+        stp     t0, t1, [d, #-16]
+        ldp     t0, t1, [s, #-16]
+        stp     t2, t3, [d, #-32]
+        ldp     t2, t3, [s, #-32]
+        stp     t4, t5, [d, #-48]
+        ldp     t4, t5, [s, #-48]
+        stp     t6, t7, [d, #-64]!
+        ldp     t6, t7, [s, #-64]!
+        subs    count, count, #8
+        bhs     bwd_copy_again
+
+bwd_copy_drain:
+        stp     t0, t1, [d, #-16]
+        stp     t2, t3, [d, #-32]
+        stp     t4, t5, [d, #-48]
+        stp     t6, t7, [d, #-64]!
+
+        // count is now -8..-1 for 0..7 words to copy
+        adr     t0, 0f
+        add     t0, t0, count, lsl #5
+        br      t0
+
+        .align  5
+        ret                             // -8 == 0 words
+        .align  5
+        ldr     t0, [s, #-8]            // -7 == 1 word
+        str     t0, [d, #-8]
+        ret
+        .align  5
+        ldp     t0, t1, [s, #-16]       // -6 = 2 words
+        stp     t0, t1, [d, #-16]
+        ret
+        .align  5
+        ldp     t0, t1, [s, #-16]       // -5 = 3 words
+        ldr     t2, [s, #-24]
+        stp     t0, t1, [d, #-16]
+        str     t2, [d, #-24]
+        ret
+        .align  5
+        ldp     t0, t1, [s, #-16]       // -4 = 4 words
+        ldp     t2, t3, [s, #-32]
+        stp     t0, t1, [d, #-16]
+        stp     t2, t3, [d, #-32]
+        ret
+        .align  5
+        ldp     t0, t1, [s, #-16]       // -3 = 5 words
+        ldp     t2, t3, [s, #-32]
+        ldr     t4, [s, #-40]
+        stp     t0, t1, [d, #-16]
+        stp     t2, t3, [d, #-32]
+        str     t4, [d, #-40]
+        ret
+        .align  5
+        ldp     t0, t1, [s, #-16]       // -2 = 6 words
+        ldp     t2, t3, [s, #-32]
+        ldp     t4, t5, [s, #-48]
+        stp     t0, t1, [d, #-16]
+        stp     t2, t3, [d, #-32]
+        stp     t4, t5, [d, #-48]
+        ret
+        .align  5
+        ldp     t0, t1, [s, #-16]       // -1 = 7 words
+        ldp     t2, t3, [s, #-32]
+        ldp     t4, t5, [s, #-48]
+        ldr     t6, [s, #-56]
+        stp     t0, t1, [d, #-16]
+        stp     t2, t3, [d, #-32]
+        stp     t4, t5, [d, #-48]
+        str     t6, [d, #-56]
+        // Is always aligned here, code for 7 words is one instruction
+        // too large so it just falls through.
+        .align  5
+0:
+        ret