hotspot/src/os_cpu/linux_arm/vm/linux_arm_64.s
changeset 42664 29142a56c193
equal deleted inserted replaced
42663:2335df372367 42664:29142a56c193
       
     1 # 
       
     2 # Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
       
     3 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
       
     4 #
       
     5 # This code is free software; you can redistribute it and/or modify it
       
     6 # under the terms of the GNU General Public License version 2 only, as
       
     7 # published by the Free Software Foundation.
       
     8 #
       
     9 # This code is distributed in the hope that it will be useful, but WITHOUT
       
    10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
       
    11 # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
       
    12 # version 2 for more details (a copy is included in the LICENSE file that
       
    13 # accompanied this code).
       
    14 #
       
    15 # You should have received a copy of the GNU General Public License version
       
    16 # 2 along with this work; if not, write to the Free Software Foundation,
       
    17 # Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
       
    18 #
       
    19 # Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
       
    20 # or visit www.oracle.com if you need additional information or have any
       
    21 # questions.
       
    22 # 
       
    23 
       
    24         # TODO-AARCH64
       
    25         
       
    26         # NOTE WELL!  The _Copy functions are called directly
       
    27         # from server-compiler-generated code via CallLeafNoFP,
       
    28         # which means that they *must* either not use floating
       
    29         # point or use it in the same manner as does the server
       
    30         # compiler.
       
    31         
       
    32         .globl _Copy_conjoint_bytes
       
    33         .type _Copy_conjoint_bytes, %function
       
    34         .globl _Copy_arrayof_conjoint_bytes
       
    35         .type _Copy_arrayof_conjoint_bytes, %function
       
    36         .globl _Copy_disjoint_words
       
    37         .type _Copy_disjoint_words, %function
       
    38         .globl _Copy_conjoint_words
       
    39         .type _Copy_conjoint_words, %function
       
    40         .globl _Copy_conjoint_jshorts_atomic
       
    41         .type _Copy_conjoint_jshorts_atomic, %function
       
    42         .globl _Copy_arrayof_conjoint_jshorts
       
    43         .type _Copy_arrayof_conjoint_jshorts, %function
       
    44         .globl _Copy_conjoint_jints_atomic
       
    45         .type _Copy_conjoint_jints_atomic, %function
       
    46         .globl _Copy_arrayof_conjoint_jints
       
    47         .type _Copy_arrayof_conjoint_jints, %function
       
    48         .globl _Copy_conjoint_jlongs_atomic
       
    49         .type _Copy_conjoint_jlongs_atomic, %function
       
    50         .globl _Copy_arrayof_conjoint_jlongs
       
    51         .type _Copy_arrayof_conjoint_jlongs, %function
       
    52 
       
    53         .text
       
    54         .globl  SpinPause
       
    55         .type SpinPause, %function
       
    56 SpinPause:
       
    57         yield
       
    58         ret
       
    59 
       
    60         # Support for void Copy::conjoint_bytes(void* from,
       
    61         #                                       void* to,
       
    62         #                                       size_t count)
       
    63 _Copy_conjoint_bytes:
       
    64         hlt 1002
       
    65 
       
    66         # Support for void Copy::arrayof_conjoint_bytes(void* from,
       
    67         #                                               void* to,
       
    68         #                                               size_t count)
       
    69 _Copy_arrayof_conjoint_bytes:
       
    70         hlt 1003
       
    71 
       
    72 
       
    73         # Support for void Copy::disjoint_words(void* from,
       
    74         #                                       void* to,
       
    75         #                                       size_t count)
       
    76 _Copy_disjoint_words:
       
    77         # These and further memory prefetches may hit out of array ranges.
       
    78         # Experiments showed that prefetching of inaccessible memory doesn't result in exceptions.
       
    79         prfm    pldl1keep,  [x0, #0]
       
    80         prfm    pstl1keep,  [x1, #0]
       
    81         prfm    pldl1keep,  [x0, #64]
       
    82         prfm    pstl1keep,  [x1, #64]
       
    83 
       
    84         subs    x18, x2,  #128
       
    85         b.ge    dw_large
       
    86 
       
    87 dw_lt_128:
       
    88         # Copy [x0, x0 + x2) to [x1, x1 + x2)
       
    89         
       
    90         adr     x15,  dw_tail_table_base
       
    91         and     x16,  x2,  #~8
       
    92 
       
    93         # Calculate address to jump and store it to x15:
       
    94         #   Each pair of instructions before dw_tail_table_base copies 16 bytes.
       
    95         #   x16 is count of bytes to copy aligned down by 16.
       
    96         #   So x16/16 pairs of instructions should be executed. 
       
    97         #   Each pair takes 8 bytes, so x15 = dw_tail_table_base - (x16/16)*8 = x15 - x16/2
       
    98         sub     x15,  x15, x16, lsr #1
       
    99         prfm    plil1keep, [x15]
       
   100     
       
   101         add     x17,  x0,  x2
       
   102         add     x18,  x1,  x2
       
   103 
       
   104         # If x2 = x16 + 8, then copy 8 bytes and x16 bytes after that.
       
   105         # Otherwise x2 = x16, so proceed to copy x16 bytes.
       
   106         tbz     x2, #3, dw_lt_128_even
       
   107         ldr     x3, [x0]
       
   108         str     x3, [x1]
       
   109 dw_lt_128_even:
       
   110         # Copy [x17 - x16, x17) to [x18 - x16, x18)
       
   111         # x16 is aligned by 16 and less than 128
       
   112 
       
   113         # Execute (x16/16) ldp-stp pairs; each pair copies 16 bytes
       
   114         br      x15
       
   115 
       
   116         ldp     x3,  x4,  [x17, #-112]
       
   117         stp     x3,  x4,  [x18, #-112]
       
   118         ldp     x5,  x6,  [x17, #-96]
       
   119         stp     x5,  x6,  [x18, #-96]
       
   120         ldp     x7,  x8,  [x17, #-80]
       
   121         stp     x7,  x8,  [x18, #-80]
       
   122         ldp     x9,  x10, [x17, #-64]
       
   123         stp     x9,  x10, [x18, #-64]
       
   124         ldp     x11, x12, [x17, #-48]
       
   125         stp     x11, x12, [x18, #-48]
       
   126         ldp     x13, x14, [x17, #-32]
       
   127         stp     x13, x14, [x18, #-32]
       
   128         ldp     x15, x16, [x17, #-16]
       
   129         stp     x15, x16, [x18, #-16]
       
   130 dw_tail_table_base:
       
   131         ret
       
   132 
       
   133 .p2align  6
       
   134 .rept   12
       
   135         nop
       
   136 .endr
       
   137 dw_large:
       
   138         # x18 >= 0;
       
   139         # Copy [x0, x0 + x18 + 128) to [x1, x1 + x18 + 128)
       
   140 
       
   141         ldp     x3,  x4,  [x0], #64
       
   142         ldp     x5,  x6,  [x0, #-48]
       
   143         ldp     x7,  x8,  [x0, #-32]
       
   144         ldp     x9,  x10, [x0, #-16]
       
   145 
       
   146         # Before and after each iteration of loop registers x3-x10 contain [x0 - 64, x0),
       
   147         # and x1 is a place to copy this data;
       
   148         # x18 contains number of bytes to be stored minus 128
       
   149 
       
   150         # Exactly 16 instructions from p2align, so dw_loop starts from cache line boundary
       
   151         # Checking it explictly by aligning with "hlt 1000" instructions 
       
   152 .p2alignl  6, 0xd4407d00
       
   153 dw_loop:
       
   154         prfm    pldl1keep,  [x0, #64]
       
   155         # Next line actually hurted memory copy performance (for interpreter) - JDK-8078120
       
   156         # prfm    pstl1keep,  [x1, #64]
       
   157 
       
   158         subs    x18, x18, #64
       
   159 
       
   160         stp     x3,  x4,  [x1, #0]
       
   161         ldp     x3,  x4,  [x0, #0]
       
   162         stp     x5,  x6,  [x1, #16]
       
   163         ldp     x5,  x6,  [x0, #16]
       
   164         stp     x7,  x8,  [x1, #32]
       
   165         ldp     x7,  x8,  [x0, #32]
       
   166         stp     x9,  x10, [x1, #48]
       
   167         ldp     x9,  x10, [x0, #48]
       
   168         
       
   169         add     x1,  x1,  #64
       
   170         add     x0,  x0,  #64
       
   171 
       
   172         b.ge    dw_loop
       
   173 
       
   174         # 13 instructions from dw_loop, so the loop body hits into one cache line
       
   175 
       
   176 dw_loop_end:
       
   177         adds    x2,  x18, #64
       
   178 
       
   179         stp     x3,  x4,  [x1], #64
       
   180         stp     x5,  x6,  [x1, #-48]
       
   181         stp     x7,  x8,  [x1, #-32]
       
   182         stp     x9,  x10, [x1, #-16]
       
   183 
       
   184         # Increased x18 by 64, but stored 64 bytes, so x2 contains exact number of bytes to be stored
       
   185 
       
   186         # If this number is not zero, also copy remaining bytes
       
   187         b.ne    dw_lt_128
       
   188         ret
       
   189 
       
   190 
       
   191         # Support for void Copy::conjoint_words(void* from,
       
   192         #                                       void* to,
       
   193         #                                       size_t count)
       
   194 _Copy_conjoint_words:
       
   195         subs    x3, x1, x0
       
   196         # hi condition is met <=> from < to
       
   197         ccmp    x2, x3, #0, hi
       
   198         # hi condition is met <=> (from < to) and (to - from < count)
       
   199         # otherwise _Copy_disjoint_words may be used, because it performs forward copying,
       
   200         # so it also works when ranges overlap but to <= from
       
   201         b.ls    _Copy_disjoint_words
       
   202 
       
   203         # Overlapping case should be the rare one, it does not worth optimizing
       
   204 
       
   205         ands    x3,  x2,  #~8
       
   206         # x3 is count aligned down by 2*wordSize
       
   207         add     x0,  x0,  x2
       
   208         add     x1,  x1,  x2
       
   209         sub     x3,  x3,  #16
       
   210         # Skip loop if 0 or 1 words
       
   211         b.eq    cw_backward_loop_end
       
   212 
       
   213         # x3 >= 0
       
   214         # Copy [x0 - x3 - 16, x0) to [x1 - x3 - 16, x1) backward
       
   215 cw_backward_loop:
       
   216         subs    x3,  x3,  #16
       
   217         ldp     x4,  x5,  [x0, #-16]!
       
   218         stp     x4,  x5,  [x1, #-16]!
       
   219         b.ge    cw_backward_loop
       
   220 
       
   221 cw_backward_loop_end:
       
   222         # Copy remaining 0 or 1 words
       
   223         tbz     x2,  #3,  cw_finish
       
   224         ldr     x3, [x0, #-8]
       
   225         str     x3, [x1, #-8]
       
   226 
       
   227 cw_finish:
       
   228         ret
       
   229 
       
   230 
       
   231         # Support for void Copy::conjoint_jshorts_atomic(void* from,
       
   232         #                                                void* to,
       
   233         #                                                size_t count)
       
   234 _Copy_conjoint_jshorts_atomic:
       
   235         add     x17, x0, x2
       
   236         add     x18, x1, x2
       
   237 
       
   238         subs    x3, x1, x0
       
   239         # hi is met <=> (from < to) and (to - from < count)
       
   240         ccmp    x2, x3, #0, hi
       
   241         b.hi    cs_backward
       
   242         
       
   243         subs    x3, x2, #14
       
   244         b.ge    cs_forward_loop
       
   245 
       
   246         # Copy x2 < 14 bytes from x0 to x1
       
   247 cs_forward_lt14:
       
   248         ands    x7, x2, #7
       
   249         tbz     x2, #3, cs_forward_lt8
       
   250         ldrh    w3, [x0, #0]
       
   251         ldrh    w4, [x0, #2]
       
   252         ldrh    w5, [x0, #4]
       
   253         ldrh    w6, [x0, #6]
       
   254 
       
   255         strh    w3, [x1, #0]
       
   256         strh    w4, [x1, #2]
       
   257         strh    w5, [x1, #4]
       
   258         strh    w6, [x1, #6]
       
   259 
       
   260         # Copy x7 < 8 bytes from x17 - x7 to x18 - x7
       
   261 cs_forward_lt8:
       
   262         b.eq    cs_forward_0
       
   263         cmp     x7, #4
       
   264         b.lt    cs_forward_2
       
   265         b.eq    cs_forward_4
       
   266 
       
   267 cs_forward_6:
       
   268         ldrh    w3, [x17, #-6]
       
   269         strh    w3, [x18, #-6]
       
   270 cs_forward_4:
       
   271         ldrh    w4, [x17, #-4]
       
   272         strh    w4, [x18, #-4]
       
   273 cs_forward_2:
       
   274         ldrh    w5, [x17, #-2]
       
   275         strh    w5, [x18, #-2]
       
   276 cs_forward_0:
       
   277         ret
       
   278 
       
   279 
       
   280         # Copy [x0, x0 + x3 + 14) to [x1, x1 + x3 + 14)
       
   281         # x3 >= 0
       
   282 .p2align 6
       
   283 cs_forward_loop:
       
   284         subs    x3, x3, #14
       
   285         
       
   286         ldrh    w4, [x0], #14
       
   287         ldrh    w5, [x0, #-12]
       
   288         ldrh    w6, [x0, #-10]
       
   289         ldrh    w7, [x0, #-8]
       
   290         ldrh    w8, [x0, #-6]
       
   291         ldrh    w9, [x0, #-4]
       
   292         ldrh    w10, [x0, #-2]
       
   293 
       
   294         strh    w4, [x1], #14
       
   295         strh    w5, [x1, #-12]
       
   296         strh    w6, [x1, #-10]
       
   297         strh    w7, [x1, #-8]
       
   298         strh    w8, [x1, #-6]
       
   299         strh    w9, [x1, #-4]
       
   300         strh    w10, [x1, #-2]
       
   301 
       
   302         b.ge    cs_forward_loop
       
   303         # Exactly 16 instruction from cs_forward_loop, so loop fits into one cache line
       
   304 
       
   305         adds    x2, x3, #14
       
   306         # x2 bytes should be copied from x0 to x1
       
   307         b.ne    cs_forward_lt14
       
   308         ret
       
   309         
       
   310         # Very similar to forward copying
       
   311 cs_backward:
       
   312         subs    x3, x2, #14
       
   313         b.ge    cs_backward_loop
       
   314 
       
   315 cs_backward_lt14:
       
   316         ands    x7, x2, #7
       
   317         tbz     x2, #3, cs_backward_lt8
       
   318 
       
   319         ldrh    w3, [x17, #-8]
       
   320         ldrh    w4, [x17, #-6]
       
   321         ldrh    w5, [x17, #-4]
       
   322         ldrh    w6, [x17, #-2]
       
   323         
       
   324         strh    w3, [x18, #-8]
       
   325         strh    w4, [x18, #-6]
       
   326         strh    w5, [x18, #-4]
       
   327         strh    w6, [x18, #-2]
       
   328 
       
   329 cs_backward_lt8:
       
   330         b.eq    cs_backward_0
       
   331         cmp     x7, #4
       
   332         b.lt    cs_backward_2
       
   333         b.eq    cs_backward_4
       
   334 
       
   335 cs_backward_6:
       
   336         ldrh    w3, [x0, #4]
       
   337         strh    w3, [x1, #4]
       
   338 
       
   339 cs_backward_4:
       
   340         ldrh    w4, [x0, #2]
       
   341         strh    w4, [x1, #2]
       
   342 
       
   343 cs_backward_2:
       
   344         ldrh    w5, [x0, #0]
       
   345         strh    w5, [x1, #0]
       
   346 
       
   347 cs_backward_0:
       
   348         ret
       
   349 
       
   350 
       
   351 .p2align 6
       
   352 cs_backward_loop:
       
   353         subs    x3, x3, #14
       
   354 
       
   355         ldrh    w4, [x17, #-14]!
       
   356         ldrh    w5, [x17, #2]
       
   357         ldrh    w6, [x17, #4]
       
   358         ldrh    w7, [x17, #6]
       
   359         ldrh    w8, [x17, #8]
       
   360         ldrh    w9, [x17, #10]
       
   361         ldrh    w10, [x17, #12]
       
   362 
       
   363         strh    w4, [x18, #-14]!
       
   364         strh    w5, [x18, #2]
       
   365         strh    w6, [x18, #4]
       
   366         strh    w7, [x18, #6]
       
   367         strh    w8, [x18, #8]
       
   368         strh    w9, [x18, #10]
       
   369         strh    w10, [x18, #12]
       
   370 
       
   371         b.ge    cs_backward_loop
       
   372         adds    x2, x3, #14
       
   373         b.ne    cs_backward_lt14
       
   374         ret
       
   375 
       
   376 
       
   377         # Support for void Copy::arrayof_conjoint_jshorts(void* from,
       
   378         #                                                 void* to,
       
   379         #                                                 size_t count)
       
   380 _Copy_arrayof_conjoint_jshorts:
       
   381         hlt 1007
       
   382 
       
   383 
       
   384         # Support for void Copy::conjoint_jlongs_atomic(jlong* from,
       
   385         #                                               jlong* to,
       
   386         #                                               size_t count)
       
   387 _Copy_conjoint_jlongs_atomic:
       
   388 _Copy_arrayof_conjoint_jlongs:
       
   389         hlt 1009
       
   390 
       
   391 
       
   392         # Support for void Copy::conjoint_jints_atomic(void* from,
       
   393         #                                              void* to,
       
   394         #                                              size_t count)
       
   395 _Copy_conjoint_jints_atomic:
       
   396 _Copy_arrayof_conjoint_jints:
       
   397         # These and further memory prefetches may hit out of array ranges.
       
   398         # Experiments showed that prefetching of inaccessible memory doesn't result in exceptions.
       
   399         prfm    pldl1keep,  [x0, #0]
       
   400         prfm    pstl1keep,  [x1, #0]
       
   401         prfm    pldl1keep,  [x0, #32]
       
   402         prfm    pstl1keep,  [x1, #32]
       
   403 
       
   404         subs    x3, x1, x0
       
   405         # hi condition is met <=> from < to
       
   406         ccmp    x2, x3, #0, hi
       
   407         # hi condition is met <=> (from < to) and (to - from < count)
       
   408         b.hi    ci_backward
       
   409 
       
   410         subs    x18, x2,  #64
       
   411         b.ge    ci_forward_large
       
   412 
       
   413 ci_forward_lt_64:
       
   414         # Copy [x0, x0 + x2) to [x1, x1 + x2)
       
   415         
       
   416         adr     x15,  ci_forward_tail_table_base
       
   417         and     x16,  x2,  #~4
       
   418 
       
   419         # Calculate address to jump and store it to x15:
       
   420         #   Each pair of instructions before ci_forward_tail_table_base copies 8 bytes.
       
   421         #   x16 is count of bytes to copy aligned down by 8.
       
   422         #   So x16/8 pairs of instructions should be executed. 
       
   423         #   Each pair takes 8 bytes, so x15 = ci_forward_tail_table_base - (x16/8)*8 = x15 - x16
       
   424         sub     x15,  x15, x16
       
   425         prfm    plil1keep, [x15]
       
   426     
       
   427         add     x17,  x0,  x2
       
   428         add     x18,  x1,  x2
       
   429 
       
   430         # If x2 = x16 + 4, then copy 4 bytes and x16 bytes after that.
       
   431         # Otherwise x2 = x16, so proceed to copy x16 bytes.
       
   432         tbz     x2, #2, ci_forward_lt_64_even
       
   433         ldr     w3, [x0]
       
   434         str     w3, [x1]
       
   435 ci_forward_lt_64_even:
       
   436         # Copy [x17 - x16, x17) to [x18 - x16, x18)
       
   437         # x16 is aligned by 8 and less than 64
       
   438 
       
   439         # Execute (x16/8) ldp-stp pairs; each pair copies 8 bytes
       
   440         br      x15
       
   441 
       
   442         ldp     w3,  w4,  [x17, #-56]
       
   443         stp     w3,  w4,  [x18, #-56]
       
   444         ldp     w5,  w6,  [x17, #-48]
       
   445         stp     w5,  w6,  [x18, #-48]
       
   446         ldp     w7,  w8,  [x17, #-40]
       
   447         stp     w7,  w8,  [x18, #-40]
       
   448         ldp     w9,  w10, [x17, #-32]
       
   449         stp     w9,  w10, [x18, #-32]
       
   450         ldp     w11, w12, [x17, #-24]
       
   451         stp     w11, w12, [x18, #-24]
       
   452         ldp     w13, w14, [x17, #-16]
       
   453         stp     w13, w14, [x18, #-16]
       
   454         ldp     w15, w16, [x17, #-8]
       
   455         stp     w15, w16, [x18, #-8]
       
   456 ci_forward_tail_table_base:
       
   457         ret
       
   458 
       
   459 .p2align  6
       
   460 .rept   12
       
   461         nop
       
   462 .endr
       
   463 ci_forward_large:
       
   464         # x18 >= 0;
       
   465         # Copy [x0, x0 + x18 + 64) to [x1, x1 + x18 + 64)
       
   466 
       
   467         ldp     w3,  w4,  [x0], #32
       
   468         ldp     w5,  w6,  [x0, #-24]
       
   469         ldp     w7,  w8,  [x0, #-16]
       
   470         ldp     w9,  w10, [x0, #-8]
       
   471 
       
   472         # Before and after each iteration of loop registers w3-w10 contain [x0 - 32, x0),
       
   473         # and x1 is a place to copy this data;
       
   474         # x18 contains number of bytes to be stored minus 64
       
   475 
       
   476         # Exactly 16 instructions from p2align, so ci_forward_loop starts from cache line boundary
       
   477         # Checking it explictly by aligning with "hlt 1000" instructions 
       
   478 .p2alignl  6, 0xd4407d00
       
   479 ci_forward_loop:
       
   480         prfm    pldl1keep,  [x0, #32]
       
   481         prfm    pstl1keep,  [x1, #32]
       
   482 
       
   483         subs    x18, x18, #32
       
   484 
       
   485         stp     w3,  w4,  [x1, #0]
       
   486         ldp     w3,  w4,  [x0, #0]
       
   487         stp     w5,  w6,  [x1, #8]
       
   488         ldp     w5,  w6,  [x0, #8]
       
   489         stp     w7,  w8,  [x1, #16]
       
   490         ldp     w7,  w8,  [x0, #16]
       
   491         stp     w9,  w10, [x1, #24]
       
   492         ldp     w9,  w10, [x0, #24]
       
   493         
       
   494         add     x1,  x1,  #32
       
   495         add     x0,  x0,  #32
       
   496 
       
   497         b.ge    ci_forward_loop
       
   498 
       
   499         # 14 instructions from ci_forward_loop, so the loop body hits into one cache line
       
   500 
       
   501 ci_forward_loop_end:
       
   502         adds    x2,  x18, #32
       
   503 
       
   504         stp     w3,  w4,  [x1], #32
       
   505         stp     w5,  w6,  [x1, #-24]
       
   506         stp     w7,  w8,  [x1, #-16]
       
   507         stp     w9,  w10, [x1, #-8]
       
   508 
       
   509         # Increased x18 by 32, but stored 32 bytes, so x2 contains exact number of bytes to be stored
       
   510 
       
   511         # If this number is not zero, also copy remaining bytes
       
   512         b.ne    ci_forward_lt_64
       
   513         ret
       
   514 
       
   515 ci_backward:
       
   516 
       
   517         # Overlapping case should be the rare one, it does not worth optimizing
       
   518 
       
   519         ands    x3,  x2,  #~4
       
   520         # x3 is count aligned down by 2*jintSize
       
   521         add     x0,  x0,  x2
       
   522         add     x1,  x1,  x2
       
   523         sub     x3,  x3,  #8
       
   524         # Skip loop if 0 or 1 jints
       
   525         b.eq    ci_backward_loop_end
       
   526 
       
   527         # x3 >= 0
       
   528         # Copy [x0 - x3 - 8, x0) to [x1 - x3 - 8, x1) backward
       
   529 ci_backward_loop:
       
   530         subs    x3,  x3,  #8
       
   531         ldp     w4,  w5,  [x0, #-8]!
       
   532         stp     w4,  w5,  [x1, #-8]!
       
   533         b.ge    ci_backward_loop
       
   534 
       
   535 ci_backward_loop_end:
       
   536         # Copy remaining 0 or 1 jints
       
   537         tbz     x2,  #2,  ci_backward_finish
       
   538         ldr     w3, [x0, #-4]
       
   539         str     w3, [x1, #-4]
       
   540 
       
   541 ci_backward_finish:
       
   542         ret