42664
|
1 |
#
|
|
2 |
# Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
|
|
3 |
# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
4 |
#
|
|
5 |
# This code is free software; you can redistribute it and/or modify it
|
|
6 |
# under the terms of the GNU General Public License version 2 only, as
|
|
7 |
# published by the Free Software Foundation.
|
|
8 |
#
|
|
9 |
# This code is distributed in the hope that it will be useful, but WITHOUT
|
|
10 |
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
11 |
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
12 |
# version 2 for more details (a copy is included in the LICENSE file that
|
|
13 |
# accompanied this code).
|
|
14 |
#
|
|
15 |
# You should have received a copy of the GNU General Public License version
|
|
16 |
# 2 along with this work; if not, write to the Free Software Foundation,
|
|
17 |
# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
18 |
#
|
|
19 |
# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
|
20 |
# or visit www.oracle.com if you need additional information or have any
|
|
21 |
# questions.
|
|
22 |
#
|
|
23 |
|
|
24 |
# TODO-AARCH64
|
|
25 |
|
|
26 |
# NOTE WELL! The _Copy functions are called directly
|
|
27 |
# from server-compiler-generated code via CallLeafNoFP,
|
|
28 |
# which means that they *must* either not use floating
|
|
29 |
# point or use it in the same manner as does the server
|
|
30 |
# compiler.
|
|
31 |
|
|
32 |
.globl _Copy_conjoint_bytes
|
|
33 |
.type _Copy_conjoint_bytes, %function
|
|
34 |
.globl _Copy_arrayof_conjoint_bytes
|
|
35 |
.type _Copy_arrayof_conjoint_bytes, %function
|
|
36 |
.globl _Copy_disjoint_words
|
|
37 |
.type _Copy_disjoint_words, %function
|
|
38 |
.globl _Copy_conjoint_words
|
|
39 |
.type _Copy_conjoint_words, %function
|
|
40 |
.globl _Copy_conjoint_jshorts_atomic
|
|
41 |
.type _Copy_conjoint_jshorts_atomic, %function
|
|
42 |
.globl _Copy_arrayof_conjoint_jshorts
|
|
43 |
.type _Copy_arrayof_conjoint_jshorts, %function
|
|
44 |
.globl _Copy_conjoint_jints_atomic
|
|
45 |
.type _Copy_conjoint_jints_atomic, %function
|
|
46 |
.globl _Copy_arrayof_conjoint_jints
|
|
47 |
.type _Copy_arrayof_conjoint_jints, %function
|
|
48 |
.globl _Copy_conjoint_jlongs_atomic
|
|
49 |
.type _Copy_conjoint_jlongs_atomic, %function
|
|
50 |
.globl _Copy_arrayof_conjoint_jlongs
|
|
51 |
.type _Copy_arrayof_conjoint_jlongs, %function
|
|
52 |
|
|
53 |
.text
|
|
54 |
.globl SpinPause
|
|
55 |
.type SpinPause, %function
|
|
56 |
SpinPause:
|
|
57 |
yield
|
|
58 |
ret
|
|
59 |
|
|
60 |
# Support for void Copy::conjoint_bytes(void* from,
|
|
61 |
# void* to,
|
|
62 |
# size_t count)
|
|
63 |
_Copy_conjoint_bytes:
|
|
64 |
hlt 1002
|
|
65 |
|
|
66 |
# Support for void Copy::arrayof_conjoint_bytes(void* from,
|
|
67 |
# void* to,
|
|
68 |
# size_t count)
|
|
69 |
_Copy_arrayof_conjoint_bytes:
|
|
70 |
hlt 1003
|
|
71 |
|
|
72 |
|
|
73 |
# Support for void Copy::disjoint_words(void* from,
|
|
74 |
# void* to,
|
|
75 |
# size_t count)
|
|
76 |
_Copy_disjoint_words:
|
|
77 |
# These and further memory prefetches may hit out of array ranges.
|
|
78 |
# Experiments showed that prefetching of inaccessible memory doesn't result in exceptions.
|
|
79 |
prfm pldl1keep, [x0, #0]
|
|
80 |
prfm pstl1keep, [x1, #0]
|
|
81 |
prfm pldl1keep, [x0, #64]
|
|
82 |
prfm pstl1keep, [x1, #64]
|
|
83 |
|
|
84 |
subs x18, x2, #128
|
|
85 |
b.ge dw_large
|
|
86 |
|
|
87 |
dw_lt_128:
|
|
88 |
# Copy [x0, x0 + x2) to [x1, x1 + x2)
|
|
89 |
|
|
90 |
adr x15, dw_tail_table_base
|
|
91 |
and x16, x2, #~8
|
|
92 |
|
|
93 |
# Calculate address to jump and store it to x15:
|
|
94 |
# Each pair of instructions before dw_tail_table_base copies 16 bytes.
|
|
95 |
# x16 is count of bytes to copy aligned down by 16.
|
|
96 |
# So x16/16 pairs of instructions should be executed.
|
|
97 |
# Each pair takes 8 bytes, so x15 = dw_tail_table_base - (x16/16)*8 = x15 - x16/2
|
|
98 |
sub x15, x15, x16, lsr #1
|
|
99 |
prfm plil1keep, [x15]
|
|
100 |
|
|
101 |
add x17, x0, x2
|
|
102 |
add x18, x1, x2
|
|
103 |
|
|
104 |
# If x2 = x16 + 8, then copy 8 bytes and x16 bytes after that.
|
|
105 |
# Otherwise x2 = x16, so proceed to copy x16 bytes.
|
|
106 |
tbz x2, #3, dw_lt_128_even
|
|
107 |
ldr x3, [x0]
|
|
108 |
str x3, [x1]
|
|
109 |
dw_lt_128_even:
|
|
110 |
# Copy [x17 - x16, x17) to [x18 - x16, x18)
|
|
111 |
# x16 is aligned by 16 and less than 128
|
|
112 |
|
|
113 |
# Execute (x16/16) ldp-stp pairs; each pair copies 16 bytes
|
|
114 |
br x15
|
|
115 |
|
|
116 |
ldp x3, x4, [x17, #-112]
|
|
117 |
stp x3, x4, [x18, #-112]
|
|
118 |
ldp x5, x6, [x17, #-96]
|
|
119 |
stp x5, x6, [x18, #-96]
|
|
120 |
ldp x7, x8, [x17, #-80]
|
|
121 |
stp x7, x8, [x18, #-80]
|
|
122 |
ldp x9, x10, [x17, #-64]
|
|
123 |
stp x9, x10, [x18, #-64]
|
|
124 |
ldp x11, x12, [x17, #-48]
|
|
125 |
stp x11, x12, [x18, #-48]
|
|
126 |
ldp x13, x14, [x17, #-32]
|
|
127 |
stp x13, x14, [x18, #-32]
|
|
128 |
ldp x15, x16, [x17, #-16]
|
|
129 |
stp x15, x16, [x18, #-16]
|
|
130 |
dw_tail_table_base:
|
|
131 |
ret
|
|
132 |
|
|
133 |
.p2align 6
|
|
134 |
.rept 12
|
|
135 |
nop
|
|
136 |
.endr
|
|
137 |
dw_large:
|
|
138 |
# x18 >= 0;
|
|
139 |
# Copy [x0, x0 + x18 + 128) to [x1, x1 + x18 + 128)
|
|
140 |
|
|
141 |
ldp x3, x4, [x0], #64
|
|
142 |
ldp x5, x6, [x0, #-48]
|
|
143 |
ldp x7, x8, [x0, #-32]
|
|
144 |
ldp x9, x10, [x0, #-16]
|
|
145 |
|
|
146 |
# Before and after each iteration of loop registers x3-x10 contain [x0 - 64, x0),
|
|
147 |
# and x1 is a place to copy this data;
|
|
148 |
# x18 contains number of bytes to be stored minus 128
|
|
149 |
|
|
150 |
# Exactly 16 instructions from p2align, so dw_loop starts from cache line boundary
|
|
151 |
# Checking it explictly by aligning with "hlt 1000" instructions
|
|
152 |
.p2alignl 6, 0xd4407d00
|
|
153 |
dw_loop:
|
|
154 |
prfm pldl1keep, [x0, #64]
|
|
155 |
# Next line actually hurted memory copy performance (for interpreter) - JDK-8078120
|
|
156 |
# prfm pstl1keep, [x1, #64]
|
|
157 |
|
|
158 |
subs x18, x18, #64
|
|
159 |
|
|
160 |
stp x3, x4, [x1, #0]
|
|
161 |
ldp x3, x4, [x0, #0]
|
|
162 |
stp x5, x6, [x1, #16]
|
|
163 |
ldp x5, x6, [x0, #16]
|
|
164 |
stp x7, x8, [x1, #32]
|
|
165 |
ldp x7, x8, [x0, #32]
|
|
166 |
stp x9, x10, [x1, #48]
|
|
167 |
ldp x9, x10, [x0, #48]
|
|
168 |
|
|
169 |
add x1, x1, #64
|
|
170 |
add x0, x0, #64
|
|
171 |
|
|
172 |
b.ge dw_loop
|
|
173 |
|
|
174 |
# 13 instructions from dw_loop, so the loop body hits into one cache line
|
|
175 |
|
|
176 |
dw_loop_end:
|
|
177 |
adds x2, x18, #64
|
|
178 |
|
|
179 |
stp x3, x4, [x1], #64
|
|
180 |
stp x5, x6, [x1, #-48]
|
|
181 |
stp x7, x8, [x1, #-32]
|
|
182 |
stp x9, x10, [x1, #-16]
|
|
183 |
|
|
184 |
# Increased x18 by 64, but stored 64 bytes, so x2 contains exact number of bytes to be stored
|
|
185 |
|
|
186 |
# If this number is not zero, also copy remaining bytes
|
|
187 |
b.ne dw_lt_128
|
|
188 |
ret
|
|
189 |
|
|
190 |
|
|
191 |
# Support for void Copy::conjoint_words(void* from,
|
|
192 |
# void* to,
|
|
193 |
# size_t count)
|
|
194 |
_Copy_conjoint_words:
|
|
195 |
subs x3, x1, x0
|
|
196 |
# hi condition is met <=> from < to
|
|
197 |
ccmp x2, x3, #0, hi
|
|
198 |
# hi condition is met <=> (from < to) and (to - from < count)
|
|
199 |
# otherwise _Copy_disjoint_words may be used, because it performs forward copying,
|
|
200 |
# so it also works when ranges overlap but to <= from
|
|
201 |
b.ls _Copy_disjoint_words
|
|
202 |
|
|
203 |
# Overlapping case should be the rare one, it does not worth optimizing
|
|
204 |
|
|
205 |
ands x3, x2, #~8
|
|
206 |
# x3 is count aligned down by 2*wordSize
|
|
207 |
add x0, x0, x2
|
|
208 |
add x1, x1, x2
|
|
209 |
sub x3, x3, #16
|
|
210 |
# Skip loop if 0 or 1 words
|
|
211 |
b.eq cw_backward_loop_end
|
|
212 |
|
|
213 |
# x3 >= 0
|
|
214 |
# Copy [x0 - x3 - 16, x0) to [x1 - x3 - 16, x1) backward
|
|
215 |
cw_backward_loop:
|
|
216 |
subs x3, x3, #16
|
|
217 |
ldp x4, x5, [x0, #-16]!
|
|
218 |
stp x4, x5, [x1, #-16]!
|
|
219 |
b.ge cw_backward_loop
|
|
220 |
|
|
221 |
cw_backward_loop_end:
|
|
222 |
# Copy remaining 0 or 1 words
|
|
223 |
tbz x2, #3, cw_finish
|
|
224 |
ldr x3, [x0, #-8]
|
|
225 |
str x3, [x1, #-8]
|
|
226 |
|
|
227 |
cw_finish:
|
|
228 |
ret
|
|
229 |
|
|
230 |
|
|
231 |
# Support for void Copy::conjoint_jshorts_atomic(void* from,
|
|
232 |
# void* to,
|
|
233 |
# size_t count)
|
|
234 |
_Copy_conjoint_jshorts_atomic:
|
|
235 |
add x17, x0, x2
|
|
236 |
add x18, x1, x2
|
|
237 |
|
|
238 |
subs x3, x1, x0
|
|
239 |
# hi is met <=> (from < to) and (to - from < count)
|
|
240 |
ccmp x2, x3, #0, hi
|
|
241 |
b.hi cs_backward
|
|
242 |
|
|
243 |
subs x3, x2, #14
|
|
244 |
b.ge cs_forward_loop
|
|
245 |
|
|
246 |
# Copy x2 < 14 bytes from x0 to x1
|
|
247 |
cs_forward_lt14:
|
|
248 |
ands x7, x2, #7
|
|
249 |
tbz x2, #3, cs_forward_lt8
|
|
250 |
ldrh w3, [x0, #0]
|
|
251 |
ldrh w4, [x0, #2]
|
|
252 |
ldrh w5, [x0, #4]
|
|
253 |
ldrh w6, [x0, #6]
|
|
254 |
|
|
255 |
strh w3, [x1, #0]
|
|
256 |
strh w4, [x1, #2]
|
|
257 |
strh w5, [x1, #4]
|
|
258 |
strh w6, [x1, #6]
|
|
259 |
|
|
260 |
# Copy x7 < 8 bytes from x17 - x7 to x18 - x7
|
|
261 |
cs_forward_lt8:
|
|
262 |
b.eq cs_forward_0
|
|
263 |
cmp x7, #4
|
|
264 |
b.lt cs_forward_2
|
|
265 |
b.eq cs_forward_4
|
|
266 |
|
|
267 |
cs_forward_6:
|
|
268 |
ldrh w3, [x17, #-6]
|
|
269 |
strh w3, [x18, #-6]
|
|
270 |
cs_forward_4:
|
|
271 |
ldrh w4, [x17, #-4]
|
|
272 |
strh w4, [x18, #-4]
|
|
273 |
cs_forward_2:
|
|
274 |
ldrh w5, [x17, #-2]
|
|
275 |
strh w5, [x18, #-2]
|
|
276 |
cs_forward_0:
|
|
277 |
ret
|
|
278 |
|
|
279 |
|
|
280 |
# Copy [x0, x0 + x3 + 14) to [x1, x1 + x3 + 14)
|
|
281 |
# x3 >= 0
|
|
282 |
.p2align 6
|
|
283 |
cs_forward_loop:
|
|
284 |
subs x3, x3, #14
|
|
285 |
|
|
286 |
ldrh w4, [x0], #14
|
|
287 |
ldrh w5, [x0, #-12]
|
|
288 |
ldrh w6, [x0, #-10]
|
|
289 |
ldrh w7, [x0, #-8]
|
|
290 |
ldrh w8, [x0, #-6]
|
|
291 |
ldrh w9, [x0, #-4]
|
|
292 |
ldrh w10, [x0, #-2]
|
|
293 |
|
|
294 |
strh w4, [x1], #14
|
|
295 |
strh w5, [x1, #-12]
|
|
296 |
strh w6, [x1, #-10]
|
|
297 |
strh w7, [x1, #-8]
|
|
298 |
strh w8, [x1, #-6]
|
|
299 |
strh w9, [x1, #-4]
|
|
300 |
strh w10, [x1, #-2]
|
|
301 |
|
|
302 |
b.ge cs_forward_loop
|
|
303 |
# Exactly 16 instruction from cs_forward_loop, so loop fits into one cache line
|
|
304 |
|
|
305 |
adds x2, x3, #14
|
|
306 |
# x2 bytes should be copied from x0 to x1
|
|
307 |
b.ne cs_forward_lt14
|
|
308 |
ret
|
|
309 |
|
|
310 |
# Very similar to forward copying
|
|
311 |
cs_backward:
|
|
312 |
subs x3, x2, #14
|
|
313 |
b.ge cs_backward_loop
|
|
314 |
|
|
315 |
cs_backward_lt14:
|
|
316 |
ands x7, x2, #7
|
|
317 |
tbz x2, #3, cs_backward_lt8
|
|
318 |
|
|
319 |
ldrh w3, [x17, #-8]
|
|
320 |
ldrh w4, [x17, #-6]
|
|
321 |
ldrh w5, [x17, #-4]
|
|
322 |
ldrh w6, [x17, #-2]
|
|
323 |
|
|
324 |
strh w3, [x18, #-8]
|
|
325 |
strh w4, [x18, #-6]
|
|
326 |
strh w5, [x18, #-4]
|
|
327 |
strh w6, [x18, #-2]
|
|
328 |
|
|
329 |
cs_backward_lt8:
|
|
330 |
b.eq cs_backward_0
|
|
331 |
cmp x7, #4
|
|
332 |
b.lt cs_backward_2
|
|
333 |
b.eq cs_backward_4
|
|
334 |
|
|
335 |
cs_backward_6:
|
|
336 |
ldrh w3, [x0, #4]
|
|
337 |
strh w3, [x1, #4]
|
|
338 |
|
|
339 |
cs_backward_4:
|
|
340 |
ldrh w4, [x0, #2]
|
|
341 |
strh w4, [x1, #2]
|
|
342 |
|
|
343 |
cs_backward_2:
|
|
344 |
ldrh w5, [x0, #0]
|
|
345 |
strh w5, [x1, #0]
|
|
346 |
|
|
347 |
cs_backward_0:
|
|
348 |
ret
|
|
349 |
|
|
350 |
|
|
351 |
.p2align 6
|
|
352 |
cs_backward_loop:
|
|
353 |
subs x3, x3, #14
|
|
354 |
|
|
355 |
ldrh w4, [x17, #-14]!
|
|
356 |
ldrh w5, [x17, #2]
|
|
357 |
ldrh w6, [x17, #4]
|
|
358 |
ldrh w7, [x17, #6]
|
|
359 |
ldrh w8, [x17, #8]
|
|
360 |
ldrh w9, [x17, #10]
|
|
361 |
ldrh w10, [x17, #12]
|
|
362 |
|
|
363 |
strh w4, [x18, #-14]!
|
|
364 |
strh w5, [x18, #2]
|
|
365 |
strh w6, [x18, #4]
|
|
366 |
strh w7, [x18, #6]
|
|
367 |
strh w8, [x18, #8]
|
|
368 |
strh w9, [x18, #10]
|
|
369 |
strh w10, [x18, #12]
|
|
370 |
|
|
371 |
b.ge cs_backward_loop
|
|
372 |
adds x2, x3, #14
|
|
373 |
b.ne cs_backward_lt14
|
|
374 |
ret
|
|
375 |
|
|
376 |
|
|
377 |
# Support for void Copy::arrayof_conjoint_jshorts(void* from,
|
|
378 |
# void* to,
|
|
379 |
# size_t count)
|
|
380 |
_Copy_arrayof_conjoint_jshorts:
|
|
381 |
hlt 1007
|
|
382 |
|
|
383 |
|
|
384 |
# Support for void Copy::conjoint_jlongs_atomic(jlong* from,
|
|
385 |
# jlong* to,
|
|
386 |
# size_t count)
|
|
387 |
_Copy_conjoint_jlongs_atomic:
|
|
388 |
_Copy_arrayof_conjoint_jlongs:
|
|
389 |
hlt 1009
|
|
390 |
|
|
391 |
|
|
392 |
# Support for void Copy::conjoint_jints_atomic(void* from,
|
|
393 |
# void* to,
|
|
394 |
# size_t count)
|
|
395 |
_Copy_conjoint_jints_atomic:
|
|
396 |
_Copy_arrayof_conjoint_jints:
|
|
397 |
# These and further memory prefetches may hit out of array ranges.
|
|
398 |
# Experiments showed that prefetching of inaccessible memory doesn't result in exceptions.
|
|
399 |
prfm pldl1keep, [x0, #0]
|
|
400 |
prfm pstl1keep, [x1, #0]
|
|
401 |
prfm pldl1keep, [x0, #32]
|
|
402 |
prfm pstl1keep, [x1, #32]
|
|
403 |
|
|
404 |
subs x3, x1, x0
|
|
405 |
# hi condition is met <=> from < to
|
|
406 |
ccmp x2, x3, #0, hi
|
|
407 |
# hi condition is met <=> (from < to) and (to - from < count)
|
|
408 |
b.hi ci_backward
|
|
409 |
|
|
410 |
subs x18, x2, #64
|
|
411 |
b.ge ci_forward_large
|
|
412 |
|
|
413 |
ci_forward_lt_64:
|
|
414 |
# Copy [x0, x0 + x2) to [x1, x1 + x2)
|
|
415 |
|
|
416 |
adr x15, ci_forward_tail_table_base
|
|
417 |
and x16, x2, #~4
|
|
418 |
|
|
419 |
# Calculate address to jump and store it to x15:
|
|
420 |
# Each pair of instructions before ci_forward_tail_table_base copies 8 bytes.
|
|
421 |
# x16 is count of bytes to copy aligned down by 8.
|
|
422 |
# So x16/8 pairs of instructions should be executed.
|
|
423 |
# Each pair takes 8 bytes, so x15 = ci_forward_tail_table_base - (x16/8)*8 = x15 - x16
|
|
424 |
sub x15, x15, x16
|
|
425 |
prfm plil1keep, [x15]
|
|
426 |
|
|
427 |
add x17, x0, x2
|
|
428 |
add x18, x1, x2
|
|
429 |
|
|
430 |
# If x2 = x16 + 4, then copy 4 bytes and x16 bytes after that.
|
|
431 |
# Otherwise x2 = x16, so proceed to copy x16 bytes.
|
|
432 |
tbz x2, #2, ci_forward_lt_64_even
|
|
433 |
ldr w3, [x0]
|
|
434 |
str w3, [x1]
|
|
435 |
ci_forward_lt_64_even:
|
|
436 |
# Copy [x17 - x16, x17) to [x18 - x16, x18)
|
|
437 |
# x16 is aligned by 8 and less than 64
|
|
438 |
|
|
439 |
# Execute (x16/8) ldp-stp pairs; each pair copies 8 bytes
|
|
440 |
br x15
|
|
441 |
|
|
442 |
ldp w3, w4, [x17, #-56]
|
|
443 |
stp w3, w4, [x18, #-56]
|
|
444 |
ldp w5, w6, [x17, #-48]
|
|
445 |
stp w5, w6, [x18, #-48]
|
|
446 |
ldp w7, w8, [x17, #-40]
|
|
447 |
stp w7, w8, [x18, #-40]
|
|
448 |
ldp w9, w10, [x17, #-32]
|
|
449 |
stp w9, w10, [x18, #-32]
|
|
450 |
ldp w11, w12, [x17, #-24]
|
|
451 |
stp w11, w12, [x18, #-24]
|
|
452 |
ldp w13, w14, [x17, #-16]
|
|
453 |
stp w13, w14, [x18, #-16]
|
|
454 |
ldp w15, w16, [x17, #-8]
|
|
455 |
stp w15, w16, [x18, #-8]
|
|
456 |
ci_forward_tail_table_base:
|
|
457 |
ret
|
|
458 |
|
|
459 |
.p2align 6
|
|
460 |
.rept 12
|
|
461 |
nop
|
|
462 |
.endr
|
|
463 |
ci_forward_large:
|
|
464 |
# x18 >= 0;
|
|
465 |
# Copy [x0, x0 + x18 + 64) to [x1, x1 + x18 + 64)
|
|
466 |
|
|
467 |
ldp w3, w4, [x0], #32
|
|
468 |
ldp w5, w6, [x0, #-24]
|
|
469 |
ldp w7, w8, [x0, #-16]
|
|
470 |
ldp w9, w10, [x0, #-8]
|
|
471 |
|
|
472 |
# Before and after each iteration of loop registers w3-w10 contain [x0 - 32, x0),
|
|
473 |
# and x1 is a place to copy this data;
|
|
474 |
# x18 contains number of bytes to be stored minus 64
|
|
475 |
|
|
476 |
# Exactly 16 instructions from p2align, so ci_forward_loop starts from cache line boundary
|
|
477 |
# Checking it explictly by aligning with "hlt 1000" instructions
|
|
478 |
.p2alignl 6, 0xd4407d00
|
|
479 |
ci_forward_loop:
|
|
480 |
prfm pldl1keep, [x0, #32]
|
|
481 |
prfm pstl1keep, [x1, #32]
|
|
482 |
|
|
483 |
subs x18, x18, #32
|
|
484 |
|
|
485 |
stp w3, w4, [x1, #0]
|
|
486 |
ldp w3, w4, [x0, #0]
|
|
487 |
stp w5, w6, [x1, #8]
|
|
488 |
ldp w5, w6, [x0, #8]
|
|
489 |
stp w7, w8, [x1, #16]
|
|
490 |
ldp w7, w8, [x0, #16]
|
|
491 |
stp w9, w10, [x1, #24]
|
|
492 |
ldp w9, w10, [x0, #24]
|
|
493 |
|
|
494 |
add x1, x1, #32
|
|
495 |
add x0, x0, #32
|
|
496 |
|
|
497 |
b.ge ci_forward_loop
|
|
498 |
|
|
499 |
# 14 instructions from ci_forward_loop, so the loop body hits into one cache line
|
|
500 |
|
|
501 |
ci_forward_loop_end:
|
|
502 |
adds x2, x18, #32
|
|
503 |
|
|
504 |
stp w3, w4, [x1], #32
|
|
505 |
stp w5, w6, [x1, #-24]
|
|
506 |
stp w7, w8, [x1, #-16]
|
|
507 |
stp w9, w10, [x1, #-8]
|
|
508 |
|
|
509 |
# Increased x18 by 32, but stored 32 bytes, so x2 contains exact number of bytes to be stored
|
|
510 |
|
|
511 |
# If this number is not zero, also copy remaining bytes
|
|
512 |
b.ne ci_forward_lt_64
|
|
513 |
ret
|
|
514 |
|
|
515 |
ci_backward:
|
|
516 |
|
|
517 |
# Overlapping case should be the rare one, it does not worth optimizing
|
|
518 |
|
|
519 |
ands x3, x2, #~4
|
|
520 |
# x3 is count aligned down by 2*jintSize
|
|
521 |
add x0, x0, x2
|
|
522 |
add x1, x1, x2
|
|
523 |
sub x3, x3, #8
|
|
524 |
# Skip loop if 0 or 1 jints
|
|
525 |
b.eq ci_backward_loop_end
|
|
526 |
|
|
527 |
# x3 >= 0
|
|
528 |
# Copy [x0 - x3 - 8, x0) to [x1 - x3 - 8, x1) backward
|
|
529 |
ci_backward_loop:
|
|
530 |
subs x3, x3, #8
|
|
531 |
ldp w4, w5, [x0, #-8]!
|
|
532 |
stp w4, w5, [x1, #-8]!
|
|
533 |
b.ge ci_backward_loop
|
|
534 |
|
|
535 |
ci_backward_loop_end:
|
|
536 |
# Copy remaining 0 or 1 jints
|
|
537 |
tbz x2, #2, ci_backward_finish
|
|
538 |
ldr w3, [x0, #-4]
|
|
539 |
str w3, [x1, #-4]
|
|
540 |
|
|
541 |
ci_backward_finish:
|
|
542 |
ret
|