|
1 # |
|
2 # Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved. |
|
3 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
|
4 # |
|
5 # This code is free software; you can redistribute it and/or modify it |
|
6 # under the terms of the GNU General Public License version 2 only, as |
|
7 # published by the Free Software Foundation. |
|
8 # |
|
9 # This code is distributed in the hope that it will be useful, but WITHOUT |
|
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
11 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
12 # version 2 for more details (a copy is included in the LICENSE file that |
|
13 # accompanied this code). |
|
14 # |
|
15 # You should have received a copy of the GNU General Public License version |
|
16 # 2 along with this work; if not, write to the Free Software Foundation, |
|
17 # Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
|
18 # |
|
19 # Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
|
20 # or visit www.oracle.com if you need additional information or have any |
|
21 # questions. |
|
22 # |
|
23 |
|
24 # TODO-AARCH64 |
|
25 |
|
26 # NOTE WELL! The _Copy functions are called directly |
|
27 # from server-compiler-generated code via CallLeafNoFP, |
|
28 # which means that they *must* either not use floating |
|
29 # point or use it in the same manner as does the server |
|
30 # compiler. |
|
31 |
|
32 .globl _Copy_conjoint_bytes |
|
33 .type _Copy_conjoint_bytes, %function |
|
34 .globl _Copy_arrayof_conjoint_bytes |
|
35 .type _Copy_arrayof_conjoint_bytes, %function |
|
36 .globl _Copy_disjoint_words |
|
37 .type _Copy_disjoint_words, %function |
|
38 .globl _Copy_conjoint_words |
|
39 .type _Copy_conjoint_words, %function |
|
40 .globl _Copy_conjoint_jshorts_atomic |
|
41 .type _Copy_conjoint_jshorts_atomic, %function |
|
42 .globl _Copy_arrayof_conjoint_jshorts |
|
43 .type _Copy_arrayof_conjoint_jshorts, %function |
|
44 .globl _Copy_conjoint_jints_atomic |
|
45 .type _Copy_conjoint_jints_atomic, %function |
|
46 .globl _Copy_arrayof_conjoint_jints |
|
47 .type _Copy_arrayof_conjoint_jints, %function |
|
48 .globl _Copy_conjoint_jlongs_atomic |
|
49 .type _Copy_conjoint_jlongs_atomic, %function |
|
50 .globl _Copy_arrayof_conjoint_jlongs |
|
51 .type _Copy_arrayof_conjoint_jlongs, %function |
|
52 |
|
53 .text |
|
54 .globl SpinPause |
|
55 .type SpinPause, %function |
|
56 SpinPause: |
|
57 yield |
|
58 ret |
|
59 |
|
60 # Support for void Copy::conjoint_bytes(void* from, |
|
61 # void* to, |
|
62 # size_t count) |
|
63 _Copy_conjoint_bytes: |
|
64 hlt 1002 |
|
65 |
|
66 # Support for void Copy::arrayof_conjoint_bytes(void* from, |
|
67 # void* to, |
|
68 # size_t count) |
|
69 _Copy_arrayof_conjoint_bytes: |
|
70 hlt 1003 |
|
71 |
|
72 |
|
73 # Support for void Copy::disjoint_words(void* from, |
|
74 # void* to, |
|
75 # size_t count) |
|
76 _Copy_disjoint_words: |
|
77 # These and further memory prefetches may hit out of array ranges. |
|
78 # Experiments showed that prefetching of inaccessible memory doesn't result in exceptions. |
|
79 prfm pldl1keep, [x0, #0] |
|
80 prfm pstl1keep, [x1, #0] |
|
81 prfm pldl1keep, [x0, #64] |
|
82 prfm pstl1keep, [x1, #64] |
|
83 |
|
84 subs x18, x2, #128 |
|
85 b.ge dw_large |
|
86 |
|
87 dw_lt_128: |
|
88 # Copy [x0, x0 + x2) to [x1, x1 + x2) |
|
89 |
|
90 adr x15, dw_tail_table_base |
|
91 and x16, x2, #~8 |
|
92 |
|
93 # Calculate address to jump and store it to x15: |
|
94 # Each pair of instructions before dw_tail_table_base copies 16 bytes. |
|
95 # x16 is count of bytes to copy aligned down by 16. |
|
96 # So x16/16 pairs of instructions should be executed. |
|
97 # Each pair takes 8 bytes, so x15 = dw_tail_table_base - (x16/16)*8 = x15 - x16/2 |
|
98 sub x15, x15, x16, lsr #1 |
|
99 prfm plil1keep, [x15] |
|
100 |
|
101 add x17, x0, x2 |
|
102 add x18, x1, x2 |
|
103 |
|
104 # If x2 = x16 + 8, then copy 8 bytes and x16 bytes after that. |
|
105 # Otherwise x2 = x16, so proceed to copy x16 bytes. |
|
106 tbz x2, #3, dw_lt_128_even |
|
107 ldr x3, [x0] |
|
108 str x3, [x1] |
|
109 dw_lt_128_even: |
|
110 # Copy [x17 - x16, x17) to [x18 - x16, x18) |
|
111 # x16 is aligned by 16 and less than 128 |
|
112 |
|
113 # Execute (x16/16) ldp-stp pairs; each pair copies 16 bytes |
|
114 br x15 |
|
115 |
|
116 ldp x3, x4, [x17, #-112] |
|
117 stp x3, x4, [x18, #-112] |
|
118 ldp x5, x6, [x17, #-96] |
|
119 stp x5, x6, [x18, #-96] |
|
120 ldp x7, x8, [x17, #-80] |
|
121 stp x7, x8, [x18, #-80] |
|
122 ldp x9, x10, [x17, #-64] |
|
123 stp x9, x10, [x18, #-64] |
|
124 ldp x11, x12, [x17, #-48] |
|
125 stp x11, x12, [x18, #-48] |
|
126 ldp x13, x14, [x17, #-32] |
|
127 stp x13, x14, [x18, #-32] |
|
128 ldp x15, x16, [x17, #-16] |
|
129 stp x15, x16, [x18, #-16] |
|
130 dw_tail_table_base: |
|
131 ret |
|
132 |
|
133 .p2align 6 |
|
134 .rept 12 |
|
135 nop |
|
136 .endr |
|
137 dw_large: |
|
138 # x18 >= 0; |
|
139 # Copy [x0, x0 + x18 + 128) to [x1, x1 + x18 + 128) |
|
140 |
|
141 ldp x3, x4, [x0], #64 |
|
142 ldp x5, x6, [x0, #-48] |
|
143 ldp x7, x8, [x0, #-32] |
|
144 ldp x9, x10, [x0, #-16] |
|
145 |
|
146 # Before and after each iteration of loop registers x3-x10 contain [x0 - 64, x0), |
|
147 # and x1 is a place to copy this data; |
|
148 # x18 contains number of bytes to be stored minus 128 |
|
149 |
|
150 # Exactly 16 instructions from p2align, so dw_loop starts from cache line boundary |
|
151 # Checking it explictly by aligning with "hlt 1000" instructions |
|
152 .p2alignl 6, 0xd4407d00 |
|
153 dw_loop: |
|
154 prfm pldl1keep, [x0, #64] |
|
155 # Next line actually hurted memory copy performance (for interpreter) - JDK-8078120 |
|
156 # prfm pstl1keep, [x1, #64] |
|
157 |
|
158 subs x18, x18, #64 |
|
159 |
|
160 stp x3, x4, [x1, #0] |
|
161 ldp x3, x4, [x0, #0] |
|
162 stp x5, x6, [x1, #16] |
|
163 ldp x5, x6, [x0, #16] |
|
164 stp x7, x8, [x1, #32] |
|
165 ldp x7, x8, [x0, #32] |
|
166 stp x9, x10, [x1, #48] |
|
167 ldp x9, x10, [x0, #48] |
|
168 |
|
169 add x1, x1, #64 |
|
170 add x0, x0, #64 |
|
171 |
|
172 b.ge dw_loop |
|
173 |
|
174 # 13 instructions from dw_loop, so the loop body hits into one cache line |
|
175 |
|
176 dw_loop_end: |
|
177 adds x2, x18, #64 |
|
178 |
|
179 stp x3, x4, [x1], #64 |
|
180 stp x5, x6, [x1, #-48] |
|
181 stp x7, x8, [x1, #-32] |
|
182 stp x9, x10, [x1, #-16] |
|
183 |
|
184 # Increased x18 by 64, but stored 64 bytes, so x2 contains exact number of bytes to be stored |
|
185 |
|
186 # If this number is not zero, also copy remaining bytes |
|
187 b.ne dw_lt_128 |
|
188 ret |
|
189 |
|
190 |
|
191 # Support for void Copy::conjoint_words(void* from, |
|
192 # void* to, |
|
193 # size_t count) |
|
194 _Copy_conjoint_words: |
|
195 subs x3, x1, x0 |
|
196 # hi condition is met <=> from < to |
|
197 ccmp x2, x3, #0, hi |
|
198 # hi condition is met <=> (from < to) and (to - from < count) |
|
199 # otherwise _Copy_disjoint_words may be used, because it performs forward copying, |
|
200 # so it also works when ranges overlap but to <= from |
|
201 b.ls _Copy_disjoint_words |
|
202 |
|
203 # Overlapping case should be the rare one, it does not worth optimizing |
|
204 |
|
205 ands x3, x2, #~8 |
|
206 # x3 is count aligned down by 2*wordSize |
|
207 add x0, x0, x2 |
|
208 add x1, x1, x2 |
|
209 sub x3, x3, #16 |
|
210 # Skip loop if 0 or 1 words |
|
211 b.eq cw_backward_loop_end |
|
212 |
|
213 # x3 >= 0 |
|
214 # Copy [x0 - x3 - 16, x0) to [x1 - x3 - 16, x1) backward |
|
215 cw_backward_loop: |
|
216 subs x3, x3, #16 |
|
217 ldp x4, x5, [x0, #-16]! |
|
218 stp x4, x5, [x1, #-16]! |
|
219 b.ge cw_backward_loop |
|
220 |
|
221 cw_backward_loop_end: |
|
222 # Copy remaining 0 or 1 words |
|
223 tbz x2, #3, cw_finish |
|
224 ldr x3, [x0, #-8] |
|
225 str x3, [x1, #-8] |
|
226 |
|
227 cw_finish: |
|
228 ret |
|
229 |
|
230 |
|
231 # Support for void Copy::conjoint_jshorts_atomic(void* from, |
|
232 # void* to, |
|
233 # size_t count) |
|
234 _Copy_conjoint_jshorts_atomic: |
|
235 add x17, x0, x2 |
|
236 add x18, x1, x2 |
|
237 |
|
238 subs x3, x1, x0 |
|
239 # hi is met <=> (from < to) and (to - from < count) |
|
240 ccmp x2, x3, #0, hi |
|
241 b.hi cs_backward |
|
242 |
|
243 subs x3, x2, #14 |
|
244 b.ge cs_forward_loop |
|
245 |
|
246 # Copy x2 < 14 bytes from x0 to x1 |
|
247 cs_forward_lt14: |
|
248 ands x7, x2, #7 |
|
249 tbz x2, #3, cs_forward_lt8 |
|
250 ldrh w3, [x0, #0] |
|
251 ldrh w4, [x0, #2] |
|
252 ldrh w5, [x0, #4] |
|
253 ldrh w6, [x0, #6] |
|
254 |
|
255 strh w3, [x1, #0] |
|
256 strh w4, [x1, #2] |
|
257 strh w5, [x1, #4] |
|
258 strh w6, [x1, #6] |
|
259 |
|
260 # Copy x7 < 8 bytes from x17 - x7 to x18 - x7 |
|
261 cs_forward_lt8: |
|
262 b.eq cs_forward_0 |
|
263 cmp x7, #4 |
|
264 b.lt cs_forward_2 |
|
265 b.eq cs_forward_4 |
|
266 |
|
267 cs_forward_6: |
|
268 ldrh w3, [x17, #-6] |
|
269 strh w3, [x18, #-6] |
|
270 cs_forward_4: |
|
271 ldrh w4, [x17, #-4] |
|
272 strh w4, [x18, #-4] |
|
273 cs_forward_2: |
|
274 ldrh w5, [x17, #-2] |
|
275 strh w5, [x18, #-2] |
|
276 cs_forward_0: |
|
277 ret |
|
278 |
|
279 |
|
280 # Copy [x0, x0 + x3 + 14) to [x1, x1 + x3 + 14) |
|
281 # x3 >= 0 |
|
282 .p2align 6 |
|
283 cs_forward_loop: |
|
284 subs x3, x3, #14 |
|
285 |
|
286 ldrh w4, [x0], #14 |
|
287 ldrh w5, [x0, #-12] |
|
288 ldrh w6, [x0, #-10] |
|
289 ldrh w7, [x0, #-8] |
|
290 ldrh w8, [x0, #-6] |
|
291 ldrh w9, [x0, #-4] |
|
292 ldrh w10, [x0, #-2] |
|
293 |
|
294 strh w4, [x1], #14 |
|
295 strh w5, [x1, #-12] |
|
296 strh w6, [x1, #-10] |
|
297 strh w7, [x1, #-8] |
|
298 strh w8, [x1, #-6] |
|
299 strh w9, [x1, #-4] |
|
300 strh w10, [x1, #-2] |
|
301 |
|
302 b.ge cs_forward_loop |
|
303 # Exactly 16 instruction from cs_forward_loop, so loop fits into one cache line |
|
304 |
|
305 adds x2, x3, #14 |
|
306 # x2 bytes should be copied from x0 to x1 |
|
307 b.ne cs_forward_lt14 |
|
308 ret |
|
309 |
|
310 # Very similar to forward copying |
|
311 cs_backward: |
|
312 subs x3, x2, #14 |
|
313 b.ge cs_backward_loop |
|
314 |
|
315 cs_backward_lt14: |
|
316 ands x7, x2, #7 |
|
317 tbz x2, #3, cs_backward_lt8 |
|
318 |
|
319 ldrh w3, [x17, #-8] |
|
320 ldrh w4, [x17, #-6] |
|
321 ldrh w5, [x17, #-4] |
|
322 ldrh w6, [x17, #-2] |
|
323 |
|
324 strh w3, [x18, #-8] |
|
325 strh w4, [x18, #-6] |
|
326 strh w5, [x18, #-4] |
|
327 strh w6, [x18, #-2] |
|
328 |
|
329 cs_backward_lt8: |
|
330 b.eq cs_backward_0 |
|
331 cmp x7, #4 |
|
332 b.lt cs_backward_2 |
|
333 b.eq cs_backward_4 |
|
334 |
|
335 cs_backward_6: |
|
336 ldrh w3, [x0, #4] |
|
337 strh w3, [x1, #4] |
|
338 |
|
339 cs_backward_4: |
|
340 ldrh w4, [x0, #2] |
|
341 strh w4, [x1, #2] |
|
342 |
|
343 cs_backward_2: |
|
344 ldrh w5, [x0, #0] |
|
345 strh w5, [x1, #0] |
|
346 |
|
347 cs_backward_0: |
|
348 ret |
|
349 |
|
350 |
|
351 .p2align 6 |
|
352 cs_backward_loop: |
|
353 subs x3, x3, #14 |
|
354 |
|
355 ldrh w4, [x17, #-14]! |
|
356 ldrh w5, [x17, #2] |
|
357 ldrh w6, [x17, #4] |
|
358 ldrh w7, [x17, #6] |
|
359 ldrh w8, [x17, #8] |
|
360 ldrh w9, [x17, #10] |
|
361 ldrh w10, [x17, #12] |
|
362 |
|
363 strh w4, [x18, #-14]! |
|
364 strh w5, [x18, #2] |
|
365 strh w6, [x18, #4] |
|
366 strh w7, [x18, #6] |
|
367 strh w8, [x18, #8] |
|
368 strh w9, [x18, #10] |
|
369 strh w10, [x18, #12] |
|
370 |
|
371 b.ge cs_backward_loop |
|
372 adds x2, x3, #14 |
|
373 b.ne cs_backward_lt14 |
|
374 ret |
|
375 |
|
376 |
|
377 # Support for void Copy::arrayof_conjoint_jshorts(void* from, |
|
378 # void* to, |
|
379 # size_t count) |
|
380 _Copy_arrayof_conjoint_jshorts: |
|
381 hlt 1007 |
|
382 |
|
383 |
|
384 # Support for void Copy::conjoint_jlongs_atomic(jlong* from, |
|
385 # jlong* to, |
|
386 # size_t count) |
|
387 _Copy_conjoint_jlongs_atomic: |
|
388 _Copy_arrayof_conjoint_jlongs: |
|
389 hlt 1009 |
|
390 |
|
391 |
|
392 # Support for void Copy::conjoint_jints_atomic(void* from, |
|
393 # void* to, |
|
394 # size_t count) |
|
395 _Copy_conjoint_jints_atomic: |
|
396 _Copy_arrayof_conjoint_jints: |
|
397 # These and further memory prefetches may hit out of array ranges. |
|
398 # Experiments showed that prefetching of inaccessible memory doesn't result in exceptions. |
|
399 prfm pldl1keep, [x0, #0] |
|
400 prfm pstl1keep, [x1, #0] |
|
401 prfm pldl1keep, [x0, #32] |
|
402 prfm pstl1keep, [x1, #32] |
|
403 |
|
404 subs x3, x1, x0 |
|
405 # hi condition is met <=> from < to |
|
406 ccmp x2, x3, #0, hi |
|
407 # hi condition is met <=> (from < to) and (to - from < count) |
|
408 b.hi ci_backward |
|
409 |
|
410 subs x18, x2, #64 |
|
411 b.ge ci_forward_large |
|
412 |
|
413 ci_forward_lt_64: |
|
414 # Copy [x0, x0 + x2) to [x1, x1 + x2) |
|
415 |
|
416 adr x15, ci_forward_tail_table_base |
|
417 and x16, x2, #~4 |
|
418 |
|
419 # Calculate address to jump and store it to x15: |
|
420 # Each pair of instructions before ci_forward_tail_table_base copies 8 bytes. |
|
421 # x16 is count of bytes to copy aligned down by 8. |
|
422 # So x16/8 pairs of instructions should be executed. |
|
423 # Each pair takes 8 bytes, so x15 = ci_forward_tail_table_base - (x16/8)*8 = x15 - x16 |
|
424 sub x15, x15, x16 |
|
425 prfm plil1keep, [x15] |
|
426 |
|
427 add x17, x0, x2 |
|
428 add x18, x1, x2 |
|
429 |
|
430 # If x2 = x16 + 4, then copy 4 bytes and x16 bytes after that. |
|
431 # Otherwise x2 = x16, so proceed to copy x16 bytes. |
|
432 tbz x2, #2, ci_forward_lt_64_even |
|
433 ldr w3, [x0] |
|
434 str w3, [x1] |
|
435 ci_forward_lt_64_even: |
|
436 # Copy [x17 - x16, x17) to [x18 - x16, x18) |
|
437 # x16 is aligned by 8 and less than 64 |
|
438 |
|
439 # Execute (x16/8) ldp-stp pairs; each pair copies 8 bytes |
|
440 br x15 |
|
441 |
|
442 ldp w3, w4, [x17, #-56] |
|
443 stp w3, w4, [x18, #-56] |
|
444 ldp w5, w6, [x17, #-48] |
|
445 stp w5, w6, [x18, #-48] |
|
446 ldp w7, w8, [x17, #-40] |
|
447 stp w7, w8, [x18, #-40] |
|
448 ldp w9, w10, [x17, #-32] |
|
449 stp w9, w10, [x18, #-32] |
|
450 ldp w11, w12, [x17, #-24] |
|
451 stp w11, w12, [x18, #-24] |
|
452 ldp w13, w14, [x17, #-16] |
|
453 stp w13, w14, [x18, #-16] |
|
454 ldp w15, w16, [x17, #-8] |
|
455 stp w15, w16, [x18, #-8] |
|
456 ci_forward_tail_table_base: |
|
457 ret |
|
458 |
|
459 .p2align 6 |
|
460 .rept 12 |
|
461 nop |
|
462 .endr |
|
463 ci_forward_large: |
|
464 # x18 >= 0; |
|
465 # Copy [x0, x0 + x18 + 64) to [x1, x1 + x18 + 64) |
|
466 |
|
467 ldp w3, w4, [x0], #32 |
|
468 ldp w5, w6, [x0, #-24] |
|
469 ldp w7, w8, [x0, #-16] |
|
470 ldp w9, w10, [x0, #-8] |
|
471 |
|
472 # Before and after each iteration of loop registers w3-w10 contain [x0 - 32, x0), |
|
473 # and x1 is a place to copy this data; |
|
474 # x18 contains number of bytes to be stored minus 64 |
|
475 |
|
476 # Exactly 16 instructions from p2align, so ci_forward_loop starts from cache line boundary |
|
477 # Checking it explictly by aligning with "hlt 1000" instructions |
|
478 .p2alignl 6, 0xd4407d00 |
|
479 ci_forward_loop: |
|
480 prfm pldl1keep, [x0, #32] |
|
481 prfm pstl1keep, [x1, #32] |
|
482 |
|
483 subs x18, x18, #32 |
|
484 |
|
485 stp w3, w4, [x1, #0] |
|
486 ldp w3, w4, [x0, #0] |
|
487 stp w5, w6, [x1, #8] |
|
488 ldp w5, w6, [x0, #8] |
|
489 stp w7, w8, [x1, #16] |
|
490 ldp w7, w8, [x0, #16] |
|
491 stp w9, w10, [x1, #24] |
|
492 ldp w9, w10, [x0, #24] |
|
493 |
|
494 add x1, x1, #32 |
|
495 add x0, x0, #32 |
|
496 |
|
497 b.ge ci_forward_loop |
|
498 |
|
499 # 14 instructions from ci_forward_loop, so the loop body hits into one cache line |
|
500 |
|
501 ci_forward_loop_end: |
|
502 adds x2, x18, #32 |
|
503 |
|
504 stp w3, w4, [x1], #32 |
|
505 stp w5, w6, [x1, #-24] |
|
506 stp w7, w8, [x1, #-16] |
|
507 stp w9, w10, [x1, #-8] |
|
508 |
|
509 # Increased x18 by 32, but stored 32 bytes, so x2 contains exact number of bytes to be stored |
|
510 |
|
511 # If this number is not zero, also copy remaining bytes |
|
512 b.ne ci_forward_lt_64 |
|
513 ret |
|
514 |
|
515 ci_backward: |
|
516 |
|
517 # Overlapping case should be the rare one, it does not worth optimizing |
|
518 |
|
519 ands x3, x2, #~4 |
|
520 # x3 is count aligned down by 2*jintSize |
|
521 add x0, x0, x2 |
|
522 add x1, x1, x2 |
|
523 sub x3, x3, #8 |
|
524 # Skip loop if 0 or 1 jints |
|
525 b.eq ci_backward_loop_end |
|
526 |
|
527 # x3 >= 0 |
|
528 # Copy [x0 - x3 - 8, x0) to [x1 - x3 - 8, x1) backward |
|
529 ci_backward_loop: |
|
530 subs x3, x3, #8 |
|
531 ldp w4, w5, [x0, #-8]! |
|
532 stp w4, w5, [x1, #-8]! |
|
533 b.ge ci_backward_loop |
|
534 |
|
535 ci_backward_loop_end: |
|
536 # Copy remaining 0 or 1 jints |
|
537 tbz x2, #2, ci_backward_finish |
|
538 ldr w3, [x0, #-4] |
|
539 str w3, [x1, #-4] |
|
540 |
|
541 ci_backward_finish: |
|
542 ret |