author | jwilhelm |
Thu, 12 Sep 2019 03:21:11 +0200 | |
changeset 58094 | 0f6c749acd15 |
parent 55398 | e53ec3b362f4 |
permissions | -rw-r--r-- |
36595
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
1 |
/* |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
2 |
* Copyright (c) 2016, Linaro Ltd. All rights reserved. |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
3 |
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
4 |
* |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
5 |
* This code is free software; you can redistribute it and/or modify it |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
6 |
* under the terms of the GNU General Public License version 2 only, as |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
7 |
* published by the Free Software Foundation. |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
8 |
* |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
9 |
* This code is distributed in the hope that it will be useful, but WITHOUT |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
10 |
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
11 |
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
12 |
* version 2 for more details (a copy is included in the LICENSE file that |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
13 |
* accompanied this code). |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
14 |
* |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
15 |
* You should have received a copy of the GNU General Public License version |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
16 |
* 2 along with this work; if not, write to the Free Software Foundation, |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
17 |
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
18 |
* |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
19 |
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
20 |
* or visit www.oracle.com if you need additional information or have any |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
21 |
* questions. |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
22 |
* |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
23 |
*/ |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
24 |
.global _Copy_conjoint_words |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
25 |
.global _Copy_disjoint_words |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
26 |
|
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
27 |
s .req x0 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
28 |
d .req x1 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
29 |
count .req x2 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
30 |
t0 .req x3 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
31 |
t1 .req x4 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
32 |
t2 .req x5 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
33 |
t3 .req x6 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
34 |
t4 .req x7 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
35 |
t5 .req x8 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
36 |
t6 .req x9 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
37 |
t7 .req x10 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
38 |
|
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
39 |
.align 6 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
40 |
_Copy_disjoint_words: |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
41 |
// Ensure 2 word aligned |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
42 |
tbz s, #3, fwd_copy_aligned |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
43 |
ldr t0, [s], #8 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
44 |
str t0, [d], #8 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
45 |
sub count, count, #1 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
46 |
|
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
47 |
fwd_copy_aligned: |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
48 |
// Bias s & d so we only pre index on the last copy |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
49 |
sub s, s, #16 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
50 |
sub d, d, #16 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
51 |
|
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
52 |
ldp t0, t1, [s, #16] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
53 |
ldp t2, t3, [s, #32] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
54 |
ldp t4, t5, [s, #48] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
55 |
ldp t6, t7, [s, #64]! |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
56 |
|
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
57 |
subs count, count, #16 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
58 |
blo fwd_copy_drain |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
59 |
|
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
60 |
fwd_copy_again: |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
61 |
prfm pldl1keep, [s, #256] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
62 |
stp t0, t1, [d, #16] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
63 |
ldp t0, t1, [s, #16] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
64 |
stp t2, t3, [d, #32] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
65 |
ldp t2, t3, [s, #32] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
66 |
stp t4, t5, [d, #48] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
67 |
ldp t4, t5, [s, #48] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
68 |
stp t6, t7, [d, #64]! |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
69 |
ldp t6, t7, [s, #64]! |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
70 |
subs count, count, #8 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
71 |
bhs fwd_copy_again |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
72 |
|
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
73 |
fwd_copy_drain: |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
74 |
stp t0, t1, [d, #16] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
75 |
stp t2, t3, [d, #32] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
76 |
stp t4, t5, [d, #48] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
77 |
stp t6, t7, [d, #64]! |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
78 |
|
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
79 |
// count is now -8..-1 for 0..7 words to copy |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
80 |
adr t0, 0f |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
81 |
add t0, t0, count, lsl #5 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
82 |
br t0 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
83 |
|
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
84 |
.align 5 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
85 |
ret // -8 == 0 words |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
86 |
.align 5 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
87 |
ldr t0, [s, #16] // -7 == 1 word |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
88 |
str t0, [d, #16] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
89 |
ret |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
90 |
.align 5 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
91 |
ldp t0, t1, [s, #16] // -6 = 2 words |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
92 |
stp t0, t1, [d, #16] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
93 |
ret |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
94 |
.align 5 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
95 |
ldp t0, t1, [s, #16] // -5 = 3 words |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
96 |
ldr t2, [s, #32] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
97 |
stp t0, t1, [d, #16] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
98 |
str t2, [d, #32] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
99 |
ret |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
100 |
.align 5 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
101 |
ldp t0, t1, [s, #16] // -4 = 4 words |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
102 |
ldp t2, t3, [s, #32] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
103 |
stp t0, t1, [d, #16] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
104 |
stp t2, t3, [d, #32] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
105 |
ret |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
106 |
.align 5 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
107 |
ldp t0, t1, [s, #16] // -3 = 5 words |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
108 |
ldp t2, t3, [s, #32] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
109 |
ldr t4, [s, #48] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
110 |
stp t0, t1, [d, #16] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
111 |
stp t2, t3, [d, #32] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
112 |
str t4, [d, #48] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
113 |
ret |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
114 |
.align 5 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
115 |
ldp t0, t1, [s, #16] // -2 = 6 words |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
116 |
ldp t2, t3, [s, #32] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
117 |
ldp t4, t5, [s, #48] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
118 |
stp t0, t1, [d, #16] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
119 |
stp t2, t3, [d, #32] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
120 |
stp t4, t5, [d, #48] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
121 |
ret |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
122 |
.align 5 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
123 |
ldp t0, t1, [s, #16] // -1 = 7 words |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
124 |
ldp t2, t3, [s, #32] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
125 |
ldp t4, t5, [s, #48] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
126 |
ldr t6, [s, #64] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
127 |
stp t0, t1, [d, #16] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
128 |
stp t2, t3, [d, #32] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
129 |
stp t4, t5, [d, #48] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
130 |
str t6, [d, #64] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
131 |
// Is always aligned here, code for 7 words is one instruction |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
132 |
// too large so it just falls through. |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
133 |
.align 5 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
134 |
0: |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
135 |
ret |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
136 |
|
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
137 |
.align 6 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
138 |
_Copy_conjoint_words: |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
139 |
sub t0, d, s |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
140 |
cmp t0, count, lsl #3 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
141 |
bhs _Copy_disjoint_words |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
142 |
|
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
143 |
add s, s, count, lsl #3 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
144 |
add d, d, count, lsl #3 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
145 |
|
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
146 |
// Ensure 2 word aligned |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
147 |
tbz s, #3, bwd_copy_aligned |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
148 |
ldr t0, [s, #-8]! |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
149 |
str t0, [d, #-8]! |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
150 |
sub count, count, #1 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
151 |
|
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
152 |
bwd_copy_aligned: |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
153 |
ldp t0, t1, [s, #-16] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
154 |
ldp t2, t3, [s, #-32] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
155 |
ldp t4, t5, [s, #-48] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
156 |
ldp t6, t7, [s, #-64]! |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
157 |
|
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
158 |
subs count, count, #16 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
159 |
blo bwd_copy_drain |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
160 |
|
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
161 |
bwd_copy_again: |
55398
e53ec3b362f4
8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents:
47216
diff
changeset
|
162 |
prfum pldl1keep, [s, #-256] |
36595
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
163 |
stp t0, t1, [d, #-16] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
164 |
ldp t0, t1, [s, #-16] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
165 |
stp t2, t3, [d, #-32] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
166 |
ldp t2, t3, [s, #-32] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
167 |
stp t4, t5, [d, #-48] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
168 |
ldp t4, t5, [s, #-48] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
169 |
stp t6, t7, [d, #-64]! |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
170 |
ldp t6, t7, [s, #-64]! |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
171 |
subs count, count, #8 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
172 |
bhs bwd_copy_again |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
173 |
|
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
174 |
bwd_copy_drain: |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
175 |
stp t0, t1, [d, #-16] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
176 |
stp t2, t3, [d, #-32] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
177 |
stp t4, t5, [d, #-48] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
178 |
stp t6, t7, [d, #-64]! |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
179 |
|
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
180 |
// count is now -8..-1 for 0..7 words to copy |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
181 |
adr t0, 0f |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
182 |
add t0, t0, count, lsl #5 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
183 |
br t0 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
184 |
|
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
185 |
.align 5 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
186 |
ret // -8 == 0 words |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
187 |
.align 5 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
188 |
ldr t0, [s, #-8] // -7 == 1 word |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
189 |
str t0, [d, #-8] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
190 |
ret |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
191 |
.align 5 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
192 |
ldp t0, t1, [s, #-16] // -6 = 2 words |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
193 |
stp t0, t1, [d, #-16] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
194 |
ret |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
195 |
.align 5 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
196 |
ldp t0, t1, [s, #-16] // -5 = 3 words |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
197 |
ldr t2, [s, #-24] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
198 |
stp t0, t1, [d, #-16] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
199 |
str t2, [d, #-24] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
200 |
ret |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
201 |
.align 5 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
202 |
ldp t0, t1, [s, #-16] // -4 = 4 words |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
203 |
ldp t2, t3, [s, #-32] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
204 |
stp t0, t1, [d, #-16] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
205 |
stp t2, t3, [d, #-32] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
206 |
ret |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
207 |
.align 5 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
208 |
ldp t0, t1, [s, #-16] // -3 = 5 words |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
209 |
ldp t2, t3, [s, #-32] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
210 |
ldr t4, [s, #-40] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
211 |
stp t0, t1, [d, #-16] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
212 |
stp t2, t3, [d, #-32] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
213 |
str t4, [d, #-40] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
214 |
ret |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
215 |
.align 5 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
216 |
ldp t0, t1, [s, #-16] // -2 = 6 words |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
217 |
ldp t2, t3, [s, #-32] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
218 |
ldp t4, t5, [s, #-48] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
219 |
stp t0, t1, [d, #-16] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
220 |
stp t2, t3, [d, #-32] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
221 |
stp t4, t5, [d, #-48] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
222 |
ret |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
223 |
.align 5 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
224 |
ldp t0, t1, [s, #-16] // -1 = 7 words |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
225 |
ldp t2, t3, [s, #-32] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
226 |
ldp t4, t5, [s, #-48] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
227 |
ldr t6, [s, #-56] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
228 |
stp t0, t1, [d, #-16] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
229 |
stp t2, t3, [d, #-32] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
230 |
stp t4, t5, [d, #-48] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
231 |
str t6, [d, #-56] |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
232 |
// Is always aligned here, code for 7 words is one instruction |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
233 |
// too large so it just falls through. |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
234 |
.align 5 |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
235 |
0: |
3322a76f3a00
8151502: optimize pd_disjoint_words and pd_conjoint_words
enevill
parents:
diff
changeset
|
236 |
ret |