author | ngasson |
Mon, 17 Jun 2019 15:31:49 +0800 | |
changeset 55398 | e53ec3b362f4 |
parent 51739 | 7bed934d439e |
permissions | -rw-r--r-- |
50754 | 1 |
/* Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved. |
2 |
* Copyright (c) 2018, Cavium. All rights reserved. (By BELLSOFT) |
|
3 |
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
|
4 |
* |
|
5 |
* This code is free software; you can redistribute it and/or modify it |
|
6 |
* under the terms of the GNU General Public License version 2 only, as |
|
7 |
* published by the Free Software Foundation. |
|
8 |
* |
|
9 |
* This code is distributed in the hope that it will be useful, but WITHOUT |
|
10 |
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
11 |
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
12 |
* version 2 for more details (a copy is included in the LICENSE file that |
|
13 |
* accompanied this code). |
|
14 |
* |
|
15 |
* You should have received a copy of the GNU General Public License version |
|
16 |
* 2 along with this work; if not, write to the Free Software Foundation, |
|
17 |
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
|
18 |
* |
|
19 |
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
|
20 |
* or visit www.oracle.com if you need additional information or have any |
|
21 |
* questions. |
|
22 |
* |
|
23 |
*/ |
|
24 |
||
25 |
#include "precompiled.hpp" |
|
26 |
#include "asm/assembler.hpp" |
|
27 |
#include "asm/assembler.inline.hpp" |
|
28 |
#include "runtime/stubRoutines.hpp" |
|
29 |
#include "macroAssembler_aarch64.hpp" |
|
30 |
||
31 |
// The following code is a optimized version of fdlibm sin/cos implementation |
|
32 |
// (C code is in share/runtime/sharedRuntimeTrig.cpp) adapted for AARCH64. |
|
33 |
||
34 |
// Please refer to sin/cos approximation via polynomial and |
|
35 |
// trigonometric argument reduction techniques to the following literature: |
|
36 |
// |
|
37 |
// [1] Muller, Jean-Michel, Nicolas Brisebarre, Florent De Dinechin, |
|
38 |
// Claude-Pierre Jeannerod, Vincent Lefevre, Guillaume Melquiond, |
|
39 |
// Nathalie Revol, Damien Stehlé, and Serge Torres: |
|
40 |
// Handbook of floating-point arithmetic. |
|
41 |
// Springer Science & Business Media, 2009. |
|
42 |
// [2] K. C. Ng |
|
43 |
// Argument Reduction for Huge Arguments: Good to the Last Bit |
|
44 |
// July 13, 1992, SunPro |
|
45 |
// |
|
46 |
// HOW TO READ THIS CODE: |
|
47 |
// This code consists of several functions. Each function has following header: |
|
48 |
// 1) Description |
|
49 |
// 2) C-pseudo code with differences from fdlibm marked by comments starting |
|
50 |
// with "NOTE". Check unmodified fdlibm code in |
|
51 |
// share/runtime/SharedRuntimeTrig.cpp |
|
52 |
// 3) Brief textual description of changes between fdlibm and current |
|
53 |
// implementation along with optimization notes (if applicable) |
|
54 |
// 4) Assumptions, input and output |
|
55 |
// 5) (Optional) additional notes about intrinsic implementation |
|
56 |
// Each function is separated in blocks which follow the pseudo-code structure |
|
57 |
// |
|
58 |
// HIGH-LEVEL ALGORITHM DESCRIPTION: |
|
59 |
// - entry point: generate_dsin_dcos(...); |
|
60 |
// - check corner cases: NaN, INF, tiny argument. |
|
61 |
// - check if |x| < Pi/4. Then approximate sin/cos via polynomial (kernel_sin/kernel_cos) |
|
62 |
// -- else proceed to argument reduction routine (__ieee754_rem_pio2) and |
|
63 |
// use reduced argument to get result via kernel_sin/kernel_cos |
|
64 |
// |
|
65 |
// HIGH-LEVEL CHANGES BETWEEN INTRINSICS AND FDLIBM: |
|
66 |
// 1) two_over_pi table fdlibm representation is int[], while intrinsic version |
|
67 |
// has these int values converted to double representation to load converted |
|
68 |
// double values directly (see stubRoutines_aarch4::_two_over_pi) |
|
69 |
// 2) Several loops are unrolled and vectorized: see comments in code after |
|
70 |
// labels: SKIP_F_LOAD, RECOMP_FOR1_CHECK, RECOMP_FOR2 |
|
71 |
// 3) fdlibm npio2_hw table now has "prefix" with constants used in |
|
72 |
// calculation. These constants are loaded from npio2_hw table instead of |
|
73 |
// constructing it in code (see stubRoutines_aarch64.cpp) |
|
74 |
// 4) Polynomial coefficients for sin and cos are moved to table sin_coef |
|
75 |
// and cos_coef to use the same optimization as in 3). It allows to load most of |
|
76 |
// required constants via single instruction |
|
77 |
// |
|
78 |
// |
|
79 |
// |
|
80 |
///* __ieee754_rem_pio2(x,y) |
|
81 |
// * |
|
82 |
// * returns the remainder of x rem pi/2 in y[0]+y[1] (i.e. like x div pi/2) |
|
83 |
// * x is input argument, y[] is hi and low parts of reduced argument (x) |
|
84 |
// * uses __kernel_rem_pio2() |
|
85 |
// */ |
|
86 |
// // use tables(see stubRoutines_aarch64.cpp): two_over_pi and modified npio2_hw |
|
87 |
// |
|
88 |
// BEGIN __ieee754_rem_pio2 PSEUDO CODE |
|
89 |
// |
|
90 |
//static int __ieee754_rem_pio2(double x, double *y) { |
|
91 |
// double z,w,t,r,fn; |
|
92 |
// double tx[3]; |
|
93 |
// int e0,i,j,nx,n,ix,hx,i0; |
|
94 |
// |
|
95 |
// i0 = ((*(int*)&two24A)>>30)^1; /* high word index */ |
|
96 |
// hx = *(i0+(int*)&x); /* high word of x */ |
|
97 |
// ix = hx&0x7fffffff; |
|
98 |
// if(ix<0x4002d97c) { /* |x| < 3pi/4, special case with n=+-1 */ |
|
99 |
// if(hx>0) { |
|
100 |
// z = x - pio2_1; |
|
101 |
// if(ix!=0x3ff921fb) { /* 33+53 bit pi is good enough */ |
|
102 |
// y[0] = z - pio2_1t; |
|
103 |
// y[1] = (z-y[0])-pio2_1t; |
|
104 |
// } else { /* near pi/2, use 33+33+53 bit pi */ |
|
105 |
// z -= pio2_2; |
|
106 |
// y[0] = z - pio2_2t; |
|
107 |
// y[1] = (z-y[0])-pio2_2t; |
|
108 |
// } |
|
109 |
// return 1; |
|
110 |
// } else { /* negative x */ |
|
111 |
// z = x + pio2_1; |
|
112 |
// if(ix!=0x3ff921fb) { /* 33+53 bit pi is good enough */ |
|
113 |
// y[0] = z + pio2_1t; |
|
114 |
// y[1] = (z-y[0])+pio2_1t; |
|
115 |
// } else { /* near pi/2, use 33+33+53 bit pi */ |
|
116 |
// z += pio2_2; |
|
117 |
// y[0] = z + pio2_2t; |
|
118 |
// y[1] = (z-y[0])+pio2_2t; |
|
119 |
// } |
|
120 |
// return -1; |
|
121 |
// } |
|
122 |
// } |
|
123 |
// if(ix<=0x413921fb) { /* |x| ~<= 2^19*(pi/2), medium size */ |
|
124 |
// t = fabsd(x); |
|
125 |
// n = (int) (t*invpio2+half); |
|
126 |
// fn = (double)n; |
|
127 |
// r = t-fn*pio2_1; |
|
128 |
// w = fn*pio2_1t; /* 1st round good to 85 bit */ |
|
129 |
// // NOTE: y[0] = r-w; is moved from if/else below to be before "if" |
|
130 |
// y[0] = r-w; |
|
131 |
// if(n<32&&ix!=npio2_hw[n-1]) { |
|
132 |
// // y[0] = r-w; /* quick check no cancellation */ // NOTE: moved earlier |
|
133 |
// } else { |
|
134 |
// j = ix>>20; |
|
135 |
// // y[0] = r-w; // NOTE: moved earlier |
|
136 |
// i = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff); |
|
137 |
// if(i>16) { /* 2nd iteration needed, good to 118 */ |
|
138 |
// t = r; |
|
139 |
// w = fn*pio2_2; |
|
140 |
// r = t-w; |
|
141 |
// w = fn*pio2_2t-((t-r)-w); |
|
142 |
// y[0] = r-w; |
|
143 |
// i = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff); |
|
144 |
// if(i>49) { /* 3rd iteration need, 151 bits acc */ |
|
145 |
// t = r; /* will cover all possible cases */ |
|
146 |
// w = fn*pio2_3; |
|
147 |
// r = t-w; |
|
148 |
// w = fn*pio2_3t-((t-r)-w); |
|
149 |
// y[0] = r-w; |
|
150 |
// } |
|
151 |
// } |
|
152 |
// } |
|
153 |
// y[1] = (r-y[0])-w; |
|
154 |
// if(hx<0) {y[0] = -y[0]; y[1] = -y[1]; return -n;} |
|
155 |
// else return n; |
|
156 |
// } |
|
157 |
// /* |
|
158 |
// * all other (large) arguments |
|
159 |
// */ |
|
160 |
// // NOTE: this check is removed, because it was checked in dsin/dcos |
|
161 |
// // if(ix>=0x7ff00000) { /* x is inf or NaN */ |
|
162 |
// // y[0]=y[1]=x-x; return 0; |
|
163 |
// // } |
|
164 |
// /* set z = scalbn(|x|,ilogb(x)-23) */ |
|
165 |
// *(1-i0+(int*)&z) = *(1-i0+(int*)&x); |
|
166 |
// e0 = (ix>>20)-1046; /* e0 = ilogb(z)-23; */ |
|
167 |
// *(i0+(int*)&z) = ix - (e0<<20); |
|
168 |
// |
|
169 |
// // NOTE: "for" loop below in unrolled. See comments in asm code |
|
170 |
// for(i=0;i<2;i++) { |
|
171 |
// tx[i] = (double)((int)(z)); |
|
172 |
// z = (z-tx[i])*two24A; |
|
173 |
// } |
|
174 |
// |
|
175 |
// tx[2] = z; |
|
176 |
// nx = 3; |
|
177 |
// |
|
178 |
// // NOTE: while(tx[nx-1]==zeroA) nx--; is unrolled. See comments in asm code |
|
179 |
// while(tx[nx-1]==zeroA) nx--; /* skip zero term */ |
|
180 |
// |
|
181 |
// n = __kernel_rem_pio2(tx,y,e0,nx,2,two_over_pi); |
|
182 |
// if(hx<0) {y[0] = -y[0]; y[1] = -y[1]; return -n;} |
|
183 |
// return n; |
|
184 |
//} |
|
185 |
// |
|
186 |
// END __ieee754_rem_pio2 PSEUDO CODE |
|
187 |
// |
|
188 |
// Changes between fdlibm and intrinsic for __ieee754_rem_pio2: |
|
189 |
// 1. INF/NaN check for huge argument is removed in comparison with fdlibm |
|
190 |
// code, because this check is already done in dcos/dsin code |
|
191 |
// 2. Most constants are now loaded from table instead of direct initialization |
|
192 |
// 3. Two loops are unrolled |
|
193 |
// Assumptions: |
|
194 |
// 1. Assume |X| >= PI/4 |
|
195 |
// 2. Assume rscratch1 = 0x3fe921fb00000000 (~ PI/4) |
|
196 |
// 3. Assume ix = r3 |
|
197 |
// Input and output: |
|
198 |
// 1. Input: X = r0 |
|
199 |
// 2. Return n in r2, y[0] == y0 == v4, y[1] == y1 == v5 |
|
200 |
// NOTE: general purpose register names match local variable names in C code |
|
201 |
// NOTE: fpu registers are actively reused. See comments in code about their usage |
|
202 |
void MacroAssembler::generate__ieee754_rem_pio2(address npio2_hw, |
|
203 |
address two_over_pi, address pio2) { |
|
204 |
const long PIO2_1t = 0x3DD0B4611A626331UL; |
|
205 |
const long PIO2_2 = 0x3DD0B4611A600000UL; |
|
206 |
const long PIO2_2t = 0x3BA3198A2E037073UL; |
|
207 |
Label X_IS_NEGATIVE, X_IS_MEDIUM_OR_LARGE, X_IS_POSITIVE_LONG_PI, LARGE_ELSE, |
|
208 |
REDUCTION_DONE, X_IS_MEDIUM_BRANCH_DONE, X_IS_LARGE, NX_SET, |
|
209 |
X_IS_NEGATIVE_LONG_PI; |
|
210 |
Register X = r0, n = r2, ix = r3, jv = r4, tmp5 = r5, jx = r6, |
|
211 |
tmp3 = r7, iqBase = r10, ih = r11, i = r17; |
|
212 |
// initializing constants first |
|
213 |
// rscratch1 = 0x3fe921fb00000000 (see assumptions) |
|
214 |
movk(rscratch1, 0x3ff9, 48); // was 0x3fe921fb0..0 now it's 0x3ff921fb0..0 |
|
215 |
mov(rscratch2, 0x4002d97c); // 3*PI/4 high word |
|
216 |
movk(rscratch1, 0x5440, 16); // now rscratch1 == PIO2_1 |
|
217 |
fmovd(v1, rscratch1); // v1 = PIO2_1 |
|
218 |
cmp(rscratch2, ix); |
|
219 |
br(LE, X_IS_MEDIUM_OR_LARGE); |
|
220 |
||
221 |
block_comment("if(ix<0x4002d97c) {... /* |x| ~< 3pi/4 */ "); { |
|
222 |
cmp(X, zr); |
|
223 |
br(LT, X_IS_NEGATIVE); |
|
224 |
||
225 |
block_comment("if(hx>0) {"); { |
|
226 |
fsubd(v2, v0, v1); // v2 = z = x - pio2_1 |
|
227 |
cmp(ix, rscratch1, LSR, 32); |
|
228 |
mov(n, 1); |
|
229 |
br(EQ, X_IS_POSITIVE_LONG_PI); |
|
230 |
||
231 |
block_comment("case: hx > 0 && ix!=0x3ff921fb {"); { /* 33+53 bit pi is good enough */ |
|
232 |
mov(rscratch2, PIO2_1t); |
|
233 |
fmovd(v27, rscratch2); |
|
234 |
fsubd(v4, v2, v27); // v4 = y[0] = z - pio2_1t; |
|
235 |
fsubd(v5, v2, v4); |
|
236 |
fsubd(v5, v5, v27); // v5 = y[1] = (z-y[0])-pio2_1t |
|
237 |
b(REDUCTION_DONE); |
|
238 |
} |
|
239 |
||
240 |
block_comment("case: hx > 0 &*& ix==0x3ff921fb {"); { /* near pi/2, use 33+33+53 bit pi */ |
|
241 |
bind(X_IS_POSITIVE_LONG_PI); |
|
242 |
mov(rscratch1, PIO2_2); |
|
243 |
mov(rscratch2, PIO2_2t); |
|
244 |
fmovd(v27, rscratch1); |
|
245 |
fmovd(v6, rscratch2); |
|
246 |
fsubd(v2, v2, v27); // z-= pio2_2 |
|
247 |
fsubd(v4, v2, v6); // y[0] = z - pio2_2t |
|
248 |
fsubd(v5, v2, v4); |
|
249 |
fsubd(v5, v5, v6); // v5 = (z - y[0]) - pio2_2t |
|
250 |
b(REDUCTION_DONE); |
|
251 |
} |
|
252 |
} |
|
253 |
||
254 |
block_comment("case: hx <= 0)"); { |
|
255 |
bind(X_IS_NEGATIVE); |
|
256 |
faddd(v2, v0, v1); // v2 = z = x + pio2_1 |
|
257 |
cmp(ix, rscratch1, LSR, 32); |
|
258 |
mov(n, -1); |
|
259 |
br(EQ, X_IS_NEGATIVE_LONG_PI); |
|
260 |
||
261 |
block_comment("case: hx <= 0 && ix!=0x3ff921fb) {"); { /* 33+53 bit pi is good enough */ |
|
262 |
mov(rscratch2, PIO2_1t); |
|
263 |
fmovd(v27, rscratch2); |
|
264 |
faddd(v4, v2, v27); // v4 = y[0] = z + pio2_1t; |
|
265 |
fsubd(v5, v2, v4); |
|
266 |
faddd(v5, v5, v27); // v5 = y[1] = (z-y[0]) + pio2_1t |
|
267 |
b(REDUCTION_DONE); |
|
268 |
} |
|
269 |
||
270 |
block_comment("case: hx <= 0 && ix==0x3ff921fb"); { /* near pi/2, use 33+33+53 bit pi */ |
|
271 |
bind(X_IS_NEGATIVE_LONG_PI); |
|
272 |
mov(rscratch1, PIO2_2); |
|
273 |
mov(rscratch2, PIO2_2t); |
|
274 |
fmovd(v27, rscratch1); |
|
275 |
fmovd(v6, rscratch2); |
|
276 |
faddd(v2, v2, v27); // z += pio2_2 |
|
277 |
faddd(v4, v2, v6); // y[0] = z + pio2_2t |
|
278 |
fsubd(v5, v2, v4); |
|
279 |
faddd(v5, v5, v6); // v5 = (z - y[0]) + pio2_2t |
|
280 |
b(REDUCTION_DONE); |
|
281 |
} |
|
282 |
} |
|
283 |
} |
|
284 |
bind(X_IS_MEDIUM_OR_LARGE); |
|
285 |
mov(rscratch1, 0x413921fb); |
|
286 |
cmp(ix, rscratch1); // ix < = 0x413921fb ? |
|
287 |
br(GT, X_IS_LARGE); |
|
288 |
||
289 |
block_comment("|x| ~<= 2^19*(pi/2), medium size"); { |
|
290 |
lea(ih, ExternalAddress(npio2_hw)); |
|
291 |
ld1(v4, v5, v6, v7, T1D, ih); |
|
292 |
fabsd(v31, v0); // v31 = t = |x| |
|
293 |
add(ih, ih, 64); |
|
294 |
fmaddd(v2, v31, v5, v4); // v2 = t * invpio2 + half (invpio2 = 53 bits of 2/pi, half = 0.5) |
|
295 |
fcvtzdw(n, v2); // n = (int) v2 |
|
296 |
frintzd(v2, v2); |
|
297 |
fmsubd(v3, v2, v6, v31); // v3 = r = t - fn * pio2_1 |
|
298 |
fmuld(v26, v2, v7); // v26 = w = fn * pio2_1t |
|
299 |
fsubd(v4, v3, v26); // y[0] = r - w. Calculated before branch |
|
51374
7be0084191ed
8206895: aarch64: rework error-prone cmp instuction
bulasevich
parents:
50754
diff
changeset
|
300 |
cmp(n, (u1)32); |
50754 | 301 |
br(GT, LARGE_ELSE); |
302 |
subw(tmp5, n, 1); // tmp5 = n - 1 |
|
303 |
ldrw(jv, Address(ih, tmp5, Address::lsl(2))); |
|
304 |
cmp(ix, jv); |
|
305 |
br(NE, X_IS_MEDIUM_BRANCH_DONE); |
|
306 |
||
307 |
block_comment("else block for if(n<32&&ix!=npio2_hw[n-1])"); { |
|
308 |
bind(LARGE_ELSE); |
|
309 |
fmovd(jx, v4); |
|
310 |
lsr(tmp5, ix, 20); // j = ix >> 20 |
|
311 |
lsl(jx, jx, 1); |
|
312 |
sub(tmp3, tmp5, jx, LSR, 32 + 20 + 1); // r7 = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff); |
|
313 |
||
314 |
block_comment("if(i>16)"); { |
|
51374
7be0084191ed
8206895: aarch64: rework error-prone cmp instuction
bulasevich
parents:
50754
diff
changeset
|
315 |
cmp(tmp3, (u1)16); |
50754 | 316 |
br(LE, X_IS_MEDIUM_BRANCH_DONE); |
317 |
// i > 16. 2nd iteration needed |
|
318 |
ldpd(v6, v7, Address(ih, -32)); |
|
319 |
fmovd(v28, v3); // t = r |
|
320 |
fmuld(v29, v2, v6); // w = v29 = fn * pio2_2 |
|
321 |
fsubd(v3, v28, v29); // r = t - w |
|
322 |
fsubd(v31, v28, v3); // v31 = (t - r) |
|
323 |
fsubd(v31, v29, v31); // v31 = w - (t - r) = - ((t - r) - w) |
|
324 |
fmaddd(v26, v2, v7, v31); // v26 = w = fn*pio2_2t - ((t - r) - w) |
|
325 |
fsubd(v4, v3, v26); // y[0] = r - w |
|
326 |
fmovd(jx, v4); |
|
327 |
lsl(jx, jx, 1); |
|
328 |
sub(tmp3, tmp5, jx, LSR, 32 + 20 + 1); // r7 = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff); |
|
329 |
||
330 |
block_comment("if(i>49)"); { |
|
51374
7be0084191ed
8206895: aarch64: rework error-prone cmp instuction
bulasevich
parents:
50754
diff
changeset
|
331 |
cmp(tmp3, (u1)49); |
50754 | 332 |
br(LE, X_IS_MEDIUM_BRANCH_DONE); |
333 |
// 3rd iteration need, 151 bits acc |
|
334 |
ldpd(v6, v7, Address(ih, -16)); |
|
335 |
fmovd(v28, v3); // save "r" |
|
336 |
fmuld(v29, v2, v6); // v29 = fn * pio2_3 |
|
337 |
fsubd(v3, v28, v29); // r = r - w |
|
338 |
fsubd(v31, v28, v3); // v31 = (t - r) |
|
339 |
fsubd(v31, v29, v31); // v31 = w - (t - r) = - ((t - r) - w) |
|
340 |
fmaddd(v26, v2, v7, v31); // v26 = w = fn*pio2_3t - ((t - r) - w) |
|
341 |
fsubd(v4, v3, v26); // y[0] = r - w |
|
342 |
} |
|
343 |
} |
|
344 |
} |
|
345 |
block_comment("medium x tail"); { |
|
346 |
bind(X_IS_MEDIUM_BRANCH_DONE); |
|
347 |
fsubd(v5, v3, v4); // v5 = y[1] = (r - y[0]) |
|
348 |
fsubd(v5, v5, v26); // v5 = y[1] = (r - y[0]) - w |
|
349 |
cmp(X, zr); |
|
350 |
br(GT, REDUCTION_DONE); |
|
351 |
fnegd(v4, v4); |
|
352 |
negw(n, n); |
|
353 |
fnegd(v5, v5); |
|
354 |
b(REDUCTION_DONE); |
|
355 |
} |
|
356 |
} |
|
357 |
||
358 |
block_comment("all other (large) arguments"); { |
|
359 |
bind(X_IS_LARGE); |
|
360 |
lsr(rscratch1, ix, 20); // ix >> 20 |
|
361 |
movz(tmp5, 0x4170, 48); |
|
362 |
subw(rscratch1, rscratch1, 1046); // e0 |
|
363 |
fmovd(v10, tmp5); // init two24A value |
|
364 |
subw(jv, ix, rscratch1, LSL, 20); // ix - (e0<<20) |
|
365 |
lsl(jv, jv, 32); |
|
366 |
subw(rscratch2, rscratch1, 3); |
|
367 |
bfm(jv, X, 0, 31); // jv = z |
|
368 |
movw(i, 24); |
|
369 |
fmovd(v26, jv); // v26 = z |
|
370 |
||
371 |
block_comment("unrolled for(i=0;i<2;i++) {tx[i] = (double)((int)(z));z = (z-tx[i])*two24A;}"); { |
|
372 |
// tx[0,1,2] = v6,v7,v26 |
|
373 |
frintzd(v6, v26); // v6 = (double)((int)v26) |
|
374 |
sdivw(jv, rscratch2, i); // jv = (e0 - 3)/24 |
|
375 |
fsubd(v26, v26, v6); |
|
376 |
sub(sp, sp, 560); |
|
377 |
fmuld(v26, v26, v10); |
|
378 |
frintzd(v7, v26); // v7 = (double)((int)v26) |
|
379 |
movw(jx, 2); // calculate jx as nx - 1, which is initially 2. Not a part of unrolled loop |
|
380 |
fsubd(v26, v26, v7); |
|
381 |
} |
|
382 |
||
383 |
block_comment("nx calculation with unrolled while(tx[nx-1]==zeroA) nx--;"); { |
|
55398
e53ec3b362f4
8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents:
51739
diff
changeset
|
384 |
fcmpd(v26, 0.0); // if NE then jx == 2. else it's 1 or 0 |
50754 | 385 |
add(iqBase, sp, 480); // base of iq[] |
386 |
fmuld(v3, v26, v10); |
|
387 |
br(NE, NX_SET); |
|
55398
e53ec3b362f4
8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents:
51739
diff
changeset
|
388 |
fcmpd(v7, 0.0); // v7 == 0 => jx = 0. Else jx = 1 |
50754 | 389 |
csetw(jx, NE); |
390 |
} |
|
391 |
bind(NX_SET); |
|
392 |
generate__kernel_rem_pio2(two_over_pi, pio2); |
|
393 |
// now we have y[0] = v4, y[1] = v5 and n = r2 |
|
394 |
cmp(X, zr); |
|
395 |
br(GE, REDUCTION_DONE); |
|
396 |
fnegd(v4, v4); |
|
397 |
fnegd(v5, v5); |
|
398 |
negw(n, n); |
|
399 |
} |
|
400 |
bind(REDUCTION_DONE); |
|
401 |
} |
|
402 |
||
403 |
///* |
|
404 |
// * __kernel_rem_pio2(x,y,e0,nx,prec,ipio2) |
|
405 |
// * double x[],y[]; int e0,nx,prec; int ipio2[]; |
|
406 |
// * |
|
407 |
// * __kernel_rem_pio2 return the last three digits of N with |
|
408 |
// * y = x - N*pi/2 |
|
409 |
// * so that |y| < pi/2. |
|
410 |
// * |
|
411 |
// * The method is to compute the integer (mod 8) and fraction parts of |
|
412 |
// * (2/pi)*x without doing the full multiplication. In general we |
|
413 |
// * skip the part of the product that are known to be a huge integer ( |
|
414 |
// * more accurately, = 0 mod 8 ). Thus the number of operations are |
|
415 |
// * independent of the exponent of the input. |
|
416 |
// * |
|
417 |
// * NOTE: 2/pi int representation is converted to double |
|
418 |
// * // (2/pi) is represented by an array of 24-bit integers in ipio2[]. |
|
419 |
// * |
|
420 |
// * Input parameters: |
|
421 |
// * x[] The input value (must be positive) is broken into nx |
|
422 |
// * pieces of 24-bit integers in double precision format. |
|
423 |
// * x[i] will be the i-th 24 bit of x. The scaled exponent |
|
424 |
// * of x[0] is given in input parameter e0 (i.e., x[0]*2^e0 |
|
425 |
// * match x's up to 24 bits. |
|
426 |
// * |
|
427 |
// * Example of breaking a double positive z into x[0]+x[1]+x[2]: |
|
428 |
// * e0 = ilogb(z)-23 |
|
429 |
// * z = scalbn(z,-e0) |
|
430 |
// * for i = 0,1,2 |
|
431 |
// * x[i] = floor(z) |
|
432 |
// * z = (z-x[i])*2**24 |
|
433 |
// * |
|
434 |
// * |
|
435 |
// * y[] ouput result in an array of double precision numbers. |
|
436 |
// * The dimension of y[] is: |
|
437 |
// * 24-bit precision 1 |
|
438 |
// * 53-bit precision 2 |
|
439 |
// * 64-bit precision 2 |
|
440 |
// * 113-bit precision 3 |
|
441 |
// * The actual value is the sum of them. Thus for 113-bit |
|
442 |
// * precsion, one may have to do something like: |
|
443 |
// * |
|
444 |
// * long double t,w,r_head, r_tail; |
|
445 |
// * t = (long double)y[2] + (long double)y[1]; |
|
446 |
// * w = (long double)y[0]; |
|
447 |
// * r_head = t+w; |
|
448 |
// * r_tail = w - (r_head - t); |
|
449 |
// * |
|
450 |
// * e0 The exponent of x[0] |
|
451 |
// * |
|
452 |
// * nx dimension of x[] |
|
453 |
// * |
|
454 |
// * prec an interger indicating the precision: |
|
455 |
// * 0 24 bits (single) |
|
456 |
// * 1 53 bits (double) |
|
457 |
// * 2 64 bits (extended) |
|
458 |
// * 3 113 bits (quad) |
|
459 |
// * |
|
460 |
// * NOTE: ipio2[] array below is converted to double representation |
|
461 |
// * //ipio2[] |
|
462 |
// * // integer array, contains the (24*i)-th to (24*i+23)-th |
|
463 |
// * // bit of 2/pi after binary point. The corresponding |
|
464 |
// * // floating value is |
|
465 |
// * |
|
466 |
// * ipio2[i] * 2^(-24(i+1)). |
|
467 |
// * |
|
468 |
// * Here is the description of some local variables: |
|
469 |
// * |
|
470 |
// * jk jk+1 is the initial number of terms of ipio2[] needed |
|
471 |
// * in the computation. The recommended value is 2,3,4, |
|
472 |
// * 6 for single, double, extended,and quad. |
|
473 |
// * |
|
474 |
// * jz local integer variable indicating the number of |
|
475 |
// * terms of ipio2[] used. |
|
476 |
// * |
|
477 |
// * jx nx - 1 |
|
478 |
// * |
|
479 |
// * jv index for pointing to the suitable ipio2[] for the |
|
480 |
// * computation. In general, we want |
|
481 |
// * ( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8 |
|
482 |
// * is an integer. Thus |
|
483 |
// * e0-3-24*jv >= 0 or (e0-3)/24 >= jv |
|
484 |
// * Hence jv = max(0,(e0-3)/24). |
|
485 |
// * |
|
486 |
// * jp jp+1 is the number of terms in PIo2[] needed, jp = jk. |
|
487 |
// * |
|
488 |
// * q[] double array with integral value, representing the |
|
489 |
// * 24-bits chunk of the product of x and 2/pi. |
|
490 |
// * |
|
491 |
// * q0 the corresponding exponent of q[0]. Note that the |
|
492 |
// * exponent for q[i] would be q0-24*i. |
|
493 |
// * |
|
494 |
// * PIo2[] double precision array, obtained by cutting pi/2 |
|
495 |
// * into 24 bits chunks. |
|
496 |
// * |
|
497 |
// * f[] ipio2[] in floating point |
|
498 |
// * |
|
499 |
// * iq[] integer array by breaking up q[] in 24-bits chunk. |
|
500 |
// * |
|
501 |
// * fq[] final product of x*(2/pi) in fq[0],..,fq[jk] |
|
502 |
// * |
|
503 |
// * ih integer. If >0 it indicates q[] is >= 0.5, hence |
|
504 |
// * it also indicates the *sign* of the result. |
|
505 |
// * |
|
506 |
// */ |
|
507 |
// |
|
508 |
// Use PIo2 table(see stubRoutines_aarch64.cpp) |
|
509 |
// |
|
510 |
// BEGIN __kernel_rem_pio2 PSEUDO CODE |
|
511 |
// |
|
512 |
//static int __kernel_rem_pio2(double *x, double *y, int e0, int nx, int prec, /* NOTE: converted to double */ const double *ipio2 // const int *ipio2) { |
|
513 |
// int jz,jx,jv,jp,jk,carry,n,iq[20],i,j,k,m,q0,ih; |
|
514 |
// double z,fw,f[20],fq[20],q[20]; |
|
515 |
// |
|
516 |
// /* initialize jk*/ |
|
517 |
// // jk = init_jk[prec]; // NOTE: prec==2 for double. jk is always 4. |
|
518 |
// jp = jk; // NOTE: always 4 |
|
519 |
// |
|
520 |
// /* determine jx,jv,q0, note that 3>q0 */ |
|
521 |
// jx = nx-1; |
|
522 |
// jv = (e0-3)/24; if(jv<0) jv=0; |
|
523 |
// q0 = e0-24*(jv+1); |
|
524 |
// |
|
525 |
// /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */ |
|
526 |
// j = jv-jx; m = jx+jk; |
|
527 |
// |
|
528 |
// // NOTE: split into two for-loops: one with zeroB and one with ipio2[j]. It |
|
529 |
// // allows the use of wider loads/stores |
|
530 |
// for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; //(double) ipio2[j]; |
|
531 |
// |
|
532 |
// // NOTE: unrolled and vectorized "for". See comments in asm code |
|
533 |
// /* compute q[0],q[1],...q[jk] */ |
|
534 |
// for (i=0;i<=jk;i++) { |
|
535 |
// for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw; |
|
536 |
// } |
|
537 |
// |
|
538 |
// jz = jk; |
|
539 |
//recompute: |
|
540 |
// /* distill q[] into iq[] reversingly */ |
|
541 |
// for(i=0,j=jz,z=q[jz];j>0;i++,j--) { |
|
542 |
// fw = (double)((int)(twon24* z)); |
|
543 |
// iq[i] = (int)(z-two24B*fw); |
|
544 |
// z = q[j-1]+fw; |
|
545 |
// } |
|
546 |
// |
|
547 |
// /* compute n */ |
|
548 |
// z = scalbnA(z,q0); /* actual value of z */ |
|
549 |
// z -= 8.0*floor(z*0.125); /* trim off integer >= 8 */ |
|
550 |
// n = (int) z; |
|
551 |
// z -= (double)n; |
|
552 |
// ih = 0; |
|
553 |
// if(q0>0) { /* need iq[jz-1] to determine n */ |
|
554 |
// i = (iq[jz-1]>>(24-q0)); n += i; |
|
555 |
// iq[jz-1] -= i<<(24-q0); |
|
556 |
// ih = iq[jz-1]>>(23-q0); |
|
557 |
// } |
|
558 |
// else if(q0==0) ih = iq[jz-1]>>23; |
|
559 |
// else if(z>=0.5) ih=2; |
|
560 |
// |
|
561 |
// if(ih>0) { /* q > 0.5 */ |
|
562 |
// n += 1; carry = 0; |
|
563 |
// for(i=0;i<jz ;i++) { /* compute 1-q */ |
|
564 |
// j = iq[i]; |
|
565 |
// if(carry==0) { |
|
566 |
// if(j!=0) { |
|
567 |
// carry = 1; iq[i] = 0x1000000- j; |
|
568 |
// } |
|
569 |
// } else iq[i] = 0xffffff - j; |
|
570 |
// } |
|
571 |
// if(q0>0) { /* rare case: chance is 1 in 12 */ |
|
572 |
// switch(q0) { |
|
573 |
// case 1: |
|
574 |
// iq[jz-1] &= 0x7fffff; break; |
|
575 |
// case 2: |
|
576 |
// iq[jz-1] &= 0x3fffff; break; |
|
577 |
// } |
|
578 |
// } |
|
579 |
// if(ih==2) { |
|
580 |
// z = one - z; |
|
581 |
// if(carry!=0) z -= scalbnA(one,q0); |
|
582 |
// } |
|
583 |
// } |
|
584 |
// |
|
585 |
// /* check if recomputation is needed */ |
|
586 |
// if(z==zeroB) { |
|
587 |
// j = 0; |
|
588 |
// for (i=jz-1;i>=jk;i--) j |= iq[i]; |
|
589 |
// if(j==0) { /* need recomputation */ |
|
590 |
// for(k=1;iq[jk-k]==0;k++); /* k = no. of terms needed */ |
|
591 |
// |
|
592 |
// for(i=jz+1;i<=jz+k;i++) { /* add q[jz+1] to q[jz+k] */ |
|
593 |
// f[jx+i] = /* NOTE: converted to double */ ipio2[jv+i]; //(double) ipio2[jv+i]; |
|
594 |
// for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; |
|
595 |
// q[i] = fw; |
|
596 |
// } |
|
597 |
// jz += k; |
|
598 |
// goto recompute; |
|
599 |
// } |
|
600 |
// } |
|
601 |
// |
|
602 |
// /* chop off zero terms */ |
|
603 |
// if(z==0.0) { |
|
604 |
// jz -= 1; q0 -= 24; |
|
605 |
// while(iq[jz]==0) { jz--; q0-=24;} |
|
606 |
// } else { /* break z into 24-bit if necessary */ |
|
607 |
// z = scalbnA(z,-q0); |
|
608 |
// if(z>=two24B) { |
|
609 |
// fw = (double)((int)(twon24*z)); |
|
610 |
// iq[jz] = (int)(z-two24B*fw); |
|
611 |
// jz += 1; q0 += 24; |
|
612 |
// iq[jz] = (int) fw; |
|
613 |
// } else iq[jz] = (int) z ; |
|
614 |
// } |
|
615 |
// |
|
616 |
// /* convert integer "bit" chunk to floating-point value */ |
|
617 |
// fw = scalbnA(one,q0); |
|
618 |
// for(i=jz;i>=0;i--) { |
|
619 |
// q[i] = fw*(double)iq[i]; fw*=twon24; |
|
620 |
// } |
|
621 |
// |
|
622 |
// /* compute PIo2[0,...,jp]*q[jz,...,0] */ |
|
623 |
// for(i=jz;i>=0;i--) { |
|
624 |
// for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k]; |
|
625 |
// fq[jz-i] = fw; |
|
626 |
// } |
|
627 |
// |
|
628 |
// // NOTE: switch below is eliminated, because prec is always 2 for doubles |
|
629 |
// /* compress fq[] into y[] */ |
|
630 |
// //switch(prec) { |
|
631 |
// //case 0: |
|
632 |
// // fw = 0.0; |
|
633 |
// // for (i=jz;i>=0;i--) fw += fq[i]; |
|
634 |
// // y[0] = (ih==0)? fw: -fw; |
|
635 |
// // break; |
|
636 |
// //case 1: |
|
637 |
// //case 2: |
|
638 |
// fw = 0.0; |
|
639 |
// for (i=jz;i>=0;i--) fw += fq[i]; |
|
640 |
// y[0] = (ih==0)? fw: -fw; |
|
641 |
// fw = fq[0]-fw; |
|
642 |
// for (i=1;i<=jz;i++) fw += fq[i]; |
|
643 |
// y[1] = (ih==0)? fw: -fw; |
|
644 |
// // break; |
|
645 |
// //case 3: /* painful */ |
|
646 |
// // for (i=jz;i>0;i--) { |
|
647 |
// // fw = fq[i-1]+fq[i]; |
|
648 |
// // fq[i] += fq[i-1]-fw; |
|
649 |
// // fq[i-1] = fw; |
|
650 |
// // } |
|
651 |
// // for (i=jz;i>1;i--) { |
|
652 |
// // fw = fq[i-1]+fq[i]; |
|
653 |
// // fq[i] += fq[i-1]-fw; |
|
654 |
// // fq[i-1] = fw; |
|
655 |
// // } |
|
656 |
// // for (fw=0.0,i=jz;i>=2;i--) fw += fq[i]; |
|
657 |
// // if(ih==0) { |
|
658 |
// // y[0] = fq[0]; y[1] = fq[1]; y[2] = fw; |
|
659 |
// // } else { |
|
660 |
// // y[0] = -fq[0]; y[1] = -fq[1]; y[2] = -fw; |
|
661 |
// // } |
|
662 |
// //} |
|
663 |
// return n&7; |
|
664 |
//} |
|
665 |
// |
|
666 |
// END __kernel_rem_pio2 PSEUDO CODE |
|
667 |
// |
|
668 |
// Changes between fdlibm and intrinsic: |
|
669 |
// 1. One loop is unrolled and vectorized (see comments in code) |
|
670 |
// 2. One loop is split into 2 loops (see comments in code) |
|
671 |
// 3. Non-double code is removed(last switch). Sevaral variables became |
|
672 |
// constants because of that (see comments in code) |
|
673 |
// 4. Use of jx, which is nx-1 instead of nx |
|
674 |
// Assumptions: |
|
675 |
// 1. Assume |X| >= PI/4 |
|
676 |
// Input and output: |
|
677 |
// 1. Input: X = r0, jx == nx - 1 == r6, e0 == rscratch1 |
|
678 |
// 2. Return n in r2, y[0] == y0 == v4, y[1] == y1 == v5 |
|
679 |
// NOTE: general purpose register names match local variable names in C code |
|
680 |
// NOTE: fpu registers are actively reused. See comments in code about their usage |
|
681 |
void MacroAssembler::generate__kernel_rem_pio2(address two_over_pi, address pio2) { |
|
682 |
Label Q_DONE, JX_IS_0, JX_IS_2, COMP_INNER_LOOP, RECOMP_FOR2, Q0_ZERO_CMP_LT, |
|
683 |
RECOMP_CHECK_DONE_NOT_ZERO, Q0_ZERO_CMP_DONE, COMP_FOR, Q0_ZERO_CMP_EQ, |
|
684 |
INIT_F_ZERO, RECOMPUTE, IH_FOR_INCREMENT, IH_FOR_STORE, RECOMP_CHECK_DONE, |
|
685 |
Z_IS_LESS_THAN_TWO24B, Z_IS_ZERO, FW_Y1_NO_NEGATION, |
|
686 |
RECOMP_FW_UPDATED, Z_ZERO_CHECK_DONE, FW_FOR1, IH_AFTER_SWITCH, IH_HANDLED, |
|
687 |
CONVERTION_FOR, FW_Y0_NO_NEGATION, FW_FOR1_DONE, FW_FOR2, FW_FOR2_DONE, |
|
688 |
IH_FOR, SKIP_F_LOAD, RECOMP_FOR1, RECOMP_FIRST_FOR, INIT_F_COPY, |
|
689 |
RECOMP_FOR1_CHECK; |
|
690 |
Register tmp2 = r1, n = r2, jv = r4, tmp5 = r5, jx = r6, |
|
691 |
tmp3 = r7, iqBase = r10, ih = r11, tmp4 = r12, tmp1 = r13, |
|
692 |
jz = r14, j = r15, twoOverPiBase = r16, i = r17, qBase = r18; |
|
693 |
// jp = jk == init_jk[prec] = init_jk[2] == {2,3,4,6}[2] == 4 |
|
694 |
// jx = nx - 1 |
|
695 |
lea(twoOverPiBase, ExternalAddress(two_over_pi)); |
|
696 |
cmpw(jv, zr); |
|
697 |
addw(tmp4, jx, 4); // tmp4 = m = jx + jk = jx + 4. jx is in {0,1,2} so m is in [4,5,6] |
|
698 |
cselw(jv, jv, zr, GE); |
|
55398
e53ec3b362f4
8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents:
51739
diff
changeset
|
699 |
fmovd(v26, 0.0); |
50754 | 700 |
addw(tmp5, jv, 1); // jv+1 |
701 |
subsw(j, jv, jx); |
|
702 |
add(qBase, sp, 320); // base of q[] |
|
703 |
msubw(rscratch1, i, tmp5, rscratch1); // q0 = e0-24*(jv+1) |
|
704 |
// use double f[20], fq[20], q[20], iq[20] on stack, which is |
|
705 |
// (20 + 20 + 20) x 8 + 20 x 4 = 560 bytes. From lower to upper addresses it |
|
706 |
// will contain f[20], fq[20], q[20], iq[20] |
|
707 |
// now initialize f[20] indexes 0..m (inclusive) |
|
708 |
// for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j]; |
|
709 |
mov(tmp5, sp); |
|
710 |
||
711 |
block_comment("for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j];"); { |
|
712 |
eorw(i, i, i); |
|
713 |
br(GE, INIT_F_COPY); |
|
714 |
bind(INIT_F_ZERO); |
|
715 |
stpq(v26, v26, Address(post(tmp5, 32))); |
|
716 |
addw(i, i, 4); |
|
717 |
addsw(j, j, 4); |
|
718 |
br(LT, INIT_F_ZERO); |
|
719 |
subw(i, i, j); |
|
720 |
movw(j, zr); |
|
721 |
bind(INIT_F_COPY); |
|
722 |
add(tmp1, twoOverPiBase, j, LSL, 3); // ipio2[j] start address |
|
723 |
ld1(v18, v19, v20, v21, T16B, tmp1); |
|
724 |
add(tmp5, sp, i, ext::uxtx, 3); |
|
725 |
st1(v18, v19, v20, v21, T16B, tmp5); |
|
726 |
} |
|
727 |
// v18..v21 can actually contain f[0..7] |
|
728 |
cbz(i, SKIP_F_LOAD); // i == 0 => f[i] == f[0] => already loaded |
|
729 |
ld1(v18, v19, v20, v21, T2D, Address(sp)); // load f[0..7] |
|
730 |
bind(SKIP_F_LOAD); |
|
731 |
// calculate 2^q0 and 2^-q0, which we'll need further. |
|
732 |
// q0 is exponent. So, calculate biased exponent(q0+1023) |
|
733 |
negw(tmp4, rscratch1); |
|
734 |
addw(tmp5, rscratch1, 1023); |
|
735 |
addw(tmp4, tmp4, 1023); |
|
736 |
// Unroll following for(s) depending on jx in [0,1,2] |
|
737 |
// for (i=0;i<=jk;i++) { |
|
738 |
// for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw; |
|
739 |
// } |
|
740 |
// Unrolling for jx == 0 case: |
|
741 |
// q[0] = x[0] * f[0] |
|
742 |
// q[1] = x[0] * f[1] |
|
743 |
// q[2] = x[0] * f[2] |
|
744 |
// q[3] = x[0] * f[3] |
|
745 |
// q[4] = x[0] * f[4] |
|
746 |
// |
|
747 |
// Vectorization for unrolled jx == 0 case: |
|
748 |
// {q[0], q[1]} = {f[0], f[1]} * x[0] |
|
749 |
// {q[2], q[3]} = {f[2], f[3]} * x[0] |
|
750 |
// q[4] = f[4] * x[0] |
|
751 |
// |
|
752 |
// Unrolling for jx == 1 case: |
|
753 |
// q[0] = x[0] * f[1] + x[1] * f[0] |
|
754 |
// q[1] = x[0] * f[2] + x[1] * f[1] |
|
755 |
// q[2] = x[0] * f[3] + x[1] * f[2] |
|
756 |
// q[3] = x[0] * f[4] + x[1] * f[3] |
|
757 |
// q[4] = x[0] * f[5] + x[1] * f[4] |
|
758 |
// |
|
759 |
// Vectorization for unrolled jx == 1 case: |
|
760 |
// {q[0], q[1]} = {f[0], f[1]} * x[1] |
|
761 |
// {q[2], q[3]} = {f[2], f[3]} * x[1] |
|
762 |
// q[4] = f[4] * x[1] |
|
763 |
// {q[0], q[1]} += {f[1], f[2]} * x[0] |
|
764 |
// {q[2], q[3]} += {f[3], f[4]} * x[0] |
|
765 |
// q[4] += f[5] * x[0] |
|
766 |
// |
|
767 |
// Unrolling for jx == 2 case: |
|
768 |
// q[0] = x[0] * f[2] + x[1] * f[1] + x[2] * f[0] |
|
769 |
// q[1] = x[0] * f[3] + x[1] * f[2] + x[2] * f[1] |
|
770 |
// q[2] = x[0] * f[4] + x[1] * f[3] + x[2] * f[2] |
|
771 |
// q[3] = x[0] * f[5] + x[1] * f[4] + x[2] * f[3] |
|
772 |
// q[4] = x[0] * f[6] + x[1] * f[5] + x[2] * f[4] |
|
773 |
// |
|
774 |
// Vectorization for unrolled jx == 2 case: |
|
775 |
// {q[0], q[1]} = {f[0], f[1]} * x[2] |
|
776 |
// {q[2], q[3]} = {f[2], f[3]} * x[2] |
|
777 |
// q[4] = f[4] * x[2] |
|
778 |
// {q[0], q[1]} += {f[1], f[2]} * x[1] |
|
779 |
// {q[2], q[3]} += {f[3], f[4]} * x[1] |
|
780 |
// q[4] += f[5] * x[1] |
|
781 |
// {q[0], q[1]} += {f[2], f[3]} * x[0] |
|
782 |
// {q[2], q[3]} += {f[4], f[5]} * x[0] |
|
783 |
// q[4] += f[6] * x[0] |
|
784 |
block_comment("unrolled and vectorized computation of q[0]..q[jk]"); { |
|
785 |
cmpw(jx, 1); |
|
786 |
lsl(tmp5, tmp5, 52); // now it's 2^q0 double value |
|
787 |
lsl(tmp4, tmp4, 52); // now it's 2^-q0 double value |
|
788 |
br(LT, JX_IS_0); |
|
789 |
add(i, sp, 8); |
|
790 |
ldpq(v26, v27, i); // load f[1..4] |
|
791 |
br(GT, JX_IS_2); |
|
792 |
// jx == 1 |
|
793 |
fmulxvs(v28, T2D, v18, v7); // f[0,1] * x[1] |
|
794 |
fmulxvs(v29, T2D, v19, v7); // f[2,3] * x[1] |
|
795 |
fmuld(v30, v20, v7); // f[4] * x[1] |
|
796 |
fmlavs(v28, T2D, v26, v6, 0); |
|
797 |
fmlavs(v29, T2D, v27, v6, 0); |
|
798 |
fmlavs(v30, T2D, v6, v20, 1); // v30 += f[5] * x[0] |
|
799 |
b(Q_DONE); |
|
800 |
bind(JX_IS_2); |
|
801 |
fmulxvs(v28, T2D, v18, v3); // f[0,1] * x[2] |
|
802 |
fmulxvs(v29, T2D, v19, v3); // f[2,3] * x[2] |
|
803 |
fmuld(v30, v20, v3); // f[4] * x[2] |
|
804 |
fmlavs(v28, T2D, v26, v7, 0); |
|
805 |
fmlavs(v29, T2D, v27, v7, 0); |
|
806 |
fmlavs(v30, T2D, v7, v20, 1); // v30 += f[5] * x[1] |
|
807 |
fmlavs(v28, T2D, v19, v6, 0); |
|
808 |
fmlavs(v29, T2D, v20, v6, 0); |
|
809 |
fmlavs(v30, T2D, v6, v21, 0); // v30 += f[6] * x[0] |
|
810 |
b(Q_DONE); |
|
811 |
bind(JX_IS_0); |
|
812 |
fmulxvs(v28, T2D, v18, v6); // f[0,1] * x[0] |
|
813 |
fmulxvs(v29, T2D, v19, v6); // f[2,3] * x[0] |
|
814 |
fmuld(v30, v20, v6); // f[4] * x[0] |
|
815 |
bind(Q_DONE); |
|
816 |
st1(v28, v29, v30, T2D, Address(qBase)); // save calculated q[0]...q[jk] |
|
817 |
} |
|
818 |
movz(i, 0x3E70, 48); |
|
819 |
movw(jz, 4); |
|
820 |
fmovd(v17, i); // v17 = twon24 |
|
821 |
fmovd(v30, tmp5); // 2^q0 |
|
55398
e53ec3b362f4
8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents:
51739
diff
changeset
|
822 |
fmovd(v21, 0.125); |
e53ec3b362f4
8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents:
51739
diff
changeset
|
823 |
fmovd(v20, 8.0); |
50754 | 824 |
fmovd(v22, tmp4); // 2^-q0 |
825 |
||
826 |
block_comment("recompute loop"); { |
|
827 |
bind(RECOMPUTE); |
|
828 |
// for(i=0,j=jz,z=q[jz];j>0;i++,j--) { |
|
829 |
// fw = (double)((int)(twon24* z)); |
|
830 |
// iq[i] = (int)(z-two24A*fw); |
|
831 |
// z = q[j-1]+fw; |
|
832 |
// } |
|
833 |
block_comment("distill q[] into iq[] reversingly"); { |
|
834 |
eorw(i, i, i); |
|
835 |
movw(j, jz); |
|
836 |
add(tmp2, qBase, jz, LSL, 3); // q[jz] address |
|
837 |
ldrd(v18, post(tmp2, -8)); // z = q[j] and moving address to q[j-1] |
|
838 |
bind(RECOMP_FIRST_FOR); |
|
839 |
ldrd(v27, post(tmp2, -8)); |
|
840 |
fmuld(v29, v17, v18); // twon24*z |
|
841 |
frintzd(v29, v29); // (double)(int) |
|
842 |
fmsubd(v28, v10, v29, v18); // v28 = z-two24A*fw |
|
843 |
fcvtzdw(tmp1, v28); // (int)(z-two24A*fw) |
|
844 |
strw(tmp1, Address(iqBase, i, Address::lsl(2))); |
|
845 |
faddd(v18, v27, v29); |
|
846 |
add(i, i, 1); |
|
847 |
subs(j, j, 1); |
|
848 |
br(GT, RECOMP_FIRST_FOR); |
|
849 |
} |
|
850 |
// compute n |
|
851 |
fmuld(v18, v18, v30); |
|
852 |
fmuld(v2, v18, v21); |
|
853 |
frintmd(v2, v2); // v2 = floor(v2) == rounding towards -inf |
|
854 |
fmsubd(v18, v2, v20, v18); // z -= 8.0*floor(z*0.125); |
|
855 |
movw(ih, 2); |
|
856 |
frintzd(v2, v18); // v2 = (double)((int)z) |
|
857 |
fcvtzdw(n, v18); // n = (int) z; |
|
858 |
fsubd(v18, v18, v2); // z -= (double)n; |
|
859 |
||
860 |
block_comment("q0-dependent initialization"); { |
|
861 |
cmpw(rscratch1, 0); // if (q0 > 0) |
|
862 |
br(LT, Q0_ZERO_CMP_LT); |
|
863 |
subw(j, jz, 1); // j = jz - 1 |
|
864 |
ldrw(tmp2, Address(iqBase, j, Address::lsl(2))); // tmp2 = iq[jz-1] |
|
865 |
br(EQ, Q0_ZERO_CMP_EQ); |
|
866 |
movw(tmp4, 24); |
|
867 |
subw(tmp4, tmp4, rscratch1); // == 24 - q0 |
|
868 |
lsrvw(i, tmp2, tmp4); // i = iq[jz-1] >> (24-q0) |
|
869 |
lslvw(tmp5, i, tmp4); |
|
870 |
subw(tmp2, tmp2, tmp5); // iq[jz-1] -= i<<(24-q0); |
|
871 |
strw(tmp2, Address(iqBase, j, Address::lsl(2))); // store iq[jz-1] |
|
872 |
subw(rscratch2, tmp4, 1); // == 23 - q0 |
|
873 |
addw(n, n, i); // n+=i |
|
874 |
lsrvw(ih, tmp2, rscratch2); // ih = iq[jz-1] >> (23-q0) |
|
875 |
b(Q0_ZERO_CMP_DONE); |
|
876 |
bind(Q0_ZERO_CMP_EQ); |
|
877 |
lsr(ih, tmp2, 23); // ih = iq[z-1] >> 23 |
|
878 |
b(Q0_ZERO_CMP_DONE); |
|
879 |
bind(Q0_ZERO_CMP_LT); |
|
55398
e53ec3b362f4
8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents:
51739
diff
changeset
|
880 |
fmovd(v4, 0.5); |
50754 | 881 |
fcmpd(v18, v4); |
882 |
cselw(ih, zr, ih, LT); // if (z<0.5) ih = 0 |
|
883 |
} |
|
884 |
bind(Q0_ZERO_CMP_DONE); |
|
885 |
cmpw(ih, zr); |
|
886 |
br(LE, IH_HANDLED); |
|
887 |
||
888 |
block_comment("if(ih>) {"); { |
|
889 |
// use rscratch2 as carry |
|
890 |
||
891 |
block_comment("for(i=0;i<jz ;i++) {...}"); { |
|
892 |
addw(n, n, 1); |
|
893 |
eorw(i, i, i); |
|
894 |
eorw(rscratch2, rscratch2, rscratch2); |
|
895 |
bind(IH_FOR); |
|
896 |
ldrw(j, Address(iqBase, i, Address::lsl(2))); // j = iq[i] |
|
897 |
movw(tmp3, 0x1000000); |
|
898 |
subw(tmp3, tmp3, rscratch2); |
|
899 |
cbnzw(rscratch2, IH_FOR_STORE); |
|
900 |
cbzw(j, IH_FOR_INCREMENT); |
|
901 |
movw(rscratch2, 1); |
|
902 |
bind(IH_FOR_STORE); |
|
903 |
subw(tmp3, tmp3, j); |
|
904 |
strw(tmp3, Address(iqBase, i, Address::lsl(2))); // iq[i] = 0xffffff - j |
|
905 |
bind(IH_FOR_INCREMENT); |
|
906 |
addw(i, i, 1); |
|
907 |
cmpw(i, jz); |
|
908 |
br(LT, IH_FOR); |
|
909 |
} |
|
910 |
||
911 |
block_comment("if(q0>0) {"); { |
|
912 |
cmpw(rscratch1, zr); |
|
913 |
br(LE, IH_AFTER_SWITCH); |
|
914 |
// tmp3 still has iq[jz-1] value. no need to reload |
|
915 |
// now, zero high tmp3 bits (rscratch1 number of bits) |
|
916 |
movw(j, -1); |
|
917 |
subw(i, jz, 1); // set i to jz-1 |
|
918 |
lsrv(j, j, rscratch1); |
|
919 |
andw(tmp3, tmp3, j, LSR, 8); // we have 24-bit-based constants |
|
920 |
strw(tmp3, Address(iqBase, i, Address::lsl(2))); // save iq[jz-1] |
|
921 |
} |
|
922 |
bind(IH_AFTER_SWITCH); |
|
923 |
cmpw(ih, 2); |
|
924 |
br(NE, IH_HANDLED); |
|
925 |
||
926 |
block_comment("if(ih==2) {"); { |
|
55398
e53ec3b362f4
8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents:
51739
diff
changeset
|
927 |
fmovd(v25, 1.0); |
50754 | 928 |
fsubd(v18, v25, v18); // z = one - z; |
929 |
cbzw(rscratch2, IH_HANDLED); |
|
930 |
fsubd(v18, v18, v30); // z -= scalbnA(one,q0); |
|
931 |
} |
|
932 |
} |
|
933 |
bind(IH_HANDLED); |
|
934 |
// check if recomputation is needed |
|
55398
e53ec3b362f4
8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents:
51739
diff
changeset
|
935 |
fcmpd(v18, 0.0); |
50754 | 936 |
br(NE, RECOMP_CHECK_DONE_NOT_ZERO); |
937 |
||
938 |
block_comment("if(z==zeroB) {"); { |
|
939 |
||
940 |
block_comment("for (i=jz-1;i>=jk;i--) j |= iq[i];"); { |
|
941 |
subw(i, jz, 1); |
|
942 |
eorw(j, j, j); |
|
943 |
b(RECOMP_FOR1_CHECK); |
|
944 |
bind(RECOMP_FOR1); |
|
945 |
ldrw(tmp1, Address(iqBase, i, Address::lsl(2))); |
|
946 |
orrw(j, j, tmp1); |
|
947 |
subw(i, i, 1); |
|
948 |
bind(RECOMP_FOR1_CHECK); |
|
949 |
cmpw(i, 4); |
|
950 |
br(GE, RECOMP_FOR1); |
|
951 |
} |
|
952 |
cbnzw(j, RECOMP_CHECK_DONE); |
|
953 |
||
954 |
block_comment("if(j==0) {"); { |
|
955 |
// for(k=1;iq[jk-k]==0;k++); // let's unroll it. jk == 4. So, read |
|
956 |
// iq[3], iq[2], iq[1], iq[0] until non-zero value |
|
957 |
ldp(tmp1, tmp3, iqBase); // iq[0..3] |
|
958 |
movw(j, 2); |
|
959 |
cmp(tmp3, zr); |
|
960 |
csel(tmp1, tmp1, tmp3, EQ); // set register for further consideration |
|
961 |
cselw(j, j, zr, EQ); // set initial k. Use j as k |
|
962 |
cmp(zr, tmp1, LSR, 32); |
|
963 |
addw(i, jz, 1); |
|
964 |
csincw(j, j, j, NE); |
|
965 |
||
966 |
block_comment("for(i=jz+1;i<=jz+k;i++) {...}"); { |
|
967 |
addw(jz, i, j); // i = jz+1, j = k-1. j+i = jz+k (which is a new jz) |
|
968 |
bind(RECOMP_FOR2); |
|
969 |
addw(tmp1, jv, i); |
|
970 |
ldrd(v29, Address(twoOverPiBase, tmp1, Address::lsl(3))); |
|
971 |
addw(tmp2, jx, i); |
|
972 |
strd(v29, Address(sp, tmp2, Address::lsl(3))); |
|
973 |
// f[jx+i] = /* NOTE: converted to double */ ipio2[jv+i]; //(double) ipio2[jv+i]; |
|
974 |
// since jx = 0, 1 or 2 we can unroll it: |
|
975 |
// for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; |
|
976 |
// f[jx+i-j] == (for first iteration) f[jx+i], which is already v29 |
|
977 |
add(tmp2, sp, tmp2, ext::uxtx, 3); // address of f[jx+i] |
|
978 |
ldpd(v4, v5, Address(tmp2, -16)); // load f[jx+i-2] and f[jx+i-1] |
|
979 |
fmuld(v26, v6, v29); // initial fw |
|
980 |
cbzw(jx, RECOMP_FW_UPDATED); |
|
981 |
fmaddd(v26, v7, v5, v26); |
|
982 |
cmpw(jx, 1); |
|
983 |
br(EQ, RECOMP_FW_UPDATED); |
|
984 |
fmaddd(v26, v3, v4, v26); |
|
985 |
bind(RECOMP_FW_UPDATED); |
|
986 |
strd(v26, Address(qBase, i, Address::lsl(3))); // q[i] = fw; |
|
987 |
addw(i, i, 1); |
|
988 |
cmpw(i, jz); // jz here is "old jz" + k |
|
989 |
br(LE, RECOMP_FOR2); |
|
990 |
} |
|
991 |
b(RECOMPUTE); |
|
992 |
} |
|
993 |
} |
|
994 |
} |
|
995 |
bind(RECOMP_CHECK_DONE); |
|
996 |
// chop off zero terms |
|
55398
e53ec3b362f4
8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents:
51739
diff
changeset
|
997 |
fcmpd(v18, 0.0); |
50754 | 998 |
br(EQ, Z_IS_ZERO); |
999 |
||
1000 |
block_comment("else block of if(z==0.0) {"); { |
|
1001 |
bind(RECOMP_CHECK_DONE_NOT_ZERO); |
|
1002 |
fmuld(v18, v18, v22); |
|
1003 |
fcmpd(v18, v10); // v10 is stil two24A |
|
1004 |
br(LT, Z_IS_LESS_THAN_TWO24B); |
|
1005 |
fmuld(v1, v18, v17); // twon24*z |
|
1006 |
frintzd(v1, v1); // v1 = (double)(int)(v1) |
|
51739
7bed934d439e
8210461: AArch64: Math.cos intrinsic gives incorrect results
dpochepk
parents:
51374
diff
changeset
|
1007 |
fmsubd(v2, v10, v1, v18); |
50754 | 1008 |
fcvtzdw(tmp3, v1); // (int)fw |
1009 |
fcvtzdw(tmp2, v2); // double to int |
|
1010 |
strw(tmp2, Address(iqBase, jz, Address::lsl(2))); |
|
1011 |
addw(rscratch1, rscratch1, 24); |
|
1012 |
addw(jz, jz, 1); |
|
1013 |
strw(tmp3, Address(iqBase, jz, Address::lsl(2))); // iq[jz] = (int) fw |
|
1014 |
b(Z_ZERO_CHECK_DONE); |
|
1015 |
bind(Z_IS_LESS_THAN_TWO24B); |
|
1016 |
fcvtzdw(tmp3, v18); // (int)z |
|
1017 |
strw(tmp3, Address(iqBase, jz, Address::lsl(2))); // iq[jz] = (int) z |
|
1018 |
b(Z_ZERO_CHECK_DONE); |
|
1019 |
} |
|
1020 |
||
1021 |
block_comment("if(z==0.0) {"); { |
|
1022 |
bind(Z_IS_ZERO); |
|
1023 |
subw(jz, jz, 1); |
|
1024 |
ldrw(tmp1, Address(iqBase, jz, Address::lsl(2))); |
|
1025 |
subw(rscratch1, rscratch1, 24); |
|
1026 |
cbz(tmp1, Z_IS_ZERO); |
|
1027 |
} |
|
1028 |
bind(Z_ZERO_CHECK_DONE); |
|
1029 |
// convert integer "bit" chunk to floating-point value |
|
1030 |
// v17 = twon24 |
|
1031 |
// update v30, which was scalbnA(1.0, <old q0>); |
|
1032 |
addw(tmp2, rscratch1, 1023); // biased exponent |
|
1033 |
lsl(tmp2, tmp2, 52); // put at correct position |
|
1034 |
mov(i, jz); |
|
1035 |
fmovd(v30, tmp2); |
|
1036 |
||
1037 |
block_comment("for(i=jz;i>=0;i--) {q[i] = fw*(double)iq[i]; fw*=twon24;}"); { |
|
1038 |
bind(CONVERTION_FOR); |
|
1039 |
ldrw(tmp1, Address(iqBase, i, Address::lsl(2))); |
|
1040 |
scvtfwd(v31, tmp1); |
|
1041 |
fmuld(v31, v31, v30); |
|
1042 |
strd(v31, Address(qBase, i, Address::lsl(3))); |
|
1043 |
fmuld(v30, v30, v17); |
|
1044 |
subsw(i, i, 1); |
|
1045 |
br(GE, CONVERTION_FOR); |
|
1046 |
} |
|
1047 |
add(rscratch2, sp, 160); // base for fq |
|
1048 |
// reusing twoOverPiBase |
|
1049 |
lea(twoOverPiBase, ExternalAddress(pio2)); |
|
1050 |
||
1051 |
block_comment("compute PIo2[0,...,jp]*q[jz,...,0]. for(i=jz;i>=0;i--) {...}"); { |
|
1052 |
movw(i, jz); |
|
1053 |
movw(tmp2, zr); // tmp2 will keep jz - i == 0 at start |
|
1054 |
bind(COMP_FOR); |
|
1055 |
// for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k]; |
|
55398
e53ec3b362f4
8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents:
51739
diff
changeset
|
1056 |
fmovd(v30, 0.0); |
50754 | 1057 |
add(tmp5, qBase, i, LSL, 3); // address of q[i+k] for k==0 |
1058 |
movw(tmp3, 4); |
|
1059 |
movw(tmp4, zr); // used as k |
|
1060 |
cmpw(tmp2, 4); |
|
1061 |
add(tmp1, qBase, i, LSL, 3); // used as q[i] address |
|
1062 |
cselw(tmp3, tmp2, tmp3, LE); // min(jz - i, jp) |
|
1063 |
||
1064 |
block_comment("for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];"); { |
|
1065 |
bind(COMP_INNER_LOOP); |
|
1066 |
ldrd(v18, Address(tmp1, tmp4, Address::lsl(3))); // q[i+k] |
|
1067 |
ldrd(v19, Address(twoOverPiBase, tmp4, Address::lsl(3))); // PIo2[k] |
|
1068 |
fmaddd(v30, v18, v19, v30); // fw += PIo2[k]*q[i+k]; |
|
1069 |
addw(tmp4, tmp4, 1); // k++ |
|
1070 |
cmpw(tmp4, tmp3); |
|
1071 |
br(LE, COMP_INNER_LOOP); |
|
1072 |
} |
|
1073 |
strd(v30, Address(rscratch2, tmp2, Address::lsl(3))); // fq[jz-i] |
|
1074 |
add(tmp2, tmp2, 1); |
|
1075 |
subsw(i, i, 1); |
|
1076 |
br(GE, COMP_FOR); |
|
1077 |
} |
|
1078 |
||
1079 |
block_comment("switch(prec) {...}. case 2:"); { |
|
1080 |
// compress fq into y[] |
|
1081 |
// remember prec == 2 |
|
1082 |
||
1083 |
block_comment("for (i=jz;i>=0;i--) fw += fq[i];"); { |
|
55398
e53ec3b362f4
8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents:
51739
diff
changeset
|
1084 |
fmovd(v4, 0.0); |
50754 | 1085 |
mov(i, jz); |
1086 |
bind(FW_FOR1); |
|
1087 |
ldrd(v1, Address(rscratch2, i, Address::lsl(3))); |
|
1088 |
subsw(i, i, 1); |
|
1089 |
faddd(v4, v4, v1); |
|
1090 |
br(GE, FW_FOR1); |
|
1091 |
} |
|
1092 |
bind(FW_FOR1_DONE); |
|
1093 |
// v1 contains fq[0]. so, keep it so far |
|
1094 |
fsubd(v5, v1, v4); // fw = fq[0] - fw |
|
1095 |
cbzw(ih, FW_Y0_NO_NEGATION); |
|
1096 |
fnegd(v4, v4); |
|
1097 |
bind(FW_Y0_NO_NEGATION); |
|
1098 |
||
1099 |
block_comment("for (i=1;i<=jz;i++) fw += fq[i];"); { |
|
1100 |
movw(i, 1); |
|
1101 |
cmpw(jz, 1); |
|
1102 |
br(LT, FW_FOR2_DONE); |
|
1103 |
bind(FW_FOR2); |
|
1104 |
ldrd(v1, Address(rscratch2, i, Address::lsl(3))); |
|
1105 |
addw(i, i, 1); |
|
1106 |
cmp(i, jz); |
|
1107 |
faddd(v5, v5, v1); |
|
1108 |
br(LE, FW_FOR2); |
|
1109 |
} |
|
1110 |
bind(FW_FOR2_DONE); |
|
1111 |
cbz(ih, FW_Y1_NO_NEGATION); |
|
1112 |
fnegd(v5, v5); |
|
1113 |
bind(FW_Y1_NO_NEGATION); |
|
1114 |
add(sp, sp, 560); |
|
1115 |
} |
|
1116 |
} |
|
1117 |
||
1118 |
///* __kernel_sin( x, y, iy) |
|
1119 |
// * kernel sin function on [-pi/4, pi/4], pi/4 ~ 0.7854 |
|
1120 |
// * Input x is assumed to be bounded by ~pi/4 in magnitude. |
|
1121 |
// * Input y is the tail of x. |
|
1122 |
// * Input iy indicates whether y is 0. (if iy=0, y assume to be 0). |
|
1123 |
// * |
|
1124 |
// * Algorithm |
|
1125 |
// * 1. Since sin(-x) = -sin(x), we need only to consider positive x. |
|
1126 |
// * 2. if x < 2^-27 (hx<0x3e400000 0), return x with inexact if x!=0. |
|
1127 |
// * 3. sin(x) is approximated by a polynomial of degree 13 on |
|
1128 |
// * [0,pi/4] |
|
1129 |
// * 3 13 |
|
1130 |
// * sin(x) ~ x + S1*x + ... + S6*x |
|
1131 |
// * where |
|
1132 |
// * |
|
1133 |
// * |sin(x) 2 4 6 8 10 12 | -58 |
|
1134 |
// * |----- - (1+S1*x +S2*x +S3*x +S4*x +S5*x +S6*x )| <= 2 |
|
1135 |
// * | x | |
|
1136 |
// * |
|
1137 |
// * 4. sin(x+y) = sin(x) + sin'(x')*y |
|
1138 |
// * ~ sin(x) + (1-x*x/2)*y |
|
1139 |
// * For better accuracy, let |
|
1140 |
// * 3 2 2 2 2 |
|
1141 |
// * r = x *(S2+x *(S3+x *(S4+x *(S5+x *S6)))) |
|
1142 |
// * then 3 2 |
|
1143 |
// * sin(x) = x + (S1*x + (x *(r-y/2)+y)) |
|
1144 |
// */ |
|
1145 |
//static const double |
|
1146 |
//S1 = -1.66666666666666324348e-01, /* 0xBFC55555, 0x55555549 */ |
|
1147 |
//S2 = 8.33333333332248946124e-03, /* 0x3F811111, 0x1110F8A6 */ |
|
1148 |
//S3 = -1.98412698298579493134e-04, /* 0xBF2A01A0, 0x19C161D5 */ |
|
1149 |
//S4 = 2.75573137070700676789e-06, /* 0x3EC71DE3, 0x57B1FE7D */ |
|
1150 |
//S5 = -2.50507602534068634195e-08, /* 0xBE5AE5E6, 0x8A2B9CEB */ |
|
1151 |
//S6 = 1.58969099521155010221e-10; /* 0x3DE5D93A, 0x5ACFD57C */ |
|
1152 |
// |
|
1153 |
// NOTE: S1..S6 were moved into a table: StubRoutines::aarch64::_dsin_coef |
|
1154 |
// |
|
1155 |
// BEGIN __kernel_sin PSEUDO CODE |
|
1156 |
// |
|
1157 |
//static double __kernel_sin(double x, double y, bool iy) |
|
1158 |
//{ |
|
1159 |
// double z,r,v; |
|
1160 |
// |
|
1161 |
// // NOTE: not needed. moved to dsin/dcos |
|
1162 |
// //int ix; |
|
1163 |
// //ix = high(x)&0x7fffffff; /* high word of x */ |
|
1164 |
// |
|
1165 |
// // NOTE: moved to dsin/dcos |
|
1166 |
// //if(ix<0x3e400000) /* |x| < 2**-27 */ |
|
1167 |
// // {if((int)x==0) return x;} /* generate inexact */ |
|
1168 |
// |
|
1169 |
// z = x*x; |
|
1170 |
// v = z*x; |
|
1171 |
// r = S2+z*(S3+z*(S4+z*(S5+z*S6))); |
|
1172 |
// if(iy==0) return x+v*(S1+z*r); |
|
1173 |
// else return x-((z*(half*y-v*r)-y)-v*S1); |
|
1174 |
//} |
|
1175 |
// |
|
1176 |
// END __kernel_sin PSEUDO CODE |
|
1177 |
// |
|
1178 |
// Changes between fdlibm and intrinsic: |
|
1179 |
// 1. Removed |x| < 2**-27 check, because if was done earlier in dsin/dcos |
|
1180 |
// 2. Constants are now loaded from table dsin_coef |
|
1181 |
// 3. C code parameter "int iy" was modified to "bool iyIsOne", because |
|
1182 |
// iy is always 0 or 1. Also, iyIsOne branch was moved into |
|
1183 |
// generation phase instead of taking it during code execution |
|
1184 |
// Input ans output: |
|
1185 |
// 1. Input for generated function: X argument = x |
|
1186 |
// 2. Input for generator: x = register to read argument from, iyIsOne |
|
1187 |
// = flag to use low argument low part or not, dsin_coef = coefficients |
|
1188 |
// table address |
|
1189 |
// 3. Return sin(x) value in v0 |
|
1190 |
void MacroAssembler::generate_kernel_sin(FloatRegister x, bool iyIsOne, |
|
1191 |
address dsin_coef) { |
|
1192 |
FloatRegister y = v5, z = v6, v = v7, r = v16, S1 = v17, S2 = v18, |
|
1193 |
S3 = v19, S4 = v20, S5 = v21, S6 = v22, half = v23; |
|
1194 |
lea(rscratch2, ExternalAddress(dsin_coef)); |
|
1195 |
ldpd(S5, S6, Address(rscratch2, 32)); |
|
1196 |
fmuld(z, x, x); // z = x*x; |
|
1197 |
ld1(S1, S2, S3, S4, T1D, Address(rscratch2)); |
|
1198 |
fmuld(v, z, x); // v = z*x; |
|
1199 |
||
1200 |
block_comment("calculate r = S2+z*(S3+z*(S4+z*(S5+z*S6)))"); { |
|
1201 |
fmaddd(r, z, S6, S5); |
|
1202 |
// initialize "half" in current block to utilize 2nd FPU. However, it's |
|
1203 |
// not a part of this block |
|
1204 |
fmovd(half, 0.5); |
|
1205 |
fmaddd(r, z, r, S4); |
|
1206 |
fmaddd(r, z, r, S3); |
|
1207 |
fmaddd(r, z, r, S2); |
|
1208 |
} |
|
1209 |
||
1210 |
if (!iyIsOne) { |
|
1211 |
// return x+v*(S1+z*r); |
|
1212 |
fmaddd(S1, z, r, S1); |
|
1213 |
fmaddd(v0, v, S1, x); |
|
1214 |
} else { |
|
1215 |
// return x-((z*(half*y-v*r)-y)-v*S1); |
|
1216 |
fmuld(S6, half, y); // half*y |
|
1217 |
fmsubd(S6, v, r, S6); // half*y-v*r |
|
1218 |
fmsubd(S6, z, S6, y); // y - z*(half*y-v*r) = - (z*(half*y-v*r)-y) |
|
1219 |
fmaddd(S6, v, S1, S6); // - (z*(half*y-v*r)-y) + v*S1 == -((z*(half*y-v*r)-y)-v*S1) |
|
1220 |
faddd(v0, x, S6); |
|
1221 |
} |
|
1222 |
} |
|
1223 |
||
1224 |
///* |
|
1225 |
// * __kernel_cos( x, y ) |
|
1226 |
// * kernel cos function on [-pi/4, pi/4], pi/4 ~ 0.785398164 |
|
1227 |
// * Input x is assumed to be bounded by ~pi/4 in magnitude. |
|
1228 |
// * Input y is the tail of x. |
|
1229 |
// * |
|
1230 |
// * Algorithm |
|
1231 |
// * 1. Since cos(-x) = cos(x), we need only to consider positive x. |
|
1232 |
// * 2. if x < 2^-27 (hx<0x3e400000 0), return 1 with inexact if x!=0. |
|
1233 |
// * 3. cos(x) is approximated by a polynomial of degree 14 on |
|
1234 |
// * [0,pi/4] |
|
1235 |
// * 4 14 |
|
1236 |
// * cos(x) ~ 1 - x*x/2 + C1*x + ... + C6*x |
|
1237 |
// * where the remez error is |
|
1238 |
// * |
|
1239 |
// * | 2 4 6 8 10 12 14 | -58 |
|
1240 |
// * |cos(x)-(1-.5*x +C1*x +C2*x +C3*x +C4*x +C5*x +C6*x )| <= 2 |
|
1241 |
// * | | |
|
1242 |
// * |
|
1243 |
// * 4 6 8 10 12 14 |
|
1244 |
// * 4. let r = C1*x +C2*x +C3*x +C4*x +C5*x +C6*x , then |
|
1245 |
// * cos(x) = 1 - x*x/2 + r |
|
1246 |
// * since cos(x+y) ~ cos(x) - sin(x)*y |
|
1247 |
// * ~ cos(x) - x*y, |
|
1248 |
// * a correction term is necessary in cos(x) and hence |
|
1249 |
// * cos(x+y) = 1 - (x*x/2 - (r - x*y)) |
|
1250 |
// * For better accuracy when x > 0.3, let qx = |x|/4 with |
|
1251 |
// * the last 32 bits mask off, and if x > 0.78125, let qx = 0.28125. |
|
1252 |
// * Then |
|
1253 |
// * cos(x+y) = (1-qx) - ((x*x/2-qx) - (r-x*y)). |
|
1254 |
// * Note that 1-qx and (x*x/2-qx) is EXACT here, and the |
|
1255 |
// * magnitude of the latter is at least a quarter of x*x/2, |
|
1256 |
// * thus, reducing the rounding error in the subtraction. |
|
1257 |
// */ |
|
1258 |
// |
|
1259 |
//static const double |
|
1260 |
//C1 = 4.16666666666666019037e-02, /* 0x3FA55555, 0x5555554C */ |
|
1261 |
//C2 = -1.38888888888741095749e-03, /* 0xBF56C16C, 0x16C15177 */ |
|
1262 |
//C3 = 2.48015872894767294178e-05, /* 0x3EFA01A0, 0x19CB1590 */ |
|
1263 |
//C4 = -2.75573143513906633035e-07, /* 0xBE927E4F, 0x809C52AD */ |
|
1264 |
//C5 = 2.08757232129817482790e-09, /* 0x3E21EE9E, 0xBDB4B1C4 */ |
|
1265 |
//C6 = -1.13596475577881948265e-11; /* 0xBDA8FAE9, 0xBE8838D4 */ |
|
1266 |
// |
|
1267 |
// NOTE: C1..C6 were moved into a table: StubRoutines::aarch64::_dcos_coef |
|
1268 |
// |
|
1269 |
// BEGIN __kernel_cos PSEUDO CODE |
|
1270 |
// |
|
1271 |
//static double __kernel_cos(double x, double y) |
|
1272 |
//{ |
|
1273 |
// double a,h,z,r,qx=0; |
|
1274 |
// |
|
1275 |
// // NOTE: ix is already initialized in dsin/dcos. Reuse value from register |
|
1276 |
// //int ix; |
|
1277 |
// //ix = high(x)&0x7fffffff; /* ix = |x|'s high word*/ |
|
1278 |
// |
|
1279 |
// // NOTE: moved to dsin/dcos |
|
1280 |
// //if(ix<0x3e400000) { /* if x < 2**27 */ |
|
1281 |
// // if(((int)x)==0) return one; /* generate inexact */ |
|
1282 |
// //} |
|
1283 |
// |
|
1284 |
// z = x*x; |
|
1285 |
// r = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6))))); |
|
1286 |
// if(ix < 0x3FD33333) /* if |x| < 0.3 */ |
|
1287 |
// return one - (0.5*z - (z*r - x*y)); |
|
1288 |
// else { |
|
1289 |
// if(ix > 0x3fe90000) { /* x > 0.78125 */ |
|
1290 |
// qx = 0.28125; |
|
1291 |
// } else { |
|
1292 |
// set_high(&qx, ix-0x00200000); /* x/4 */ |
|
1293 |
// set_low(&qx, 0); |
|
1294 |
// } |
|
1295 |
// h = 0.5*z-qx; |
|
1296 |
// a = one-qx; |
|
1297 |
// return a - (h - (z*r-x*y)); |
|
1298 |
// } |
|
1299 |
//} |
|
1300 |
// |
|
1301 |
// END __kernel_cos PSEUDO CODE |
|
1302 |
// |
|
1303 |
// Changes between fdlibm and intrinsic: |
|
1304 |
// 1. Removed |x| < 2**-27 check, because if was done earlier in dsin/dcos |
|
1305 |
// 2. Constants are now loaded from table dcos_coef |
|
1306 |
// Input and output: |
|
1307 |
// 1. Input for generated function: X argument = x |
|
1308 |
// 2. Input for generator: x = register to read argument from, dcos_coef |
|
1309 |
// = coefficients table address |
|
1310 |
// 2. Return cos(x) value in v0 |
|
1311 |
void MacroAssembler::generate_kernel_cos(FloatRegister x, address dcos_coef) { |
|
1312 |
Register ix = r3; |
|
1313 |
FloatRegister qx = v1, h = v2, a = v3, y = v5, z = v6, r = v7, C1 = v18, |
|
1314 |
C2 = v19, C3 = v20, C4 = v21, C5 = v22, C6 = v23, one = v25, half = v26; |
|
1315 |
Label IX_IS_LARGE, SET_QX_CONST, DONE, QX_SET; |
|
1316 |
lea(rscratch2, ExternalAddress(dcos_coef)); |
|
1317 |
ldpd(C5, C6, Address(rscratch2, 32)); // load C5, C6 |
|
1318 |
fmuld(z, x, x); // z=x^2 |
|
1319 |
ld1(C1, C2, C3, C4, T1D, Address(rscratch2)); // load C1..C3\4 |
|
1320 |
block_comment("calculate r = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))))"); { |
|
1321 |
fmaddd(r, z, C6, C5); |
|
55398
e53ec3b362f4
8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents:
51739
diff
changeset
|
1322 |
fmovd(half, 0.5); |
50754 | 1323 |
fmaddd(r, z, r, C4); |
1324 |
fmuld(y, x, y); |
|
1325 |
fmaddd(r, z, r, C3); |
|
1326 |
mov(rscratch1, 0x3FD33333); |
|
1327 |
fmaddd(r, z, r, C2); |
|
1328 |
fmuld(x, z, z); // x = z^2 |
|
1329 |
fmaddd(r, z, r, C1); // r = C1+z(C2+z(C4+z(C5+z*C6))) |
|
1330 |
} |
|
1331 |
// need to multiply r by z to have "final" r value |
|
55398
e53ec3b362f4
8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents:
51739
diff
changeset
|
1332 |
fmovd(one, 1.0); |
50754 | 1333 |
cmp(ix, rscratch1); |
1334 |
br(GT, IX_IS_LARGE); |
|
1335 |
block_comment("if(ix < 0x3FD33333) return one - (0.5*z - (z*r - x*y))"); { |
|
1336 |
// return 1.0 - (0.5*z - (z*r - x*y)) = 1.0 - (0.5*z + (x*y - z*r)) |
|
1337 |
fmsubd(v0, x, r, y); |
|
1338 |
fmaddd(v0, half, z, v0); |
|
1339 |
fsubd(v0, one, v0); |
|
1340 |
b(DONE); |
|
1341 |
} |
|
1342 |
block_comment("if(ix >= 0x3FD33333)"); { |
|
1343 |
bind(IX_IS_LARGE); |
|
1344 |
movz(rscratch2, 0x3FE9, 16); |
|
1345 |
cmp(ix, rscratch2); |
|
1346 |
br(GT, SET_QX_CONST); |
|
1347 |
block_comment("set_high(&qx, ix-0x00200000); set_low(&qx, 0);"); { |
|
1348 |
subw(rscratch2, ix, 0x00200000); |
|
1349 |
lsl(rscratch2, rscratch2, 32); |
|
1350 |
fmovd(qx, rscratch2); |
|
1351 |
} |
|
1352 |
b(QX_SET); |
|
1353 |
bind(SET_QX_CONST); |
|
1354 |
block_comment("if(ix > 0x3fe90000) qx = 0.28125;"); { |
|
55398
e53ec3b362f4
8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents:
51739
diff
changeset
|
1355 |
fmovd(qx, 0.28125); |
50754 | 1356 |
} |
1357 |
bind(QX_SET); |
|
1358 |
fnmsub(C6, x, r, y); // z*r - xy |
|
1359 |
fnmsub(h, half, z, qx); // h = 0.5*z - qx |
|
1360 |
fsubd(a, one, qx); // a = 1-qx |
|
1361 |
fsubd(C6, h, C6); // = h - (z*r - x*y) |
|
1362 |
fsubd(v0, a, C6); |
|
1363 |
} |
|
1364 |
bind(DONE); |
|
1365 |
} |
|
1366 |
||
1367 |
// generate_dsin_dcos creates stub for dsin and dcos |
|
1368 |
// Generation is done via single call because dsin and dcos code is almost the |
|
1369 |
// same(see C code below). These functions work as follows: |
|
1370 |
// 1) handle corner cases: |x| ~< pi/4, x is NaN or INF, |x| < 2**-27 |
|
1371 |
// 2) perform argument reduction if required |
|
1372 |
// 3) call kernel_sin or kernel_cos which approximate sin/cos via polynomial |
|
1373 |
// |
|
1374 |
// BEGIN dsin/dcos PSEUDO CODE |
|
1375 |
// |
|
1376 |
//dsin_dcos(jdouble x, bool isCos) { |
|
1377 |
// double y[2],z=0.0; |
|
1378 |
// int n, ix; |
|
1379 |
// |
|
1380 |
// /* High word of x. */ |
|
1381 |
// ix = high(x); |
|
1382 |
// |
|
1383 |
// /* |x| ~< pi/4 */ |
|
1384 |
// ix &= 0x7fffffff; |
|
1385 |
// if(ix <= 0x3fe921fb) return isCos ? __kernel_cos : __kernel_sin(x,z,0); |
|
1386 |
// |
|
1387 |
// /* sin/cos(Inf or NaN) is NaN */ |
|
1388 |
// else if (ix>=0x7ff00000) return x-x; |
|
1389 |
// else if (ix<0x3e400000) { /* if ix < 2**27 */ |
|
1390 |
// if(((int)x)==0) return isCos ? one : x; /* generate inexact */ |
|
1391 |
// } |
|
1392 |
// /* argument reduction needed */ |
|
1393 |
// else { |
|
1394 |
// n = __ieee754_rem_pio2(x,y); |
|
1395 |
// switch(n&3) { |
|
1396 |
// case 0: return isCos ? __kernel_cos(y[0],y[1]) : __kernel_sin(y[0],y[1], true); |
|
1397 |
// case 1: return isCos ? -__kernel_sin(y[0],y[1],true) : __kernel_cos(y[0],y[1]); |
|
1398 |
// case 2: return isCos ? -__kernel_cos(y[0],y[1]) : -__kernel_sin(y[0],y[1], true); |
|
1399 |
// default: |
|
1400 |
// return isCos ? __kernel_sin(y[0],y[1],1) : -__kernel_cos(y[0],y[1]); |
|
1401 |
// } |
|
1402 |
// } |
|
1403 |
//} |
|
1404 |
// END dsin/dcos PSEUDO CODE |
|
1405 |
// |
|
1406 |
// Changes between fdlibm and intrinsic: |
|
1407 |
// 1. Moved ix < 2**27 from kernel_sin/kernel_cos into dsin/dcos |
|
1408 |
// 2. Final switch use equivalent bit checks(tbz/tbnz) |
|
1409 |
// Input ans output: |
|
1410 |
// 1. Input for generated function: X = r0 |
|
1411 |
// 2. Input for generator: isCos = generate sin or cos, npio2_hw = address |
|
1412 |
// of npio2_hw table, two_over_pi = address of two_over_pi table, |
|
1413 |
// pio2 = address if pio2 table, dsin_coef = address if dsin_coef table, |
|
1414 |
// dcos_coef = address of dcos_coef table |
|
1415 |
// 3. Return result in v0 |
|
1416 |
// NOTE: general purpose register names match local variable names in C code |
|
1417 |
void MacroAssembler::generate_dsin_dcos(bool isCos, address npio2_hw, |
|
1418 |
address two_over_pi, address pio2, address dsin_coef, address dcos_coef) { |
|
1419 |
const int POSITIVE_INFINITY_OR_NAN_PREFIX = 0x7FF0; |
|
1420 |
||
1421 |
Label DONE, ARG_REDUCTION, TINY_X, RETURN_SIN, EARLY_CASE; |
|
1422 |
Register X = r0, absX = r1, n = r2, ix = r3; |
|
1423 |
FloatRegister y0 = v4, y1 = v5; |
|
1424 |
block_comment("check |x| ~< pi/4, NaN, Inf and |x| < 2**-27 cases"); { |
|
1425 |
fmovd(X, v0); |
|
1426 |
mov(rscratch2, 0x3e400000); |
|
1427 |
mov(rscratch1, 0x3fe921fb00000000); // pi/4. shifted to reuse later |
|
1428 |
ubfm(absX, X, 0, 62); // absX |
|
1429 |
movz(r10, POSITIVE_INFINITY_OR_NAN_PREFIX, 48); |
|
1430 |
cmp(rscratch2, absX, LSR, 32); |
|
1431 |
lsr(ix, absX, 32); // set ix |
|
1432 |
br(GT, TINY_X); // handle tiny x (|x| < 2^-27) |
|
1433 |
cmp(ix, rscratch1, LSR, 32); |
|
1434 |
br(LE, EARLY_CASE); // if(ix <= 0x3fe921fb) return |
|
1435 |
cmp(absX, r10); |
|
1436 |
br(LT, ARG_REDUCTION); |
|
1437 |
// X is NaN or INF(i.e. 0x7FF* or 0xFFF*). Return NaN (mantissa != 0). |
|
1438 |
// Set last bit unconditionally to make it NaN |
|
1439 |
orr(r10, r10, 1); |
|
1440 |
fmovd(v0, r10); |
|
1441 |
ret(lr); |
|
1442 |
} |
|
1443 |
block_comment("kernel_sin/kernel_cos: if(ix<0x3e400000) {<fast return>}"); { |
|
1444 |
bind(TINY_X); |
|
1445 |
if (isCos) { |
|
55398
e53ec3b362f4
8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents:
51739
diff
changeset
|
1446 |
fmovd(v0, 1.0); |
50754 | 1447 |
} |
1448 |
ret(lr); |
|
1449 |
} |
|
1450 |
bind(ARG_REDUCTION); /* argument reduction needed */ |
|
1451 |
block_comment("n = __ieee754_rem_pio2(x,y);"); { |
|
1452 |
generate__ieee754_rem_pio2(npio2_hw, two_over_pi, pio2); |
|
1453 |
} |
|
1454 |
block_comment("switch(n&3) {case ... }"); { |
|
1455 |
if (isCos) { |
|
1456 |
eorw(absX, n, n, LSR, 1); |
|
1457 |
tbnz(n, 0, RETURN_SIN); |
|
1458 |
} else { |
|
1459 |
tbz(n, 0, RETURN_SIN); |
|
1460 |
} |
|
1461 |
generate_kernel_cos(y0, dcos_coef); |
|
1462 |
if (isCos) { |
|
1463 |
tbz(absX, 0, DONE); |
|
1464 |
} else { |
|
1465 |
tbz(n, 1, DONE); |
|
1466 |
} |
|
1467 |
fnegd(v0, v0); |
|
1468 |
ret(lr); |
|
1469 |
bind(RETURN_SIN); |
|
1470 |
generate_kernel_sin(y0, true, dsin_coef); |
|
1471 |
if (isCos) { |
|
1472 |
tbz(absX, 0, DONE); |
|
1473 |
} else { |
|
1474 |
tbz(n, 1, DONE); |
|
1475 |
} |
|
1476 |
fnegd(v0, v0); |
|
1477 |
ret(lr); |
|
1478 |
} |
|
1479 |
bind(EARLY_CASE); |
|
1480 |
eor(y1, T8B, y1, y1); |
|
1481 |
if (isCos) { |
|
1482 |
generate_kernel_cos(v0, dcos_coef); |
|
1483 |
} else { |
|
1484 |
generate_kernel_sin(v0, false, dsin_coef); |
|
1485 |
} |
|
1486 |
bind(DONE); |
|
1487 |
ret(lr); |
|
1488 |
} |