54084
|
1 |
/*
|
58299
|
2 |
* Copyright (c) 2018, 2019, Oracle and/or its affiliates. All rights reserved.
|
54084
|
3 |
* Copyright (c) 2016, Intel Corporation. All rights reserved.
|
|
4 |
* Intel Math Library (LIBM) Source Code
|
|
5 |
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
6 |
*
|
|
7 |
* This code is free software; you can redistribute it and/or modify it
|
|
8 |
* under the terms of the GNU General Public License version 2 only, as
|
|
9 |
* published by the Free Software Foundation.
|
|
10 |
*
|
|
11 |
* This code is distributed in the hope that it will be useful, but WITHOUT
|
|
12 |
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
13 |
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
14 |
* version 2 for more details (a copy is included in the LICENSE file that
|
|
15 |
* accompanied this code).
|
|
16 |
*
|
|
17 |
* You should have received a copy of the GNU General Public License version
|
|
18 |
* 2 along with this work; if not, write to the Free Software Foundation,
|
|
19 |
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
20 |
*
|
|
21 |
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
|
22 |
* or visit www.oracle.com if you need additional information or have any
|
|
23 |
* questions.
|
|
24 |
*/
|
|
25 |
|
|
26 |
|
|
27 |
package org.graalvm.compiler.lir.amd64;
|
|
28 |
|
|
29 |
import static jdk.vm.ci.amd64.AMD64.r10;
|
|
30 |
import static jdk.vm.ci.amd64.AMD64.r11;
|
|
31 |
import static jdk.vm.ci.amd64.AMD64.r8;
|
|
32 |
import static jdk.vm.ci.amd64.AMD64.r9;
|
|
33 |
import static jdk.vm.ci.amd64.AMD64.rax;
|
|
34 |
import static jdk.vm.ci.amd64.AMD64.rbx;
|
|
35 |
import static jdk.vm.ci.amd64.AMD64.rcx;
|
|
36 |
import static jdk.vm.ci.amd64.AMD64.rdi;
|
|
37 |
import static jdk.vm.ci.amd64.AMD64.rdx;
|
|
38 |
import static jdk.vm.ci.amd64.AMD64.rsi;
|
|
39 |
import static jdk.vm.ci.amd64.AMD64.rsp;
|
|
40 |
import static jdk.vm.ci.amd64.AMD64.xmm0;
|
|
41 |
import static jdk.vm.ci.amd64.AMD64.xmm1;
|
|
42 |
import static jdk.vm.ci.amd64.AMD64.xmm2;
|
|
43 |
import static jdk.vm.ci.amd64.AMD64.xmm3;
|
|
44 |
import static jdk.vm.ci.amd64.AMD64.xmm4;
|
|
45 |
import static jdk.vm.ci.amd64.AMD64.xmm5;
|
|
46 |
import static jdk.vm.ci.amd64.AMD64.xmm6;
|
|
47 |
import static jdk.vm.ci.amd64.AMD64.xmm7;
|
|
48 |
import static org.graalvm.compiler.lir.amd64.AMD64HotSpotHelper.pointerConstant;
|
|
49 |
import static org.graalvm.compiler.lir.amd64.AMD64HotSpotHelper.recordExternalAddress;
|
|
50 |
|
|
51 |
import org.graalvm.compiler.asm.Label;
|
|
52 |
import org.graalvm.compiler.asm.amd64.AMD64Address;
|
|
53 |
import org.graalvm.compiler.asm.amd64.AMD64Assembler.ConditionFlag;
|
|
54 |
import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler;
|
|
55 |
import org.graalvm.compiler.lir.LIRInstructionClass;
|
|
56 |
import org.graalvm.compiler.lir.asm.ArrayDataPointerConstant;
|
|
57 |
import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
|
|
58 |
|
|
59 |
import jdk.vm.ci.amd64.AMD64;
|
|
60 |
|
|
61 |
/**
|
|
62 |
* <pre>
|
|
63 |
* ALGORITHM DESCRIPTION - SIN()
|
|
64 |
* ---------------------
|
|
65 |
*
|
|
66 |
* 1. RANGE REDUCTION
|
|
67 |
*
|
|
68 |
* We perform an initial range reduction from X to r with
|
|
69 |
*
|
|
70 |
* X =~= N * pi/32 + r
|
|
71 |
*
|
|
72 |
* so that |r| <= pi/64 + epsilon. We restrict inputs to those
|
|
73 |
* where |N| <= 932560. Beyond this, the range reduction is
|
|
74 |
* insufficiently accurate. For extremely small inputs,
|
|
75 |
* denormalization can occur internally, impacting performance.
|
|
76 |
* This means that the main path is actually only taken for
|
|
77 |
* 2^-252 <= |X| < 90112.
|
|
78 |
*
|
|
79 |
* To avoid branches, we perform the range reduction to full
|
|
80 |
* accuracy each time.
|
|
81 |
*
|
|
82 |
* X - N * (P_1 + P_2 + P_3)
|
|
83 |
*
|
|
84 |
* where P_1 and P_2 are 32-bit numbers (so multiplication by N
|
|
85 |
* is exact) and P_3 is a 53-bit number. Together, these
|
|
86 |
* approximate pi well enough for all cases in the restricted
|
|
87 |
* range.
|
|
88 |
*
|
|
89 |
* The main reduction sequence is:
|
|
90 |
*
|
|
91 |
* y = 32/pi * x
|
|
92 |
* N = integer(y)
|
|
93 |
* (computed by adding and subtracting off SHIFTER)
|
|
94 |
*
|
|
95 |
* m_1 = N * P_1
|
|
96 |
* m_2 = N * P_2
|
|
97 |
* r_1 = x - m_1
|
|
98 |
* r = r_1 - m_2
|
|
99 |
* (this r can be used for most of the calculation)
|
|
100 |
*
|
|
101 |
* c_1 = r_1 - r
|
|
102 |
* m_3 = N * P_3
|
|
103 |
* c_2 = c_1 - m_2
|
|
104 |
* c = c_2 - m_3
|
|
105 |
*
|
|
106 |
* 2. MAIN ALGORITHM
|
|
107 |
*
|
|
108 |
* The algorithm uses a table lookup based on B = M * pi / 32
|
|
109 |
* where M = N mod 64. The stored values are:
|
|
110 |
* sigma closest power of 2 to cos(B)
|
|
111 |
* C_hl 53-bit cos(B) - sigma
|
|
112 |
* S_hi + S_lo 2 * 53-bit sin(B)
|
|
113 |
*
|
|
114 |
* The computation is organized as follows:
|
|
115 |
*
|
|
116 |
* sin(B + r + c) = [sin(B) + sigma * r] +
|
|
117 |
* r * (cos(B) - sigma) +
|
|
118 |
* sin(B) * [cos(r + c) - 1] +
|
|
119 |
* cos(B) * [sin(r + c) - r]
|
|
120 |
*
|
|
121 |
* which is approximately:
|
|
122 |
*
|
|
123 |
* [S_hi + sigma * r] +
|
|
124 |
* C_hl * r +
|
|
125 |
* S_lo + S_hi * [(cos(r) - 1) - r * c] +
|
|
126 |
* (C_hl + sigma) * [(sin(r) - r) + c]
|
|
127 |
*
|
|
128 |
* and this is what is actually computed. We separate this sum
|
|
129 |
* into four parts:
|
|
130 |
*
|
|
131 |
* hi + med + pols + corr
|
|
132 |
*
|
|
133 |
* where
|
|
134 |
*
|
|
135 |
* hi = S_hi + sigma r
|
|
136 |
* med = C_hl * r
|
|
137 |
* pols = S_hi * (cos(r) - 1) + (C_hl + sigma) * (sin(r) - r)
|
|
138 |
* corr = S_lo + c * ((C_hl + sigma) - S_hi * r)
|
|
139 |
*
|
|
140 |
* 3. POLYNOMIAL
|
|
141 |
*
|
|
142 |
* The polynomial S_hi * (cos(r) - 1) + (C_hl + sigma) *
|
|
143 |
* (sin(r) - r) can be rearranged freely, since it is quite
|
|
144 |
* small, so we exploit parallelism to the fullest.
|
|
145 |
*
|
|
146 |
* psc4 = SC_4 * r_1
|
|
147 |
* msc4 = psc4 * r
|
|
148 |
* r2 = r * r
|
|
149 |
* msc2 = SC_2 * r2
|
|
150 |
* r4 = r2 * r2
|
|
151 |
* psc3 = SC_3 + msc4
|
|
152 |
* psc1 = SC_1 + msc2
|
|
153 |
* msc3 = r4 * psc3
|
|
154 |
* sincospols = psc1 + msc3
|
|
155 |
* pols = sincospols *
|
|
156 |
* <S_hi * r^2 | (C_hl + sigma) * r^3>
|
|
157 |
*
|
|
158 |
* 4. CORRECTION TERM
|
|
159 |
*
|
|
160 |
* This is where the "c" component of the range reduction is
|
|
161 |
* taken into account; recall that just "r" is used for most of
|
|
162 |
* the calculation.
|
|
163 |
*
|
|
164 |
* -c = m_3 - c_2
|
|
165 |
* -d = S_hi * r - (C_hl + sigma)
|
|
166 |
* corr = -c * -d + S_lo
|
|
167 |
*
|
|
168 |
* 5. COMPENSATED SUMMATIONS
|
|
169 |
*
|
|
170 |
* The two successive compensated summations add up the high
|
|
171 |
* and medium parts, leaving just the low parts to add up at
|
|
172 |
* the end.
|
|
173 |
*
|
|
174 |
* rs = sigma * r
|
|
175 |
* res_int = S_hi + rs
|
|
176 |
* k_0 = S_hi - res_int
|
|
177 |
* k_2 = k_0 + rs
|
|
178 |
* med = C_hl * r
|
|
179 |
* res_hi = res_int + med
|
|
180 |
* k_1 = res_int - res_hi
|
|
181 |
* k_3 = k_1 + med
|
|
182 |
*
|
|
183 |
* 6. FINAL SUMMATION
|
|
184 |
*
|
|
185 |
* We now add up all the small parts:
|
|
186 |
*
|
|
187 |
* res_lo = pols(hi) + pols(lo) + corr + k_1 + k_3
|
|
188 |
*
|
|
189 |
* Now the overall result is just:
|
|
190 |
*
|
|
191 |
* res_hi + res_lo
|
|
192 |
*
|
|
193 |
* 7. SMALL ARGUMENTS
|
|
194 |
*
|
|
195 |
* If |x| < SNN (SNN meaning the smallest normal number), we
|
|
196 |
* simply perform 0.1111111 cdots 1111 * x. For SNN <= |x|, we
|
|
197 |
* do 2^-55 * (2^55 * x - x).
|
|
198 |
*
|
|
199 |
* Special cases:
|
|
200 |
* sin(NaN) = quiet NaN, and raise invalid exception
|
|
201 |
* sin(INF) = NaN and raise invalid exception
|
|
202 |
* sin(+/-0) = +/-0
|
|
203 |
* </pre>
|
|
204 |
*/
|
|
205 |
public final class AMD64MathSinOp extends AMD64MathIntrinsicUnaryOp {
|
|
206 |
|
|
207 |
public static final LIRInstructionClass<AMD64MathSinOp> TYPE = LIRInstructionClass.create(AMD64MathSinOp.class);
|
|
208 |
|
|
209 |
public AMD64MathSinOp() {
|
|
210 |
super(TYPE, /* GPR */ rax, rcx, rdx, rbx, rsi, rdi, r8, r9, r10, r11,
|
|
211 |
/* XMM */ xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
|
|
212 |
}
|
|
213 |
|
|
214 |
private ArrayDataPointerConstant onehalf = pointerConstant(16, new int[]{
|
|
215 |
// @formatter:off
|
|
216 |
0x00000000, 0x3fe00000, 0x00000000, 0x3fe00000
|
|
217 |
// @formatter:on
|
|
218 |
});
|
|
219 |
|
|
220 |
private ArrayDataPointerConstant p2 = pointerConstant(16, new int[]{
|
|
221 |
// @formatter:off
|
|
222 |
0x1a600000, 0x3d90b461, 0x1a600000, 0x3d90b461
|
|
223 |
// @formatter:on
|
|
224 |
});
|
|
225 |
|
|
226 |
private ArrayDataPointerConstant sc4 = pointerConstant(16, new int[]{
|
|
227 |
// @formatter:off
|
|
228 |
0xa556c734, 0x3ec71de3, 0x1a01a01a, 0x3efa01a0
|
|
229 |
// @formatter:on
|
|
230 |
});
|
|
231 |
|
|
232 |
private ArrayDataPointerConstant ctable = pointerConstant(16, new int[]{
|
|
233 |
// @formatter:off
|
|
234 |
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
|
235 |
0x00000000, 0x00000000, 0x3ff00000, 0x176d6d31, 0xbf73b92e,
|
|
236 |
0xbc29b42c, 0x3fb917a6, 0xe0000000, 0xbc3e2718, 0x00000000,
|
|
237 |
0x3ff00000, 0x011469fb, 0xbf93ad06, 0x3c69a60b, 0x3fc8f8b8,
|
|
238 |
0xc0000000, 0xbc626d19, 0x00000000, 0x3ff00000, 0x939d225a,
|
|
239 |
0xbfa60bea, 0x2ed59f06, 0x3fd29406, 0xa0000000, 0xbc75d28d,
|
|
240 |
0x00000000, 0x3ff00000, 0x866b95cf, 0xbfb37ca1, 0xa6aea963,
|
|
241 |
0x3fd87de2, 0xe0000000, 0xbc672ced, 0x00000000, 0x3ff00000,
|
|
242 |
0x73fa1279, 0xbfbe3a68, 0x3806f63b, 0x3fde2b5d, 0x20000000,
|
|
243 |
0x3c5e0d89, 0x00000000, 0x3ff00000, 0x5bc57974, 0xbfc59267,
|
|
244 |
0x39ae68c8, 0x3fe1c73b, 0x20000000, 0x3c8b25dd, 0x00000000,
|
|
245 |
0x3ff00000, 0x53aba2fd, 0xbfcd0dfe, 0x25091dd6, 0x3fe44cf3,
|
|
246 |
0x20000000, 0x3c68076a, 0x00000000, 0x3ff00000, 0x99fcef32,
|
|
247 |
0x3fca8279, 0x667f3bcd, 0x3fe6a09e, 0x20000000, 0xbc8bdd34,
|
|
248 |
0x00000000, 0x3fe00000, 0x94247758, 0x3fc133cc, 0x6b151741,
|
|
249 |
0x3fe8bc80, 0x20000000, 0xbc82c5e1, 0x00000000, 0x3fe00000,
|
|
250 |
0x9ae68c87, 0x3fac73b3, 0x290ea1a3, 0x3fea9b66, 0xe0000000,
|
|
251 |
0x3c39f630, 0x00000000, 0x3fe00000, 0x7f909c4e, 0xbf9d4a2c,
|
|
252 |
0xf180bdb1, 0x3fec38b2, 0x80000000, 0xbc76e0b1, 0x00000000,
|
|
253 |
0x3fe00000, 0x65455a75, 0xbfbe0875, 0xcf328d46, 0x3fed906b,
|
|
254 |
0x20000000, 0x3c7457e6, 0x00000000, 0x3fe00000, 0x76acf82d,
|
|
255 |
0x3fa4a031, 0x56c62dda, 0x3fee9f41, 0xe0000000, 0x3c8760b1,
|
|
256 |
0x00000000, 0x3fd00000, 0x0e5967d5, 0xbfac1d1f, 0xcff75cb0,
|
|
257 |
0x3fef6297, 0x20000000, 0x3c756217, 0x00000000, 0x3fd00000,
|
|
258 |
0x0f592f50, 0xbf9ba165, 0xa3d12526, 0x3fefd88d, 0x40000000,
|
|
259 |
0xbc887df6, 0x00000000, 0x3fc00000, 0x00000000, 0x00000000,
|
|
260 |
0x00000000, 0x3ff00000, 0x00000000, 0x00000000, 0x00000000,
|
|
261 |
0x00000000, 0x0f592f50, 0x3f9ba165, 0xa3d12526, 0x3fefd88d,
|
|
262 |
0x40000000, 0xbc887df6, 0x00000000, 0xbfc00000, 0x0e5967d5,
|
|
263 |
0x3fac1d1f, 0xcff75cb0, 0x3fef6297, 0x20000000, 0x3c756217,
|
|
264 |
0x00000000, 0xbfd00000, 0x76acf82d, 0xbfa4a031, 0x56c62dda,
|
|
265 |
0x3fee9f41, 0xe0000000, 0x3c8760b1, 0x00000000, 0xbfd00000,
|
|
266 |
0x65455a75, 0x3fbe0875, 0xcf328d46, 0x3fed906b, 0x20000000,
|
|
267 |
0x3c7457e6, 0x00000000, 0xbfe00000, 0x7f909c4e, 0x3f9d4a2c,
|
|
268 |
0xf180bdb1, 0x3fec38b2, 0x80000000, 0xbc76e0b1, 0x00000000,
|
|
269 |
0xbfe00000, 0x9ae68c87, 0xbfac73b3, 0x290ea1a3, 0x3fea9b66,
|
|
270 |
0xe0000000, 0x3c39f630, 0x00000000, 0xbfe00000, 0x94247758,
|
|
271 |
0xbfc133cc, 0x6b151741, 0x3fe8bc80, 0x20000000, 0xbc82c5e1,
|
|
272 |
0x00000000, 0xbfe00000, 0x99fcef32, 0xbfca8279, 0x667f3bcd,
|
|
273 |
0x3fe6a09e, 0x20000000, 0xbc8bdd34, 0x00000000, 0xbfe00000,
|
|
274 |
0x53aba2fd, 0x3fcd0dfe, 0x25091dd6, 0x3fe44cf3, 0x20000000,
|
|
275 |
0x3c68076a, 0x00000000, 0xbff00000, 0x5bc57974, 0x3fc59267,
|
|
276 |
0x39ae68c8, 0x3fe1c73b, 0x20000000, 0x3c8b25dd, 0x00000000,
|
|
277 |
0xbff00000, 0x73fa1279, 0x3fbe3a68, 0x3806f63b, 0x3fde2b5d,
|
|
278 |
0x20000000, 0x3c5e0d89, 0x00000000, 0xbff00000, 0x866b95cf,
|
|
279 |
0x3fb37ca1, 0xa6aea963, 0x3fd87de2, 0xe0000000, 0xbc672ced,
|
|
280 |
0x00000000, 0xbff00000, 0x939d225a, 0x3fa60bea, 0x2ed59f06,
|
|
281 |
0x3fd29406, 0xa0000000, 0xbc75d28d, 0x00000000, 0xbff00000,
|
|
282 |
0x011469fb, 0x3f93ad06, 0x3c69a60b, 0x3fc8f8b8, 0xc0000000,
|
|
283 |
0xbc626d19, 0x00000000, 0xbff00000, 0x176d6d31, 0x3f73b92e,
|
|
284 |
0xbc29b42c, 0x3fb917a6, 0xe0000000, 0xbc3e2718, 0x00000000,
|
|
285 |
0xbff00000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
|
286 |
0x00000000, 0x00000000, 0x00000000, 0xbff00000, 0x176d6d31,
|
|
287 |
0x3f73b92e, 0xbc29b42c, 0xbfb917a6, 0xe0000000, 0x3c3e2718,
|
|
288 |
0x00000000, 0xbff00000, 0x011469fb, 0x3f93ad06, 0x3c69a60b,
|
|
289 |
0xbfc8f8b8, 0xc0000000, 0x3c626d19, 0x00000000, 0xbff00000,
|
|
290 |
0x939d225a, 0x3fa60bea, 0x2ed59f06, 0xbfd29406, 0xa0000000,
|
|
291 |
0x3c75d28d, 0x00000000, 0xbff00000, 0x866b95cf, 0x3fb37ca1,
|
|
292 |
0xa6aea963, 0xbfd87de2, 0xe0000000, 0x3c672ced, 0x00000000,
|
|
293 |
0xbff00000, 0x73fa1279, 0x3fbe3a68, 0x3806f63b, 0xbfde2b5d,
|
|
294 |
0x20000000, 0xbc5e0d89, 0x00000000, 0xbff00000, 0x5bc57974,
|
|
295 |
0x3fc59267, 0x39ae68c8, 0xbfe1c73b, 0x20000000, 0xbc8b25dd,
|
|
296 |
0x00000000, 0xbff00000, 0x53aba2fd, 0x3fcd0dfe, 0x25091dd6,
|
|
297 |
0xbfe44cf3, 0x20000000, 0xbc68076a, 0x00000000, 0xbff00000,
|
|
298 |
0x99fcef32, 0xbfca8279, 0x667f3bcd, 0xbfe6a09e, 0x20000000,
|
|
299 |
0x3c8bdd34, 0x00000000, 0xbfe00000, 0x94247758, 0xbfc133cc,
|
|
300 |
0x6b151741, 0xbfe8bc80, 0x20000000, 0x3c82c5e1, 0x00000000,
|
|
301 |
0xbfe00000, 0x9ae68c87, 0xbfac73b3, 0x290ea1a3, 0xbfea9b66,
|
|
302 |
0xe0000000, 0xbc39f630, 0x00000000, 0xbfe00000, 0x7f909c4e,
|
|
303 |
0x3f9d4a2c, 0xf180bdb1, 0xbfec38b2, 0x80000000, 0x3c76e0b1,
|
|
304 |
0x00000000, 0xbfe00000, 0x65455a75, 0x3fbe0875, 0xcf328d46,
|
|
305 |
0xbfed906b, 0x20000000, 0xbc7457e6, 0x00000000, 0xbfe00000,
|
|
306 |
0x76acf82d, 0xbfa4a031, 0x56c62dda, 0xbfee9f41, 0xe0000000,
|
|
307 |
0xbc8760b1, 0x00000000, 0xbfd00000, 0x0e5967d5, 0x3fac1d1f,
|
|
308 |
0xcff75cb0, 0xbfef6297, 0x20000000, 0xbc756217, 0x00000000,
|
|
309 |
0xbfd00000, 0x0f592f50, 0x3f9ba165, 0xa3d12526, 0xbfefd88d,
|
|
310 |
0x40000000, 0x3c887df6, 0x00000000, 0xbfc00000, 0x00000000,
|
|
311 |
0x00000000, 0x00000000, 0xbff00000, 0x00000000, 0x00000000,
|
|
312 |
0x00000000, 0x00000000, 0x0f592f50, 0xbf9ba165, 0xa3d12526,
|
|
313 |
0xbfefd88d, 0x40000000, 0x3c887df6, 0x00000000, 0x3fc00000,
|
|
314 |
0x0e5967d5, 0xbfac1d1f, 0xcff75cb0, 0xbfef6297, 0x20000000,
|
|
315 |
0xbc756217, 0x00000000, 0x3fd00000, 0x76acf82d, 0x3fa4a031,
|
|
316 |
0x56c62dda, 0xbfee9f41, 0xe0000000, 0xbc8760b1, 0x00000000,
|
|
317 |
0x3fd00000, 0x65455a75, 0xbfbe0875, 0xcf328d46, 0xbfed906b,
|
|
318 |
0x20000000, 0xbc7457e6, 0x00000000, 0x3fe00000, 0x7f909c4e,
|
|
319 |
0xbf9d4a2c, 0xf180bdb1, 0xbfec38b2, 0x80000000, 0x3c76e0b1,
|
|
320 |
0x00000000, 0x3fe00000, 0x9ae68c87, 0x3fac73b3, 0x290ea1a3,
|
|
321 |
0xbfea9b66, 0xe0000000, 0xbc39f630, 0x00000000, 0x3fe00000,
|
|
322 |
0x94247758, 0x3fc133cc, 0x6b151741, 0xbfe8bc80, 0x20000000,
|
|
323 |
0x3c82c5e1, 0x00000000, 0x3fe00000, 0x99fcef32, 0x3fca8279,
|
|
324 |
0x667f3bcd, 0xbfe6a09e, 0x20000000, 0x3c8bdd34, 0x00000000,
|
|
325 |
0x3fe00000, 0x53aba2fd, 0xbfcd0dfe, 0x25091dd6, 0xbfe44cf3,
|
|
326 |
0x20000000, 0xbc68076a, 0x00000000, 0x3ff00000, 0x5bc57974,
|
|
327 |
0xbfc59267, 0x39ae68c8, 0xbfe1c73b, 0x20000000, 0xbc8b25dd,
|
|
328 |
0x00000000, 0x3ff00000, 0x73fa1279, 0xbfbe3a68, 0x3806f63b,
|
|
329 |
0xbfde2b5d, 0x20000000, 0xbc5e0d89, 0x00000000, 0x3ff00000,
|
|
330 |
0x866b95cf, 0xbfb37ca1, 0xa6aea963, 0xbfd87de2, 0xe0000000,
|
|
331 |
0x3c672ced, 0x00000000, 0x3ff00000, 0x939d225a, 0xbfa60bea,
|
|
332 |
0x2ed59f06, 0xbfd29406, 0xa0000000, 0x3c75d28d, 0x00000000,
|
|
333 |
0x3ff00000, 0x011469fb, 0xbf93ad06, 0x3c69a60b, 0xbfc8f8b8,
|
|
334 |
0xc0000000, 0x3c626d19, 0x00000000, 0x3ff00000, 0x176d6d31,
|
|
335 |
0xbf73b92e, 0xbc29b42c, 0xbfb917a6, 0xe0000000, 0x3c3e2718,
|
|
336 |
0x00000000, 0x3ff00000
|
|
337 |
// @formatter:on
|
|
338 |
});
|
|
339 |
|
|
340 |
private ArrayDataPointerConstant sc2 = pointerConstant(16, new int[]{
|
|
341 |
// @formatter:off
|
|
342 |
0x11111111, 0x3f811111, 0x55555555, 0x3fa55555
|
|
343 |
// @formatter:on
|
|
344 |
});
|
|
345 |
|
|
346 |
private ArrayDataPointerConstant sc3 = pointerConstant(16, new int[]{
|
|
347 |
// @formatter:off
|
|
348 |
0x1a01a01a, 0xbf2a01a0, 0x16c16c17, 0xbf56c16c
|
|
349 |
// @formatter:on
|
|
350 |
});
|
|
351 |
|
|
352 |
private ArrayDataPointerConstant sc1 = pointerConstant(16, new int[]{
|
|
353 |
// @formatter:off
|
|
354 |
0x55555555, 0xbfc55555, 0x00000000, 0xbfe00000
|
|
355 |
// @formatter:on
|
|
356 |
});
|
|
357 |
|
|
358 |
private ArrayDataPointerConstant piInvTable = pointerConstant(16, new int[]{
|
|
359 |
// @formatter:off
|
|
360 |
0x00000000, 0x00000000, 0xa2f9836e, 0x4e441529, 0xfc2757d1,
|
|
361 |
0xf534ddc0, 0xdb629599, 0x3c439041, 0xfe5163ab, 0xdebbc561,
|
|
362 |
0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c, 0xfe1deb1c,
|
|
363 |
0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
|
|
364 |
0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff,
|
|
365 |
0xde05980f, 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7,
|
|
366 |
0x4f463f66, 0x9e5fea2d, 0x7527bac7, 0xebe5f17b, 0x3d0739f7,
|
|
367 |
0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08, 0x56033046, 0xfc7b6bab,
|
|
368 |
0xf0cfbc21
|
|
369 |
// @formatter:on
|
|
370 |
});
|
|
371 |
|
|
372 |
private ArrayDataPointerConstant pi4 = pointerConstant(8, new int[]{
|
|
373 |
// @formatter:off
|
|
374 |
0x40000000, 0x3fe921fb,
|
|
375 |
});
|
|
376 |
private ArrayDataPointerConstant pi48 = pointerConstant(8, new int[]{
|
|
377 |
0x18469899, 0x3e64442d
|
|
378 |
// @formatter:on
|
|
379 |
});
|
|
380 |
|
|
381 |
private ArrayDataPointerConstant pi32Inv = pointerConstant(8, new int[]{
|
|
382 |
// @formatter:off
|
|
383 |
0x6dc9c883, 0x40245f30
|
|
384 |
// @formatter:on
|
|
385 |
});
|
|
386 |
|
|
387 |
private ArrayDataPointerConstant shifter = pointerConstant(8, new int[]{
|
|
388 |
// @formatter:off
|
|
389 |
0x00000000, 0x43380000
|
|
390 |
// @formatter:on
|
|
391 |
});
|
|
392 |
|
|
393 |
private ArrayDataPointerConstant signMask = pointerConstant(8, new int[]{
|
|
394 |
// @formatter:off
|
|
395 |
0x00000000, 0x80000000
|
|
396 |
// @formatter:on
|
|
397 |
});
|
|
398 |
|
|
399 |
private ArrayDataPointerConstant p3 = pointerConstant(8, new int[]{
|
|
400 |
// @formatter:off
|
|
401 |
0x2e037073, 0x3b63198a
|
|
402 |
// @formatter:on
|
|
403 |
});
|
|
404 |
|
|
405 |
private ArrayDataPointerConstant allOnes = pointerConstant(8, new int[]{
|
|
406 |
// @formatter:off
|
|
407 |
0xffffffff, 0x3fefffff
|
|
408 |
// @formatter:on
|
|
409 |
});
|
|
410 |
|
|
411 |
private ArrayDataPointerConstant twoPow55 = pointerConstant(8, new int[]{
|
|
412 |
// @formatter:off
|
|
413 |
0x00000000, 0x43600000
|
|
414 |
// @formatter:on
|
|
415 |
});
|
|
416 |
|
|
417 |
private ArrayDataPointerConstant twoPowM55 = pointerConstant(8, new int[]{
|
|
418 |
// @formatter:off
|
|
419 |
0x00000000, 0x3c800000
|
|
420 |
// @formatter:on
|
|
421 |
});
|
|
422 |
|
|
423 |
private ArrayDataPointerConstant p1 = pointerConstant(8, new int[]{
|
|
424 |
// @formatter:off
|
|
425 |
0x54400000, 0x3fb921fb
|
|
426 |
// @formatter:on
|
|
427 |
});
|
|
428 |
|
|
429 |
private ArrayDataPointerConstant negZero = pointerConstant(8, new int[]{
|
|
430 |
// @formatter:off
|
|
431 |
0x00000000, 0x80000000
|
|
432 |
// @formatter:on
|
|
433 |
});
|
|
434 |
|
|
435 |
@Override
|
|
436 |
public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) {
|
|
437 |
Label block0 = new Label();
|
|
438 |
Label block1 = new Label();
|
|
439 |
Label block2 = new Label();
|
|
440 |
Label block3 = new Label();
|
|
441 |
Label block4 = new Label();
|
|
442 |
Label block5 = new Label();
|
|
443 |
Label block6 = new Label();
|
|
444 |
Label block7 = new Label();
|
|
445 |
Label block8 = new Label();
|
|
446 |
Label block9 = new Label();
|
|
447 |
Label block10 = new Label();
|
|
448 |
Label block11 = new Label();
|
|
449 |
Label block12 = new Label();
|
|
450 |
Label block13 = new Label();
|
|
451 |
Label block14 = new Label();
|
|
452 |
|
|
453 |
masm.push(AMD64.rbx);
|
|
454 |
masm.subq(rsp, 16);
|
|
455 |
masm.movsd(new AMD64Address(rsp, 8), xmm0);
|
|
456 |
masm.movl(rax, new AMD64Address(rsp, 12));
|
|
457 |
masm.movq(xmm1, recordExternalAddress(crb, pi32Inv)); // 0x6dc9c883, 0x40245f30
|
|
458 |
masm.movq(xmm2, recordExternalAddress(crb, shifter)); // 0x00000000, 0x43380000
|
|
459 |
masm.andl(rax, 2147418112);
|
|
460 |
masm.subl(rax, 808452096);
|
|
461 |
masm.cmpl(rax, 281346048);
|
|
462 |
masm.jcc(ConditionFlag.Above, block0);
|
|
463 |
masm.mulsd(xmm1, xmm0);
|
|
464 |
masm.movdqu(xmm5, recordExternalAddress(crb, onehalf)); // 0x00000000, 0x3fe00000,
|
|
465 |
// 0x00000000, 0x3fe00000
|
|
466 |
masm.movq(xmm4, recordExternalAddress(crb, signMask)); // 0x00000000, 0x80000000
|
|
467 |
masm.pand(xmm4, xmm0);
|
|
468 |
masm.por(xmm5, xmm4);
|
|
469 |
masm.addpd(xmm1, xmm5);
|
|
470 |
masm.cvttsd2sil(rdx, xmm1);
|
|
471 |
masm.cvtsi2sdl(xmm1, rdx);
|
|
472 |
masm.movdqu(xmm6, recordExternalAddress(crb, p2)); // 0x1a600000, 0x3d90b461,
|
|
473 |
// 0x1a600000, 0x3d90b461
|
|
474 |
masm.movq(r8, 0x3fb921fb54400000L);
|
|
475 |
masm.movdq(xmm3, r8);
|
|
476 |
masm.movdqu(xmm5, recordExternalAddress(crb, sc4)); // 0xa556c734, 0x3ec71de3,
|
|
477 |
// 0x1a01a01a, 0x3efa01a0
|
|
478 |
masm.pshufd(xmm4, xmm0, 68);
|
|
479 |
masm.mulsd(xmm3, xmm1);
|
|
480 |
if (masm.supports(AMD64.CPUFeature.SSE3)) {
|
|
481 |
masm.movddup(xmm1, xmm1);
|
|
482 |
} else {
|
|
483 |
masm.movlhps(xmm1, xmm1);
|
|
484 |
}
|
|
485 |
masm.andl(rdx, 63);
|
|
486 |
masm.shll(rdx, 5);
|
|
487 |
masm.leaq(AMD64.rax, recordExternalAddress(crb, ctable));
|
|
488 |
masm.addq(AMD64.rax, AMD64.rdx);
|
|
489 |
masm.mulpd(xmm6, xmm1);
|
|
490 |
masm.mulsd(xmm1, recordExternalAddress(crb, p3)); // 0x2e037073, 0x3b63198a
|
|
491 |
masm.subsd(xmm4, xmm3);
|
|
492 |
masm.movq(xmm7, new AMD64Address(AMD64.rax, 8));
|
|
493 |
masm.subsd(xmm0, xmm3);
|
|
494 |
if (masm.supports(AMD64.CPUFeature.SSE3)) {
|
|
495 |
masm.movddup(xmm3, xmm4);
|
|
496 |
} else {
|
|
497 |
masm.movdqu(xmm3, xmm4);
|
|
498 |
masm.movlhps(xmm3, xmm3);
|
|
499 |
}
|
|
500 |
masm.subsd(xmm4, xmm6);
|
|
501 |
masm.pshufd(xmm0, xmm0, 68);
|
|
502 |
masm.movdqu(xmm2, new AMD64Address(AMD64.rax, 0));
|
|
503 |
masm.mulpd(xmm5, xmm0);
|
|
504 |
masm.subpd(xmm0, xmm6);
|
|
505 |
masm.mulsd(xmm7, xmm4);
|
|
506 |
masm.subsd(xmm3, xmm4);
|
|
507 |
masm.mulpd(xmm5, xmm0);
|
|
508 |
masm.mulpd(xmm0, xmm0);
|
|
509 |
masm.subsd(xmm3, xmm6);
|
|
510 |
masm.movdqu(xmm6, recordExternalAddress(crb, sc2)); // 0x11111111, 0x3f811111,
|
|
511 |
// 0x55555555, 0x3fa55555
|
|
512 |
masm.subsd(xmm1, xmm3);
|
|
513 |
masm.movq(xmm3, new AMD64Address(AMD64.rax, 24));
|
|
514 |
masm.addsd(xmm2, xmm3);
|
|
515 |
masm.subsd(xmm7, xmm2);
|
|
516 |
masm.mulsd(xmm2, xmm4);
|
|
517 |
masm.mulpd(xmm6, xmm0);
|
|
518 |
masm.mulsd(xmm3, xmm4);
|
|
519 |
masm.mulpd(xmm2, xmm0);
|
|
520 |
masm.mulpd(xmm0, xmm0);
|
|
521 |
masm.addpd(xmm5, recordExternalAddress(crb, sc3)); // 0x1a01a01a, 0xbf2a01a0,
|
|
522 |
// 0x16c16c17, 0xbf56c16c
|
|
523 |
masm.mulsd(xmm4, new AMD64Address(AMD64.rax, 0));
|
|
524 |
masm.addpd(xmm6, recordExternalAddress(crb, sc1)); // 0x55555555, 0xbfc55555,
|
|
525 |
// 0x00000000, 0xbfe00000
|
|
526 |
masm.mulpd(xmm5, xmm0);
|
|
527 |
masm.movdqu(xmm0, xmm3);
|
|
528 |
masm.addsd(xmm3, new AMD64Address(AMD64.rax, 8));
|
|
529 |
masm.mulpd(xmm1, xmm7);
|
|
530 |
masm.movdqu(xmm7, xmm4);
|
|
531 |
masm.addsd(xmm4, xmm3);
|
|
532 |
masm.addpd(xmm6, xmm5);
|
|
533 |
masm.movq(xmm5, new AMD64Address(AMD64.rax, 8));
|
|
534 |
masm.subsd(xmm5, xmm3);
|
|
535 |
masm.subsd(xmm3, xmm4);
|
|
536 |
masm.addsd(xmm1, new AMD64Address(AMD64.rax, 16));
|
|
537 |
masm.mulpd(xmm6, xmm2);
|
|
538 |
masm.addsd(xmm5, xmm0);
|
|
539 |
masm.addsd(xmm3, xmm7);
|
|
540 |
masm.addsd(xmm1, xmm5);
|
|
541 |
masm.addsd(xmm1, xmm3);
|
|
542 |
masm.addsd(xmm1, xmm6);
|
|
543 |
masm.unpckhpd(xmm6, xmm6);
|
|
544 |
masm.movdqu(xmm0, xmm4);
|
|
545 |
masm.addsd(xmm1, xmm6);
|
|
546 |
masm.addsd(xmm0, xmm1);
|
|
547 |
masm.jmp(block14);
|
|
548 |
|
|
549 |
masm.bind(block0);
|
|
550 |
masm.jcc(ConditionFlag.Greater, block1);
|
|
551 |
masm.shrl(rax, 20);
|
|
552 |
masm.cmpl(rax, 3325);
|
|
553 |
masm.jcc(ConditionFlag.NotEqual, block2);
|
|
554 |
masm.mulsd(xmm0, recordExternalAddress(crb, allOnes)); // 0xffffffff, 0x3fefffff
|
|
555 |
masm.jmp(block14);
|
|
556 |
|
|
557 |
masm.bind(block2);
|
|
558 |
masm.movq(xmm3, recordExternalAddress(crb, twoPow55)); // 0x00000000, 0x43600000
|
|
559 |
masm.mulsd(xmm3, xmm0);
|
|
560 |
masm.subsd(xmm3, xmm0);
|
|
561 |
masm.mulsd(xmm3, recordExternalAddress(crb, twoPowM55)); // 0x00000000, 0x3c800000
|
|
562 |
masm.jmp(block14);
|
|
563 |
|
|
564 |
masm.bind(block1);
|
|
565 |
masm.pextrw(rax, xmm0, 3);
|
|
566 |
masm.andl(rax, 32752);
|
|
567 |
masm.cmpl(rax, 32752);
|
|
568 |
masm.jcc(ConditionFlag.Equal, block3);
|
|
569 |
masm.pextrw(rcx, xmm0, 3);
|
|
570 |
masm.andl(rcx, 32752);
|
|
571 |
masm.subl(rcx, 16224);
|
|
572 |
masm.shrl(rcx, 7);
|
|
573 |
masm.andl(rcx, 65532);
|
|
574 |
masm.leaq(r11, recordExternalAddress(crb, piInvTable));
|
|
575 |
masm.addq(AMD64.rcx, r11);
|
|
576 |
masm.movdq(AMD64.rax, xmm0);
|
|
577 |
masm.movl(r10, new AMD64Address(AMD64.rcx, 20));
|
|
578 |
masm.movl(r8, new AMD64Address(AMD64.rcx, 24));
|
|
579 |
masm.movl(rdx, rax);
|
|
580 |
masm.shrq(AMD64.rax, 21);
|
|
581 |
masm.orl(rax, Integer.MIN_VALUE);
|
|
582 |
masm.shrl(rax, 11);
|
|
583 |
masm.movl(r9, r10);
|
|
584 |
masm.imulq(r10, AMD64.rdx);
|
|
585 |
masm.imulq(r9, AMD64.rax);
|
|
586 |
masm.imulq(r8, AMD64.rax);
|
|
587 |
masm.movl(rsi, new AMD64Address(AMD64.rcx, 16));
|
|
588 |
masm.movl(rdi, new AMD64Address(AMD64.rcx, 12));
|
|
589 |
masm.movl(r11, r10);
|
|
590 |
masm.shrq(r10, 32);
|
|
591 |
masm.addq(r9, r10);
|
|
592 |
masm.addq(r11, r8);
|
|
593 |
masm.movl(r8, r11);
|
|
594 |
masm.shrq(r11, 32);
|
|
595 |
masm.addq(r9, r11);
|
|
596 |
masm.movl(r10, rsi);
|
|
597 |
masm.imulq(rsi, AMD64.rdx);
|
|
598 |
masm.imulq(r10, AMD64.rax);
|
|
599 |
masm.movl(r11, rdi);
|
|
600 |
masm.imulq(rdi, AMD64.rdx);
|
|
601 |
masm.movl(rbx, rsi);
|
|
602 |
masm.shrq(rsi, 32);
|
|
603 |
masm.addq(r9, AMD64.rbx);
|
|
604 |
masm.movl(rbx, r9);
|
|
605 |
masm.shrq(r9, 32);
|
|
606 |
masm.addq(r10, rsi);
|
|
607 |
masm.addq(r10, r9);
|
|
608 |
masm.shlq(AMD64.rbx, 32);
|
|
609 |
masm.orq(r8, AMD64.rbx);
|
|
610 |
masm.imulq(r11, AMD64.rax);
|
|
611 |
masm.movl(r9, new AMD64Address(AMD64.rcx, 8));
|
|
612 |
masm.movl(rsi, new AMD64Address(AMD64.rcx, 4));
|
|
613 |
masm.movl(rbx, rdi);
|
|
614 |
masm.shrq(rdi, 32);
|
|
615 |
masm.addq(r10, AMD64.rbx);
|
|
616 |
masm.movl(rbx, r10);
|
|
617 |
masm.shrq(r10, 32);
|
|
618 |
masm.addq(r11, rdi);
|
|
619 |
masm.addq(r11, r10);
|
|
620 |
masm.movq(rdi, r9);
|
|
621 |
masm.imulq(r9, AMD64.rdx);
|
|
622 |
masm.imulq(rdi, AMD64.rax);
|
|
623 |
masm.movl(r10, r9);
|
|
624 |
masm.shrq(r9, 32);
|
|
625 |
masm.addq(r11, r10);
|
|
626 |
masm.movl(r10, r11);
|
|
627 |
masm.shrq(r11, 32);
|
|
628 |
masm.addq(rdi, r9);
|
|
629 |
masm.addq(rdi, r11);
|
|
630 |
masm.movq(r9, rsi);
|
|
631 |
masm.imulq(rsi, AMD64.rdx);
|
|
632 |
masm.imulq(r9, AMD64.rax);
|
|
633 |
masm.shlq(r10, 32);
|
|
634 |
masm.orq(r10, AMD64.rbx);
|
|
635 |
masm.movl(rax, new AMD64Address(AMD64.rcx, 0));
|
|
636 |
masm.movl(r11, rsi);
|
|
637 |
masm.shrq(rsi, 32);
|
|
638 |
masm.addq(rdi, r11);
|
|
639 |
masm.movl(r11, rdi);
|
|
640 |
masm.shrq(rdi, 32);
|
|
641 |
masm.addq(r9, rsi);
|
|
642 |
masm.addq(r9, rdi);
|
|
643 |
masm.imulq(AMD64.rdx, AMD64.rax);
|
|
644 |
masm.pextrw(rbx, xmm0, 3);
|
|
645 |
masm.leaq(rdi, recordExternalAddress(crb, piInvTable));
|
|
646 |
masm.subq(AMD64.rcx, rdi);
|
|
647 |
masm.addl(rcx, rcx);
|
|
648 |
masm.addl(rcx, rcx);
|
|
649 |
masm.addl(rcx, rcx);
|
|
650 |
masm.addl(rcx, 19);
|
|
651 |
masm.movl(rsi, 32768);
|
|
652 |
masm.andl(rsi, rbx);
|
|
653 |
masm.shrl(rbx, 4);
|
|
654 |
masm.andl(rbx, 2047);
|
|
655 |
masm.subl(rbx, 1023);
|
|
656 |
masm.subl(rcx, rbx);
|
|
657 |
masm.addq(r9, AMD64.rdx);
|
|
658 |
masm.movl(rdx, rcx);
|
|
659 |
masm.addl(rdx, 32);
|
|
660 |
masm.cmpl(rcx, 1);
|
|
661 |
masm.jcc(ConditionFlag.Less, block4);
|
|
662 |
masm.negl(rcx);
|
|
663 |
masm.addl(rcx, 29);
|
|
664 |
masm.shll(r9);
|
|
665 |
masm.movl(rdi, r9);
|
|
666 |
masm.andl(r9, 536870911);
|
|
667 |
masm.testl(r9, 268435456);
|
|
668 |
masm.jcc(ConditionFlag.NotEqual, block5);
|
|
669 |
masm.shrl(r9);
|
|
670 |
masm.movl(rbx, 0);
|
|
671 |
masm.shlq(r9, 32);
|
|
672 |
masm.orq(r9, r11);
|
|
673 |
|
|
674 |
masm.bind(block6);
|
|
675 |
|
|
676 |
masm.bind(block7);
|
|
677 |
|
|
678 |
masm.cmpq(r9, 0);
|
|
679 |
masm.jcc(ConditionFlag.Equal, block8);
|
|
680 |
|
|
681 |
masm.bind(block9);
|
|
682 |
masm.bsrq(r11, r9);
|
|
683 |
masm.movl(rcx, 29);
|
|
684 |
masm.subl(rcx, r11);
|
|
685 |
masm.jcc(ConditionFlag.LessEqual, block10);
|
|
686 |
masm.shlq(r9);
|
|
687 |
masm.movq(AMD64.rax, r10);
|
|
688 |
masm.shlq(r10);
|
|
689 |
masm.addl(rdx, rcx);
|
|
690 |
masm.negl(rcx);
|
|
691 |
masm.addl(rcx, 64);
|
|
692 |
masm.shrq(AMD64.rax);
|
|
693 |
masm.shrq(r8);
|
|
694 |
masm.orq(r9, AMD64.rax);
|
|
695 |
masm.orq(r10, r8);
|
|
696 |
|
|
697 |
masm.bind(block11);
|
|
698 |
masm.cvtsi2sdq(xmm0, r9);
|
|
699 |
masm.shrq(r10, 1);
|
|
700 |
masm.cvtsi2sdq(xmm3, r10);
|
|
701 |
masm.xorpd(xmm4, xmm4);
|
|
702 |
masm.shll(rdx, 4);
|
|
703 |
masm.negl(rdx);
|
|
704 |
masm.addl(rdx, 16368);
|
|
705 |
masm.orl(rdx, rsi);
|
|
706 |
masm.xorl(rdx, rbx);
|
|
707 |
masm.pinsrw(xmm4, rdx, 3);
|
|
708 |
masm.movq(xmm2, recordExternalAddress(crb, pi4)); // 0x40000000, 0x3fe921fb,
|
|
709 |
// 0x18469899, 0x3e64442d
|
|
710 |
masm.movq(xmm6, recordExternalAddress(crb, pi48)); // 0x3fe921fb, 0x18469899,
|
|
711 |
// 0x3e64442d
|
|
712 |
masm.xorpd(xmm5, xmm5);
|
|
713 |
masm.subl(rdx, 1008);
|
|
714 |
masm.pinsrw(xmm5, rdx, 3);
|
|
715 |
masm.mulsd(xmm0, xmm4);
|
|
716 |
masm.shll(rsi, 16);
|
|
717 |
masm.sarl(rsi, 31);
|
|
718 |
masm.mulsd(xmm3, xmm5);
|
|
719 |
masm.movdqu(xmm1, xmm0);
|
|
720 |
masm.mulsd(xmm0, xmm2);
|
|
721 |
masm.shrl(rdi, 29);
|
|
722 |
masm.addsd(xmm1, xmm3);
|
|
723 |
masm.mulsd(xmm3, xmm2);
|
|
724 |
masm.addl(rdi, rsi);
|
|
725 |
masm.xorl(rdi, rsi);
|
|
726 |
masm.mulsd(xmm6, xmm1);
|
|
727 |
masm.movl(rax, rdi);
|
|
728 |
masm.addsd(xmm6, xmm3);
|
|
729 |
masm.movdqu(xmm2, xmm0);
|
|
730 |
masm.addsd(xmm0, xmm6);
|
|
731 |
masm.subsd(xmm2, xmm0);
|
|
732 |
masm.addsd(xmm6, xmm2);
|
|
733 |
|
|
734 |
masm.bind(block12);
|
|
735 |
masm.movq(xmm1, recordExternalAddress(crb, pi32Inv)); // 0x6dc9c883, 0x40245f30
|
|
736 |
masm.mulsd(xmm1, xmm0);
|
|
737 |
masm.movq(xmm5, recordExternalAddress(crb, onehalf)); // 0x00000000, 0x3fe00000,
|
|
738 |
// 0x00000000, 0x3fe00000
|
|
739 |
masm.movq(xmm4, recordExternalAddress(crb, signMask)); // 0x00000000, 0x80000000
|
|
740 |
masm.pand(xmm4, xmm0);
|
|
741 |
masm.por(xmm5, xmm4);
|
|
742 |
masm.addpd(xmm1, xmm5);
|
|
743 |
masm.cvttsd2sil(rdx, xmm1);
|
|
744 |
masm.cvtsi2sdl(xmm1, rdx);
|
|
745 |
masm.movq(xmm3, recordExternalAddress(crb, p1)); // 0x54400000, 0x3fb921fb
|
|
746 |
masm.movdqu(xmm2, recordExternalAddress(crb, p2)); // 0x1a600000, 0x3d90b461,
|
|
747 |
// 0x1a600000, 0x3d90b461
|
|
748 |
masm.mulsd(xmm3, xmm1);
|
|
749 |
masm.unpcklpd(xmm1, xmm1);
|
|
750 |
masm.shll(rax, 3);
|
|
751 |
masm.addl(rdx, 1865216);
|
|
752 |
masm.movdqu(xmm4, xmm0);
|
|
753 |
masm.addl(rdx, rax);
|
|
754 |
masm.andl(rdx, 63);
|
|
755 |
masm.movdqu(xmm5, recordExternalAddress(crb, sc4)); // 0x54400000, 0x3fb921fb
|
|
756 |
masm.leaq(AMD64.rax, recordExternalAddress(crb, ctable));
|
|
757 |
masm.shll(rdx, 5);
|
|
758 |
masm.addq(AMD64.rax, AMD64.rdx);
|
|
759 |
masm.mulpd(xmm2, xmm1);
|
|
760 |
masm.subsd(xmm0, xmm3);
|
|
761 |
masm.mulsd(xmm1, recordExternalAddress(crb, p3)); // 0x2e037073, 0x3b63198a
|
|
762 |
masm.subsd(xmm4, xmm3);
|
|
763 |
masm.movq(xmm7, new AMD64Address(AMD64.rax, 8));
|
|
764 |
masm.unpcklpd(xmm0, xmm0);
|
|
765 |
masm.movdqu(xmm3, xmm4);
|
|
766 |
masm.subsd(xmm4, xmm2);
|
|
767 |
masm.mulpd(xmm5, xmm0);
|
|
768 |
masm.subpd(xmm0, xmm2);
|
|
769 |
masm.mulsd(xmm7, xmm4);
|
|
770 |
masm.subsd(xmm3, xmm4);
|
|
771 |
masm.mulpd(xmm5, xmm0);
|
|
772 |
masm.mulpd(xmm0, xmm0);
|
|
773 |
masm.subsd(xmm3, xmm2);
|
|
774 |
masm.movdqu(xmm2, new AMD64Address(AMD64.rax, 0));
|
|
775 |
masm.subsd(xmm1, xmm3);
|
|
776 |
masm.movq(xmm3, new AMD64Address(AMD64.rax, 24));
|
|
777 |
masm.addsd(xmm2, xmm3);
|
|
778 |
masm.subsd(xmm7, xmm2);
|
|
779 |
masm.subsd(xmm1, xmm6);
|
|
780 |
masm.movdqu(xmm6, recordExternalAddress(crb, sc2)); // 0x11111111, 0x3f811111,
|
|
781 |
// 0x55555555, 0x3fa55555
|
|
782 |
masm.mulsd(xmm2, xmm4);
|
|
783 |
masm.mulpd(xmm6, xmm0);
|
|
784 |
masm.mulsd(xmm3, xmm4);
|
|
785 |
masm.mulpd(xmm2, xmm0);
|
|
786 |
masm.mulpd(xmm0, xmm0);
|
|
787 |
masm.addpd(xmm5, recordExternalAddress(crb, sc3)); // 0x1a01a01a, 0xbf2a01a0,
|
|
788 |
// 0x16c16c17, 0xbf56c16c
|
|
789 |
masm.mulsd(xmm4, new AMD64Address(AMD64.rax, 0));
|
|
790 |
masm.addpd(xmm6, recordExternalAddress(crb, sc1)); // 0x55555555, 0xbfc55555,
|
|
791 |
// 0x00000000, 0xbfe00000
|
|
792 |
masm.mulpd(xmm5, xmm0);
|
|
793 |
masm.movdqu(xmm0, xmm3);
|
|
794 |
masm.addsd(xmm3, new AMD64Address(AMD64.rax, 8));
|
|
795 |
masm.mulpd(xmm1, xmm7);
|
|
796 |
masm.movdqu(xmm7, xmm4);
|
|
797 |
masm.addsd(xmm4, xmm3);
|
|
798 |
masm.addpd(xmm6, xmm5);
|
|
799 |
masm.movq(xmm5, new AMD64Address(AMD64.rax, 8));
|
|
800 |
masm.subsd(xmm5, xmm3);
|
|
801 |
masm.subsd(xmm3, xmm4);
|
|
802 |
masm.addsd(xmm1, new AMD64Address(AMD64.rax, 16));
|
|
803 |
masm.mulpd(xmm6, xmm2);
|
|
804 |
masm.addsd(xmm5, xmm0);
|
|
805 |
masm.addsd(xmm3, xmm7);
|
|
806 |
masm.addsd(xmm1, xmm5);
|
|
807 |
masm.addsd(xmm1, xmm3);
|
|
808 |
masm.addsd(xmm1, xmm6);
|
|
809 |
masm.unpckhpd(xmm6, xmm6);
|
|
810 |
masm.movdqu(xmm0, xmm4);
|
|
811 |
masm.addsd(xmm1, xmm6);
|
|
812 |
masm.addsd(xmm0, xmm1);
|
|
813 |
masm.jmp(block14);
|
|
814 |
|
|
815 |
masm.bind(block8);
|
|
816 |
masm.addl(rdx, 64);
|
|
817 |
masm.movq(r9, r10);
|
|
818 |
masm.movq(r10, r8);
|
|
819 |
masm.movl(r8, 0);
|
|
820 |
masm.cmpq(r9, 0);
|
|
821 |
masm.jcc(ConditionFlag.NotEqual, block9);
|
|
822 |
masm.addl(rdx, 64);
|
|
823 |
masm.movq(r9, r10);
|
|
824 |
masm.movq(r10, r8);
|
|
825 |
masm.cmpq(r9, 0);
|
|
826 |
masm.jcc(ConditionFlag.NotEqual, block9);
|
|
827 |
masm.xorpd(xmm0, xmm0);
|
|
828 |
masm.xorpd(xmm6, xmm6);
|
|
829 |
masm.jmp(block12);
|
|
830 |
|
|
831 |
masm.bind(block10);
|
|
832 |
masm.jcc(ConditionFlag.Equal, block11);
|
|
833 |
masm.negl(rcx);
|
|
834 |
masm.shrq(r10);
|
|
835 |
masm.movq(AMD64.rax, r9);
|
|
836 |
masm.shrq(r9);
|
|
837 |
masm.subl(rdx, rcx);
|
|
838 |
masm.negl(rcx);
|
|
839 |
masm.addl(rcx, 64);
|
|
840 |
masm.shlq(AMD64.rax);
|
|
841 |
masm.orq(r10, AMD64.rax);
|
|
842 |
masm.jmp(block11);
|
|
843 |
|
|
844 |
masm.bind(block4);
|
|
845 |
masm.negl(rcx);
|
|
846 |
masm.shlq(r9, 32);
|
|
847 |
masm.orq(r9, r11);
|
|
848 |
masm.shlq(r9);
|
|
849 |
masm.movq(rdi, r9);
|
|
850 |
masm.testl(r9, Integer.MIN_VALUE);
|
|
851 |
masm.jcc(ConditionFlag.NotEqual, block13);
|
|
852 |
masm.shrl(r9);
|
|
853 |
masm.movl(rbx, 0);
|
|
854 |
masm.shrq(rdi, 3);
|
|
855 |
masm.jmp(block7);
|
|
856 |
|
|
857 |
masm.bind(block5);
|
|
858 |
masm.shrl(r9);
|
|
859 |
masm.movl(rbx, 536870912);
|
|
860 |
masm.shrl(rbx);
|
|
861 |
masm.shlq(r9, 32);
|
|
862 |
masm.orq(r9, r11);
|
|
863 |
masm.shlq(AMD64.rbx, 32);
|
|
864 |
masm.addl(rdi, 536870912);
|
|
865 |
masm.movl(AMD64.rcx, 0);
|
|
866 |
masm.movl(r11, 0);
|
|
867 |
masm.subq(AMD64.rcx, r8);
|
|
868 |
masm.sbbq(r11, r10);
|
|
869 |
masm.sbbq(AMD64.rbx, r9);
|
|
870 |
masm.movq(r8, AMD64.rcx);
|
|
871 |
masm.movq(r10, r11);
|
|
872 |
masm.movq(r9, AMD64.rbx);
|
|
873 |
masm.movl(rbx, 32768);
|
|
874 |
masm.jmp(block6);
|
|
875 |
|
|
876 |
masm.bind(block13);
|
|
877 |
masm.shrl(r9);
|
|
878 |
masm.movq(AMD64.rbx, 0x100000000L);
|
|
879 |
masm.shrq(AMD64.rbx);
|
|
880 |
masm.movl(AMD64.rcx, 0);
|
|
881 |
masm.movl(r11, 0);
|
|
882 |
masm.subq(AMD64.rcx, r8);
|
|
883 |
masm.sbbq(r11, r10);
|
|
884 |
masm.sbbq(AMD64.rbx, r9);
|
|
885 |
masm.movq(r8, AMD64.rcx);
|
|
886 |
masm.movq(r10, r11);
|
|
887 |
masm.movq(r9, AMD64.rbx);
|
|
888 |
masm.movl(rbx, 32768);
|
|
889 |
masm.shrq(rdi, 3);
|
|
890 |
masm.addl(rdi, 536870912);
|
|
891 |
masm.jmp(block7);
|
|
892 |
|
|
893 |
masm.bind(block3);
|
|
894 |
masm.movq(xmm0, new AMD64Address(rsp, 8));
|
|
895 |
masm.mulsd(xmm0, recordExternalAddress(crb, negZero)); // 0x00000000, 0x80000000
|
|
896 |
masm.movq(new AMD64Address(rsp, 0), xmm0);
|
|
897 |
|
|
898 |
masm.bind(block14);
|
|
899 |
masm.addq(rsp, 16);
|
|
900 |
masm.pop(AMD64.rbx);
|
|
901 |
}
|
|
902 |
}
|