33089
|
1 |
/*
|
|
2 |
* Copyright (c) 2015, Intel Corporation.
|
|
3 |
* Intel Math Library (LIBM) Source Code
|
|
4 |
*
|
|
5 |
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
6 |
*
|
|
7 |
* This code is free software; you can redistribute it and/or modify it
|
|
8 |
* under the terms of the GNU General Public License version 2 only, as
|
|
9 |
* published by the Free Software Foundation.
|
|
10 |
*
|
|
11 |
* This code is distributed in the hope that it will be useful, but WITHOUT
|
|
12 |
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
13 |
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
14 |
* version 2 for more details (a copy is included in the LICENSE file that
|
|
15 |
* accompanied this code).
|
|
16 |
*
|
|
17 |
* You should have received a copy of the GNU General Public License version
|
|
18 |
* 2 along with this work; if not, write to the Free Software Foundation,
|
|
19 |
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
20 |
*
|
|
21 |
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
|
22 |
* or visit www.oracle.com if you need additional information or have any
|
|
23 |
* questions.
|
|
24 |
*
|
|
25 |
*/
|
|
26 |
|
33465
|
27 |
#include "precompiled.hpp"
|
|
28 |
#include "asm/assembler.hpp"
|
|
29 |
#include "asm/assembler.inline.hpp"
|
|
30 |
#include "macroAssembler_x86.hpp"
|
|
31 |
|
|
32 |
#ifdef _MSC_VER
|
|
33 |
#define ALIGNED_(x) __declspec(align(x))
|
|
34 |
#else
|
|
35 |
#define ALIGNED_(x) __attribute__ ((aligned(x)))
|
|
36 |
#endif
|
|
37 |
|
33089
|
38 |
/******************************************************************************/
|
33465
|
39 |
// ALGORITHM DESCRIPTION - EXP()
|
33089
|
40 |
// ---------------------
|
|
41 |
//
|
|
42 |
// Description:
|
|
43 |
// Let K = 64 (table size).
|
|
44 |
// x x/log(2) n
|
|
45 |
// e = 2 = 2 * T[j] * (1 + P(y))
|
|
46 |
// where
|
|
47 |
// x = m*log(2)/K + y, y in [-log(2)/K..log(2)/K]
|
|
48 |
// m = n*K + j, m,n,j - signed integer, j in [-K/2..K/2]
|
|
49 |
// j/K
|
|
50 |
// values of 2 are tabulated as T[j] = T_hi[j] ( 1 + T_lo[j]).
|
|
51 |
//
|
|
52 |
// P(y) is a minimax polynomial approximation of exp(x)-1
|
|
53 |
// on small interval [-log(2)/K..log(2)/K] (were calculated by Maple V).
|
|
54 |
//
|
|
55 |
// To avoid problems with arithmetic overflow and underflow,
|
|
56 |
// n n1 n2
|
|
57 |
// value of 2 is safely computed as 2 * 2 where n1 in [-BIAS/2..BIAS/2]
|
|
58 |
// where BIAS is a value of exponent bias.
|
|
59 |
//
|
|
60 |
// Special cases:
|
|
61 |
// exp(NaN) = NaN
|
|
62 |
// exp(+INF) = +INF
|
|
63 |
// exp(-INF) = 0
|
|
64 |
// exp(x) = 1 for subnormals
|
|
65 |
// for finite argument, only exp(0)=1 is exact
|
|
66 |
// For IEEE double
|
|
67 |
// if x > 709.782712893383973096 then exp(x) overflow
|
|
68 |
// if x < -745.133219101941108420 then exp(x) underflow
|
|
69 |
//
|
|
70 |
/******************************************************************************/
|
|
71 |
|
|
72 |
#ifdef _LP64
|
|
73 |
|
|
74 |
ALIGNED_(16) juint _cv[] =
|
|
75 |
{
|
|
76 |
0x652b82feUL, 0x40571547UL, 0x652b82feUL, 0x40571547UL, 0xfefa0000UL,
|
|
77 |
0x3f862e42UL, 0xfefa0000UL, 0x3f862e42UL, 0xbc9e3b3aUL, 0x3d1cf79aUL,
|
|
78 |
0xbc9e3b3aUL, 0x3d1cf79aUL, 0xfffffffeUL, 0x3fdfffffUL, 0xfffffffeUL,
|
|
79 |
0x3fdfffffUL, 0xe3289860UL, 0x3f56c15cUL, 0x555b9e25UL, 0x3fa55555UL,
|
|
80 |
0xc090cf0fUL, 0x3f811115UL, 0x55548ba1UL, 0x3fc55555UL
|
|
81 |
};
|
|
82 |
|
|
83 |
ALIGNED_(16) juint _shifter[] =
|
|
84 |
{
|
|
85 |
0x00000000UL, 0x43380000UL, 0x00000000UL, 0x43380000UL
|
|
86 |
};
|
|
87 |
|
|
88 |
ALIGNED_(16) juint _mmask[] =
|
|
89 |
{
|
|
90 |
0xffffffc0UL, 0x00000000UL, 0xffffffc0UL, 0x00000000UL
|
|
91 |
};
|
|
92 |
|
|
93 |
ALIGNED_(16) juint _bias[] =
|
|
94 |
{
|
|
95 |
0x0000ffc0UL, 0x00000000UL, 0x0000ffc0UL, 0x00000000UL
|
|
96 |
};
|
|
97 |
|
|
98 |
ALIGNED_(16) juint _Tbl_addr[] =
|
|
99 |
{
|
|
100 |
0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x0e03754dUL,
|
|
101 |
0x3cad7bbfUL, 0x3e778060UL, 0x00002c9aUL, 0x3567f613UL, 0x3c8cd252UL,
|
|
102 |
0xd3158574UL, 0x000059b0UL, 0x61e6c861UL, 0x3c60f74eUL, 0x18759bc8UL,
|
|
103 |
0x00008745UL, 0x5d837b6cUL, 0x3c979aa6UL, 0x6cf9890fUL, 0x0000b558UL,
|
|
104 |
0x702f9cd1UL, 0x3c3ebe3dUL, 0x32d3d1a2UL, 0x0000e3ecUL, 0x1e63bcd8UL,
|
|
105 |
0x3ca3516eUL, 0xd0125b50UL, 0x00011301UL, 0x26f0387bUL, 0x3ca4c554UL,
|
|
106 |
0xaea92ddfUL, 0x0001429aUL, 0x62523fb6UL, 0x3ca95153UL, 0x3c7d517aUL,
|
|
107 |
0x000172b8UL, 0x3f1353bfUL, 0x3c8b898cUL, 0xeb6fcb75UL, 0x0001a35bUL,
|
|
108 |
0x3e3a2f5fUL, 0x3c9aecf7UL, 0x3168b9aaUL, 0x0001d487UL, 0x44a6c38dUL,
|
|
109 |
0x3c8a6f41UL, 0x88628cd6UL, 0x0002063bUL, 0xe3a8a894UL, 0x3c968efdUL,
|
|
110 |
0x6e756238UL, 0x0002387aUL, 0x981fe7f2UL, 0x3c80472bUL, 0x65e27cddUL,
|
|
111 |
0x00026b45UL, 0x6d09ab31UL, 0x3c82f7e1UL, 0xf51fdee1UL, 0x00029e9dUL,
|
|
112 |
0x720c0ab3UL, 0x3c8b3782UL, 0xa6e4030bUL, 0x0002d285UL, 0x4db0abb6UL,
|
|
113 |
0x3c834d75UL, 0x0a31b715UL, 0x000306feUL, 0x5dd3f84aUL, 0x3c8fdd39UL,
|
|
114 |
0xb26416ffUL, 0x00033c08UL, 0xcc187d29UL, 0x3ca12f8cUL, 0x373aa9caUL,
|
|
115 |
0x000371a7UL, 0x738b5e8bUL, 0x3ca7d229UL, 0x34e59ff6UL, 0x0003a7dbUL,
|
|
116 |
0xa72a4c6dUL, 0x3c859f48UL, 0x4c123422UL, 0x0003dea6UL, 0x259d9205UL,
|
|
117 |
0x3ca8b846UL, 0x21f72e29UL, 0x0004160aUL, 0x60c2ac12UL, 0x3c4363edUL,
|
|
118 |
0x6061892dUL, 0x00044e08UL, 0xdaa10379UL, 0x3c6ecce1UL, 0xb5c13cd0UL,
|
|
119 |
0x000486a2UL, 0xbb7aafb0UL, 0x3c7690ceUL, 0xd5362a27UL, 0x0004bfdaUL,
|
|
120 |
0x9b282a09UL, 0x3ca083ccUL, 0x769d2ca6UL, 0x0004f9b2UL, 0xc1aae707UL,
|
|
121 |
0x3ca509b0UL, 0x569d4f81UL, 0x0005342bUL, 0x18fdd78eUL, 0x3c933505UL,
|
|
122 |
0x36b527daUL, 0x00056f47UL, 0xe21c5409UL, 0x3c9063e1UL, 0xdd485429UL,
|
|
123 |
0x0005ab07UL, 0x2b64c035UL, 0x3c9432e6UL, 0x15ad2148UL, 0x0005e76fUL,
|
|
124 |
0x99f08c0aUL, 0x3ca01284UL, 0xb03a5584UL, 0x0006247eUL, 0x0073dc06UL,
|
|
125 |
0x3c99f087UL, 0x82552224UL, 0x00066238UL, 0x0da05571UL, 0x3c998d4dUL,
|
|
126 |
0x667f3bccUL, 0x0006a09eUL, 0x86ce4786UL, 0x3ca52bb9UL, 0x3c651a2eUL,
|
|
127 |
0x0006dfb2UL, 0x206f0dabUL, 0x3ca32092UL, 0xe8ec5f73UL, 0x00071f75UL,
|
|
128 |
0x8e17a7a6UL, 0x3ca06122UL, 0x564267c8UL, 0x00075febUL, 0x461e9f86UL,
|
|
129 |
0x3ca244acUL, 0x73eb0186UL, 0x0007a114UL, 0xabd66c55UL, 0x3c65ebe1UL,
|
|
130 |
0x36cf4e62UL, 0x0007e2f3UL, 0xbbff67d0UL, 0x3c96fe9fUL, 0x994cce12UL,
|
|
131 |
0x00082589UL, 0x14c801dfUL, 0x3c951f14UL, 0x9b4492ecUL, 0x000868d9UL,
|
|
132 |
0xc1f0eab4UL, 0x3c8db72fUL, 0x422aa0dbUL, 0x0008ace5UL, 0x59f35f44UL,
|
|
133 |
0x3c7bf683UL, 0x99157736UL, 0x0008f1aeUL, 0x9c06283cUL, 0x3ca360baUL,
|
|
134 |
0xb0cdc5e4UL, 0x00093737UL, 0x20f962aaUL, 0x3c95e8d1UL, 0x9fde4e4fUL,
|
|
135 |
0x00097d82UL, 0x2b91ce27UL, 0x3c71affcUL, 0x82a3f090UL, 0x0009c491UL,
|
|
136 |
0x589a2ebdUL, 0x3c9b6d34UL, 0x7b5de564UL, 0x000a0c66UL, 0x9ab89880UL,
|
|
137 |
0x3c95277cUL, 0xb23e255cUL, 0x000a5503UL, 0x6e735ab3UL, 0x3c846984UL,
|
|
138 |
0x5579fdbfUL, 0x000a9e6bUL, 0x92cb3387UL, 0x3c8c1a77UL, 0x995ad3adUL,
|
|
139 |
0x000ae89fUL, 0xdc2d1d96UL, 0x3ca22466UL, 0xb84f15faUL, 0x000b33a2UL,
|
|
140 |
0xb19505aeUL, 0x3ca1112eUL, 0xf2fb5e46UL, 0x000b7f76UL, 0x0a5fddcdUL,
|
|
141 |
0x3c74ffd7UL, 0x904bc1d2UL, 0x000bcc1eUL, 0x30af0cb3UL, 0x3c736eaeUL,
|
|
142 |
0xdd85529cUL, 0x000c199bUL, 0xd10959acUL, 0x3c84e08fUL, 0x2e57d14bUL,
|
|
143 |
0x000c67f1UL, 0x6c921968UL, 0x3c676b2cUL, 0xdcef9069UL, 0x000cb720UL,
|
|
144 |
0x36df99b3UL, 0x3c937009UL, 0x4a07897bUL, 0x000d072dUL, 0xa63d07a7UL,
|
|
145 |
0x3c74a385UL, 0xdcfba487UL, 0x000d5818UL, 0xd5c192acUL, 0x3c8e5a50UL,
|
|
146 |
0x03db3285UL, 0x000da9e6UL, 0x1c4a9792UL, 0x3c98bb73UL, 0x337b9b5eUL,
|
|
147 |
0x000dfc97UL, 0x603a88d3UL, 0x3c74b604UL, 0xe78b3ff6UL, 0x000e502eUL,
|
|
148 |
0x92094926UL, 0x3c916f27UL, 0xa2a490d9UL, 0x000ea4afUL, 0x41aa2008UL,
|
|
149 |
0x3c8ec3bcUL, 0xee615a27UL, 0x000efa1bUL, 0x31d185eeUL, 0x3c8a64a9UL,
|
|
150 |
0x5b6e4540UL, 0x000f5076UL, 0x4d91cd9dUL, 0x3c77893bUL, 0x819e90d8UL,
|
|
151 |
0x000fa7c1UL
|
|
152 |
};
|
|
153 |
|
|
154 |
ALIGNED_(16) juint _ALLONES[] =
|
|
155 |
{
|
|
156 |
0xffffffffUL, 0xffffffffUL, 0xffffffffUL, 0xffffffffUL
|
|
157 |
};
|
|
158 |
|
|
159 |
ALIGNED_(16) juint _ebias[] =
|
|
160 |
{
|
|
161 |
0x00000000UL, 0x3ff00000UL, 0x00000000UL, 0x3ff00000UL
|
|
162 |
};
|
|
163 |
|
|
164 |
ALIGNED_(4) juint _XMAX[] =
|
|
165 |
{
|
|
166 |
0xffffffffUL, 0x7fefffffUL
|
|
167 |
};
|
|
168 |
|
|
169 |
ALIGNED_(4) juint _XMIN[] =
|
|
170 |
{
|
|
171 |
0x00000000UL, 0x00100000UL
|
|
172 |
};
|
|
173 |
|
|
174 |
ALIGNED_(4) juint _INF[] =
|
|
175 |
{
|
|
176 |
0x00000000UL, 0x7ff00000UL
|
|
177 |
};
|
|
178 |
|
|
179 |
ALIGNED_(4) juint _ZERO[] =
|
|
180 |
{
|
|
181 |
0x00000000UL, 0x00000000UL
|
|
182 |
};
|
|
183 |
|
|
184 |
ALIGNED_(4) juint _ONE_val[] =
|
|
185 |
{
|
|
186 |
0x00000000UL, 0x3ff00000UL
|
|
187 |
};
|
|
188 |
|
|
189 |
|
|
190 |
// Registers:
|
|
191 |
// input: xmm0
|
|
192 |
// scratch: xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
|
|
193 |
// rax, rdx, rcx, tmp - r11
|
|
194 |
|
|
195 |
// Code generated by Intel C compiler for LIBM library
|
|
196 |
|
|
197 |
void MacroAssembler::fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register tmp) {
|
|
198 |
Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2;
|
|
199 |
Label L_2TAG_PACKET_4_0_2, L_2TAG_PACKET_5_0_2, L_2TAG_PACKET_6_0_2, L_2TAG_PACKET_7_0_2;
|
|
200 |
Label L_2TAG_PACKET_8_0_2, L_2TAG_PACKET_9_0_2, L_2TAG_PACKET_10_0_2, L_2TAG_PACKET_11_0_2;
|
|
201 |
Label L_2TAG_PACKET_12_0_2, B1_3, B1_5, start;
|
|
202 |
|
|
203 |
assert_different_registers(tmp, eax, ecx, edx);
|
|
204 |
jmp(start);
|
|
205 |
address cv = (address)_cv;
|
|
206 |
address Shifter = (address)_shifter;
|
|
207 |
address mmask = (address)_mmask;
|
|
208 |
address bias = (address)_bias;
|
|
209 |
address Tbl_addr = (address)_Tbl_addr;
|
|
210 |
address ALLONES = (address)_ALLONES;
|
|
211 |
address ebias = (address)_ebias;
|
|
212 |
address XMAX = (address)_XMAX;
|
|
213 |
address XMIN = (address)_XMIN;
|
|
214 |
address INF = (address)_INF;
|
|
215 |
address ZERO = (address)_ZERO;
|
|
216 |
address ONE_val = (address)_ONE_val;
|
|
217 |
|
|
218 |
bind(start);
|
|
219 |
subq(rsp, 24);
|
|
220 |
movsd(Address(rsp, 8), xmm0);
|
|
221 |
unpcklpd(xmm0, xmm0);
|
|
222 |
movdqu(xmm1, ExternalAddress(cv)); // 0x652b82feUL, 0x40571547UL, 0x652b82feUL, 0x40571547UL
|
|
223 |
movdqu(xmm6, ExternalAddress(Shifter)); // 0x00000000UL, 0x43380000UL, 0x00000000UL, 0x43380000UL
|
|
224 |
movdqu(xmm2, ExternalAddress(16+cv)); // 0xfefa0000UL, 0x3f862e42UL, 0xfefa0000UL, 0x3f862e42UL
|
|
225 |
movdqu(xmm3, ExternalAddress(32+cv)); // 0xbc9e3b3aUL, 0x3d1cf79aUL, 0xbc9e3b3aUL, 0x3d1cf79aUL
|
|
226 |
pextrw(eax, xmm0, 3);
|
|
227 |
andl(eax, 32767);
|
|
228 |
movl(edx, 16527);
|
|
229 |
subl(edx, eax);
|
|
230 |
subl(eax, 15504);
|
|
231 |
orl(edx, eax);
|
|
232 |
cmpl(edx, INT_MIN);
|
|
233 |
jcc(Assembler::aboveEqual, L_2TAG_PACKET_0_0_2);
|
|
234 |
mulpd(xmm1, xmm0);
|
|
235 |
addpd(xmm1, xmm6);
|
|
236 |
movapd(xmm7, xmm1);
|
|
237 |
subpd(xmm1, xmm6);
|
|
238 |
mulpd(xmm2, xmm1);
|
|
239 |
movdqu(xmm4, ExternalAddress(64+cv)); // 0xe3289860UL, 0x3f56c15cUL, 0x555b9e25UL, 0x3fa55555UL
|
|
240 |
mulpd(xmm3, xmm1);
|
|
241 |
movdqu(xmm5, ExternalAddress(80+cv)); // 0xc090cf0fUL, 0x3f811115UL, 0x55548ba1UL, 0x3fc55555UL
|
|
242 |
subpd(xmm0, xmm2);
|
|
243 |
movdl(eax, xmm7);
|
|
244 |
movl(ecx, eax);
|
|
245 |
andl(ecx, 63);
|
|
246 |
shll(ecx, 4);
|
|
247 |
sarl(eax, 6);
|
|
248 |
movl(edx, eax);
|
|
249 |
movdqu(xmm6, ExternalAddress(mmask)); // 0xffffffc0UL, 0x00000000UL, 0xffffffc0UL, 0x00000000UL
|
|
250 |
pand(xmm7, xmm6);
|
|
251 |
movdqu(xmm6, ExternalAddress(bias)); // 0x0000ffc0UL, 0x00000000UL, 0x0000ffc0UL, 0x00000000UL
|
|
252 |
paddq(xmm7, xmm6);
|
|
253 |
psllq(xmm7, 46);
|
|
254 |
subpd(xmm0, xmm3);
|
|
255 |
lea(tmp, ExternalAddress(Tbl_addr));
|
|
256 |
movdqu(xmm2, Address(ecx,tmp));
|
|
257 |
mulpd(xmm4, xmm0);
|
|
258 |
movapd(xmm6, xmm0);
|
|
259 |
movapd(xmm1, xmm0);
|
|
260 |
mulpd(xmm6, xmm6);
|
|
261 |
mulpd(xmm0, xmm6);
|
|
262 |
addpd(xmm5, xmm4);
|
|
263 |
mulsd(xmm0, xmm6);
|
|
264 |
mulpd(xmm6, ExternalAddress(48+cv)); // 0xfffffffeUL, 0x3fdfffffUL, 0xfffffffeUL, 0x3fdfffffUL
|
|
265 |
addsd(xmm1, xmm2);
|
|
266 |
unpckhpd(xmm2, xmm2);
|
|
267 |
mulpd(xmm0, xmm5);
|
|
268 |
addsd(xmm1, xmm0);
|
|
269 |
por(xmm2, xmm7);
|
|
270 |
unpckhpd(xmm0, xmm0);
|
|
271 |
addsd(xmm0, xmm1);
|
|
272 |
addsd(xmm0, xmm6);
|
|
273 |
addl(edx, 894);
|
|
274 |
cmpl(edx, 1916);
|
|
275 |
jcc (Assembler::above, L_2TAG_PACKET_1_0_2);
|
|
276 |
mulsd(xmm0, xmm2);
|
|
277 |
addsd(xmm0, xmm2);
|
|
278 |
jmp (B1_5);
|
|
279 |
|
|
280 |
bind(L_2TAG_PACKET_1_0_2);
|
|
281 |
xorpd(xmm3, xmm3);
|
|
282 |
movdqu(xmm4, ExternalAddress(ALLONES)); // 0xffffffffUL, 0xffffffffUL, 0xffffffffUL, 0xffffffffUL
|
|
283 |
movl(edx, -1022);
|
|
284 |
subl(edx, eax);
|
|
285 |
movdl(xmm5, edx);
|
|
286 |
psllq(xmm4, xmm5);
|
|
287 |
movl(ecx, eax);
|
|
288 |
sarl(eax, 1);
|
|
289 |
pinsrw(xmm3, eax, 3);
|
|
290 |
movdqu(xmm6, ExternalAddress(ebias)); // 0x00000000UL, 0x3ff00000UL, 0x00000000UL, 0x3ff00000UL
|
|
291 |
psllq(xmm3, 4);
|
|
292 |
psubd(xmm2, xmm3);
|
|
293 |
mulsd(xmm0, xmm2);
|
|
294 |
cmpl(edx, 52);
|
|
295 |
jcc(Assembler::greater, L_2TAG_PACKET_2_0_2);
|
|
296 |
pand(xmm4, xmm2);
|
|
297 |
paddd(xmm3, xmm6);
|
|
298 |
subsd(xmm2, xmm4);
|
|
299 |
addsd(xmm0, xmm2);
|
|
300 |
cmpl(ecx, 1023);
|
|
301 |
jcc(Assembler::greaterEqual, L_2TAG_PACKET_3_0_2);
|
|
302 |
pextrw(ecx, xmm0, 3);
|
|
303 |
andl(ecx, 32768);
|
|
304 |
orl(edx, ecx);
|
|
305 |
cmpl(edx, 0);
|
|
306 |
jcc(Assembler::equal, L_2TAG_PACKET_4_0_2);
|
|
307 |
movapd(xmm6, xmm0);
|
|
308 |
addsd(xmm0, xmm4);
|
|
309 |
mulsd(xmm0, xmm3);
|
|
310 |
pextrw(ecx, xmm0, 3);
|
|
311 |
andl(ecx, 32752);
|
|
312 |
cmpl(ecx, 0);
|
|
313 |
jcc(Assembler::equal, L_2TAG_PACKET_5_0_2);
|
|
314 |
jmp(B1_5);
|
|
315 |
|
|
316 |
bind(L_2TAG_PACKET_5_0_2);
|
|
317 |
mulsd(xmm6, xmm3);
|
|
318 |
mulsd(xmm4, xmm3);
|
|
319 |
movdqu(xmm0, xmm6);
|
|
320 |
pxor(xmm6, xmm4);
|
|
321 |
psrad(xmm6, 31);
|
|
322 |
pshufd(xmm6, xmm6, 85);
|
|
323 |
psllq(xmm0, 1);
|
|
324 |
psrlq(xmm0, 1);
|
|
325 |
pxor(xmm0, xmm6);
|
|
326 |
psrlq(xmm6, 63);
|
|
327 |
paddq(xmm0, xmm6);
|
|
328 |
paddq(xmm0, xmm4);
|
|
329 |
movl(Address(rsp,0), 15);
|
|
330 |
jmp(L_2TAG_PACKET_6_0_2);
|
|
331 |
|
|
332 |
bind(L_2TAG_PACKET_4_0_2);
|
|
333 |
addsd(xmm0, xmm4);
|
|
334 |
mulsd(xmm0, xmm3);
|
|
335 |
jmp(B1_5);
|
|
336 |
|
|
337 |
bind(L_2TAG_PACKET_3_0_2);
|
|
338 |
addsd(xmm0, xmm4);
|
|
339 |
mulsd(xmm0, xmm3);
|
|
340 |
pextrw(ecx, xmm0, 3);
|
|
341 |
andl(ecx, 32752);
|
|
342 |
cmpl(ecx, 32752);
|
|
343 |
jcc(Assembler::aboveEqual, L_2TAG_PACKET_7_0_2);
|
|
344 |
jmp(B1_5);
|
|
345 |
|
|
346 |
bind(L_2TAG_PACKET_2_0_2);
|
|
347 |
paddd(xmm3, xmm6);
|
|
348 |
addpd(xmm0, xmm2);
|
|
349 |
mulsd(xmm0, xmm3);
|
|
350 |
movl(Address(rsp,0), 15);
|
|
351 |
jmp(L_2TAG_PACKET_6_0_2);
|
|
352 |
|
|
353 |
bind(L_2TAG_PACKET_8_0_2);
|
|
354 |
cmpl(eax, 2146435072);
|
|
355 |
jcc(Assembler::aboveEqual, L_2TAG_PACKET_9_0_2);
|
|
356 |
movl(eax, Address(rsp,12));
|
|
357 |
cmpl(eax, INT_MIN);
|
|
358 |
jcc(Assembler::aboveEqual, L_2TAG_PACKET_10_0_2);
|
|
359 |
movsd(xmm0, ExternalAddress(XMAX)); // 0xffffffffUL, 0x7fefffffUL
|
|
360 |
mulsd(xmm0, xmm0);
|
|
361 |
|
|
362 |
bind(L_2TAG_PACKET_7_0_2);
|
|
363 |
movl(Address(rsp,0), 14);
|
|
364 |
jmp(L_2TAG_PACKET_6_0_2);
|
|
365 |
|
|
366 |
bind(L_2TAG_PACKET_10_0_2);
|
|
367 |
movsd(xmm0, ExternalAddress(XMIN)); // 0x00000000UL, 0x00100000UL
|
|
368 |
mulsd(xmm0, xmm0);
|
|
369 |
movl(Address(rsp,0), 15);
|
|
370 |
jmp(L_2TAG_PACKET_6_0_2);
|
|
371 |
|
|
372 |
bind(L_2TAG_PACKET_9_0_2);
|
|
373 |
movl(edx, Address(rsp,8));
|
|
374 |
cmpl(eax, 2146435072);
|
|
375 |
jcc(Assembler::above, L_2TAG_PACKET_11_0_2);
|
|
376 |
cmpl(edx, 0);
|
|
377 |
jcc(Assembler::notEqual, L_2TAG_PACKET_11_0_2);
|
|
378 |
movl(eax, Address(rsp,12));
|
|
379 |
cmpl(eax, 2146435072);
|
|
380 |
jcc(Assembler::notEqual, L_2TAG_PACKET_12_0_2);
|
|
381 |
movsd(xmm0, ExternalAddress(INF)); // 0x00000000UL, 0x7ff00000UL
|
|
382 |
jmp(B1_5);
|
|
383 |
|
|
384 |
bind(L_2TAG_PACKET_12_0_2);
|
|
385 |
movsd(xmm0, ExternalAddress(ZERO)); // 0x00000000UL, 0x00000000UL
|
|
386 |
jmp(B1_5);
|
|
387 |
|
|
388 |
bind(L_2TAG_PACKET_11_0_2);
|
|
389 |
movsd(xmm0, Address(rsp, 8));
|
|
390 |
addsd(xmm0, xmm0);
|
|
391 |
jmp(B1_5);
|
|
392 |
|
|
393 |
bind(L_2TAG_PACKET_0_0_2);
|
|
394 |
movl(eax, Address(rsp, 12));
|
|
395 |
andl(eax, 2147483647);
|
|
396 |
cmpl(eax, 1083179008);
|
|
397 |
jcc(Assembler::aboveEqual, L_2TAG_PACKET_8_0_2);
|
|
398 |
movsd(Address(rsp, 8), xmm0);
|
|
399 |
addsd(xmm0, ExternalAddress(ONE_val)); // 0x00000000UL, 0x3ff00000UL
|
|
400 |
jmp(B1_5);
|
|
401 |
|
|
402 |
bind(L_2TAG_PACKET_6_0_2);
|
|
403 |
movq(Address(rsp, 16), xmm0);
|
|
404 |
|
|
405 |
bind(B1_3);
|
|
406 |
movq(xmm0, Address(rsp, 16));
|
|
407 |
|
|
408 |
bind(B1_5);
|
|
409 |
addq(rsp, 24);
|
|
410 |
}
|
33465
|
411 |
|
33089
|
412 |
#endif
|
|
413 |
|
|
414 |
#ifndef _LP64
|
|
415 |
|
|
416 |
ALIGNED_(16) juint _static_const_table[] =
|
|
417 |
{
|
|
418 |
0x00000000UL, 0xfff00000UL, 0x00000000UL, 0xfff00000UL, 0xffffffc0UL,
|
|
419 |
0x00000000UL, 0xffffffc0UL, 0x00000000UL, 0x0000ffc0UL, 0x00000000UL,
|
|
420 |
0x0000ffc0UL, 0x00000000UL, 0x00000000UL, 0x43380000UL, 0x00000000UL,
|
|
421 |
0x43380000UL, 0x652b82feUL, 0x40571547UL, 0x652b82feUL, 0x40571547UL,
|
|
422 |
0xfefa0000UL, 0x3f862e42UL, 0xfefa0000UL, 0x3f862e42UL, 0xbc9e3b3aUL,
|
|
423 |
0x3d1cf79aUL, 0xbc9e3b3aUL, 0x3d1cf79aUL, 0xfffffffeUL, 0x3fdfffffUL,
|
|
424 |
0xfffffffeUL, 0x3fdfffffUL, 0xe3289860UL, 0x3f56c15cUL, 0x555b9e25UL,
|
|
425 |
0x3fa55555UL, 0xc090cf0fUL, 0x3f811115UL, 0x55548ba1UL, 0x3fc55555UL,
|
|
426 |
0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x0e03754dUL,
|
|
427 |
0x3cad7bbfUL, 0x3e778060UL, 0x00002c9aUL, 0x3567f613UL, 0x3c8cd252UL,
|
|
428 |
0xd3158574UL, 0x000059b0UL, 0x61e6c861UL, 0x3c60f74eUL, 0x18759bc8UL,
|
|
429 |
0x00008745UL, 0x5d837b6cUL, 0x3c979aa6UL, 0x6cf9890fUL, 0x0000b558UL,
|
|
430 |
0x702f9cd1UL, 0x3c3ebe3dUL, 0x32d3d1a2UL, 0x0000e3ecUL, 0x1e63bcd8UL,
|
|
431 |
0x3ca3516eUL, 0xd0125b50UL, 0x00011301UL, 0x26f0387bUL, 0x3ca4c554UL,
|
|
432 |
0xaea92ddfUL, 0x0001429aUL, 0x62523fb6UL, 0x3ca95153UL, 0x3c7d517aUL,
|
|
433 |
0x000172b8UL, 0x3f1353bfUL, 0x3c8b898cUL, 0xeb6fcb75UL, 0x0001a35bUL,
|
|
434 |
0x3e3a2f5fUL, 0x3c9aecf7UL, 0x3168b9aaUL, 0x0001d487UL, 0x44a6c38dUL,
|
|
435 |
0x3c8a6f41UL, 0x88628cd6UL, 0x0002063bUL, 0xe3a8a894UL, 0x3c968efdUL,
|
|
436 |
0x6e756238UL, 0x0002387aUL, 0x981fe7f2UL, 0x3c80472bUL, 0x65e27cddUL,
|
|
437 |
0x00026b45UL, 0x6d09ab31UL, 0x3c82f7e1UL, 0xf51fdee1UL, 0x00029e9dUL,
|
|
438 |
0x720c0ab3UL, 0x3c8b3782UL, 0xa6e4030bUL, 0x0002d285UL, 0x4db0abb6UL,
|
|
439 |
0x3c834d75UL, 0x0a31b715UL, 0x000306feUL, 0x5dd3f84aUL, 0x3c8fdd39UL,
|
|
440 |
0xb26416ffUL, 0x00033c08UL, 0xcc187d29UL, 0x3ca12f8cUL, 0x373aa9caUL,
|
|
441 |
0x000371a7UL, 0x738b5e8bUL, 0x3ca7d229UL, 0x34e59ff6UL, 0x0003a7dbUL,
|
|
442 |
0xa72a4c6dUL, 0x3c859f48UL, 0x4c123422UL, 0x0003dea6UL, 0x259d9205UL,
|
|
443 |
0x3ca8b846UL, 0x21f72e29UL, 0x0004160aUL, 0x60c2ac12UL, 0x3c4363edUL,
|
|
444 |
0x6061892dUL, 0x00044e08UL, 0xdaa10379UL, 0x3c6ecce1UL, 0xb5c13cd0UL,
|
|
445 |
0x000486a2UL, 0xbb7aafb0UL, 0x3c7690ceUL, 0xd5362a27UL, 0x0004bfdaUL,
|
|
446 |
0x9b282a09UL, 0x3ca083ccUL, 0x769d2ca6UL, 0x0004f9b2UL, 0xc1aae707UL,
|
|
447 |
0x3ca509b0UL, 0x569d4f81UL, 0x0005342bUL, 0x18fdd78eUL, 0x3c933505UL,
|
|
448 |
0x36b527daUL, 0x00056f47UL, 0xe21c5409UL, 0x3c9063e1UL, 0xdd485429UL,
|
|
449 |
0x0005ab07UL, 0x2b64c035UL, 0x3c9432e6UL, 0x15ad2148UL, 0x0005e76fUL,
|
|
450 |
0x99f08c0aUL, 0x3ca01284UL, 0xb03a5584UL, 0x0006247eUL, 0x0073dc06UL,
|
|
451 |
0x3c99f087UL, 0x82552224UL, 0x00066238UL, 0x0da05571UL, 0x3c998d4dUL,
|
|
452 |
0x667f3bccUL, 0x0006a09eUL, 0x86ce4786UL, 0x3ca52bb9UL, 0x3c651a2eUL,
|
|
453 |
0x0006dfb2UL, 0x206f0dabUL, 0x3ca32092UL, 0xe8ec5f73UL, 0x00071f75UL,
|
|
454 |
0x8e17a7a6UL, 0x3ca06122UL, 0x564267c8UL, 0x00075febUL, 0x461e9f86UL,
|
|
455 |
0x3ca244acUL, 0x73eb0186UL, 0x0007a114UL, 0xabd66c55UL, 0x3c65ebe1UL,
|
|
456 |
0x36cf4e62UL, 0x0007e2f3UL, 0xbbff67d0UL, 0x3c96fe9fUL, 0x994cce12UL,
|
|
457 |
0x00082589UL, 0x14c801dfUL, 0x3c951f14UL, 0x9b4492ecUL, 0x000868d9UL,
|
|
458 |
0xc1f0eab4UL, 0x3c8db72fUL, 0x422aa0dbUL, 0x0008ace5UL, 0x59f35f44UL,
|
|
459 |
0x3c7bf683UL, 0x99157736UL, 0x0008f1aeUL, 0x9c06283cUL, 0x3ca360baUL,
|
|
460 |
0xb0cdc5e4UL, 0x00093737UL, 0x20f962aaUL, 0x3c95e8d1UL, 0x9fde4e4fUL,
|
|
461 |
0x00097d82UL, 0x2b91ce27UL, 0x3c71affcUL, 0x82a3f090UL, 0x0009c491UL,
|
|
462 |
0x589a2ebdUL, 0x3c9b6d34UL, 0x7b5de564UL, 0x000a0c66UL, 0x9ab89880UL,
|
|
463 |
0x3c95277cUL, 0xb23e255cUL, 0x000a5503UL, 0x6e735ab3UL, 0x3c846984UL,
|
|
464 |
0x5579fdbfUL, 0x000a9e6bUL, 0x92cb3387UL, 0x3c8c1a77UL, 0x995ad3adUL,
|
|
465 |
0x000ae89fUL, 0xdc2d1d96UL, 0x3ca22466UL, 0xb84f15faUL, 0x000b33a2UL,
|
|
466 |
0xb19505aeUL, 0x3ca1112eUL, 0xf2fb5e46UL, 0x000b7f76UL, 0x0a5fddcdUL,
|
|
467 |
0x3c74ffd7UL, 0x904bc1d2UL, 0x000bcc1eUL, 0x30af0cb3UL, 0x3c736eaeUL,
|
|
468 |
0xdd85529cUL, 0x000c199bUL, 0xd10959acUL, 0x3c84e08fUL, 0x2e57d14bUL,
|
|
469 |
0x000c67f1UL, 0x6c921968UL, 0x3c676b2cUL, 0xdcef9069UL, 0x000cb720UL,
|
|
470 |
0x36df99b3UL, 0x3c937009UL, 0x4a07897bUL, 0x000d072dUL, 0xa63d07a7UL,
|
|
471 |
0x3c74a385UL, 0xdcfba487UL, 0x000d5818UL, 0xd5c192acUL, 0x3c8e5a50UL,
|
|
472 |
0x03db3285UL, 0x000da9e6UL, 0x1c4a9792UL, 0x3c98bb73UL, 0x337b9b5eUL,
|
|
473 |
0x000dfc97UL, 0x603a88d3UL, 0x3c74b604UL, 0xe78b3ff6UL, 0x000e502eUL,
|
|
474 |
0x92094926UL, 0x3c916f27UL, 0xa2a490d9UL, 0x000ea4afUL, 0x41aa2008UL,
|
|
475 |
0x3c8ec3bcUL, 0xee615a27UL, 0x000efa1bUL, 0x31d185eeUL, 0x3c8a64a9UL,
|
|
476 |
0x5b6e4540UL, 0x000f5076UL, 0x4d91cd9dUL, 0x3c77893bUL, 0x819e90d8UL,
|
|
477 |
0x000fa7c1UL, 0x00000000UL, 0x3ff00000UL, 0x00000000UL, 0x7ff00000UL,
|
|
478 |
0x00000000UL, 0x00000000UL, 0xffffffffUL, 0x7fefffffUL, 0x00000000UL,
|
|
479 |
0x00100000UL
|
|
480 |
};
|
|
481 |
|
|
482 |
//registers,
|
|
483 |
// input: (rbp + 8)
|
|
484 |
// scratch: xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
|
|
485 |
// rax, rdx, rcx, rbx (tmp)
|
|
486 |
|
|
487 |
// Code generated by Intel C compiler for LIBM library
|
|
488 |
|
|
489 |
void MacroAssembler::fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register tmp) {
|
|
490 |
Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2;
|
|
491 |
Label L_2TAG_PACKET_4_0_2, L_2TAG_PACKET_5_0_2, L_2TAG_PACKET_6_0_2, L_2TAG_PACKET_7_0_2;
|
|
492 |
Label L_2TAG_PACKET_8_0_2, L_2TAG_PACKET_9_0_2, L_2TAG_PACKET_10_0_2, L_2TAG_PACKET_11_0_2;
|
|
493 |
Label L_2TAG_PACKET_12_0_2, L_2TAG_PACKET_13_0_2, B1_3, B1_5, start;
|
|
494 |
|
|
495 |
assert_different_registers(tmp, eax, ecx, edx);
|
|
496 |
jmp(start);
|
|
497 |
address static_const_table = (address)_static_const_table;
|
|
498 |
|
|
499 |
bind(start);
|
|
500 |
subl(rsp, 120);
|
|
501 |
movl(Address(rsp, 64), tmp);
|
|
502 |
lea(tmp, ExternalAddress(static_const_table));
|
|
503 |
movdqu(xmm0, Address(rsp, 128));
|
|
504 |
unpcklpd(xmm0, xmm0);
|
|
505 |
movdqu(xmm1, Address(tmp, 64)); // 0x652b82feUL, 0x40571547UL, 0x652b82feUL, 0x40571547UL
|
|
506 |
movdqu(xmm6, Address(tmp, 48)); // 0x00000000UL, 0x43380000UL, 0x00000000UL, 0x43380000UL
|
|
507 |
movdqu(xmm2, Address(tmp, 80)); // 0xfefa0000UL, 0x3f862e42UL, 0xfefa0000UL, 0x3f862e42UL
|
|
508 |
movdqu(xmm3, Address(tmp, 96)); // 0xbc9e3b3aUL, 0x3d1cf79aUL, 0xbc9e3b3aUL, 0x3d1cf79aUL
|
|
509 |
pextrw(eax, xmm0, 3);
|
|
510 |
andl(eax, 32767);
|
|
511 |
movl(edx, 16527);
|
|
512 |
subl(edx, eax);
|
|
513 |
subl(eax, 15504);
|
|
514 |
orl(edx, eax);
|
|
515 |
cmpl(edx, INT_MIN);
|
|
516 |
jcc(Assembler::aboveEqual, L_2TAG_PACKET_0_0_2);
|
|
517 |
mulpd(xmm1, xmm0);
|
|
518 |
addpd(xmm1, xmm6);
|
|
519 |
movapd(xmm7, xmm1);
|
|
520 |
subpd(xmm1, xmm6);
|
|
521 |
mulpd(xmm2, xmm1);
|
|
522 |
movdqu(xmm4, Address(tmp, 128)); // 0xe3289860UL, 0x3f56c15cUL, 0x555b9e25UL, 0x3fa55555UL
|
|
523 |
mulpd(xmm3, xmm1);
|
|
524 |
movdqu(xmm5, Address(tmp, 144)); // 0xc090cf0fUL, 0x3f811115UL, 0x55548ba1UL, 0x3fc55555UL
|
|
525 |
subpd(xmm0, xmm2);
|
|
526 |
movdl(eax, xmm7);
|
|
527 |
movl(ecx, eax);
|
|
528 |
andl(ecx, 63);
|
|
529 |
shll(ecx, 4);
|
|
530 |
sarl(eax, 6);
|
|
531 |
movl(edx, eax);
|
|
532 |
movdqu(xmm6, Address(tmp, 16)); // 0xffffffc0UL, 0x00000000UL, 0xffffffc0UL, 0x00000000UL
|
|
533 |
pand(xmm7, xmm6);
|
|
534 |
movdqu(xmm6, Address(tmp, 32)); // 0x0000ffc0UL, 0x00000000UL, 0x0000ffc0UL, 0x00000000UL
|
|
535 |
paddq(xmm7, xmm6);
|
|
536 |
psllq(xmm7, 46);
|
|
537 |
subpd(xmm0, xmm3);
|
|
538 |
movdqu(xmm2, Address(tmp, ecx, Address::times_1, 160));
|
|
539 |
mulpd(xmm4, xmm0);
|
|
540 |
movapd(xmm6, xmm0);
|
|
541 |
movapd(xmm1, xmm0);
|
|
542 |
mulpd(xmm6, xmm6);
|
|
543 |
mulpd(xmm0, xmm6);
|
|
544 |
addpd(xmm5, xmm4);
|
|
545 |
mulsd(xmm0, xmm6);
|
|
546 |
mulpd(xmm6, Address(tmp, 112)); // 0xfffffffeUL, 0x3fdfffffUL, 0xfffffffeUL, 0x3fdfffffUL
|
|
547 |
addsd(xmm1, xmm2);
|
|
548 |
unpckhpd(xmm2, xmm2);
|
|
549 |
mulpd(xmm0, xmm5);
|
|
550 |
addsd(xmm1, xmm0);
|
|
551 |
por(xmm2, xmm7);
|
|
552 |
unpckhpd(xmm0, xmm0);
|
|
553 |
addsd(xmm0, xmm1);
|
|
554 |
addsd(xmm0, xmm6);
|
|
555 |
addl(edx, 894);
|
|
556 |
cmpl(edx, 1916);
|
|
557 |
jcc (Assembler::above, L_2TAG_PACKET_1_0_2);
|
|
558 |
mulsd(xmm0, xmm2);
|
|
559 |
addsd(xmm0, xmm2);
|
|
560 |
jmp(L_2TAG_PACKET_2_0_2);
|
|
561 |
|
|
562 |
bind(L_2TAG_PACKET_1_0_2);
|
|
563 |
fnstcw(Address(rsp, 24));
|
|
564 |
movzwl(edx, Address(rsp, 24));
|
|
565 |
orl(edx, 768);
|
|
566 |
movw(Address(rsp, 28), edx);
|
|
567 |
fldcw(Address(rsp, 28));
|
|
568 |
movl(edx, eax);
|
|
569 |
sarl(eax, 1);
|
|
570 |
subl(edx, eax);
|
|
571 |
movdqu(xmm6, Address(tmp, 0)); // 0x00000000UL, 0xfff00000UL, 0x00000000UL, 0xfff00000UL
|
|
572 |
pandn(xmm6, xmm2);
|
|
573 |
addl(eax, 1023);
|
|
574 |
movdl(xmm3, eax);
|
|
575 |
psllq(xmm3, 52);
|
|
576 |
por(xmm6, xmm3);
|
|
577 |
addl(edx, 1023);
|
|
578 |
movdl(xmm4, edx);
|
|
579 |
psllq(xmm4, 52);
|
|
580 |
movsd(Address(rsp, 8), xmm0);
|
|
581 |
fld_d(Address(rsp, 8));
|
|
582 |
movsd(Address(rsp, 16), xmm6);
|
|
583 |
fld_d(Address(rsp, 16));
|
|
584 |
fmula(1);
|
|
585 |
faddp(1);
|
|
586 |
movsd(Address(rsp, 8), xmm4);
|
|
587 |
fld_d(Address(rsp, 8));
|
|
588 |
fmulp(1);
|
|
589 |
fstp_d(Address(rsp, 8));
|
|
590 |
movsd(xmm0,Address(rsp, 8));
|
|
591 |
fldcw(Address(rsp, 24));
|
|
592 |
pextrw(ecx, xmm0, 3);
|
|
593 |
andl(ecx, 32752);
|
|
594 |
cmpl(ecx, 32752);
|
|
595 |
jcc(Assembler::greaterEqual, L_2TAG_PACKET_3_0_2);
|
|
596 |
cmpl(ecx, 0);
|
|
597 |
jcc(Assembler::equal, L_2TAG_PACKET_4_0_2);
|
|
598 |
jmp(L_2TAG_PACKET_2_0_2);
|
|
599 |
cmpl(ecx, INT_MIN);
|
|
600 |
jcc(Assembler::less, L_2TAG_PACKET_3_0_2);
|
|
601 |
cmpl(ecx, -1064950997);
|
|
602 |
jcc(Assembler::less, L_2TAG_PACKET_2_0_2);
|
|
603 |
jcc(Assembler::greater, L_2TAG_PACKET_4_0_2);
|
|
604 |
movl(edx, Address(rsp, 128));
|
|
605 |
cmpl(edx ,-17155601);
|
|
606 |
jcc(Assembler::less, L_2TAG_PACKET_2_0_2);
|
|
607 |
jmp(L_2TAG_PACKET_4_0_2);
|
|
608 |
|
|
609 |
bind(L_2TAG_PACKET_3_0_2);
|
|
610 |
movl(edx, 14);
|
|
611 |
jmp(L_2TAG_PACKET_5_0_2);
|
|
612 |
|
|
613 |
bind(L_2TAG_PACKET_4_0_2);
|
|
614 |
movl(edx, 15);
|
|
615 |
|
|
616 |
bind(L_2TAG_PACKET_5_0_2);
|
|
617 |
movsd(Address(rsp, 0), xmm0);
|
|
618 |
movsd(xmm0, Address(rsp, 128));
|
|
619 |
fld_d(Address(rsp, 0));
|
|
620 |
jmp(L_2TAG_PACKET_6_0_2);
|
|
621 |
|
|
622 |
bind(L_2TAG_PACKET_7_0_2);
|
|
623 |
cmpl(eax, 2146435072);
|
|
624 |
jcc(Assembler::greaterEqual, L_2TAG_PACKET_8_0_2);
|
|
625 |
movl(eax, Address(rsp, 132));
|
|
626 |
cmpl(eax, INT_MIN);
|
|
627 |
jcc(Assembler::greaterEqual, L_2TAG_PACKET_9_0_2);
|
|
628 |
movsd(xmm0, Address(tmp, 1208)); // 0xffffffffUL, 0x7fefffffUL
|
|
629 |
mulsd(xmm0, xmm0);
|
|
630 |
movl(edx, 14);
|
|
631 |
jmp(L_2TAG_PACKET_5_0_2);
|
|
632 |
|
|
633 |
bind(L_2TAG_PACKET_9_0_2);
|
|
634 |
movsd(xmm0, Address(tmp, 1216));
|
|
635 |
mulsd(xmm0, xmm0);
|
|
636 |
movl(edx, 15);
|
|
637 |
jmp(L_2TAG_PACKET_5_0_2);
|
|
638 |
|
|
639 |
bind(L_2TAG_PACKET_8_0_2);
|
|
640 |
movl(edx, Address(rsp, 128));
|
|
641 |
cmpl(eax, 2146435072);
|
|
642 |
jcc(Assembler::above, L_2TAG_PACKET_10_0_2);
|
|
643 |
cmpl(edx, 0);
|
|
644 |
jcc(Assembler::notEqual, L_2TAG_PACKET_10_0_2);
|
|
645 |
movl(eax, Address(rsp, 132));
|
|
646 |
cmpl(eax, 2146435072);
|
|
647 |
jcc(Assembler::notEqual, L_2TAG_PACKET_11_0_2);
|
|
648 |
movsd(xmm0, Address(tmp, 1192)); // 0x00000000UL, 0x7ff00000UL
|
|
649 |
jmp(L_2TAG_PACKET_2_0_2);
|
|
650 |
|
|
651 |
bind(L_2TAG_PACKET_11_0_2);
|
|
652 |
movsd(xmm0, Address(tmp, 1200)); // 0x00000000UL, 0x00000000UL
|
|
653 |
jmp(L_2TAG_PACKET_2_0_2);
|
|
654 |
|
|
655 |
bind(L_2TAG_PACKET_10_0_2);
|
|
656 |
movsd(xmm0, Address(rsp, 128));
|
|
657 |
addsd(xmm0, xmm0);
|
|
658 |
jmp(L_2TAG_PACKET_2_0_2);
|
|
659 |
|
|
660 |
bind(L_2TAG_PACKET_0_0_2);
|
|
661 |
movl(eax, Address(rsp, 132));
|
|
662 |
andl(eax, 2147483647);
|
|
663 |
cmpl(eax, 1083179008);
|
|
664 |
jcc(Assembler::aboveEqual, L_2TAG_PACKET_7_0_2);
|
|
665 |
movsd(xmm0, Address(rsp, 128));
|
|
666 |
addsd(xmm0, Address(tmp, 1184)); // 0x00000000UL, 0x3ff00000UL
|
|
667 |
jmp(L_2TAG_PACKET_2_0_2);
|
|
668 |
|
|
669 |
bind(L_2TAG_PACKET_2_0_2);
|
|
670 |
movsd(Address(rsp, 48), xmm0);
|
|
671 |
fld_d(Address(rsp, 48));
|
|
672 |
|
|
673 |
bind(L_2TAG_PACKET_6_0_2);
|
|
674 |
movl(tmp, Address(rsp, 64));
|
|
675 |
}
|
|
676 |
|
|
677 |
#endif
|
33465
|
678 |
|
|
679 |
/******************************************************************************/
|
|
680 |
// ALGORITHM DESCRIPTION - LOG()
|
|
681 |
// ---------------------
|
|
682 |
//
|
|
683 |
// x=2^k * mx, mx in [1,2)
|
|
684 |
//
|
|
685 |
// Get B~1/mx based on the output of rcpss instruction (B0)
|
|
686 |
// B = int((B0*2^7+0.5))/2^7
|
|
687 |
//
|
|
688 |
// Reduced argument: r=B*mx-1.0 (computed accurately in high and low parts)
|
|
689 |
//
|
|
690 |
// Result: k*log(2) - log(B) + p(r) if |x-1| >= small value (2^-6) and
|
|
691 |
// p(r) is a degree 7 polynomial
|
|
692 |
// -log(B) read from data table (high, low parts)
|
|
693 |
// Result is formed from high and low parts
|
|
694 |
//
|
|
695 |
// Special cases:
|
|
696 |
// log(NaN) = quiet NaN, and raise invalid exception
|
|
697 |
// log(+INF) = that INF
|
|
698 |
// log(0) = -INF with divide-by-zero exception raised
|
|
699 |
// log(1) = +0
|
|
700 |
// log(x) = NaN with invalid exception raised if x < -0, including -INF
|
|
701 |
//
|
|
702 |
/******************************************************************************/
|
|
703 |
|
|
704 |
#ifdef _LP64
|
|
705 |
|
|
706 |
ALIGNED_(16) juint _L_tbl[] =
|
|
707 |
{
|
|
708 |
0xfefa3800UL, 0x3fe62e42UL, 0x93c76730UL, 0x3d2ef357UL, 0xaa241800UL,
|
|
709 |
0x3fe5ee82UL, 0x0cda46beUL, 0x3d220238UL, 0x5c364800UL, 0x3fe5af40UL,
|
|
710 |
0xac10c9fbUL, 0x3d2dfa63UL, 0x26bb8c00UL, 0x3fe5707aUL, 0xff3303ddUL,
|
|
711 |
0x3d09980bUL, 0x26867800UL, 0x3fe5322eUL, 0x5d257531UL, 0x3d05ccc4UL,
|
|
712 |
0x835a5000UL, 0x3fe4f45aUL, 0x6d93b8fbUL, 0xbd2e6c51UL, 0x6f970c00UL,
|
|
713 |
0x3fe4b6fdUL, 0xed4c541cUL, 0x3cef7115UL, 0x27e8a400UL, 0x3fe47a15UL,
|
|
714 |
0xf94d60aaUL, 0xbd22cb6aUL, 0xf2f92400UL, 0x3fe43d9fUL, 0x481051f7UL,
|
|
715 |
0xbcfd984fUL, 0x2125cc00UL, 0x3fe4019cUL, 0x30f0c74cUL, 0xbd26ce79UL,
|
|
716 |
0x0c36c000UL, 0x3fe3c608UL, 0x7cfe13c2UL, 0xbd02b736UL, 0x17197800UL,
|
|
717 |
0x3fe38ae2UL, 0xbb5569a4UL, 0xbd218b7aUL, 0xad9d8c00UL, 0x3fe35028UL,
|
|
718 |
0x9527e6acUL, 0x3d10b83fUL, 0x44340800UL, 0x3fe315daUL, 0xc5a0ed9cUL,
|
|
719 |
0xbd274e93UL, 0x57b0e000UL, 0x3fe2dbf5UL, 0x07b9dc11UL, 0xbd17a6e5UL,
|
|
720 |
0x6d0ec000UL, 0x3fe2a278UL, 0xe797882dUL, 0x3d206d2bUL, 0x1134dc00UL,
|
|
721 |
0x3fe26962UL, 0x05226250UL, 0xbd0b61f1UL, 0xd8bebc00UL, 0x3fe230b0UL,
|
|
722 |
0x6e48667bUL, 0x3d12fc06UL, 0x5fc61800UL, 0x3fe1f863UL, 0xc9fe81d3UL,
|
|
723 |
0xbd2a7242UL, 0x49ae6000UL, 0x3fe1c078UL, 0xed70e667UL, 0x3cccacdeUL,
|
|
724 |
0x40f23c00UL, 0x3fe188eeUL, 0xf8ab4650UL, 0x3d14cc4eUL, 0xf6f29800UL,
|
|
725 |
0x3fe151c3UL, 0xa293ae49UL, 0xbd2edd97UL, 0x23c75c00UL, 0x3fe11af8UL,
|
|
726 |
0xbb9ddcb2UL, 0xbd258647UL, 0x8611cc00UL, 0x3fe0e489UL, 0x07801742UL,
|
|
727 |
0x3d1c2998UL, 0xe2d05400UL, 0x3fe0ae76UL, 0x887e7e27UL, 0x3d1f486bUL,
|
|
728 |
0x0533c400UL, 0x3fe078bfUL, 0x41edf5fdUL, 0x3d268122UL, 0xbe760400UL,
|
|
729 |
0x3fe04360UL, 0xe79539e0UL, 0xbd04c45fUL, 0xe5b20800UL, 0x3fe00e5aUL,
|
|
730 |
0xb1727b1cUL, 0xbd053ba3UL, 0xaf7a4800UL, 0x3fdfb358UL, 0x3c164935UL,
|
|
731 |
0x3d0085faUL, 0xee031800UL, 0x3fdf4aa7UL, 0x6f014a8bUL, 0x3d12cde5UL,
|
|
732 |
0x56b41000UL, 0x3fdee2a1UL, 0x5a470251UL, 0x3d2f27f4UL, 0xc3ddb000UL,
|
|
733 |
0x3fde7b42UL, 0x5372bd08UL, 0xbd246550UL, 0x1a272800UL, 0x3fde148aUL,
|
|
734 |
0x07322938UL, 0xbd1326b2UL, 0x484c9800UL, 0x3fddae75UL, 0x60dc616aUL,
|
|
735 |
0xbd1ea42dUL, 0x46def800UL, 0x3fdd4902UL, 0xe9a767a8UL, 0x3d235bafUL,
|
|
736 |
0x18064800UL, 0x3fdce42fUL, 0x3ec7a6b0UL, 0xbd0797c3UL, 0xc7455800UL,
|
|
737 |
0x3fdc7ff9UL, 0xc15249aeUL, 0xbd29b6ddUL, 0x693fa000UL, 0x3fdc1c60UL,
|
|
738 |
0x7fe8e180UL, 0x3d2cec80UL, 0x1b80e000UL, 0x3fdbb961UL, 0xf40a666dUL,
|
|
739 |
0x3d27d85bUL, 0x04462800UL, 0x3fdb56faUL, 0x2d841995UL, 0x3d109525UL,
|
|
740 |
0x5248d000UL, 0x3fdaf529UL, 0x52774458UL, 0xbd217cc5UL, 0x3c8ad800UL,
|
|
741 |
0x3fda93edUL, 0xbea77a5dUL, 0x3d1e36f2UL, 0x0224f800UL, 0x3fda3344UL,
|
|
742 |
0x7f9d79f5UL, 0x3d23c645UL, 0xea15f000UL, 0x3fd9d32bUL, 0x10d0c0b0UL,
|
|
743 |
0xbd26279eUL, 0x43135800UL, 0x3fd973a3UL, 0xa502d9f0UL, 0xbd152313UL,
|
|
744 |
0x635bf800UL, 0x3fd914a8UL, 0x2ee6307dUL, 0xbd1766b5UL, 0xa88b3000UL,
|
|
745 |
0x3fd8b639UL, 0xe5e70470UL, 0xbd205ae1UL, 0x776dc800UL, 0x3fd85855UL,
|
|
746 |
0x3333778aUL, 0x3d2fd56fUL, 0x3bd81800UL, 0x3fd7fafaUL, 0xc812566aUL,
|
|
747 |
0xbd272090UL, 0x687cf800UL, 0x3fd79e26UL, 0x2efd1778UL, 0x3d29ec7dUL,
|
|
748 |
0x76c67800UL, 0x3fd741d8UL, 0x49dc60b3UL, 0x3d2d8b09UL, 0xe6af1800UL,
|
|
749 |
0x3fd6e60eUL, 0x7c222d87UL, 0x3d172165UL, 0x3e9c6800UL, 0x3fd68ac8UL,
|
|
750 |
0x2756eba0UL, 0x3d20a0d3UL, 0x0b3ab000UL, 0x3fd63003UL, 0xe731ae00UL,
|
|
751 |
0xbd2db623UL, 0xdf596000UL, 0x3fd5d5bdUL, 0x08a465dcUL, 0xbd0a0b2aUL,
|
|
752 |
0x53c8d000UL, 0x3fd57bf7UL, 0xee5d40efUL, 0x3d1fadedUL, 0x0738a000UL,
|
|
753 |
0x3fd522aeUL, 0x8164c759UL, 0x3d2ebe70UL, 0x9e173000UL, 0x3fd4c9e0UL,
|
|
754 |
0x1b0ad8a4UL, 0xbd2e2089UL, 0xc271c800UL, 0x3fd4718dUL, 0x0967d675UL,
|
|
755 |
0xbd2f27ceUL, 0x23d5e800UL, 0x3fd419b4UL, 0xec90e09dUL, 0x3d08e436UL,
|
|
756 |
0x77333000UL, 0x3fd3c252UL, 0xb606bd5cUL, 0x3d183b54UL, 0x76be1000UL,
|
|
757 |
0x3fd36b67UL, 0xb0f177c8UL, 0x3d116ecdUL, 0xe1d36000UL, 0x3fd314f1UL,
|
|
758 |
0xd3213cb8UL, 0xbd28e27aUL, 0x7cdc9000UL, 0x3fd2bef0UL, 0x4a5004f4UL,
|
|
759 |
0x3d2a9cfaUL, 0x1134d800UL, 0x3fd26962UL, 0xdf5bb3b6UL, 0x3d2c93c1UL,
|
|
760 |
0x6d0eb800UL, 0x3fd21445UL, 0xba46baeaUL, 0x3d0a87deUL, 0x635a6800UL,
|
|
761 |
0x3fd1bf99UL, 0x5147bdb7UL, 0x3d2ca6edUL, 0xcbacf800UL, 0x3fd16b5cUL,
|
|
762 |
0xf7a51681UL, 0x3d2b9acdUL, 0x8227e800UL, 0x3fd1178eUL, 0x63a5f01cUL,
|
|
763 |
0xbd2c210eUL, 0x67616000UL, 0x3fd0c42dUL, 0x163ceae9UL, 0x3d27188bUL,
|
|
764 |
0x604d5800UL, 0x3fd07138UL, 0x16ed4e91UL, 0x3cf89cdbUL, 0x5626c800UL,
|
|
765 |
0x3fd01eaeUL, 0x1485e94aUL, 0xbd16f08cUL, 0x6cb3b000UL, 0x3fcf991cUL,
|
|
766 |
0xca0cdf30UL, 0x3d1bcbecUL, 0xe4dd0000UL, 0x3fcef5adUL, 0x65bb8e11UL,
|
|
767 |
0xbcca2115UL, 0xffe71000UL, 0x3fce530eUL, 0x6041f430UL, 0x3cc21227UL,
|
|
768 |
0xb0d49000UL, 0x3fcdb13dUL, 0xf715b035UL, 0xbd2aff2aUL, 0xf2656000UL,
|
|
769 |
0x3fcd1037UL, 0x75b6f6e4UL, 0xbd084a7eUL, 0xc6f01000UL, 0x3fcc6ffbUL,
|
|
770 |
0xc5962bd2UL, 0xbcf1ec72UL, 0x383be000UL, 0x3fcbd087UL, 0x595412b6UL,
|
|
771 |
0xbd2d4bc4UL, 0x575bd000UL, 0x3fcb31d8UL, 0x4eace1aaUL, 0xbd0c358dUL,
|
|
772 |
0x3c8ae000UL, 0x3fca93edUL, 0x50562169UL, 0xbd287243UL, 0x07089000UL,
|
|
773 |
0x3fc9f6c4UL, 0x6865817aUL, 0x3d29904dUL, 0xdcf70000UL, 0x3fc95a5aUL,
|
|
774 |
0x58a0ff6fUL, 0x3d07f228UL, 0xeb390000UL, 0x3fc8beafUL, 0xaae92cd1UL,
|
|
775 |
0xbd073d54UL, 0x6551a000UL, 0x3fc823c1UL, 0x9a631e83UL, 0x3d1e0ddbUL,
|
|
776 |
0x85445000UL, 0x3fc7898dUL, 0x70914305UL, 0xbd1c6610UL, 0x8b757000UL,
|
|
777 |
0x3fc6f012UL, 0xe59c21e1UL, 0xbd25118dUL, 0xbe8c1000UL, 0x3fc6574eUL,
|
|
778 |
0x2c3c2e78UL, 0x3d19cf8bUL, 0x6b544000UL, 0x3fc5bf40UL, 0xeb68981cUL,
|
|
779 |
0xbd127023UL, 0xe4a1b000UL, 0x3fc527e5UL, 0xe5697dc7UL, 0x3d2633e8UL,
|
|
780 |
0x8333b000UL, 0x3fc4913dUL, 0x54fdb678UL, 0x3d258379UL, 0xa5993000UL,
|
|
781 |
0x3fc3fb45UL, 0x7e6a354dUL, 0xbd2cd1d8UL, 0xb0159000UL, 0x3fc365fcUL,
|
|
782 |
0x234b7289UL, 0x3cc62fa8UL, 0x0c868000UL, 0x3fc2d161UL, 0xcb81b4a1UL,
|
|
783 |
0x3d039d6cUL, 0x2a49c000UL, 0x3fc23d71UL, 0x8fd3df5cUL, 0x3d100d23UL,
|
|
784 |
0x7e23f000UL, 0x3fc1aa2bUL, 0x44389934UL, 0x3d2ca78eUL, 0x8227e000UL,
|
|
785 |
0x3fc1178eUL, 0xce2d07f2UL, 0x3d21ef78UL, 0xb59e4000UL, 0x3fc08598UL,
|
|
786 |
0x7009902cUL, 0xbd27e5ddUL, 0x39dbe000UL, 0x3fbfe891UL, 0x4fa10afdUL,
|
|
787 |
0xbd2534d6UL, 0x830a2000UL, 0x3fbec739UL, 0xafe645e0UL, 0xbd2dc068UL,
|
|
788 |
0x63844000UL, 0x3fbda727UL, 0x1fa71733UL, 0x3d1a8940UL, 0x01bc4000UL,
|
|
789 |
0x3fbc8858UL, 0xc65aacd3UL, 0x3d2646d1UL, 0x8dad6000UL, 0x3fbb6ac8UL,
|
|
790 |
0x2bf768e5UL, 0xbd139080UL, 0x40b1c000UL, 0x3fba4e76UL, 0xb94407c8UL,
|
|
791 |
0xbd0e42b6UL, 0x5d594000UL, 0x3fb9335eUL, 0x3abd47daUL, 0x3d23115cUL,
|
|
792 |
0x2f40e000UL, 0x3fb8197eUL, 0xf96ffdf7UL, 0x3d0f80dcUL, 0x0aeac000UL,
|
|
793 |
0x3fb700d3UL, 0xa99ded32UL, 0x3cec1e8dUL, 0x4d97a000UL, 0x3fb5e95aUL,
|
|
794 |
0x3c5d1d1eUL, 0xbd2c6906UL, 0x5d208000UL, 0x3fb4d311UL, 0x82f4e1efUL,
|
|
795 |
0xbcf53a25UL, 0xa7d1e000UL, 0x3fb3bdf5UL, 0xa5db4ed7UL, 0x3d2cc85eUL,
|
|
796 |
0xa4472000UL, 0x3fb2aa04UL, 0xae9c697dUL, 0xbd20b6e8UL, 0xd1466000UL,
|
|
797 |
0x3fb1973bUL, 0x560d9e9bUL, 0xbd25325dUL, 0xb59e4000UL, 0x3fb08598UL,
|
|
798 |
0x7009902cUL, 0xbd17e5ddUL, 0xc006c000UL, 0x3faeea31UL, 0x4fc93b7bUL,
|
|
799 |
0xbd0e113eUL, 0xcdddc000UL, 0x3faccb73UL, 0x47d82807UL, 0xbd1a68f2UL,
|
|
800 |
0xd0fb0000UL, 0x3faaaef2UL, 0x353bb42eUL, 0x3d20fc1aUL, 0x149fc000UL,
|
|
801 |
0x3fa894aaUL, 0xd05a267dUL, 0xbd197995UL, 0xf2d4c000UL, 0x3fa67c94UL,
|
|
802 |
0xec19afa2UL, 0xbd029efbUL, 0xd42e0000UL, 0x3fa466aeUL, 0x75bdfd28UL,
|
|
803 |
0xbd2c1673UL, 0x2f8d0000UL, 0x3fa252f3UL, 0xe021b67bUL, 0x3d283e9aUL,
|
|
804 |
0x89e74000UL, 0x3fa0415dUL, 0x5cf1d753UL, 0x3d0111c0UL, 0xec148000UL,
|
|
805 |
0x3f9c63d2UL, 0x3f9eb2f3UL, 0x3d2578c6UL, 0x28c90000UL, 0x3f984925UL,
|
|
806 |
0x325a0c34UL, 0xbd2aa0baUL, 0x25980000UL, 0x3f9432a9UL, 0x928637feUL,
|
|
807 |
0x3d098139UL, 0x58938000UL, 0x3f902056UL, 0x06e2f7d2UL, 0xbd23dc5bUL,
|
|
808 |
0xa3890000UL, 0x3f882448UL, 0xda74f640UL, 0xbd275577UL, 0x75890000UL,
|
|
809 |
0x3f801015UL, 0x999d2be8UL, 0xbd10c76bUL, 0x59580000UL, 0x3f700805UL,
|
|
810 |
0xcb31c67bUL, 0x3d2166afUL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
|
|
811 |
0x80000000UL
|
|
812 |
};
|
|
813 |
|
|
814 |
ALIGNED_(16) juint _log2[] =
|
|
815 |
{
|
|
816 |
0xfefa3800UL, 0x3fa62e42UL, 0x93c76730UL, 0x3ceef357UL
|
|
817 |
};
|
|
818 |
|
|
819 |
ALIGNED_(16) juint _coeff[] =
|
|
820 |
{
|
|
821 |
0x92492492UL, 0x3fc24924UL, 0x00000000UL, 0xbfd00000UL, 0x3d6fb175UL,
|
|
822 |
0xbfc5555eUL, 0x55555555UL, 0x3fd55555UL, 0x9999999aUL, 0x3fc99999UL,
|
|
823 |
0x00000000UL, 0xbfe00000UL
|
|
824 |
};
|
|
825 |
|
|
826 |
//registers,
|
|
827 |
// input: xmm0
|
|
828 |
// scratch: xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
|
|
829 |
// rax, rdx, rcx, r8, r11
|
|
830 |
|
|
831 |
void MacroAssembler::fast_log(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register tmp1, Register tmp2) {
|
|
832 |
Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2;
|
|
833 |
Label L_2TAG_PACKET_4_0_2, L_2TAG_PACKET_5_0_2, L_2TAG_PACKET_6_0_2, L_2TAG_PACKET_7_0_2;
|
|
834 |
Label L_2TAG_PACKET_8_0_2;
|
|
835 |
Label L_2TAG_PACKET_12_0_2, L_2TAG_PACKET_13_0_2, B1_3, B1_5, start;
|
|
836 |
|
|
837 |
assert_different_registers(tmp1, tmp2, eax, ecx, edx);
|
|
838 |
jmp(start);
|
|
839 |
address L_tbl = (address)_L_tbl;
|
|
840 |
address log2 = (address)_log2;
|
|
841 |
address coeff = (address)_coeff;
|
|
842 |
|
|
843 |
bind(start);
|
|
844 |
subq(rsp, 24);
|
|
845 |
movsd(Address(rsp, 0), xmm0);
|
|
846 |
mov64(rax, 0x3ff0000000000000);
|
|
847 |
movdq(xmm2, rax);
|
|
848 |
mov64(rdx, 0x77f0000000000000);
|
|
849 |
movdq(xmm3, rdx);
|
|
850 |
movl(ecx, 32768);
|
|
851 |
movdl(xmm4, rcx);
|
|
852 |
mov64(tmp1, 0xffffe00000000000);
|
|
853 |
movdq(xmm5, tmp1);
|
|
854 |
movdqu(xmm1, xmm0);
|
|
855 |
pextrw(eax, xmm0, 3);
|
|
856 |
por(xmm0, xmm2);
|
|
857 |
movl(ecx, 16352);
|
|
858 |
psrlq(xmm0, 27);
|
|
859 |
lea(tmp2, ExternalAddress(L_tbl));
|
|
860 |
psrld(xmm0, 2);
|
|
861 |
rcpps(xmm0, xmm0);
|
|
862 |
psllq(xmm1, 12);
|
|
863 |
pshufd(xmm6, xmm5, 228);
|
|
864 |
psrlq(xmm1, 12);
|
|
865 |
subl(eax, 16);
|
|
866 |
cmpl(eax, 32736);
|
|
867 |
jcc(Assembler::aboveEqual, L_2TAG_PACKET_0_0_2);
|
|
868 |
|
|
869 |
bind(L_2TAG_PACKET_1_0_2);
|
|
870 |
paddd(xmm0, xmm4);
|
|
871 |
por(xmm1, xmm3);
|
|
872 |
movdl(edx, xmm0);
|
|
873 |
psllq(xmm0, 29);
|
|
874 |
pand(xmm5, xmm1);
|
|
875 |
pand(xmm0, xmm6);
|
|
876 |
subsd(xmm1, xmm5);
|
|
877 |
mulpd(xmm5, xmm0);
|
|
878 |
andl(eax, 32752);
|
|
879 |
subl(eax, ecx);
|
|
880 |
cvtsi2sdl(xmm7, eax);
|
|
881 |
mulsd(xmm1, xmm0);
|
|
882 |
movq(xmm6, ExternalAddress(log2)); // 0xfefa3800UL, 0x3fa62e42UL
|
|
883 |
movdqu(xmm3, ExternalAddress(coeff)); // 0x92492492UL, 0x3fc24924UL, 0x00000000UL, 0xbfd00000UL
|
|
884 |
subsd(xmm5, xmm2);
|
|
885 |
andl(edx, 16711680);
|
|
886 |
shrl(edx, 12);
|
|
887 |
movdqu(xmm0, Address(tmp2, edx));
|
|
888 |
movdqu(xmm4, ExternalAddress(16 + coeff)); // 0x3d6fb175UL, 0xbfc5555eUL, 0x55555555UL, 0x3fd55555UL
|
|
889 |
addsd(xmm1, xmm5);
|
|
890 |
movdqu(xmm2, ExternalAddress(32 + coeff)); // 0x9999999aUL, 0x3fc99999UL, 0x00000000UL, 0xbfe00000UL
|
|
891 |
mulsd(xmm6, xmm7);
|
|
892 |
movddup(xmm5, xmm1);
|
|
893 |
mulsd(xmm7, ExternalAddress(8 + log2)); // 0x93c76730UL, 0x3ceef357UL
|
|
894 |
mulsd(xmm3, xmm1);
|
|
895 |
addsd(xmm0, xmm6);
|
|
896 |
mulpd(xmm4, xmm5);
|
|
897 |
mulpd(xmm5, xmm5);
|
|
898 |
movddup(xmm6, xmm0);
|
|
899 |
addsd(xmm0, xmm1);
|
|
900 |
addpd(xmm4, xmm2);
|
|
901 |
mulpd(xmm3, xmm5);
|
|
902 |
subsd(xmm6, xmm0);
|
|
903 |
mulsd(xmm4, xmm1);
|
|
904 |
pshufd(xmm2, xmm0, 238);
|
|
905 |
addsd(xmm1, xmm6);
|
|
906 |
mulsd(xmm5, xmm5);
|
|
907 |
addsd(xmm7, xmm2);
|
|
908 |
addpd(xmm4, xmm3);
|
|
909 |
addsd(xmm1, xmm7);
|
|
910 |
mulpd(xmm4, xmm5);
|
|
911 |
addsd(xmm1, xmm4);
|
|
912 |
pshufd(xmm5, xmm4, 238);
|
|
913 |
addsd(xmm1, xmm5);
|
|
914 |
addsd(xmm0, xmm1);
|
|
915 |
jmp(B1_5);
|
|
916 |
|
|
917 |
bind(L_2TAG_PACKET_0_0_2);
|
|
918 |
movq(xmm0, Address(rsp, 0));
|
|
919 |
movq(xmm1, Address(rsp, 0));
|
|
920 |
addl(eax, 16);
|
|
921 |
cmpl(eax, 32768);
|
|
922 |
jcc(Assembler::aboveEqual, L_2TAG_PACKET_2_0_2);
|
|
923 |
cmpl(eax, 16);
|
|
924 |
jcc(Assembler::below, L_2TAG_PACKET_3_0_2);
|
|
925 |
|
|
926 |
bind(L_2TAG_PACKET_4_0_2);
|
|
927 |
addsd(xmm0, xmm0);
|
|
928 |
jmp(B1_5);
|
|
929 |
|
|
930 |
bind(L_2TAG_PACKET_5_0_2);
|
|
931 |
jcc(Assembler::above, L_2TAG_PACKET_4_0_2);
|
|
932 |
cmpl(edx, 0);
|
|
933 |
jcc(Assembler::above, L_2TAG_PACKET_4_0_2);
|
|
934 |
jmp(L_2TAG_PACKET_6_0_2);
|
|
935 |
|
|
936 |
bind(L_2TAG_PACKET_3_0_2);
|
|
937 |
xorpd(xmm1, xmm1);
|
|
938 |
addsd(xmm1, xmm0);
|
|
939 |
movdl(edx, xmm1);
|
|
940 |
psrlq(xmm1, 32);
|
|
941 |
movdl(ecx, xmm1);
|
|
942 |
orl(edx, ecx);
|
|
943 |
cmpl(edx, 0);
|
|
944 |
jcc(Assembler::equal, L_2TAG_PACKET_7_0_2);
|
|
945 |
xorpd(xmm1, xmm1);
|
|
946 |
movl(eax, 18416);
|
|
947 |
pinsrw(xmm1, eax, 3);
|
|
948 |
mulsd(xmm0, xmm1);
|
|
949 |
movdqu(xmm1, xmm0);
|
|
950 |
pextrw(eax, xmm0, 3);
|
|
951 |
por(xmm0, xmm2);
|
|
952 |
psrlq(xmm0, 27);
|
|
953 |
movl(ecx, 18416);
|
|
954 |
psrld(xmm0, 2);
|
|
955 |
rcpps(xmm0, xmm0);
|
|
956 |
psllq(xmm1, 12);
|
|
957 |
pshufd(xmm6, xmm5, 228);
|
|
958 |
psrlq(xmm1, 12);
|
|
959 |
jmp(L_2TAG_PACKET_1_0_2);
|
|
960 |
|
|
961 |
bind(L_2TAG_PACKET_2_0_2);
|
|
962 |
movdl(edx, xmm1);
|
|
963 |
psrlq(xmm1, 32);
|
|
964 |
movdl(ecx, xmm1);
|
|
965 |
addl(ecx, ecx);
|
|
966 |
cmpl(ecx, -2097152);
|
|
967 |
jcc(Assembler::aboveEqual, L_2TAG_PACKET_5_0_2);
|
|
968 |
orl(edx, ecx);
|
|
969 |
cmpl(edx, 0);
|
|
970 |
jcc(Assembler::equal, L_2TAG_PACKET_7_0_2);
|
|
971 |
|
|
972 |
bind(L_2TAG_PACKET_6_0_2);
|
|
973 |
xorpd(xmm1, xmm1);
|
|
974 |
xorpd(xmm0, xmm0);
|
|
975 |
movl(eax, 32752);
|
|
976 |
pinsrw(xmm1, eax, 3);
|
|
977 |
mulsd(xmm0, xmm1);
|
|
978 |
movl(Address(rsp, 16), 3);
|
|
979 |
jmp(L_2TAG_PACKET_8_0_2);
|
|
980 |
bind(L_2TAG_PACKET_7_0_2);
|
|
981 |
xorpd(xmm1, xmm1);
|
|
982 |
xorpd(xmm0, xmm0);
|
|
983 |
movl(eax, 49136);
|
|
984 |
pinsrw(xmm0, eax, 3);
|
|
985 |
divsd(xmm0, xmm1);
|
|
986 |
movl(Address(rsp, 16), 2);
|
|
987 |
|
|
988 |
bind(L_2TAG_PACKET_8_0_2);
|
|
989 |
movq(Address(rsp, 8), xmm0);
|
|
990 |
|
|
991 |
bind(B1_3);
|
|
992 |
movq(xmm0, Address(rsp, 8));
|
|
993 |
|
|
994 |
bind(B1_5);
|
|
995 |
addq(rsp, 24);
|
|
996 |
}
|
|
997 |
|
|
998 |
#endif
|
|
999 |
|
|
1000 |
#ifndef _LP64
|
|
1001 |
|
|
1002 |
ALIGNED_(16) juint _static_const_table_log[] =
|
|
1003 |
{
|
|
1004 |
0xfefa3800UL, 0x3fe62e42UL, 0x93c76730UL, 0x3d2ef357UL, 0xaa241800UL,
|
|
1005 |
0x3fe5ee82UL, 0x0cda46beUL, 0x3d220238UL, 0x5c364800UL, 0x3fe5af40UL,
|
|
1006 |
0xac10c9fbUL, 0x3d2dfa63UL, 0x26bb8c00UL, 0x3fe5707aUL, 0xff3303ddUL,
|
|
1007 |
0x3d09980bUL, 0x26867800UL, 0x3fe5322eUL, 0x5d257531UL, 0x3d05ccc4UL,
|
|
1008 |
0x835a5000UL, 0x3fe4f45aUL, 0x6d93b8fbUL, 0xbd2e6c51UL, 0x6f970c00UL,
|
|
1009 |
0x3fe4b6fdUL, 0xed4c541cUL, 0x3cef7115UL, 0x27e8a400UL, 0x3fe47a15UL,
|
|
1010 |
0xf94d60aaUL, 0xbd22cb6aUL, 0xf2f92400UL, 0x3fe43d9fUL, 0x481051f7UL,
|
|
1011 |
0xbcfd984fUL, 0x2125cc00UL, 0x3fe4019cUL, 0x30f0c74cUL, 0xbd26ce79UL,
|
|
1012 |
0x0c36c000UL, 0x3fe3c608UL, 0x7cfe13c2UL, 0xbd02b736UL, 0x17197800UL,
|
|
1013 |
0x3fe38ae2UL, 0xbb5569a4UL, 0xbd218b7aUL, 0xad9d8c00UL, 0x3fe35028UL,
|
|
1014 |
0x9527e6acUL, 0x3d10b83fUL, 0x44340800UL, 0x3fe315daUL, 0xc5a0ed9cUL,
|
|
1015 |
0xbd274e93UL, 0x57b0e000UL, 0x3fe2dbf5UL, 0x07b9dc11UL, 0xbd17a6e5UL,
|
|
1016 |
0x6d0ec000UL, 0x3fe2a278UL, 0xe797882dUL, 0x3d206d2bUL, 0x1134dc00UL,
|
|
1017 |
0x3fe26962UL, 0x05226250UL, 0xbd0b61f1UL, 0xd8bebc00UL, 0x3fe230b0UL,
|
|
1018 |
0x6e48667bUL, 0x3d12fc06UL, 0x5fc61800UL, 0x3fe1f863UL, 0xc9fe81d3UL,
|
|
1019 |
0xbd2a7242UL, 0x49ae6000UL, 0x3fe1c078UL, 0xed70e667UL, 0x3cccacdeUL,
|
|
1020 |
0x40f23c00UL, 0x3fe188eeUL, 0xf8ab4650UL, 0x3d14cc4eUL, 0xf6f29800UL,
|
|
1021 |
0x3fe151c3UL, 0xa293ae49UL, 0xbd2edd97UL, 0x23c75c00UL, 0x3fe11af8UL,
|
|
1022 |
0xbb9ddcb2UL, 0xbd258647UL, 0x8611cc00UL, 0x3fe0e489UL, 0x07801742UL,
|
|
1023 |
0x3d1c2998UL, 0xe2d05400UL, 0x3fe0ae76UL, 0x887e7e27UL, 0x3d1f486bUL,
|
|
1024 |
0x0533c400UL, 0x3fe078bfUL, 0x41edf5fdUL, 0x3d268122UL, 0xbe760400UL,
|
|
1025 |
0x3fe04360UL, 0xe79539e0UL, 0xbd04c45fUL, 0xe5b20800UL, 0x3fe00e5aUL,
|
|
1026 |
0xb1727b1cUL, 0xbd053ba3UL, 0xaf7a4800UL, 0x3fdfb358UL, 0x3c164935UL,
|
|
1027 |
0x3d0085faUL, 0xee031800UL, 0x3fdf4aa7UL, 0x6f014a8bUL, 0x3d12cde5UL,
|
|
1028 |
0x56b41000UL, 0x3fdee2a1UL, 0x5a470251UL, 0x3d2f27f4UL, 0xc3ddb000UL,
|
|
1029 |
0x3fde7b42UL, 0x5372bd08UL, 0xbd246550UL, 0x1a272800UL, 0x3fde148aUL,
|
|
1030 |
0x07322938UL, 0xbd1326b2UL, 0x484c9800UL, 0x3fddae75UL, 0x60dc616aUL,
|
|
1031 |
0xbd1ea42dUL, 0x46def800UL, 0x3fdd4902UL, 0xe9a767a8UL, 0x3d235bafUL,
|
|
1032 |
0x18064800UL, 0x3fdce42fUL, 0x3ec7a6b0UL, 0xbd0797c3UL, 0xc7455800UL,
|
|
1033 |
0x3fdc7ff9UL, 0xc15249aeUL, 0xbd29b6ddUL, 0x693fa000UL, 0x3fdc1c60UL,
|
|
1034 |
0x7fe8e180UL, 0x3d2cec80UL, 0x1b80e000UL, 0x3fdbb961UL, 0xf40a666dUL,
|
|
1035 |
0x3d27d85bUL, 0x04462800UL, 0x3fdb56faUL, 0x2d841995UL, 0x3d109525UL,
|
|
1036 |
0x5248d000UL, 0x3fdaf529UL, 0x52774458UL, 0xbd217cc5UL, 0x3c8ad800UL,
|
|
1037 |
0x3fda93edUL, 0xbea77a5dUL, 0x3d1e36f2UL, 0x0224f800UL, 0x3fda3344UL,
|
|
1038 |
0x7f9d79f5UL, 0x3d23c645UL, 0xea15f000UL, 0x3fd9d32bUL, 0x10d0c0b0UL,
|
|
1039 |
0xbd26279eUL, 0x43135800UL, 0x3fd973a3UL, 0xa502d9f0UL, 0xbd152313UL,
|
|
1040 |
0x635bf800UL, 0x3fd914a8UL, 0x2ee6307dUL, 0xbd1766b5UL, 0xa88b3000UL,
|
|
1041 |
0x3fd8b639UL, 0xe5e70470UL, 0xbd205ae1UL, 0x776dc800UL, 0x3fd85855UL,
|
|
1042 |
0x3333778aUL, 0x3d2fd56fUL, 0x3bd81800UL, 0x3fd7fafaUL, 0xc812566aUL,
|
|
1043 |
0xbd272090UL, 0x687cf800UL, 0x3fd79e26UL, 0x2efd1778UL, 0x3d29ec7dUL,
|
|
1044 |
0x76c67800UL, 0x3fd741d8UL, 0x49dc60b3UL, 0x3d2d8b09UL, 0xe6af1800UL,
|
|
1045 |
0x3fd6e60eUL, 0x7c222d87UL, 0x3d172165UL, 0x3e9c6800UL, 0x3fd68ac8UL,
|
|
1046 |
0x2756eba0UL, 0x3d20a0d3UL, 0x0b3ab000UL, 0x3fd63003UL, 0xe731ae00UL,
|
|
1047 |
0xbd2db623UL, 0xdf596000UL, 0x3fd5d5bdUL, 0x08a465dcUL, 0xbd0a0b2aUL,
|
|
1048 |
0x53c8d000UL, 0x3fd57bf7UL, 0xee5d40efUL, 0x3d1fadedUL, 0x0738a000UL,
|
|
1049 |
0x3fd522aeUL, 0x8164c759UL, 0x3d2ebe70UL, 0x9e173000UL, 0x3fd4c9e0UL,
|
|
1050 |
0x1b0ad8a4UL, 0xbd2e2089UL, 0xc271c800UL, 0x3fd4718dUL, 0x0967d675UL,
|
|
1051 |
0xbd2f27ceUL, 0x23d5e800UL, 0x3fd419b4UL, 0xec90e09dUL, 0x3d08e436UL,
|
|
1052 |
0x77333000UL, 0x3fd3c252UL, 0xb606bd5cUL, 0x3d183b54UL, 0x76be1000UL,
|
|
1053 |
0x3fd36b67UL, 0xb0f177c8UL, 0x3d116ecdUL, 0xe1d36000UL, 0x3fd314f1UL,
|
|
1054 |
0xd3213cb8UL, 0xbd28e27aUL, 0x7cdc9000UL, 0x3fd2bef0UL, 0x4a5004f4UL,
|
|
1055 |
0x3d2a9cfaUL, 0x1134d800UL, 0x3fd26962UL, 0xdf5bb3b6UL, 0x3d2c93c1UL,
|
|
1056 |
0x6d0eb800UL, 0x3fd21445UL, 0xba46baeaUL, 0x3d0a87deUL, 0x635a6800UL,
|
|
1057 |
0x3fd1bf99UL, 0x5147bdb7UL, 0x3d2ca6edUL, 0xcbacf800UL, 0x3fd16b5cUL,
|
|
1058 |
0xf7a51681UL, 0x3d2b9acdUL, 0x8227e800UL, 0x3fd1178eUL, 0x63a5f01cUL,
|
|
1059 |
0xbd2c210eUL, 0x67616000UL, 0x3fd0c42dUL, 0x163ceae9UL, 0x3d27188bUL,
|
|
1060 |
0x604d5800UL, 0x3fd07138UL, 0x16ed4e91UL, 0x3cf89cdbUL, 0x5626c800UL,
|
|
1061 |
0x3fd01eaeUL, 0x1485e94aUL, 0xbd16f08cUL, 0x6cb3b000UL, 0x3fcf991cUL,
|
|
1062 |
0xca0cdf30UL, 0x3d1bcbecUL, 0xe4dd0000UL, 0x3fcef5adUL, 0x65bb8e11UL,
|
|
1063 |
0xbcca2115UL, 0xffe71000UL, 0x3fce530eUL, 0x6041f430UL, 0x3cc21227UL,
|
|
1064 |
0xb0d49000UL, 0x3fcdb13dUL, 0xf715b035UL, 0xbd2aff2aUL, 0xf2656000UL,
|
|
1065 |
0x3fcd1037UL, 0x75b6f6e4UL, 0xbd084a7eUL, 0xc6f01000UL, 0x3fcc6ffbUL,
|
|
1066 |
0xc5962bd2UL, 0xbcf1ec72UL, 0x383be000UL, 0x3fcbd087UL, 0x595412b6UL,
|
|
1067 |
0xbd2d4bc4UL, 0x575bd000UL, 0x3fcb31d8UL, 0x4eace1aaUL, 0xbd0c358dUL,
|
|
1068 |
0x3c8ae000UL, 0x3fca93edUL, 0x50562169UL, 0xbd287243UL, 0x07089000UL,
|
|
1069 |
0x3fc9f6c4UL, 0x6865817aUL, 0x3d29904dUL, 0xdcf70000UL, 0x3fc95a5aUL,
|
|
1070 |
0x58a0ff6fUL, 0x3d07f228UL, 0xeb390000UL, 0x3fc8beafUL, 0xaae92cd1UL,
|
|
1071 |
0xbd073d54UL, 0x6551a000UL, 0x3fc823c1UL, 0x9a631e83UL, 0x3d1e0ddbUL,
|
|
1072 |
0x85445000UL, 0x3fc7898dUL, 0x70914305UL, 0xbd1c6610UL, 0x8b757000UL,
|
|
1073 |
0x3fc6f012UL, 0xe59c21e1UL, 0xbd25118dUL, 0xbe8c1000UL, 0x3fc6574eUL,
|
|
1074 |
0x2c3c2e78UL, 0x3d19cf8bUL, 0x6b544000UL, 0x3fc5bf40UL, 0xeb68981cUL,
|
|
1075 |
0xbd127023UL, 0xe4a1b000UL, 0x3fc527e5UL, 0xe5697dc7UL, 0x3d2633e8UL,
|
|
1076 |
0x8333b000UL, 0x3fc4913dUL, 0x54fdb678UL, 0x3d258379UL, 0xa5993000UL,
|
|
1077 |
0x3fc3fb45UL, 0x7e6a354dUL, 0xbd2cd1d8UL, 0xb0159000UL, 0x3fc365fcUL,
|
|
1078 |
0x234b7289UL, 0x3cc62fa8UL, 0x0c868000UL, 0x3fc2d161UL, 0xcb81b4a1UL,
|
|
1079 |
0x3d039d6cUL, 0x2a49c000UL, 0x3fc23d71UL, 0x8fd3df5cUL, 0x3d100d23UL,
|
|
1080 |
0x7e23f000UL, 0x3fc1aa2bUL, 0x44389934UL, 0x3d2ca78eUL, 0x8227e000UL,
|
|
1081 |
0x3fc1178eUL, 0xce2d07f2UL, 0x3d21ef78UL, 0xb59e4000UL, 0x3fc08598UL,
|
|
1082 |
0x7009902cUL, 0xbd27e5ddUL, 0x39dbe000UL, 0x3fbfe891UL, 0x4fa10afdUL,
|
|
1083 |
0xbd2534d6UL, 0x830a2000UL, 0x3fbec739UL, 0xafe645e0UL, 0xbd2dc068UL,
|
|
1084 |
0x63844000UL, 0x3fbda727UL, 0x1fa71733UL, 0x3d1a8940UL, 0x01bc4000UL,
|
|
1085 |
0x3fbc8858UL, 0xc65aacd3UL, 0x3d2646d1UL, 0x8dad6000UL, 0x3fbb6ac8UL,
|
|
1086 |
0x2bf768e5UL, 0xbd139080UL, 0x40b1c000UL, 0x3fba4e76UL, 0xb94407c8UL,
|
|
1087 |
0xbd0e42b6UL, 0x5d594000UL, 0x3fb9335eUL, 0x3abd47daUL, 0x3d23115cUL,
|
|
1088 |
0x2f40e000UL, 0x3fb8197eUL, 0xf96ffdf7UL, 0x3d0f80dcUL, 0x0aeac000UL,
|
|
1089 |
0x3fb700d3UL, 0xa99ded32UL, 0x3cec1e8dUL, 0x4d97a000UL, 0x3fb5e95aUL,
|
|
1090 |
0x3c5d1d1eUL, 0xbd2c6906UL, 0x5d208000UL, 0x3fb4d311UL, 0x82f4e1efUL,
|
|
1091 |
0xbcf53a25UL, 0xa7d1e000UL, 0x3fb3bdf5UL, 0xa5db4ed7UL, 0x3d2cc85eUL,
|
|
1092 |
0xa4472000UL, 0x3fb2aa04UL, 0xae9c697dUL, 0xbd20b6e8UL, 0xd1466000UL,
|
|
1093 |
0x3fb1973bUL, 0x560d9e9bUL, 0xbd25325dUL, 0xb59e4000UL, 0x3fb08598UL,
|
|
1094 |
0x7009902cUL, 0xbd17e5ddUL, 0xc006c000UL, 0x3faeea31UL, 0x4fc93b7bUL,
|
|
1095 |
0xbd0e113eUL, 0xcdddc000UL, 0x3faccb73UL, 0x47d82807UL, 0xbd1a68f2UL,
|
|
1096 |
0xd0fb0000UL, 0x3faaaef2UL, 0x353bb42eUL, 0x3d20fc1aUL, 0x149fc000UL,
|
|
1097 |
0x3fa894aaUL, 0xd05a267dUL, 0xbd197995UL, 0xf2d4c000UL, 0x3fa67c94UL,
|
|
1098 |
0xec19afa2UL, 0xbd029efbUL, 0xd42e0000UL, 0x3fa466aeUL, 0x75bdfd28UL,
|
|
1099 |
0xbd2c1673UL, 0x2f8d0000UL, 0x3fa252f3UL, 0xe021b67bUL, 0x3d283e9aUL,
|
|
1100 |
0x89e74000UL, 0x3fa0415dUL, 0x5cf1d753UL, 0x3d0111c0UL, 0xec148000UL,
|
|
1101 |
0x3f9c63d2UL, 0x3f9eb2f3UL, 0x3d2578c6UL, 0x28c90000UL, 0x3f984925UL,
|
|
1102 |
0x325a0c34UL, 0xbd2aa0baUL, 0x25980000UL, 0x3f9432a9UL, 0x928637feUL,
|
|
1103 |
0x3d098139UL, 0x58938000UL, 0x3f902056UL, 0x06e2f7d2UL, 0xbd23dc5bUL,
|
|
1104 |
0xa3890000UL, 0x3f882448UL, 0xda74f640UL, 0xbd275577UL, 0x75890000UL,
|
|
1105 |
0x3f801015UL, 0x999d2be8UL, 0xbd10c76bUL, 0x59580000UL, 0x3f700805UL,
|
|
1106 |
0xcb31c67bUL, 0x3d2166afUL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
|
|
1107 |
0x80000000UL, 0xfefa3800UL, 0x3fa62e42UL, 0x93c76730UL, 0x3ceef357UL,
|
|
1108 |
0x92492492UL, 0x3fc24924UL, 0x00000000UL, 0xbfd00000UL, 0x3d6fb175UL,
|
|
1109 |
0xbfc5555eUL, 0x55555555UL, 0x3fd55555UL, 0x9999999aUL, 0x3fc99999UL,
|
|
1110 |
0x00000000UL, 0xbfe00000UL, 0x00000000UL, 0xffffe000UL, 0x00000000UL,
|
|
1111 |
0xffffe000UL
|
|
1112 |
};
|
|
1113 |
//registers,
|
|
1114 |
// input: xmm0
|
|
1115 |
// scratch: xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
|
|
1116 |
// rax, rdx, rcx, rbx (tmp)
|
|
1117 |
|
|
1118 |
void MacroAssembler::fast_log(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register tmp) {
|
|
1119 |
Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2;
|
|
1120 |
Label L_2TAG_PACKET_4_0_2, L_2TAG_PACKET_5_0_2, L_2TAG_PACKET_6_0_2, L_2TAG_PACKET_7_0_2;
|
|
1121 |
Label L_2TAG_PACKET_8_0_2, L_2TAG_PACKET_9_0_2;
|
|
1122 |
Label L_2TAG_PACKET_10_0_2, start;
|
|
1123 |
|
|
1124 |
assert_different_registers(tmp, eax, ecx, edx);
|
|
1125 |
jmp(start);
|
|
1126 |
address static_const_table = (address)_static_const_table_log;
|
|
1127 |
|
|
1128 |
bind(start);
|
|
1129 |
subl(rsp, 104);
|
|
1130 |
movl(Address(rsp, 40), tmp);
|
|
1131 |
lea(tmp, ExternalAddress(static_const_table));
|
|
1132 |
xorpd(xmm2, xmm2);
|
|
1133 |
movl(eax, 16368);
|
|
1134 |
pinsrw(xmm2, eax, 3);
|
|
1135 |
xorpd(xmm3, xmm3);
|
|
1136 |
movl(edx, 30704);
|
|
1137 |
pinsrw(xmm3, edx, 3);
|
|
1138 |
movsd(xmm0, Address(rsp, 112));
|
|
1139 |
movapd(xmm1, xmm0);
|
|
1140 |
movl(ecx, 32768);
|
|
1141 |
movdl(xmm4, ecx);
|
|
1142 |
movsd(xmm5, Address(tmp, 2128)); // 0x00000000UL, 0xffffe000UL
|
|
1143 |
pextrw(eax, xmm0, 3);
|
|
1144 |
por(xmm0, xmm2);
|
|
1145 |
psllq(xmm0, 5);
|
|
1146 |
movl(ecx, 16352);
|
|
1147 |
psrlq(xmm0, 34);
|
|
1148 |
rcpss(xmm0, xmm0);
|
|
1149 |
psllq(xmm1, 12);
|
|
1150 |
pshufd(xmm6, xmm5, 228);
|
|
1151 |
psrlq(xmm1, 12);
|
|
1152 |
subl(eax, 16);
|
|
1153 |
cmpl(eax, 32736);
|
|
1154 |
jcc(Assembler::aboveEqual, L_2TAG_PACKET_0_0_2);
|
|
1155 |
|
|
1156 |
bind(L_2TAG_PACKET_1_0_2);
|
|
1157 |
paddd(xmm0, xmm4);
|
|
1158 |
por(xmm1, xmm3);
|
|
1159 |
movdl(edx, xmm0);
|
|
1160 |
psllq(xmm0, 29);
|
|
1161 |
pand(xmm5, xmm1);
|
|
1162 |
pand(xmm0, xmm6);
|
|
1163 |
subsd(xmm1, xmm5);
|
|
1164 |
mulpd(xmm5, xmm0);
|
|
1165 |
andl(eax, 32752);
|
|
1166 |
subl(eax, ecx);
|
|
1167 |
cvtsi2sdl(xmm7, eax);
|
|
1168 |
mulsd(xmm1, xmm0);
|
|
1169 |
movsd(xmm6, Address(tmp, 2064)); // 0xfefa3800UL, 0x3fa62e42UL
|
|
1170 |
movdqu(xmm3, Address(tmp, 2080)); // 0x92492492UL, 0x3fc24924UL, 0x00000000UL, 0xbfd00000UL
|
|
1171 |
subsd(xmm5, xmm2);
|
|
1172 |
andl(edx, 16711680);
|
|
1173 |
shrl(edx, 12);
|
|
1174 |
movdqu(xmm0, Address(tmp, edx));
|
|
1175 |
movdqu(xmm4, Address(tmp, 2096)); // 0x3d6fb175UL, 0xbfc5555eUL, 0x55555555UL, 0x3fd55555UL
|
|
1176 |
addsd(xmm1, xmm5);
|
|
1177 |
movdqu(xmm2, Address(tmp, 2112)); // 0x9999999aUL, 0x3fc99999UL, 0x00000000UL, 0xbfe00000UL
|
|
1178 |
mulsd(xmm6, xmm7);
|
|
1179 |
pshufd(xmm5, xmm1, 68);
|
|
1180 |
mulsd(xmm7, Address(tmp, 2072)); // 0x93c76730UL, 0x3ceef357UL, 0x92492492UL, 0x3fc24924UL
|
|
1181 |
mulsd(xmm3, xmm1);
|
|
1182 |
addsd(xmm0, xmm6);
|
|
1183 |
mulpd(xmm4, xmm5);
|
|
1184 |
mulpd(xmm5, xmm5);
|
|
1185 |
pshufd(xmm6, xmm0, 228);
|
|
1186 |
addsd(xmm0, xmm1);
|
|
1187 |
addpd(xmm4, xmm2);
|
|
1188 |
mulpd(xmm3, xmm5);
|
|
1189 |
subsd(xmm6, xmm0);
|
|
1190 |
mulsd(xmm4, xmm1);
|
|
1191 |
pshufd(xmm2, xmm0, 238);
|
|
1192 |
addsd(xmm1, xmm6);
|
|
1193 |
mulsd(xmm5, xmm5);
|
|
1194 |
addsd(xmm7, xmm2);
|
|
1195 |
addpd(xmm4, xmm3);
|
|
1196 |
addsd(xmm1, xmm7);
|
|
1197 |
mulpd(xmm4, xmm5);
|
|
1198 |
addsd(xmm1, xmm4);
|
|
1199 |
pshufd(xmm5, xmm4, 238);
|
|
1200 |
addsd(xmm1, xmm5);
|
|
1201 |
addsd(xmm0, xmm1);
|
|
1202 |
jmp(L_2TAG_PACKET_2_0_2);
|
|
1203 |
|
|
1204 |
bind(L_2TAG_PACKET_0_0_2);
|
|
1205 |
movsd(xmm0, Address(rsp, 112));
|
|
1206 |
movdqu(xmm1, xmm0);
|
|
1207 |
addl(eax, 16);
|
|
1208 |
cmpl(eax, 32768);
|
|
1209 |
jcc(Assembler::aboveEqual, L_2TAG_PACKET_3_0_2);
|
|
1210 |
cmpl(eax, 16);
|
|
1211 |
jcc(Assembler::below, L_2TAG_PACKET_4_0_2);
|
|
1212 |
|
|
1213 |
bind(L_2TAG_PACKET_5_0_2);
|
|
1214 |
addsd(xmm0, xmm0);
|
|
1215 |
jmp(L_2TAG_PACKET_2_0_2);
|
|
1216 |
|
|
1217 |
bind(L_2TAG_PACKET_6_0_2);
|
|
1218 |
jcc(Assembler::above, L_2TAG_PACKET_5_0_2);
|
|
1219 |
cmpl(edx, 0);
|
|
1220 |
jcc(Assembler::above, L_2TAG_PACKET_5_0_2);
|
|
1221 |
jmp(L_2TAG_PACKET_7_0_2);
|
|
1222 |
|
|
1223 |
bind(L_2TAG_PACKET_3_0_2);
|
|
1224 |
movdl(edx, xmm1);
|
|
1225 |
psrlq(xmm1, 32);
|
|
1226 |
movdl(ecx, xmm1);
|
|
1227 |
addl(ecx, ecx);
|
|
1228 |
cmpl(ecx, -2097152);
|
|
1229 |
jcc(Assembler::aboveEqual, L_2TAG_PACKET_6_0_2);
|
|
1230 |
orl(edx, ecx);
|
|
1231 |
cmpl(edx, 0);
|
|
1232 |
jcc(Assembler::equal, L_2TAG_PACKET_8_0_2);
|
|
1233 |
|
|
1234 |
bind(L_2TAG_PACKET_7_0_2);
|
|
1235 |
xorpd(xmm1, xmm1);
|
|
1236 |
xorpd(xmm0, xmm0);
|
|
1237 |
movl(eax, 32752);
|
|
1238 |
pinsrw(xmm1, eax, 3);
|
|
1239 |
movl(edx, 3);
|
|
1240 |
mulsd(xmm0, xmm1);
|
|
1241 |
|
|
1242 |
bind(L_2TAG_PACKET_9_0_2);
|
|
1243 |
movsd(Address(rsp, 0), xmm0);
|
|
1244 |
movsd(xmm0, Address(rsp, 112));
|
|
1245 |
fld_d(Address(rsp, 0));
|
|
1246 |
jmp(L_2TAG_PACKET_10_0_2);
|
|
1247 |
|
|
1248 |
bind(L_2TAG_PACKET_8_0_2);
|
|
1249 |
xorpd(xmm1, xmm1);
|
|
1250 |
xorpd(xmm0, xmm0);
|
|
1251 |
movl(eax, 49136);
|
|
1252 |
pinsrw(xmm0, eax, 3);
|
|
1253 |
divsd(xmm0, xmm1);
|
|
1254 |
movl(edx, 2);
|
|
1255 |
jmp(L_2TAG_PACKET_9_0_2);
|
|
1256 |
|
|
1257 |
bind(L_2TAG_PACKET_4_0_2);
|
|
1258 |
movdl(edx, xmm1);
|
|
1259 |
psrlq(xmm1, 32);
|
|
1260 |
movdl(ecx, xmm1);
|
|
1261 |
orl(edx, ecx);
|
|
1262 |
cmpl(edx, 0);
|
|
1263 |
jcc(Assembler::equal, L_2TAG_PACKET_8_0_2);
|
|
1264 |
xorpd(xmm1, xmm1);
|
|
1265 |
movl(eax, 18416);
|
|
1266 |
pinsrw(xmm1, eax, 3);
|
|
1267 |
mulsd(xmm0, xmm1);
|
|
1268 |
movapd(xmm1, xmm0);
|
|
1269 |
pextrw(eax, xmm0, 3);
|
|
1270 |
por(xmm0, xmm2);
|
|
1271 |
psllq(xmm0, 5);
|
|
1272 |
movl(ecx, 18416);
|
|
1273 |
psrlq(xmm0, 34);
|
|
1274 |
rcpss(xmm0, xmm0);
|
|
1275 |
psllq(xmm1, 12);
|
|
1276 |
pshufd(xmm6, xmm5, 228);
|
|
1277 |
psrlq(xmm1, 12);
|
|
1278 |
jmp(L_2TAG_PACKET_1_0_2);
|
|
1279 |
|
|
1280 |
bind(L_2TAG_PACKET_2_0_2);
|
|
1281 |
movsd(Address(rsp, 24), xmm0);
|
|
1282 |
fld_d(Address(rsp, 24));
|
|
1283 |
|
|
1284 |
bind(L_2TAG_PACKET_10_0_2);
|
|
1285 |
movl(tmp, Address(rsp, 40));
|
|
1286 |
}
|
|
1287 |
|
|
1288 |
#endif
|