author | shade |
Thu, 13 Dec 2018 16:14:07 +0100 | |
changeset 53017 | e10a1f7aaa13 |
parent 52990 | 1ed8de9045a7 |
child 57786 | 948ac3112da8 |
permissions | -rw-r--r-- |
52990 | 1 |
/* |
2 |
* Copyright (c) 2018, Intel Corporation. |
|
3 |
* |
|
4 |
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
|
5 |
* |
|
6 |
* This code is free software; you can redistribute it and/or modify it |
|
7 |
* under the terms of the GNU General Public License version 2 only, as |
|
8 |
* published by the Free Software Foundation. |
|
9 |
* |
|
10 |
* This code is distributed in the hope that it will be useful, but WITHOUT |
|
11 |
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
12 |
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
13 |
* version 2 for more details (a copy is included in the LICENSE file that |
|
14 |
* accompanied this code). |
|
15 |
* |
|
16 |
* You should have received a copy of the GNU General Public License version |
|
17 |
* 2 along with this work; if not, write to the Free Software Foundation, |
|
18 |
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
|
19 |
* |
|
20 |
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
|
21 |
* or visit www.oracle.com if you need additional information or have any |
|
22 |
* questions. |
|
23 |
* |
|
24 |
*/ |
|
25 |
||
26 |
#include "precompiled.hpp" |
|
27 |
#include "asm/assembler.hpp" |
|
28 |
#include "asm/assembler.inline.hpp" |
|
29 |
#include "runtime/stubRoutines.hpp" |
|
30 |
#include "macroAssembler_x86.hpp" |
|
31 |
||
53017
e10a1f7aaa13
8215354: x86_32 build failures after JDK-8214074 (Ghash optimization using AVX instructions)
shade
parents:
52990
diff
changeset
|
32 |
#ifdef _LP64 |
52990 | 33 |
// Multiply 128 x 128 bits, using 4 pclmulqdq operations |
34 |
void MacroAssembler::schoolbookAAD(int i, Register htbl, XMMRegister data, |
|
35 |
XMMRegister tmp0, XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3) { |
|
36 |
movdqu(xmm15, Address(htbl, i * 16)); |
|
37 |
vpclmulhqlqdq(tmp3, data, xmm15); // 0x01 |
|
38 |
vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); |
|
39 |
vpclmulldq(tmp3, data, xmm15); // 0x00 |
|
40 |
vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit); |
|
41 |
vpclmulhdq(tmp3, data, xmm15); // 0x11 |
|
42 |
vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit); |
|
43 |
vpclmullqhqdq(tmp3, data, xmm15); // 0x10 |
|
44 |
vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); |
|
45 |
} |
|
46 |
||
47 |
// Multiply two 128 bit numbers resulting in a 256 bit value |
|
48 |
// Result of the multiplication followed by reduction stored in state |
|
49 |
void MacroAssembler::gfmul(XMMRegister tmp0, XMMRegister state) { |
|
50 |
const XMMRegister tmp1 = xmm4; |
|
51 |
const XMMRegister tmp2 = xmm5; |
|
52 |
const XMMRegister tmp3 = xmm6; |
|
53 |
const XMMRegister tmp4 = xmm7; |
|
54 |
||
55 |
vpclmulldq(tmp1, state, tmp0); //0x00 (a0 * b0) |
|
56 |
vpclmulhdq(tmp4, state, tmp0);//0x11 (a1 * b1) |
|
57 |
vpclmullqhqdq(tmp2, state, tmp0);//0x10 (a1 * b0) |
|
58 |
vpclmulhqlqdq(tmp3, state, tmp0); //0x01 (a0 * b1) |
|
59 |
||
60 |
vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit); // (a0 * b1) + (a1 * b0) |
|
61 |
||
62 |
vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit); |
|
63 |
vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit); |
|
64 |
vpxor(tmp1, tmp1, tmp3, Assembler::AVX_128bit); // tmp1 and tmp4 hold the result |
|
65 |
vpxor(tmp4, tmp4, tmp2, Assembler::AVX_128bit); // of carryless multiplication |
|
66 |
// Follows the reduction technique mentioned in |
|
67 |
// Shift-XOR reduction described in Gueron-Kounavis May 2010 |
|
68 |
// First phase of reduction |
|
69 |
// |
|
70 |
vpslld(xmm8, tmp1, 31, Assembler::AVX_128bit); // packed right shift shifting << 31 |
|
71 |
vpslld(xmm9, tmp1, 30, Assembler::AVX_128bit); // packed right shift shifting << 30 |
|
72 |
vpslld(xmm10, tmp1, 25, Assembler::AVX_128bit);// packed right shift shifting << 25 |
|
73 |
// xor the shifted versions |
|
74 |
vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit); |
|
75 |
vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit); |
|
76 |
vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit); |
|
77 |
vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit); |
|
78 |
vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit);// first phase of the reduction complete |
|
79 |
// |
|
80 |
// Second phase of the reduction |
|
81 |
// |
|
82 |
vpsrld(xmm9, tmp1, 1, Assembler::AVX_128bit);// packed left shifting >> 1 |
|
83 |
vpsrld(xmm10, tmp1, 2, Assembler::AVX_128bit);// packed left shifting >> 2 |
|
84 |
vpsrld(xmm11, tmp1, 7, Assembler::AVX_128bit);// packed left shifting >> 7 |
|
85 |
vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit);// xor the shifted versions |
|
86 |
vpxor(xmm9, xmm9, xmm11, Assembler::AVX_128bit); |
|
87 |
vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit); |
|
88 |
vpxor(tmp1, tmp1, xmm9, Assembler::AVX_128bit); |
|
89 |
vpxor(state, tmp4, tmp1, Assembler::AVX_128bit);// the result is in state |
|
90 |
ret(0); |
|
91 |
} |
|
92 |
||
93 |
// This method takes the subkey after expansion as input and generates 1 * 16 power of subkey H. |
|
94 |
// The power of H is used in reduction process for one block ghash |
|
95 |
void MacroAssembler::generateHtbl_one_block(Register htbl) { |
|
96 |
const XMMRegister t = xmm13; |
|
97 |
||
98 |
// load the original subkey hash |
|
99 |
movdqu(t, Address(htbl, 0)); |
|
100 |
// shuffle using long swap mask |
|
101 |
movdqu(xmm10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); |
|
102 |
vpshufb(t, t, xmm10, Assembler::AVX_128bit); |
|
103 |
||
104 |
// Compute H' = GFMUL(H, 2) |
|
105 |
vpsrld(xmm3, t, 7, Assembler::AVX_128bit); |
|
106 |
movdqu(xmm4, ExternalAddress(StubRoutines::x86::ghash_shufflemask_addr())); |
|
107 |
vpshufb(xmm3, xmm3, xmm4, Assembler::AVX_128bit); |
|
108 |
movl(rax, 0xff00); |
|
109 |
movdl(xmm4, rax); |
|
110 |
vpshufb(xmm4, xmm4, xmm3, Assembler::AVX_128bit); |
|
111 |
movdqu(xmm5, ExternalAddress(StubRoutines::x86::ghash_polynomial_addr())); |
|
112 |
vpand(xmm5, xmm5, xmm4, Assembler::AVX_128bit); |
|
113 |
vpsrld(xmm3, t, 31, Assembler::AVX_128bit); |
|
114 |
vpslld(xmm4, t, 1, Assembler::AVX_128bit); |
|
115 |
vpslldq(xmm3, xmm3, 4, Assembler::AVX_128bit); |
|
116 |
vpxor(t, xmm4, xmm3, Assembler::AVX_128bit);// t holds p(x) <<1 or H * 2 |
|
117 |
||
118 |
//Adding p(x)<<1 to xmm5 which holds the reduction polynomial |
|
119 |
vpxor(t, t, xmm5, Assembler::AVX_128bit); |
|
120 |
movdqu(Address(htbl, 1 * 16), t); // H * 2 |
|
121 |
||
122 |
ret(0); |
|
123 |
} |
|
124 |
||
125 |
// This method takes the subkey after expansion as input and generates the remaining powers of subkey H. |
|
126 |
// The power of H is used in reduction process for eight block ghash |
|
127 |
void MacroAssembler::generateHtbl_eight_blocks(Register htbl) { |
|
128 |
const XMMRegister t = xmm13; |
|
129 |
const XMMRegister tmp0 = xmm1; |
|
130 |
Label GFMUL; |
|
131 |
||
132 |
movdqu(t, Address(htbl, 1 * 16)); |
|
133 |
movdqu(tmp0, t); |
|
134 |
||
135 |
// tmp0 and t hold H. Now we compute powers of H by using GFMUL(H, H) |
|
136 |
call(GFMUL, relocInfo::none); |
|
137 |
movdqu(Address(htbl, 2 * 16), t); //H ^ 2 * 2 |
|
138 |
call(GFMUL, relocInfo::none); |
|
139 |
movdqu(Address(htbl, 3 * 16), t); //H ^ 3 * 2 |
|
140 |
call(GFMUL, relocInfo::none); |
|
141 |
movdqu(Address(htbl, 4 * 16), t); //H ^ 4 * 2 |
|
142 |
call(GFMUL, relocInfo::none); |
|
143 |
movdqu(Address(htbl, 5 * 16), t); //H ^ 5 * 2 |
|
144 |
call(GFMUL, relocInfo::none); |
|
145 |
movdqu(Address(htbl, 6 * 16), t); //H ^ 6 * 2 |
|
146 |
call(GFMUL, relocInfo::none); |
|
147 |
movdqu(Address(htbl, 7 * 16), t); //H ^ 7 * 2 |
|
148 |
call(GFMUL, relocInfo::none); |
|
149 |
movdqu(Address(htbl, 8 * 16), t); //H ^ 8 * 2 |
|
150 |
ret(0); |
|
151 |
||
152 |
bind(GFMUL); |
|
153 |
gfmul(tmp0, t); |
|
154 |
} |
|
155 |
||
156 |
// Multiblock and single block GHASH computation using Shift XOR reduction technique |
|
157 |
void MacroAssembler::avx_ghash(Register input_state, Register htbl, |
|
158 |
Register input_data, Register blocks) { |
|
159 |
||
160 |
// temporary variables to hold input data and input state |
|
161 |
const XMMRegister data = xmm1; |
|
162 |
const XMMRegister state = xmm0; |
|
163 |
// temporary variables to hold intermediate results |
|
164 |
const XMMRegister tmp0 = xmm3; |
|
165 |
const XMMRegister tmp1 = xmm4; |
|
166 |
const XMMRegister tmp2 = xmm5; |
|
167 |
const XMMRegister tmp3 = xmm6; |
|
168 |
// temporary variables to hold byte and long swap masks |
|
169 |
const XMMRegister bswap_mask = xmm2; |
|
170 |
const XMMRegister lswap_mask = xmm14; |
|
171 |
||
172 |
Label GENERATE_HTBL_1_BLK, GENERATE_HTBL_8_BLKS, BEGIN_PROCESS, GFMUL, BLOCK8_REDUCTION, |
|
173 |
ONE_BLK_INIT, PROCESS_1_BLOCK, PROCESS_8_BLOCKS, SAVE_STATE, EXIT_GHASH; |
|
174 |
||
175 |
testptr(blocks, blocks); |
|
176 |
jcc(Assembler::zero, EXIT_GHASH); |
|
177 |
||
178 |
// Check if Hashtable (1*16) has been already generated |
|
179 |
// For anything less than 8 blocks, we generate only the first power of H. |
|
180 |
movdqu(tmp2, Address(htbl, 1 * 16)); |
|
181 |
ptest(tmp2, tmp2); |
|
182 |
jcc(Assembler::notZero, BEGIN_PROCESS); |
|
183 |
call(GENERATE_HTBL_1_BLK, relocInfo::none); |
|
184 |
||
185 |
// Shuffle the input state |
|
186 |
bind(BEGIN_PROCESS); |
|
187 |
movdqu(lswap_mask, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); |
|
188 |
movdqu(state, Address(input_state, 0)); |
|
189 |
vpshufb(state, state, lswap_mask, Assembler::AVX_128bit); |
|
190 |
||
191 |
cmpl(blocks, 8); |
|
192 |
jcc(Assembler::below, ONE_BLK_INIT); |
|
193 |
// If we have 8 blocks or more data, then generate remaining powers of H |
|
194 |
movdqu(tmp2, Address(htbl, 8 * 16)); |
|
195 |
ptest(tmp2, tmp2); |
|
196 |
jcc(Assembler::notZero, PROCESS_8_BLOCKS); |
|
197 |
call(GENERATE_HTBL_8_BLKS, relocInfo::none); |
|
198 |
||
199 |
//Do 8 multiplies followed by a reduction processing 8 blocks of data at a time |
|
200 |
//Each block = 16 bytes. |
|
201 |
bind(PROCESS_8_BLOCKS); |
|
202 |
subl(blocks, 8); |
|
203 |
movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr())); |
|
204 |
movdqu(data, Address(input_data, 16 * 7)); |
|
205 |
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
|
206 |
//Loading 1*16 as calculated powers of H required starts at that location. |
|
207 |
movdqu(xmm15, Address(htbl, 1 * 16)); |
|
208 |
//Perform carryless multiplication of (H*2, data block #7) |
|
209 |
vpclmulhqlqdq(tmp2, data, xmm15);//a0 * b1 |
|
210 |
vpclmulldq(tmp0, data, xmm15);//a0 * b0 |
|
211 |
vpclmulhdq(tmp1, data, xmm15);//a1 * b1 |
|
212 |
vpclmullqhqdq(tmp3, data, xmm15);//a1* b0 |
|
213 |
vpxor(tmp2, tmp2, tmp3, Assembler::AVX_128bit);// (a0 * b1) + (a1 * b0) |
|
214 |
||
215 |
movdqu(data, Address(input_data, 16 * 6)); |
|
216 |
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
|
217 |
// Perform carryless multiplication of (H^2 * 2, data block #6) |
|
218 |
schoolbookAAD(2, htbl, data, tmp0, tmp1, tmp2, tmp3); |
|
219 |
||
220 |
movdqu(data, Address(input_data, 16 * 5)); |
|
221 |
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
|
222 |
// Perform carryless multiplication of (H^3 * 2, data block #5) |
|
223 |
schoolbookAAD(3, htbl, data, tmp0, tmp1, tmp2, tmp3); |
|
224 |
movdqu(data, Address(input_data, 16 * 4)); |
|
225 |
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
|
226 |
// Perform carryless multiplication of (H^4 * 2, data block #4) |
|
227 |
schoolbookAAD(4, htbl, data, tmp0, tmp1, tmp2, tmp3); |
|
228 |
movdqu(data, Address(input_data, 16 * 3)); |
|
229 |
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
|
230 |
// Perform carryless multiplication of (H^5 * 2, data block #3) |
|
231 |
schoolbookAAD(5, htbl, data, tmp0, tmp1, tmp2, tmp3); |
|
232 |
movdqu(data, Address(input_data, 16 * 2)); |
|
233 |
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
|
234 |
// Perform carryless multiplication of (H^6 * 2, data block #2) |
|
235 |
schoolbookAAD(6, htbl, data, tmp0, tmp1, tmp2, tmp3); |
|
236 |
movdqu(data, Address(input_data, 16 * 1)); |
|
237 |
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
|
238 |
// Perform carryless multiplication of (H^7 * 2, data block #1) |
|
239 |
schoolbookAAD(7, htbl, data, tmp0, tmp1, tmp2, tmp3); |
|
240 |
movdqu(data, Address(input_data, 16 * 0)); |
|
241 |
// xor data block#0 with input state before perfoming carry-less multiplication |
|
242 |
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
|
243 |
vpxor(data, data, state, Assembler::AVX_128bit); |
|
244 |
// Perform carryless multiplication of (H^8 * 2, data block #0) |
|
245 |
schoolbookAAD(8, htbl, data, tmp0, tmp1, tmp2, tmp3); |
|
246 |
vpslldq(tmp3, tmp2, 8, Assembler::AVX_128bit); |
|
247 |
vpsrldq(tmp2, tmp2, 8, Assembler::AVX_128bit); |
|
248 |
vpxor(tmp0, tmp0, tmp3, Assembler::AVX_128bit);// tmp0, tmp1 contains aggregated results of |
|
249 |
vpxor(tmp1, tmp1, tmp2, Assembler::AVX_128bit);// the multiplication operation |
|
250 |
||
251 |
// we have the 2 128-bit partially accumulated multiplication results in tmp0:tmp1 |
|
252 |
// with higher 128-bit in tmp1 and lower 128-bit in corresponding tmp0 |
|
253 |
// Follows the reduction technique mentioned in |
|
254 |
// Shift-XOR reduction described in Gueron-Kounavis May 2010 |
|
255 |
bind(BLOCK8_REDUCTION); |
|
256 |
// First Phase of the reduction |
|
257 |
vpslld(xmm8, tmp0, 31, Assembler::AVX_128bit); // packed right shifting << 31 |
|
258 |
vpslld(xmm9, tmp0, 30, Assembler::AVX_128bit); // packed right shifting << 30 |
|
259 |
vpslld(xmm10, tmp0, 25, Assembler::AVX_128bit); // packed right shifting << 25 |
|
260 |
// xor the shifted versions |
|
261 |
vpxor(xmm8, xmm8, xmm10, Assembler::AVX_128bit); |
|
262 |
vpxor(xmm8, xmm8, xmm9, Assembler::AVX_128bit); |
|
263 |
||
264 |
vpslldq(xmm9, xmm8, 12, Assembler::AVX_128bit); |
|
265 |
vpsrldq(xmm8, xmm8, 4, Assembler::AVX_128bit); |
|
266 |
||
267 |
vpxor(tmp0, tmp0, xmm9, Assembler::AVX_128bit); // first phase of reduction is complete |
|
268 |
// second phase of the reduction |
|
269 |
vpsrld(xmm9, tmp0, 1, Assembler::AVX_128bit); // packed left shifting >> 1 |
|
270 |
vpsrld(xmm10, tmp0, 2, Assembler::AVX_128bit); // packed left shifting >> 2 |
|
271 |
vpsrld(tmp2, tmp0, 7, Assembler::AVX_128bit); // packed left shifting >> 7 |
|
272 |
// xor the shifted versions |
|
273 |
vpxor(xmm9, xmm9, xmm10, Assembler::AVX_128bit); |
|
274 |
vpxor(xmm9, xmm9, tmp2, Assembler::AVX_128bit); |
|
275 |
vpxor(xmm9, xmm9, xmm8, Assembler::AVX_128bit); |
|
276 |
vpxor(tmp0, xmm9, tmp0, Assembler::AVX_128bit); |
|
277 |
// Final result is in state |
|
278 |
vpxor(state, tmp0, tmp1, Assembler::AVX_128bit); |
|
279 |
||
280 |
lea(input_data, Address(input_data, 16 * 8)); |
|
281 |
cmpl(blocks, 8); |
|
282 |
jcc(Assembler::below, ONE_BLK_INIT); |
|
283 |
jmp(PROCESS_8_BLOCKS); |
|
284 |
||
285 |
// Since this is one block operation we will only use H * 2 i.e. the first power of H |
|
286 |
bind(ONE_BLK_INIT); |
|
287 |
movdqu(tmp0, Address(htbl, 1 * 16)); |
|
288 |
movdqu(bswap_mask, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr())); |
|
289 |
||
290 |
//Do one (128 bit x 128 bit) carry-less multiplication at a time followed by a reduction. |
|
291 |
bind(PROCESS_1_BLOCK); |
|
292 |
cmpl(blocks, 0); |
|
293 |
jcc(Assembler::equal, SAVE_STATE); |
|
294 |
subl(blocks, 1); |
|
295 |
movdqu(data, Address(input_data, 0)); |
|
296 |
vpshufb(data, data, bswap_mask, Assembler::AVX_128bit); |
|
297 |
vpxor(state, state, data, Assembler::AVX_128bit); |
|
298 |
// gfmul(H*2, state) |
|
299 |
call(GFMUL, relocInfo::none); |
|
300 |
addptr(input_data, 16); |
|
301 |
jmp(PROCESS_1_BLOCK); |
|
302 |
||
303 |
bind(SAVE_STATE); |
|
304 |
vpshufb(state, state, lswap_mask, Assembler::AVX_128bit); |
|
305 |
movdqu(Address(input_state, 0), state); |
|
306 |
jmp(EXIT_GHASH); |
|
307 |
||
308 |
bind(GFMUL); |
|
309 |
gfmul(tmp0, state); |
|
310 |
||
311 |
bind(GENERATE_HTBL_1_BLK); |
|
312 |
generateHtbl_one_block(htbl); |
|
313 |
||
314 |
bind(GENERATE_HTBL_8_BLKS); |
|
315 |
generateHtbl_eight_blocks(htbl); |
|
316 |
||
317 |
bind(EXIT_GHASH); |
|
318 |
// zero out xmm registers used for Htbl storage |
|
319 |
vpxor(xmm0, xmm0, xmm0, Assembler::AVX_128bit); |
|
320 |
vpxor(xmm1, xmm1, xmm1, Assembler::AVX_128bit); |
|
321 |
vpxor(xmm3, xmm3, xmm3, Assembler::AVX_128bit); |
|
322 |
vpxor(xmm15, xmm15, xmm15, Assembler::AVX_128bit); |
|
53017
e10a1f7aaa13
8215354: x86_32 build failures after JDK-8214074 (Ghash optimization using AVX instructions)
shade
parents:
52990
diff
changeset
|
323 |
} |
e10a1f7aaa13
8215354: x86_32 build failures after JDK-8214074 (Ghash optimization using AVX instructions)
shade
parents:
52990
diff
changeset
|
324 |
#endif // _LP64 |