|
1 /* |
|
2 * Copyright (c) 2016, Intel Corporation. |
|
3 * |
|
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
|
5 * |
|
6 * This code is free software; you can redistribute it and/or modify it |
|
7 * under the terms of the GNU General Public License version 2 only, as |
|
8 * published by the Free Software Foundation. |
|
9 * |
|
10 * This code is distributed in the hope that it will be useful, but WITHOUT |
|
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
13 * version 2 for more details (a copy is included in the LICENSE file that |
|
14 * accompanied this code). |
|
15 * |
|
16 * You should have received a copy of the GNU General Public License version |
|
17 * 2 along with this work; if not, write to the Free Software Foundation, |
|
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
|
19 * |
|
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
|
21 * or visit www.oracle.com if you need additional information or have any |
|
22 * questions. |
|
23 * |
|
24 */ |
|
25 |
|
26 #include "precompiled.hpp" |
|
27 #include "asm/assembler.hpp" |
|
28 #include "asm/assembler.inline.hpp" |
|
29 #include "runtime/stubRoutines.hpp" |
|
30 #include "macroAssembler_x86.hpp" |
|
31 |
|
32 // ofs and limit are used for multi-block byte array. |
|
33 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) |
|
34 void MacroAssembler::fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0, |
|
35 XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask, |
|
36 Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block) { |
|
37 |
|
38 Label start, done_hash, loop0; |
|
39 |
|
40 address upper_word_mask = StubRoutines::x86::upper_word_mask_addr(); |
|
41 address shuffle_byte_flip_mask = StubRoutines::x86::shuffle_byte_flip_mask_addr(); |
|
42 |
|
43 bind(start); |
|
44 movdqu(abcd, Address(state, 0)); |
|
45 pinsrd(e0, Address(state, 16), 3); |
|
46 movdqu(shuf_mask, ExternalAddress(upper_word_mask)); // 0xFFFFFFFF000000000000000000000000 |
|
47 pand(e0, shuf_mask); |
|
48 pshufd(abcd, abcd, 0x1B); |
|
49 movdqu(shuf_mask, ExternalAddress(shuffle_byte_flip_mask)); //0x000102030405060708090a0b0c0d0e0f |
|
50 |
|
51 bind(loop0); |
|
52 // Save hash values for addition after rounds |
|
53 movdqu(Address(rsp, 0), e0); |
|
54 movdqu(Address(rsp, 16), abcd); |
|
55 |
|
56 |
|
57 // Rounds 0 - 3 |
|
58 movdqu(msg0, Address(buf, 0)); |
|
59 pshufb(msg0, shuf_mask); |
|
60 paddd(e0, msg0); |
|
61 movdqa(e1, abcd); |
|
62 sha1rnds4(abcd, e0, 0); |
|
63 |
|
64 // Rounds 4 - 7 |
|
65 movdqu(msg1, Address(buf, 16)); |
|
66 pshufb(msg1, shuf_mask); |
|
67 sha1nexte(e1, msg1); |
|
68 movdqa(e0, abcd); |
|
69 sha1rnds4(abcd, e1, 0); |
|
70 sha1msg1(msg0, msg1); |
|
71 |
|
72 // Rounds 8 - 11 |
|
73 movdqu(msg2, Address(buf, 32)); |
|
74 pshufb(msg2, shuf_mask); |
|
75 sha1nexte(e0, msg2); |
|
76 movdqa(e1, abcd); |
|
77 sha1rnds4(abcd, e0, 0); |
|
78 sha1msg1(msg1, msg2); |
|
79 pxor(msg0, msg2); |
|
80 |
|
81 // Rounds 12 - 15 |
|
82 movdqu(msg3, Address(buf, 48)); |
|
83 pshufb(msg3, shuf_mask); |
|
84 sha1nexte(e1, msg3); |
|
85 movdqa(e0, abcd); |
|
86 sha1msg2(msg0, msg3); |
|
87 sha1rnds4(abcd, e1, 0); |
|
88 sha1msg1(msg2, msg3); |
|
89 pxor(msg1, msg3); |
|
90 |
|
91 // Rounds 16 - 19 |
|
92 sha1nexte(e0, msg0); |
|
93 movdqa(e1, abcd); |
|
94 sha1msg2(msg1, msg0); |
|
95 sha1rnds4(abcd, e0, 0); |
|
96 sha1msg1(msg3, msg0); |
|
97 pxor(msg2, msg0); |
|
98 |
|
99 // Rounds 20 - 23 |
|
100 sha1nexte(e1, msg1); |
|
101 movdqa(e0, abcd); |
|
102 sha1msg2(msg2, msg1); |
|
103 sha1rnds4(abcd, e1, 1); |
|
104 sha1msg1(msg0, msg1); |
|
105 pxor(msg3, msg1); |
|
106 |
|
107 // Rounds 24 - 27 |
|
108 sha1nexte(e0, msg2); |
|
109 movdqa(e1, abcd); |
|
110 sha1msg2(msg3, msg2); |
|
111 sha1rnds4(abcd, e0, 1); |
|
112 sha1msg1(msg1, msg2); |
|
113 pxor(msg0, msg2); |
|
114 |
|
115 // Rounds 28 - 31 |
|
116 sha1nexte(e1, msg3); |
|
117 movdqa(e0, abcd); |
|
118 sha1msg2(msg0, msg3); |
|
119 sha1rnds4(abcd, e1, 1); |
|
120 sha1msg1(msg2, msg3); |
|
121 pxor(msg1, msg3); |
|
122 |
|
123 // Rounds 32 - 35 |
|
124 sha1nexte(e0, msg0); |
|
125 movdqa(e1, abcd); |
|
126 sha1msg2(msg1, msg0); |
|
127 sha1rnds4(abcd, e0, 1); |
|
128 sha1msg1(msg3, msg0); |
|
129 pxor(msg2, msg0); |
|
130 |
|
131 // Rounds 36 - 39 |
|
132 sha1nexte(e1, msg1); |
|
133 movdqa(e0, abcd); |
|
134 sha1msg2(msg2, msg1); |
|
135 sha1rnds4(abcd, e1, 1); |
|
136 sha1msg1(msg0, msg1); |
|
137 pxor(msg3, msg1); |
|
138 |
|
139 // Rounds 40 - 43 |
|
140 sha1nexte(e0, msg2); |
|
141 movdqa(e1, abcd); |
|
142 sha1msg2(msg3, msg2); |
|
143 sha1rnds4(abcd, e0, 2); |
|
144 sha1msg1(msg1, msg2); |
|
145 pxor(msg0, msg2); |
|
146 |
|
147 // Rounds 44 - 47 |
|
148 sha1nexte(e1, msg3); |
|
149 movdqa(e0, abcd); |
|
150 sha1msg2(msg0, msg3); |
|
151 sha1rnds4(abcd, e1, 2); |
|
152 sha1msg1(msg2, msg3); |
|
153 pxor(msg1, msg3); |
|
154 |
|
155 // Rounds 48 - 51 |
|
156 sha1nexte(e0, msg0); |
|
157 movdqa(e1, abcd); |
|
158 sha1msg2(msg1, msg0); |
|
159 sha1rnds4(abcd, e0, 2); |
|
160 sha1msg1(msg3, msg0); |
|
161 pxor(msg2, msg0); |
|
162 |
|
163 // Rounds 52 - 55 |
|
164 sha1nexte(e1, msg1); |
|
165 movdqa(e0, abcd); |
|
166 sha1msg2(msg2, msg1); |
|
167 sha1rnds4(abcd, e1, 2); |
|
168 sha1msg1(msg0, msg1); |
|
169 pxor(msg3, msg1); |
|
170 |
|
171 // Rounds 56 - 59 |
|
172 sha1nexte(e0, msg2); |
|
173 movdqa(e1, abcd); |
|
174 sha1msg2(msg3, msg2); |
|
175 sha1rnds4(abcd, e0, 2); |
|
176 sha1msg1(msg1, msg2); |
|
177 pxor(msg0, msg2); |
|
178 |
|
179 // Rounds 60 - 63 |
|
180 sha1nexte(e1, msg3); |
|
181 movdqa(e0, abcd); |
|
182 sha1msg2(msg0, msg3); |
|
183 sha1rnds4(abcd, e1, 3); |
|
184 sha1msg1(msg2, msg3); |
|
185 pxor(msg1, msg3); |
|
186 |
|
187 // Rounds 64 - 67 |
|
188 sha1nexte(e0, msg0); |
|
189 movdqa(e1, abcd); |
|
190 sha1msg2(msg1, msg0); |
|
191 sha1rnds4(abcd, e0, 3); |
|
192 sha1msg1(msg3, msg0); |
|
193 pxor(msg2, msg0); |
|
194 |
|
195 // Rounds 68 - 71 |
|
196 sha1nexte(e1, msg1); |
|
197 movdqa(e0, abcd); |
|
198 sha1msg2(msg2, msg1); |
|
199 sha1rnds4(abcd, e1, 3); |
|
200 pxor(msg3, msg1); |
|
201 |
|
202 // Rounds 72 - 75 |
|
203 sha1nexte(e0, msg2); |
|
204 movdqa(e1, abcd); |
|
205 sha1msg2(msg3, msg2); |
|
206 sha1rnds4(abcd, e0, 3); |
|
207 |
|
208 // Rounds 76 - 79 |
|
209 sha1nexte(e1, msg3); |
|
210 movdqa(e0, abcd); |
|
211 sha1rnds4(abcd, e1, 3); |
|
212 |
|
213 // add current hash values with previously saved |
|
214 movdqu(msg0, Address(rsp, 0)); |
|
215 sha1nexte(e0, msg0); |
|
216 movdqu(msg0, Address(rsp, 16)); |
|
217 paddd(abcd, msg0); |
|
218 |
|
219 if (multi_block) { |
|
220 // increment data pointer and loop if more to process |
|
221 addptr(buf, 64); |
|
222 addptr(ofs, 64); |
|
223 cmpptr(ofs, limit); |
|
224 jcc(Assembler::belowEqual, loop0); |
|
225 movptr(rax, ofs); //return ofs |
|
226 } |
|
227 // write hash values back in the correct order |
|
228 pshufd(abcd, abcd, 0x1b); |
|
229 movdqu(Address(state, 0), abcd); |
|
230 pextrd(Address(state, 16), e0, 3); |
|
231 |
|
232 bind(done_hash); |
|
233 |
|
234 } |
|
235 |
|
236 // xmm0 (msg) is used as an implicit argument to sh256rnds2 |
|
237 // and state0 and state1 can never use xmm0 register. |
|
238 // ofs and limit are used for multi-block byte array. |
|
239 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) |
|
240 #ifdef _LP64 |
|
241 void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, |
|
242 XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, |
|
243 Register buf, Register state, Register ofs, Register limit, Register rsp, |
|
244 bool multi_block, XMMRegister shuf_mask) { |
|
245 #else |
|
246 void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, |
|
247 XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, |
|
248 Register buf, Register state, Register ofs, Register limit, Register rsp, |
|
249 bool multi_block) { |
|
250 #endif |
|
251 Label start, done_hash, loop0; |
|
252 |
|
253 address K256 = StubRoutines::x86::k256_addr(); |
|
254 address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr(); |
|
255 |
|
256 bind(start); |
|
257 movdqu(state0, Address(state, 0)); |
|
258 movdqu(state1, Address(state, 16)); |
|
259 |
|
260 pshufd(state0, state0, 0xB1); |
|
261 pshufd(state1, state1, 0x1B); |
|
262 movdqa(msgtmp4, state0); |
|
263 palignr(state0, state1, 8); |
|
264 pblendw(state1, msgtmp4, 0xF0); |
|
265 |
|
266 #ifdef _LP64 |
|
267 movdqu(shuf_mask, ExternalAddress(pshuffle_byte_flip_mask)); |
|
268 #endif |
|
269 lea(rax, ExternalAddress(K256)); |
|
270 |
|
271 bind(loop0); |
|
272 movdqu(Address(rsp, 0), state0); |
|
273 movdqu(Address(rsp, 16), state1); |
|
274 |
|
275 // Rounds 0-3 |
|
276 movdqu(msg, Address(buf, 0)); |
|
277 #ifdef _LP64 |
|
278 pshufb(msg, shuf_mask); |
|
279 #else |
|
280 pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); |
|
281 #endif |
|
282 movdqa(msgtmp0, msg); |
|
283 paddd(msg, Address(rax, 0)); |
|
284 sha256rnds2(state1, state0); |
|
285 pshufd(msg, msg, 0x0E); |
|
286 sha256rnds2(state0, state1); |
|
287 |
|
288 // Rounds 4-7 |
|
289 movdqu(msg, Address(buf, 16)); |
|
290 #ifdef _LP64 |
|
291 pshufb(msg, shuf_mask); |
|
292 #else |
|
293 pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); |
|
294 #endif |
|
295 movdqa(msgtmp1, msg); |
|
296 paddd(msg, Address(rax, 16)); |
|
297 sha256rnds2(state1, state0); |
|
298 pshufd(msg, msg, 0x0E); |
|
299 sha256rnds2(state0, state1); |
|
300 sha256msg1(msgtmp0, msgtmp1); |
|
301 |
|
302 // Rounds 8-11 |
|
303 movdqu(msg, Address(buf, 32)); |
|
304 #ifdef _LP64 |
|
305 pshufb(msg, shuf_mask); |
|
306 #else |
|
307 pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); |
|
308 #endif |
|
309 movdqa(msgtmp2, msg); |
|
310 paddd(msg, Address(rax, 32)); |
|
311 sha256rnds2(state1, state0); |
|
312 pshufd(msg, msg, 0x0E); |
|
313 sha256rnds2(state0, state1); |
|
314 sha256msg1(msgtmp1, msgtmp2); |
|
315 |
|
316 // Rounds 12-15 |
|
317 movdqu(msg, Address(buf, 48)); |
|
318 #ifdef _LP64 |
|
319 pshufb(msg, shuf_mask); |
|
320 #else |
|
321 pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask)); |
|
322 #endif |
|
323 movdqa(msgtmp3, msg); |
|
324 paddd(msg, Address(rax, 48)); |
|
325 sha256rnds2(state1, state0); |
|
326 movdqa(msgtmp4, msgtmp3); |
|
327 palignr(msgtmp4, msgtmp2, 4); |
|
328 paddd(msgtmp0, msgtmp4); |
|
329 sha256msg2(msgtmp0, msgtmp3); |
|
330 pshufd(msg, msg, 0x0E); |
|
331 sha256rnds2(state0, state1); |
|
332 sha256msg1(msgtmp2, msgtmp3); |
|
333 |
|
334 // Rounds 16-19 |
|
335 movdqa(msg, msgtmp0); |
|
336 paddd(msg, Address(rax, 64)); |
|
337 sha256rnds2(state1, state0); |
|
338 movdqa(msgtmp4, msgtmp0); |
|
339 palignr(msgtmp4, msgtmp3, 4); |
|
340 paddd(msgtmp1, msgtmp4); |
|
341 sha256msg2(msgtmp1, msgtmp0); |
|
342 pshufd(msg, msg, 0x0E); |
|
343 sha256rnds2(state0, state1); |
|
344 sha256msg1(msgtmp3, msgtmp0); |
|
345 |
|
346 // Rounds 20-23 |
|
347 movdqa(msg, msgtmp1); |
|
348 paddd(msg, Address(rax, 80)); |
|
349 sha256rnds2(state1, state0); |
|
350 movdqa(msgtmp4, msgtmp1); |
|
351 palignr(msgtmp4, msgtmp0, 4); |
|
352 paddd(msgtmp2, msgtmp4); |
|
353 sha256msg2(msgtmp2, msgtmp1); |
|
354 pshufd(msg, msg, 0x0E); |
|
355 sha256rnds2(state0, state1); |
|
356 sha256msg1(msgtmp0, msgtmp1); |
|
357 |
|
358 // Rounds 24-27 |
|
359 movdqa(msg, msgtmp2); |
|
360 paddd(msg, Address(rax, 96)); |
|
361 sha256rnds2(state1, state0); |
|
362 movdqa(msgtmp4, msgtmp2); |
|
363 palignr(msgtmp4, msgtmp1, 4); |
|
364 paddd(msgtmp3, msgtmp4); |
|
365 sha256msg2(msgtmp3, msgtmp2); |
|
366 pshufd(msg, msg, 0x0E); |
|
367 sha256rnds2(state0, state1); |
|
368 sha256msg1(msgtmp1, msgtmp2); |
|
369 |
|
370 // Rounds 28-31 |
|
371 movdqa(msg, msgtmp3); |
|
372 paddd(msg, Address(rax, 112)); |
|
373 sha256rnds2(state1, state0); |
|
374 movdqa(msgtmp4, msgtmp3); |
|
375 palignr(msgtmp4, msgtmp2, 4); |
|
376 paddd(msgtmp0, msgtmp4); |
|
377 sha256msg2(msgtmp0, msgtmp3); |
|
378 pshufd(msg, msg, 0x0E); |
|
379 sha256rnds2(state0, state1); |
|
380 sha256msg1(msgtmp2, msgtmp3); |
|
381 |
|
382 // Rounds 32-35 |
|
383 movdqa(msg, msgtmp0); |
|
384 paddd(msg, Address(rax, 128)); |
|
385 sha256rnds2(state1, state0); |
|
386 movdqa(msgtmp4, msgtmp0); |
|
387 palignr(msgtmp4, msgtmp3, 4); |
|
388 paddd(msgtmp1, msgtmp4); |
|
389 sha256msg2(msgtmp1, msgtmp0); |
|
390 pshufd(msg, msg, 0x0E); |
|
391 sha256rnds2(state0, state1); |
|
392 sha256msg1(msgtmp3, msgtmp0); |
|
393 |
|
394 // Rounds 36-39 |
|
395 movdqa(msg, msgtmp1); |
|
396 paddd(msg, Address(rax, 144)); |
|
397 sha256rnds2(state1, state0); |
|
398 movdqa(msgtmp4, msgtmp1); |
|
399 palignr(msgtmp4, msgtmp0, 4); |
|
400 paddd(msgtmp2, msgtmp4); |
|
401 sha256msg2(msgtmp2, msgtmp1); |
|
402 pshufd(msg, msg, 0x0E); |
|
403 sha256rnds2(state0, state1); |
|
404 sha256msg1(msgtmp0, msgtmp1); |
|
405 |
|
406 // Rounds 40-43 |
|
407 movdqa(msg, msgtmp2); |
|
408 paddd(msg, Address(rax, 160)); |
|
409 sha256rnds2(state1, state0); |
|
410 movdqa(msgtmp4, msgtmp2); |
|
411 palignr(msgtmp4, msgtmp1, 4); |
|
412 paddd(msgtmp3, msgtmp4); |
|
413 sha256msg2(msgtmp3, msgtmp2); |
|
414 pshufd(msg, msg, 0x0E); |
|
415 sha256rnds2(state0, state1); |
|
416 sha256msg1(msgtmp1, msgtmp2); |
|
417 |
|
418 // Rounds 44-47 |
|
419 movdqa(msg, msgtmp3); |
|
420 paddd(msg, Address(rax, 176)); |
|
421 sha256rnds2(state1, state0); |
|
422 movdqa(msgtmp4, msgtmp3); |
|
423 palignr(msgtmp4, msgtmp2, 4); |
|
424 paddd(msgtmp0, msgtmp4); |
|
425 sha256msg2(msgtmp0, msgtmp3); |
|
426 pshufd(msg, msg, 0x0E); |
|
427 sha256rnds2(state0, state1); |
|
428 sha256msg1(msgtmp2, msgtmp3); |
|
429 |
|
430 // Rounds 48-51 |
|
431 movdqa(msg, msgtmp0); |
|
432 paddd(msg, Address(rax, 192)); |
|
433 sha256rnds2(state1, state0); |
|
434 movdqa(msgtmp4, msgtmp0); |
|
435 palignr(msgtmp4, msgtmp3, 4); |
|
436 paddd(msgtmp1, msgtmp4); |
|
437 sha256msg2(msgtmp1, msgtmp0); |
|
438 pshufd(msg, msg, 0x0E); |
|
439 sha256rnds2(state0, state1); |
|
440 sha256msg1(msgtmp3, msgtmp0); |
|
441 |
|
442 // Rounds 52-55 |
|
443 movdqa(msg, msgtmp1); |
|
444 paddd(msg, Address(rax, 208)); |
|
445 sha256rnds2(state1, state0); |
|
446 movdqa(msgtmp4, msgtmp1); |
|
447 palignr(msgtmp4, msgtmp0, 4); |
|
448 paddd(msgtmp2, msgtmp4); |
|
449 sha256msg2(msgtmp2, msgtmp1); |
|
450 pshufd(msg, msg, 0x0E); |
|
451 sha256rnds2(state0, state1); |
|
452 |
|
453 // Rounds 56-59 |
|
454 movdqa(msg, msgtmp2); |
|
455 paddd(msg, Address(rax, 224)); |
|
456 sha256rnds2(state1, state0); |
|
457 movdqa(msgtmp4, msgtmp2); |
|
458 palignr(msgtmp4, msgtmp1, 4); |
|
459 paddd(msgtmp3, msgtmp4); |
|
460 sha256msg2(msgtmp3, msgtmp2); |
|
461 pshufd(msg, msg, 0x0E); |
|
462 sha256rnds2(state0, state1); |
|
463 |
|
464 // Rounds 60-63 |
|
465 movdqa(msg, msgtmp3); |
|
466 paddd(msg, Address(rax, 240)); |
|
467 sha256rnds2(state1, state0); |
|
468 pshufd(msg, msg, 0x0E); |
|
469 sha256rnds2(state0, state1); |
|
470 movdqu(msg, Address(rsp, 0)); |
|
471 paddd(state0, msg); |
|
472 movdqu(msg, Address(rsp, 16)); |
|
473 paddd(state1, msg); |
|
474 |
|
475 if (multi_block) { |
|
476 // increment data pointer and loop if more to process |
|
477 addptr(buf, 64); |
|
478 addptr(ofs, 64); |
|
479 cmpptr(ofs, limit); |
|
480 jcc(Assembler::belowEqual, loop0); |
|
481 movptr(rax, ofs); //return ofs |
|
482 } |
|
483 |
|
484 pshufd(state0, state0, 0x1B); |
|
485 pshufd(state1, state1, 0xB1); |
|
486 movdqa(msgtmp4, state0); |
|
487 pblendw(state0, state1, 0xF0); |
|
488 palignr(state1, msgtmp4, 8); |
|
489 |
|
490 movdqu(Address(state, 0), state0); |
|
491 movdqu(Address(state, 16), state1); |
|
492 |
|
493 bind(done_hash); |
|
494 |
|
495 } |
|
496 |
|
497 #ifdef _LP64 |
|
498 /* |
|
499 The algorithm below is based on Intel publication: |
|
500 "Fast SHA-256 Implementations on Intelë Architecture Processors" by Jim Guilford, Kirk Yap and Vinodh Gopal. |
|
501 The assembly code was originally provided by Sean Gulley and in many places preserves |
|
502 the original assembly NAMES and comments to simplify matching Java assembly with its original. |
|
503 The Java version was substantially redesigned to replace 1200 assembly instruction with |
|
504 much shorter run-time generator of the same code in memory. |
|
505 */ |
|
506 |
|
507 void MacroAssembler::sha256_AVX2_one_round_compute( |
|
508 Register reg_old_h, |
|
509 Register reg_a, |
|
510 Register reg_b, |
|
511 Register reg_c, |
|
512 Register reg_d, |
|
513 Register reg_e, |
|
514 Register reg_f, |
|
515 Register reg_g, |
|
516 Register reg_h, |
|
517 int iter) { |
|
518 const Register& reg_y0 = r13; |
|
519 const Register& reg_y1 = r14; |
|
520 const Register& reg_y2 = r15; |
|
521 const Register& reg_y3 = rcx; |
|
522 const Register& reg_T1 = r12; |
|
523 //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND iter ;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
524 if (iter%4 > 0) { |
|
525 addl(reg_old_h, reg_y2); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; -- |
|
526 } |
|
527 movl(reg_y2, reg_f); // reg_y2 = reg_f ; CH |
|
528 rorxd(reg_y0, reg_e, 25); // reg_y0 = reg_e >> 25 ; S1A |
|
529 rorxd(reg_y1, reg_e, 11); // reg_y1 = reg_e >> 11 ; S1B |
|
530 xorl(reg_y2, reg_g); // reg_y2 = reg_f^reg_g ; CH |
|
531 |
|
532 xorl(reg_y0, reg_y1); // reg_y0 = (reg_e>>25) ^ (reg_h>>11) ; S1 |
|
533 rorxd(reg_y1, reg_e, 6); // reg_y1 = (reg_e >> 6) ; S1 |
|
534 andl(reg_y2, reg_e); // reg_y2 = (reg_f^reg_g)®_e ; CH |
|
535 |
|
536 if (iter%4 > 0) { |
|
537 addl(reg_old_h, reg_y3); // reg_h = t1 + S0 + MAJ ; -- |
|
538 } |
|
539 |
|
540 xorl(reg_y0, reg_y1); // reg_y0 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1 |
|
541 rorxd(reg_T1, reg_a, 13); // reg_T1 = reg_a >> 13 ; S0B |
|
542 xorl(reg_y2, reg_g); // reg_y2 = CH = ((reg_f^reg_g)®_e)^reg_g ; CH |
|
543 rorxd(reg_y1, reg_a, 22); // reg_y1 = reg_a >> 22 ; S0A |
|
544 movl(reg_y3, reg_a); // reg_y3 = reg_a ; MAJA |
|
545 |
|
546 xorl(reg_y1, reg_T1); // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ; S0 |
|
547 rorxd(reg_T1, reg_a, 2); // reg_T1 = (reg_a >> 2) ; S0 |
|
548 addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter)); // reg_h = k + w + reg_h ; -- |
|
549 orl(reg_y3, reg_c); // reg_y3 = reg_a|reg_c ; MAJA |
|
550 |
|
551 xorl(reg_y1, reg_T1); // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0 |
|
552 movl(reg_T1, reg_a); // reg_T1 = reg_a ; MAJB |
|
553 andl(reg_y3, reg_b); // reg_y3 = (reg_a|reg_c)®_b ; MAJA |
|
554 andl(reg_T1, reg_c); // reg_T1 = reg_a®_c ; MAJB |
|
555 addl(reg_y2, reg_y0); // reg_y2 = S1 + CH ; -- |
|
556 |
|
557 |
|
558 addl(reg_d, reg_h); // reg_d = k + w + reg_h + reg_d ; -- |
|
559 orl(reg_y3, reg_T1); // reg_y3 = MAJ = (reg_a|reg_c)®_b)|(reg_a®_c) ; MAJ |
|
560 addl(reg_h, reg_y1); // reg_h = k + w + reg_h + S0 ; -- |
|
561 |
|
562 addl(reg_d, reg_y2); // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1 ; -- |
|
563 |
|
564 |
|
565 if (iter%4 == 3) { |
|
566 addl(reg_h, reg_y2); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; -- |
|
567 addl(reg_h, reg_y3); // reg_h = t1 + S0 + MAJ ; -- |
|
568 } |
|
569 } |
|
570 |
|
571 void MacroAssembler::sha256_AVX2_four_rounds_compute_first(int start) { |
|
572 sha256_AVX2_one_round_compute(rax, rax, rbx, rdi, rsi, r8, r9, r10, r11, start + 0); |
|
573 sha256_AVX2_one_round_compute(r11, r11, rax, rbx, rdi, rsi, r8, r9, r10, start + 1); |
|
574 sha256_AVX2_one_round_compute(r10, r10, r11, rax, rbx, rdi, rsi, r8, r9, start + 2); |
|
575 sha256_AVX2_one_round_compute(r9, r9, r10, r11, rax, rbx, rdi, rsi, r8, start + 3); |
|
576 } |
|
577 |
|
578 void MacroAssembler::sha256_AVX2_four_rounds_compute_last(int start) { |
|
579 sha256_AVX2_one_round_compute(r8, r8, r9, r10, r11, rax, rbx, rdi, rsi, start + 0); |
|
580 sha256_AVX2_one_round_compute(rsi, rsi, r8, r9, r10, r11, rax, rbx, rdi, start + 1); |
|
581 sha256_AVX2_one_round_compute(rdi, rdi, rsi, r8, r9, r10, r11, rax, rbx, start + 2); |
|
582 sha256_AVX2_one_round_compute(rbx, rbx, rdi, rsi, r8, r9, r10, r11, rax, start + 3); |
|
583 } |
|
584 |
|
585 void MacroAssembler::sha256_AVX2_one_round_and_sched( |
|
586 XMMRegister xmm_0, /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */ |
|
587 XMMRegister xmm_1, /* ymm5 */ /* full cycle is 16 iterations */ |
|
588 XMMRegister xmm_2, /* ymm6 */ |
|
589 XMMRegister xmm_3, /* ymm7 */ |
|
590 Register reg_a, /* == rax on 0 iteration, then rotate 8 register right on each next iteration */ |
|
591 Register reg_b, /* rbx */ /* full cycle is 8 iterations */ |
|
592 Register reg_c, /* rdi */ |
|
593 Register reg_d, /* rsi */ |
|
594 Register reg_e, /* r8 */ |
|
595 Register reg_f, /* r9d */ |
|
596 Register reg_g, /* r10d */ |
|
597 Register reg_h, /* r11d */ |
|
598 int iter) |
|
599 { |
|
600 movl(rcx, reg_a); // rcx = reg_a ; MAJA |
|
601 rorxd(r13, reg_e, 25); // r13 = reg_e >> 25 ; S1A |
|
602 rorxd(r14, reg_e, 11); // r14 = reg_e >> 11 ; S1B |
|
603 addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter)); |
|
604 orl(rcx, reg_c); // rcx = reg_a|reg_c ; MAJA |
|
605 |
|
606 movl(r15, reg_f); // r15 = reg_f ; CH |
|
607 rorxd(r12, reg_a, 13); // r12 = reg_a >> 13 ; S0B |
|
608 xorl(r13, r14); // r13 = (reg_e>>25) ^ (reg_e>>11) ; S1 |
|
609 xorl(r15, reg_g); // r15 = reg_f^reg_g ; CH |
|
610 |
|
611 rorxd(r14, reg_e, 6); // r14 = (reg_e >> 6) ; S1 |
|
612 andl(r15, reg_e); // r15 = (reg_f^reg_g)®_e ; CH |
|
613 |
|
614 xorl(r13, r14); // r13 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1 |
|
615 rorxd(r14, reg_a, 22); // r14 = reg_a >> 22 ; S0A |
|
616 addl(reg_d, reg_h); // reg_d = k + w + reg_h + reg_d ; -- |
|
617 |
|
618 andl(rcx, reg_b); // rcx = (reg_a|reg_c)®_b ; MAJA |
|
619 xorl(r14, r12); // r14 = (reg_a>>22) ^ (reg_a>>13) ; S0 |
|
620 |
|
621 rorxd(r12, reg_a, 2); // r12 = (reg_a >> 2) ; S0 |
|
622 xorl(r15, reg_g); // r15 = CH = ((reg_f^reg_g)®_e)^reg_g ; CH |
|
623 |
|
624 xorl(r14, r12); // r14 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0 |
|
625 movl(r12, reg_a); // r12 = reg_a ; MAJB |
|
626 andl(r12, reg_c); // r12 = reg_a®_c ; MAJB |
|
627 addl(r15, r13); // r15 = S1 + CH ; -- |
|
628 |
|
629 orl(rcx, r12); // rcx = MAJ = (reg_a|reg_c)®_b)|(reg_a®_c) ; MAJ |
|
630 addl(reg_h, r14); // reg_h = k + w + reg_h + S0 ; -- |
|
631 addl(reg_d, r15); // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1 ; -- |
|
632 |
|
633 addl(reg_h, r15); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; -- |
|
634 addl(reg_h, rcx); // reg_h = t1 + S0 + MAJ ; -- |
|
635 |
|
636 if (iter%4 == 0) { |
|
637 vpalignr(xmm0, xmm_3, xmm_2, 4, AVX_256bit); // ymm0 = W[-7] |
|
638 vpaddd(xmm0, xmm0, xmm_0, AVX_256bit); // ymm0 = W[-7] + W[-16]; y1 = (e >> 6) ; S1 |
|
639 vpalignr(xmm1, xmm_1, xmm_0, 4, AVX_256bit); // ymm1 = W[-15] |
|
640 vpsrld(xmm2, xmm1, 7, AVX_256bit); |
|
641 vpslld(xmm3, xmm1, 32-7, AVX_256bit); |
|
642 vpor(xmm3, xmm3, xmm2, AVX_256bit); // ymm3 = W[-15] ror 7 |
|
643 vpsrld(xmm2, xmm1,18, AVX_256bit); |
|
644 } else if (iter%4 == 1 ) { |
|
645 vpsrld(xmm8, xmm1, 3, AVX_256bit); // ymm8 = W[-15] >> 3 |
|
646 vpslld(xmm1, xmm1, 32-18, AVX_256bit); |
|
647 vpxor(xmm3, xmm3, xmm1, AVX_256bit); |
|
648 vpxor(xmm3, xmm3, xmm2, AVX_256bit); // ymm3 = W[-15] ror 7 ^ W[-15] ror 18 |
|
649 vpxor(xmm1, xmm3, xmm8, AVX_256bit); // ymm1 = s0 |
|
650 vpshufd(xmm2, xmm_3, 0xFA, AVX_256bit); // 11111010b ; ymm2 = W[-2] {BBAA} |
|
651 vpaddd(xmm0, xmm0, xmm1, AVX_256bit); // ymm0 = W[-16] + W[-7] + s0 |
|
652 vpsrld(xmm8, xmm2, 10, AVX_256bit); // ymm8 = W[-2] >> 10 {BBAA} |
|
653 } else if (iter%4 == 2) { |
|
654 vpsrlq(xmm3, xmm2, 19, AVX_256bit); // ymm3 = W[-2] ror 19 {xBxA} |
|
655 vpsrlq(xmm2, xmm2, 17, AVX_256bit); // ymm2 = W[-2] ror 17 {xBxA} |
|
656 vpxor(xmm2, xmm2, xmm3, AVX_256bit); |
|
657 vpxor(xmm8, xmm8, xmm2, AVX_256bit); // ymm8 = s1 {xBxA} |
|
658 vpshufb(xmm8, xmm8, xmm10, AVX_256bit); // ymm8 = s1 {00BA} |
|
659 vpaddd(xmm0, xmm0, xmm8, AVX_256bit); // ymm0 = {..., ..., W[1], W[0]} |
|
660 vpshufd(xmm2, xmm0, 0x50, AVX_256bit); // 01010000b ; ymm2 = W[-2] {DDCC} |
|
661 } else if (iter%4 == 3) { |
|
662 vpsrld(xmm11, xmm2, 10, AVX_256bit); // ymm11 = W[-2] >> 10 {DDCC} |
|
663 vpsrlq(xmm3, xmm2, 19, AVX_256bit); // ymm3 = W[-2] ror 19 {xDxC} |
|
664 vpsrlq(xmm2, xmm2, 17, AVX_256bit); // ymm2 = W[-2] ror 17 {xDxC} |
|
665 vpxor(xmm2, xmm2, xmm3, AVX_256bit); |
|
666 vpxor(xmm11, xmm11, xmm2, AVX_256bit); // ymm11 = s1 {xDxC} |
|
667 vpshufb(xmm11, xmm11, xmm12, AVX_256bit); // ymm11 = s1 {DC00} |
|
668 vpaddd(xmm_0, xmm11, xmm0, AVX_256bit); // xmm_0 = {W[3], W[2], W[1], W[0]} |
|
669 } |
|
670 } |
|
671 |
|
672 void MacroAssembler::addm(int disp, Register r1, Register r2) { |
|
673 addl(r2, Address(r1, disp)); |
|
674 movl(Address(r1, disp), r2); |
|
675 } |
|
676 |
|
677 void MacroAssembler::addmq(int disp, Register r1, Register r2) { |
|
678 addq(r2, Address(r1, disp)); |
|
679 movq(Address(r1, disp), r2); |
|
680 } |
|
681 |
|
682 void MacroAssembler::sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, |
|
683 XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, |
|
684 Register buf, Register state, Register ofs, Register limit, Register rsp, |
|
685 bool multi_block, XMMRegister shuf_mask) { |
|
686 |
|
687 Label loop0, loop1, loop2, loop3, |
|
688 last_block_enter, do_last_block, only_one_block, done_hash, |
|
689 compute_size, compute_size_end, |
|
690 compute_size1, compute_size_end1; |
|
691 |
|
692 address K256_W = StubRoutines::x86::k256_W_addr(); |
|
693 address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr(); |
|
694 address pshuffle_byte_flip_mask_addr = 0; |
|
695 |
|
696 const XMMRegister& SHUF_00BA = xmm10; // ymm10: shuffle xBxA -> 00BA |
|
697 const XMMRegister& SHUF_DC00 = xmm12; // ymm12: shuffle xDxC -> DC00 |
|
698 const XMMRegister& BYTE_FLIP_MASK = xmm13; // ymm13 |
|
699 |
|
700 const XMMRegister& X_BYTE_FLIP_MASK = xmm13; //XMM version of BYTE_FLIP_MASK |
|
701 |
|
702 const Register& NUM_BLKS = r8; // 3rd arg |
|
703 const Register& CTX = rdx; // 2nd arg |
|
704 const Register& INP = rcx; // 1st arg |
|
705 |
|
706 const Register& c = rdi; |
|
707 const Register& d = rsi; |
|
708 const Register& e = r8; // clobbers NUM_BLKS |
|
709 const Register& y3 = rcx; // clobbers INP |
|
710 |
|
711 const Register& TBL = rbp; |
|
712 const Register& SRND = CTX; // SRND is same register as CTX |
|
713 |
|
714 const Register& a = rax; |
|
715 const Register& b = rbx; |
|
716 const Register& f = r9; |
|
717 const Register& g = r10; |
|
718 const Register& h = r11; |
|
719 |
|
720 const Register& T1 = r12; |
|
721 const Register& y0 = r13; |
|
722 const Register& y1 = r14; |
|
723 const Register& y2 = r15; |
|
724 |
|
725 |
|
726 enum { |
|
727 _XFER_SIZE = 2*64*4, // 2 blocks, 64 rounds, 4 bytes/round |
|
728 _INP_END_SIZE = 8, |
|
729 _INP_SIZE = 8, |
|
730 _CTX_SIZE = 8, |
|
731 _RSP_SIZE = 8, |
|
732 |
|
733 _XFER = 0, |
|
734 _INP_END = _XFER + _XFER_SIZE, |
|
735 _INP = _INP_END + _INP_END_SIZE, |
|
736 _CTX = _INP + _INP_SIZE, |
|
737 _RSP = _CTX + _CTX_SIZE, |
|
738 STACK_SIZE = _RSP + _RSP_SIZE |
|
739 }; |
|
740 |
|
741 #ifndef _WIN64 |
|
742 push(rcx); // linux: this is limit, need at the end |
|
743 push(rdx); // linux: this is ofs |
|
744 #else |
|
745 push(r8); // win64: this is ofs |
|
746 push(r9); // win64: this is limit, we need them again at the very and |
|
747 #endif |
|
748 |
|
749 |
|
750 push(rbx); |
|
751 #ifdef _WIN64 |
|
752 push(rsi); |
|
753 push(rdi); |
|
754 #endif |
|
755 push(rbp); |
|
756 push(r12); |
|
757 push(r13); |
|
758 push(r14); |
|
759 push(r15); |
|
760 |
|
761 movq(rax, rsp); |
|
762 subq(rsp, STACK_SIZE); |
|
763 andq(rsp, -32); |
|
764 movq(Address(rsp, _RSP), rax); |
|
765 |
|
766 #ifndef _WIN64 |
|
767 // copy linux params to win64 params, therefore the rest of code will be the same for both |
|
768 movq(r9, rcx); |
|
769 movq(r8, rdx); |
|
770 movq(rdx, rsi); |
|
771 movq(rcx, rdi); |
|
772 #endif |
|
773 |
|
774 // setting original assembly ABI |
|
775 /** message to encrypt in INP */ |
|
776 lea(INP, Address(rcx, 0)); // rcx == message (buf) ;; linux: INP = buf = rdi |
|
777 /** digest in CTX */ |
|
778 movq(CTX, rdx); // rdx = digest (state) ;; linux: CTX = state = rsi |
|
779 |
|
780 /** NUM_BLK is the length of message, need to set it from ofs and limit */ |
|
781 if (multi_block) { |
|
782 |
|
783 // Win64: cannot directly update NUM_BLKS, since NUM_BLKS = ofs = r8 |
|
784 // on entry r8 = ofs |
|
785 // on exit r8 = NUM_BLKS |
|
786 |
|
787 xorq(rax, rax); |
|
788 |
|
789 bind(compute_size); |
|
790 cmpptr(r8, r9); // assume the original ofs <= limit ;; linux: cmp rcx, rdx |
|
791 jccb(Assembler::aboveEqual, compute_size_end); |
|
792 addq(r8, 64); //;; linux: ofs = rdx |
|
793 addq(rax, 64); |
|
794 jmpb(compute_size); |
|
795 |
|
796 bind(compute_size_end); |
|
797 movq(NUM_BLKS, rax); // NUM_BLK (r8) ;; linux: NUM_BLK = rdx |
|
798 |
|
799 cmpq(NUM_BLKS, 0); |
|
800 jcc(Assembler::equal, done_hash); |
|
801 |
|
802 } else { |
|
803 xorq(NUM_BLKS, NUM_BLKS); |
|
804 addq(NUM_BLKS, 64); |
|
805 }//if (!multi_block) |
|
806 |
|
807 lea(NUM_BLKS, Address(INP, NUM_BLKS, Address::times_1, -64)); // pointer to the last block |
|
808 movq(Address(rsp, _INP_END), NUM_BLKS); // |
|
809 |
|
810 cmpptr(INP, NUM_BLKS); //cmp INP, NUM_BLKS |
|
811 jcc(Assembler::equal, only_one_block); //je only_one_block |
|
812 |
|
813 // load initial digest |
|
814 movl(a, Address(CTX, 4*0)); |
|
815 movl(b, Address(CTX, 4*1)); |
|
816 movl(c, Address(CTX, 4*2)); |
|
817 movl(d, Address(CTX, 4*3)); |
|
818 movl(e, Address(CTX, 4*4)); |
|
819 movl(f, Address(CTX, 4*5)); |
|
820 // load g - r10 after it is used as scratch |
|
821 movl(h, Address(CTX, 4*7)); |
|
822 |
|
823 pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask; |
|
824 vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr +0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip] |
|
825 vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32)); //[_SHUF_00BA wrt rip] |
|
826 vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64)); //[_SHUF_DC00 wrt rip] |
|
827 |
|
828 movl(g, Address(CTX, 4*6)); |
|
829 |
|
830 movq(Address(rsp, _CTX), CTX); // store |
|
831 |
|
832 bind(loop0); |
|
833 lea(TBL, ExternalAddress(K256_W)); |
|
834 |
|
835 // assume buffers not aligned |
|
836 |
|
837 // Load first 16 dwords from two blocks |
|
838 vmovdqu(xmm0, Address(INP, 0*32)); |
|
839 vmovdqu(xmm1, Address(INP, 1*32)); |
|
840 vmovdqu(xmm2, Address(INP, 2*32)); |
|
841 vmovdqu(xmm3, Address(INP, 3*32)); |
|
842 |
|
843 // byte swap data |
|
844 vpshufb(xmm0, xmm0, BYTE_FLIP_MASK, AVX_256bit); |
|
845 vpshufb(xmm1, xmm1, BYTE_FLIP_MASK, AVX_256bit); |
|
846 vpshufb(xmm2, xmm2, BYTE_FLIP_MASK, AVX_256bit); |
|
847 vpshufb(xmm3, xmm3, BYTE_FLIP_MASK, AVX_256bit); |
|
848 |
|
849 // transpose data into high/low halves |
|
850 vperm2i128(xmm4, xmm0, xmm2, 0x20); |
|
851 vperm2i128(xmm5, xmm0, xmm2, 0x31); |
|
852 vperm2i128(xmm6, xmm1, xmm3, 0x20); |
|
853 vperm2i128(xmm7, xmm1, xmm3, 0x31); |
|
854 |
|
855 bind(last_block_enter); |
|
856 addq(INP, 64); |
|
857 movq(Address(rsp, _INP), INP); |
|
858 |
|
859 //;; schedule 48 input dwords, by doing 3 rounds of 12 each |
|
860 xorq(SRND, SRND); |
|
861 |
|
862 align(16); |
|
863 bind(loop1); |
|
864 vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit); |
|
865 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9); |
|
866 sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, rax, rbx, rdi, rsi, r8, r9, r10, r11, 0); |
|
867 sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r11, rax, rbx, rdi, rsi, r8, r9, r10, 1); |
|
868 sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r10, r11, rax, rbx, rdi, rsi, r8, r9, 2); |
|
869 sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r9, r10, r11, rax, rbx, rdi, rsi, r8, 3); |
|
870 |
|
871 vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit); |
|
872 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9); |
|
873 sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, r8, r9, r10, r11, rax, rbx, rdi, rsi, 8+0); |
|
874 sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rsi, r8, r9, r10, r11, rax, rbx, rdi, 8+1); |
|
875 sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rdi, rsi, r8, r9, r10, r11, rax, rbx, 8+2); |
|
876 sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rbx, rdi, rsi, r8, r9, r10, r11, rax, 8+3); |
|
877 |
|
878 vpaddd(xmm9, xmm6, Address(TBL, SRND, Address::times_1, 2*32), AVX_256bit); |
|
879 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 2*32), xmm9); |
|
880 sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, rax, rbx, rdi, rsi, r8, r9, r10, r11, 16+0); |
|
881 sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r11, rax, rbx, rdi, rsi, r8, r9, r10, 16+1); |
|
882 sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r10, r11, rax, rbx, rdi, rsi, r8, r9, 16+2); |
|
883 sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r9, r10, r11, rax, rbx, rdi, rsi, r8, 16+3); |
|
884 |
|
885 vpaddd(xmm9, xmm7, Address(TBL, SRND, Address::times_1, 3*32), AVX_256bit); |
|
886 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 3*32), xmm9); |
|
887 |
|
888 sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, r8, r9, r10, r11, rax, rbx, rdi, rsi, 24+0); |
|
889 sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rsi, r8, r9, r10, r11, rax, rbx, rdi, 24+1); |
|
890 sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rdi, rsi, r8, r9, r10, r11, rax, rbx, 24+2); |
|
891 sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rbx, rdi, rsi, r8, r9, r10, r11, rax, 24+3); |
|
892 |
|
893 addq(SRND, 4*32); |
|
894 cmpq(SRND, 3 * 4*32); |
|
895 jcc(Assembler::below, loop1); |
|
896 |
|
897 bind(loop2); |
|
898 // Do last 16 rounds with no scheduling |
|
899 vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit); |
|
900 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9); |
|
901 sha256_AVX2_four_rounds_compute_first(0); |
|
902 |
|
903 vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit); |
|
904 vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9); |
|
905 sha256_AVX2_four_rounds_compute_last(0 + 8); |
|
906 |
|
907 addq(SRND, 2*32); |
|
908 |
|
909 vmovdqu(xmm4, xmm6); |
|
910 vmovdqu(xmm5, xmm7); |
|
911 |
|
912 cmpq(SRND, 4 * 4*32); |
|
913 jcc(Assembler::below, loop2); |
|
914 |
|
915 movq(CTX, Address(rsp, _CTX)); |
|
916 movq(INP, Address(rsp, _INP)); |
|
917 |
|
918 addm(4*0, CTX, a); |
|
919 addm(4*1, CTX, b); |
|
920 addm(4*2, CTX, c); |
|
921 addm(4*3, CTX, d); |
|
922 addm(4*4, CTX, e); |
|
923 addm(4*5, CTX, f); |
|
924 addm(4*6, CTX, g); |
|
925 addm(4*7, CTX, h); |
|
926 |
|
927 cmpq(INP, Address(rsp, _INP_END)); |
|
928 jcc(Assembler::above, done_hash); |
|
929 |
|
930 //Do second block using previously scheduled results |
|
931 xorq(SRND, SRND); |
|
932 align(16); |
|
933 bind(loop3); |
|
934 sha256_AVX2_four_rounds_compute_first(4); |
|
935 sha256_AVX2_four_rounds_compute_last(4+8); |
|
936 |
|
937 addq(SRND, 2*32); |
|
938 cmpq(SRND, 4 * 4*32); |
|
939 jcc(Assembler::below, loop3); |
|
940 |
|
941 movq(CTX, Address(rsp, _CTX)); |
|
942 movq(INP, Address(rsp, _INP)); |
|
943 addq(INP, 64); |
|
944 |
|
945 addm(4*0, CTX, a); |
|
946 addm(4*1, CTX, b); |
|
947 addm(4*2, CTX, c); |
|
948 addm(4*3, CTX, d); |
|
949 addm(4*4, CTX, e); |
|
950 addm(4*5, CTX, f); |
|
951 addm(4*6, CTX, g); |
|
952 addm(4*7, CTX, h); |
|
953 |
|
954 cmpq(INP, Address(rsp, _INP_END)); |
|
955 jcc(Assembler::below, loop0); |
|
956 jccb(Assembler::above, done_hash); |
|
957 |
|
958 bind(do_last_block); |
|
959 lea(TBL, ExternalAddress(K256_W)); |
|
960 |
|
961 movdqu(xmm4, Address(INP, 0*16)); |
|
962 movdqu(xmm5, Address(INP, 1*16)); |
|
963 movdqu(xmm6, Address(INP, 2*16)); |
|
964 movdqu(xmm7, Address(INP, 3*16)); |
|
965 |
|
966 vpshufb(xmm4, xmm4, xmm13, AVX_128bit); |
|
967 vpshufb(xmm5, xmm5, xmm13, AVX_128bit); |
|
968 vpshufb(xmm6, xmm6, xmm13, AVX_128bit); |
|
969 vpshufb(xmm7, xmm7, xmm13, AVX_128bit); |
|
970 |
|
971 jmp(last_block_enter); |
|
972 |
|
973 bind(only_one_block); |
|
974 |
|
975 // load initial digest ;; table should be preloaded with following values |
|
976 movl(a, Address(CTX, 4*0)); // 0x6a09e667 |
|
977 movl(b, Address(CTX, 4*1)); // 0xbb67ae85 |
|
978 movl(c, Address(CTX, 4*2)); // 0x3c6ef372 |
|
979 movl(d, Address(CTX, 4*3)); // 0xa54ff53a |
|
980 movl(e, Address(CTX, 4*4)); // 0x510e527f |
|
981 movl(f, Address(CTX, 4*5)); // 0x9b05688c |
|
982 // load g - r10 after use as scratch |
|
983 movl(h, Address(CTX, 4*7)); // 0x5be0cd19 |
|
984 |
|
985 |
|
986 pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask; |
|
987 vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip] |
|
988 vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32)); //[_SHUF_00BA wrt rip] |
|
989 vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64)); //[_SHUF_DC00 wrt rip] |
|
990 |
|
991 movl(g, Address(CTX, 4*6)); // 0x1f83d9ab |
|
992 |
|
993 movq(Address(rsp, _CTX), CTX); |
|
994 jmpb(do_last_block); |
|
995 |
|
996 bind(done_hash); |
|
997 |
|
998 movq(rsp, Address(rsp, _RSP)); |
|
999 |
|
1000 pop(r15); |
|
1001 pop(r14); |
|
1002 pop(r13); |
|
1003 pop(r12); |
|
1004 pop(rbp); |
|
1005 #ifdef _WIN64 |
|
1006 pop(rdi); |
|
1007 pop(rsi); |
|
1008 #endif |
|
1009 pop(rbx); |
|
1010 |
|
1011 #ifdef _WIN64 |
|
1012 pop(r9); |
|
1013 pop(r8); |
|
1014 #else |
|
1015 pop(rdx); |
|
1016 pop(rcx); |
|
1017 #endif |
|
1018 |
|
1019 if (multi_block) { |
|
1020 #ifdef _WIN64 |
|
1021 const Register& limit_end = r9; |
|
1022 const Register& ofs_end = r8; |
|
1023 #else |
|
1024 const Register& limit_end = rcx; |
|
1025 const Register& ofs_end = rdx; |
|
1026 #endif |
|
1027 movq(rax, ofs_end); |
|
1028 |
|
1029 bind(compute_size1); |
|
1030 cmpptr(rax, limit_end); // assume the original ofs <= limit |
|
1031 jccb(Assembler::aboveEqual, compute_size_end1); |
|
1032 addq(rax, 64); |
|
1033 jmpb(compute_size1); |
|
1034 |
|
1035 bind(compute_size_end1); |
|
1036 } |
|
1037 } |
|
1038 |
|
1039 void MacroAssembler::sha512_AVX2_one_round_compute(Register old_h, Register a, Register b, Register c, |
|
1040 Register d, Register e, Register f, Register g, Register h, |
|
1041 int iteration) |
|
1042 { |
|
1043 |
|
1044 const Register& y0 = r13; |
|
1045 const Register& y1 = r14; |
|
1046 const Register& y2 = r15; |
|
1047 #ifdef _WIN64 |
|
1048 const Register& y3 = rcx; |
|
1049 #else |
|
1050 const Register& y3 = rdi; |
|
1051 #endif |
|
1052 const Register& T1 = r12; |
|
1053 |
|
1054 if (iteration % 4 > 0) { |
|
1055 addq(old_h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; |
|
1056 } |
|
1057 movq(y2, f); //y2 = f; CH |
|
1058 rorxq(y0, e, 41); //y0 = e >> 41; S1A |
|
1059 rorxq(y1, e, 18); //y1 = e >> 18; S1B |
|
1060 xorq(y2, g); //y2 = f^g; CH |
|
1061 |
|
1062 xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18); S1 |
|
1063 rorxq(y1, e, 14); //y1 = (e >> 14); S1 |
|
1064 andq(y2, e); //y2 = (f^g)&e; CH |
|
1065 |
|
1066 if (iteration % 4 > 0 ) { |
|
1067 addq(old_h, y3); //h = t1 + S0 + MAJ |
|
1068 } |
|
1069 xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18) ^ (e >> 14); S1 |
|
1070 rorxq(T1, a, 34); //T1 = a >> 34; S0B |
|
1071 xorq(y2, g); //y2 = CH = ((f^g)&e) ^g; CH |
|
1072 rorxq(y1, a, 39); //y1 = a >> 39; S0A |
|
1073 movq(y3, a); //y3 = a; MAJA |
|
1074 |
|
1075 xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34); S0 |
|
1076 rorxq(T1, a, 28); //T1 = (a >> 28); S0 |
|
1077 addq(h, Address(rsp, (8 * iteration))); //h = k + w + h; -- |
|
1078 orq(y3, c); //y3 = a | c; MAJA |
|
1079 |
|
1080 xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34) ^ (a >> 28); S0 |
|
1081 movq(T1, a); //T1 = a; MAJB |
|
1082 andq(y3, b); //y3 = (a | c)&b; MAJA |
|
1083 andq(T1, c); //T1 = a&c; MAJB |
|
1084 addq(y2, y0); //y2 = S1 + CH; -- |
|
1085 |
|
1086 addq(d, h); //d = k + w + h + d; -- |
|
1087 orq(y3, T1); //y3 = MAJ = (a | c)&b) | (a&c); MAJ |
|
1088 addq(h, y1); //h = k + w + h + S0; -- |
|
1089 |
|
1090 addq(d, y2); //d = k + w + h + d + S1 + CH = d + t1; -- |
|
1091 |
|
1092 if (iteration % 4 == 3) { |
|
1093 addq(h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; -- |
|
1094 addq(h, y3); //h = t1 + S0 + MAJ; -- |
|
1095 } |
|
1096 } |
|
1097 |
|
1098 void MacroAssembler::sha512_AVX2_one_round_and_schedule( |
|
1099 XMMRegister xmm4, // ymm4 |
|
1100 XMMRegister xmm5, // ymm5 |
|
1101 XMMRegister xmm6, // ymm6 |
|
1102 XMMRegister xmm7, // ymm7 |
|
1103 Register a, //rax |
|
1104 Register b, //rbx |
|
1105 Register c, //rdi |
|
1106 Register d, //rsi |
|
1107 Register e, //r8 |
|
1108 Register f, //r9 |
|
1109 Register g, //r10 |
|
1110 Register h, //r11 |
|
1111 int iteration) |
|
1112 { |
|
1113 |
|
1114 const Register& y0 = r13; |
|
1115 const Register& y1 = r14; |
|
1116 const Register& y2 = r15; |
|
1117 #ifdef _WIN64 |
|
1118 const Register& y3 = rcx; |
|
1119 #else |
|
1120 const Register& y3 = rdi; |
|
1121 #endif |
|
1122 const Register& T1 = r12; |
|
1123 |
|
1124 if (iteration % 4 == 0) { |
|
1125 // Extract w[t - 7] |
|
1126 // xmm0 = W[-7] |
|
1127 vperm2f128(xmm0, xmm7, xmm6, 3); |
|
1128 vpalignr(xmm0, xmm0, xmm6, 8, AVX_256bit); |
|
1129 |
|
1130 // Calculate w[t - 16] + w[t - 7] |
|
1131 vpaddq(xmm0, xmm0, xmm4, AVX_256bit); //xmm0 = W[-7] + W[-16] |
|
1132 // Extract w[t - 15] |
|
1133 //xmm1 = W[-15] |
|
1134 vperm2f128(xmm1, xmm5, xmm4, 3); |
|
1135 vpalignr(xmm1, xmm1, xmm4, 8, AVX_256bit); |
|
1136 |
|
1137 // Calculate sigma0 |
|
1138 // Calculate w[t - 15] ror 1 |
|
1139 vpsrlq(xmm2, xmm1, 1, AVX_256bit); |
|
1140 vpsllq(xmm3, xmm1, (64 - 1), AVX_256bit); |
|
1141 vpor(xmm3, xmm3, xmm2, AVX_256bit); //xmm3 = W[-15] ror 1 |
|
1142 // Calculate w[t - 15] shr 7 |
|
1143 vpsrlq(xmm8, xmm1, 7, AVX_256bit); //xmm8 = W[-15] >> 7 |
|
1144 |
|
1145 } else if (iteration % 4 == 1) { |
|
1146 //Calculate w[t - 15] ror 8 |
|
1147 vpsrlq(xmm2, xmm1, 8, AVX_256bit); |
|
1148 vpsllq(xmm1, xmm1, (64 - 8), AVX_256bit); |
|
1149 vpor(xmm1, xmm1, xmm2, AVX_256bit); //xmm1 = W[-15] ror 8 |
|
1150 |
|
1151 //XOR the three components |
|
1152 vpxor(xmm3, xmm3, xmm8, AVX_256bit); //xmm3 = W[-15] ror 1 ^ W[-15] >> 7 |
|
1153 vpxor(xmm1, xmm3, xmm1, AVX_256bit); //xmm1 = s0 |
|
1154 |
|
1155 //Add three components, w[t - 16], w[t - 7] and sigma0 |
|
1156 vpaddq(xmm0, xmm0, xmm1, AVX_256bit); //xmm0 = W[-16] + W[-7] + s0 |
|
1157 |
|
1158 // Move to appropriate lanes for calculating w[16] and w[17] |
|
1159 vperm2f128(xmm4, xmm0, xmm0, 0); //xmm4 = W[-16] + W[-7] + s0{ BABA } |
|
1160 |
|
1161 //Move to appropriate lanes for calculating w[18] and w[19] |
|
1162 vpand(xmm0, xmm0, xmm10, AVX_256bit); //xmm0 = W[-16] + W[-7] + s0{ DC00 } |
|
1163 //Calculate w[16] and w[17] in both 128 bit lanes |
|
1164 //Calculate sigma1 for w[16] and w[17] on both 128 bit lanes |
|
1165 vperm2f128(xmm2, xmm7, xmm7, 17); //xmm2 = W[-2] {BABA} |
|
1166 vpsrlq(xmm8, xmm2, 6, AVX_256bit); //xmm8 = W[-2] >> 6 {BABA} |
|
1167 |
|
1168 } else if (iteration % 4 == 2) { |
|
1169 vpsrlq(xmm3, xmm2, 19, AVX_256bit); //xmm3 = W[-2] >> 19 {BABA} |
|
1170 vpsllq(xmm1, xmm2, (64 - 19), AVX_256bit); //xmm1 = W[-2] << 19 {BABA} |
|
1171 vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 19 {BABA} |
|
1172 vpxor(xmm8, xmm8, xmm3, AVX_256bit);// xmm8 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} |
|
1173 vpsrlq(xmm3, xmm2, 61, AVX_256bit); //xmm3 = W[-2] >> 61 {BABA} |
|
1174 vpsllq(xmm1, xmm2, (64 - 61), AVX_256bit); //xmm1 = W[-2] << 61 {BABA} |
|
1175 vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 61 {BABA} |
|
1176 vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) { BABA } |
|
1177 |
|
1178 //Add sigma1 to the other components to get w[16] and w[17] |
|
1179 vpaddq(xmm4, xmm4, xmm8, AVX_256bit); //xmm4 = { W[1], W[0], W[1], W[0] } |
|
1180 |
|
1181 //Calculate sigma1 for w[18] and w[19] for upper 128 bit lane |
|
1182 vpsrlq(xmm8, xmm4, 6, AVX_256bit); //xmm8 = W[-2] >> 6 {DC--} |
|
1183 |
|
1184 } else if (iteration % 4 == 3){ |
|
1185 vpsrlq(xmm3, xmm4, 19, AVX_256bit); //xmm3 = W[-2] >> 19 {DC--} |
|
1186 vpsllq(xmm1, xmm4, (64 - 19), AVX_256bit); //xmm1 = W[-2] << 19 {DC--} |
|
1187 vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 19 {DC--} |
|
1188 vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} |
|
1189 vpsrlq(xmm3, xmm4, 61, AVX_256bit); //xmm3 = W[-2] >> 61 {DC--} |
|
1190 vpsllq(xmm1, xmm4, (64 - 61), AVX_256bit); //xmm1 = W[-2] << 61 {DC--} |
|
1191 vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 61 {DC--} |
|
1192 vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) { DC-- } |
|
1193 |
|
1194 //Add the sigma0 + w[t - 7] + w[t - 16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19] |
|
1195 vpaddq(xmm2, xmm0, xmm8, AVX_256bit); //xmm2 = { W[3], W[2], --, -- } |
|
1196 |
|
1197 //Form w[19, w[18], w17], w[16] |
|
1198 vpblendd(xmm4, xmm4, xmm2, 0xF0, AVX_256bit); //xmm4 = { W[3], W[2], W[1], W[0] } |
|
1199 } |
|
1200 |
|
1201 movq(y3, a); //y3 = a; MAJA |
|
1202 rorxq(y0, e, 41); // y0 = e >> 41; S1A |
|
1203 rorxq(y1, e, 18); //y1 = e >> 18; S1B |
|
1204 addq(h, Address(rsp, (iteration * 8))); //h = k + w + h; -- |
|
1205 orq(y3, c); //y3 = a | c; MAJA |
|
1206 movq(y2, f); //y2 = f; CH |
|
1207 |
|
1208 xorq(y2, g); //y2 = f^g; CH |
|
1209 |
|
1210 rorxq(T1, a, 34); //T1 = a >> 34; S0B |
|
1211 xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18); S1 |
|
1212 |
|
1213 rorxq(y1, e, 14); //y1 = (e >> 14); S1 |
|
1214 |
|
1215 andq(y2, e); //y2 = (f^g) & e; CH |
|
1216 addq(d, h); //d = k + w + h + d; -- |
|
1217 |
|
1218 andq(y3, b); //y3 = (a | c)&b; MAJA |
|
1219 xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18) ^ (e >> 14); S1 |
|
1220 rorxq(y1, a, 39); //y1 = a >> 39; S0A |
|
1221 |
|
1222 xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34); S0 |
|
1223 rorxq(T1, a, 28); //T1 = (a >> 28); S0 |
|
1224 xorq(y2, g); //y2 = CH = ((f^g)&e) ^ g; CH |
|
1225 |
|
1226 xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34) ^ (a >> 28); S0 |
|
1227 movq(T1, a); //T1 = a; MAJB |
|
1228 |
|
1229 andq(T1, c); //T1 = a&c; MAJB |
|
1230 addq(y2, y0); //y2 = S1 + CH; -- |
|
1231 |
|
1232 orq(y3, T1); //y3 = MAJ = (a | c)&b) | (a&c); MAJ |
|
1233 addq(h, y1); //h = k + w + h + S0; -- |
|
1234 |
|
1235 addq(d, y2); //d = k + w + h + d + S1 + CH = d + t1; -- |
|
1236 addq(h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; -- |
|
1237 addq(h, y3); //h = t1 + S0 + MAJ; -- |
|
1238 } |
|
1239 |
|
1240 void MacroAssembler::sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, |
|
1241 XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, |
|
1242 Register buf, Register state, Register ofs, Register limit, Register rsp, |
|
1243 bool multi_block, XMMRegister shuf_mask) |
|
1244 { |
|
1245 |
|
1246 Label loop0, loop1, loop2, done_hash, |
|
1247 compute_block_size, compute_size, |
|
1248 compute_block_size_end, compute_size_end; |
|
1249 |
|
1250 address K512_W = StubRoutines::x86::k512_W_addr(); |
|
1251 address pshuffle_byte_flip_mask_sha512 = StubRoutines::x86::pshuffle_byte_flip_mask_addr_sha512(); |
|
1252 address pshuffle_byte_flip_mask_addr = 0; |
|
1253 |
|
1254 const XMMRegister& XFER = xmm0; // YTMP0 |
|
1255 const XMMRegister& BYTE_FLIP_MASK = xmm9; // ymm9 |
|
1256 const XMMRegister& YMM_MASK_LO = xmm10; // ymm10 |
|
1257 #ifdef _WIN64 |
|
1258 const Register& INP = rcx; //1st arg |
|
1259 const Register& CTX = rdx; //2nd arg |
|
1260 const Register& NUM_BLKS = r8; //3rd arg |
|
1261 const Register& c = rdi; |
|
1262 const Register& d = rsi; |
|
1263 const Register& e = r8; |
|
1264 const Register& y3 = rcx; |
|
1265 const Register& offset = r8; |
|
1266 const Register& input_limit = r9; |
|
1267 #else |
|
1268 const Register& INP = rdi; //1st arg |
|
1269 const Register& CTX = rsi; //2nd arg |
|
1270 const Register& NUM_BLKS = rdx; //3rd arg |
|
1271 const Register& c = rcx; |
|
1272 const Register& d = r8; |
|
1273 const Register& e = rdx; |
|
1274 const Register& y3 = rdi; |
|
1275 const Register& offset = rdx; |
|
1276 const Register& input_limit = rcx; |
|
1277 #endif |
|
1278 |
|
1279 const Register& TBL = rbp; |
|
1280 |
|
1281 const Register& a = rax; |
|
1282 const Register& b = rbx; |
|
1283 |
|
1284 const Register& f = r9; |
|
1285 const Register& g = r10; |
|
1286 const Register& h = r11; |
|
1287 |
|
1288 //Local variables as defined in assembly file. |
|
1289 enum |
|
1290 { |
|
1291 _XFER_SIZE = 4 * 8, // resq 4 => reserve 4 quadwords. Hence 4 * 8 |
|
1292 _SRND_SIZE = 8, // resq 1 |
|
1293 _INP_SIZE = 8, |
|
1294 _INP_END_SIZE = 8, |
|
1295 _RSP_SAVE_SIZE = 8, // defined as resq 1 |
|
1296 |
|
1297 #ifdef _WIN64 |
|
1298 _GPR_SAVE_SIZE = 8 * 8, // defined as resq 8 |
|
1299 #else |
|
1300 _GPR_SAVE_SIZE = 6 * 8 // resq 6 |
|
1301 #endif |
|
1302 }; |
|
1303 |
|
1304 enum |
|
1305 { |
|
1306 _XFER = 0, |
|
1307 _SRND = _XFER + _XFER_SIZE, // 32 |
|
1308 _INP = _SRND + _SRND_SIZE, // 40 |
|
1309 _INP_END = _INP + _INP_SIZE, // 48 |
|
1310 _RSP = _INP_END + _INP_END_SIZE, // 56 |
|
1311 _GPR = _RSP + _RSP_SAVE_SIZE, // 64 |
|
1312 _STACK_SIZE = _GPR + _GPR_SAVE_SIZE // 128 for windows and 112 for linux. |
|
1313 }; |
|
1314 |
|
1315 //Saving offset and limit as it will help with blocksize calculation for multiblock SHA512. |
|
1316 #ifdef _WIN64 |
|
1317 push(r8); // win64: this is ofs |
|
1318 push(r9); // win64: this is limit, we need them again at the very end. |
|
1319 #else |
|
1320 push(rdx); // linux : this is ofs, need at the end for multiblock calculation |
|
1321 push(rcx); // linux: This is the limit. |
|
1322 #endif |
|
1323 |
|
1324 //Allocate Stack Space |
|
1325 movq(rax, rsp); |
|
1326 subq(rsp, _STACK_SIZE); |
|
1327 andq(rsp, -32); |
|
1328 movq(Address(rsp, _RSP), rax); |
|
1329 |
|
1330 //Save GPRs |
|
1331 movq(Address(rsp, _GPR), rbp); |
|
1332 movq(Address(rsp, (_GPR + 8)), rbx); |
|
1333 movq(Address(rsp, (_GPR + 16)), r12); |
|
1334 movq(Address(rsp, (_GPR + 24)), r13); |
|
1335 movq(Address(rsp, (_GPR + 32)), r14); |
|
1336 movq(Address(rsp, (_GPR + 40)), r15); |
|
1337 |
|
1338 #ifdef _WIN64 |
|
1339 movq(Address(rsp, (_GPR + 48)), rsi); |
|
1340 movq(Address(rsp, (_GPR + 56)), rdi); |
|
1341 #endif |
|
1342 |
|
1343 vpblendd(xmm0, xmm0, xmm1, 0xF0, AVX_128bit); |
|
1344 vpblendd(xmm0, xmm0, xmm1, 0xF0, AVX_256bit); |
|
1345 |
|
1346 if (multi_block) { |
|
1347 xorq(rax, rax); |
|
1348 bind(compute_block_size); |
|
1349 cmpptr(offset, input_limit); // Assuming that offset is less than limit. |
|
1350 jccb(Assembler::aboveEqual, compute_block_size_end); |
|
1351 addq(offset, 128); |
|
1352 addq(rax, 128); |
|
1353 jmpb(compute_block_size); |
|
1354 |
|
1355 bind(compute_block_size_end); |
|
1356 movq(NUM_BLKS, rax); |
|
1357 |
|
1358 cmpq(NUM_BLKS, 0); |
|
1359 jcc(Assembler::equal, done_hash); |
|
1360 } else { |
|
1361 xorq(NUM_BLKS, NUM_BLKS); //If single block. |
|
1362 addq(NUM_BLKS, 128); |
|
1363 } |
|
1364 |
|
1365 addq(NUM_BLKS, INP); //pointer to end of data |
|
1366 movq(Address(rsp, _INP_END), NUM_BLKS); |
|
1367 |
|
1368 //load initial digest |
|
1369 movq(a, Address(CTX, 8 * 0)); |
|
1370 movq(b, Address(CTX, 8 * 1)); |
|
1371 movq(c, Address(CTX, 8 * 2)); |
|
1372 movq(d, Address(CTX, 8 * 3)); |
|
1373 movq(e, Address(CTX, 8 * 4)); |
|
1374 movq(f, Address(CTX, 8 * 5)); |
|
1375 // load g - r10 after it is used as scratch |
|
1376 movq(h, Address(CTX, 8 * 7)); |
|
1377 |
|
1378 pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask_sha512; |
|
1379 vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //PSHUFFLE_BYTE_FLIP_MASK wrt rip |
|
1380 vmovdqu(YMM_MASK_LO, ExternalAddress(pshuffle_byte_flip_mask_addr + 32)); |
|
1381 |
|
1382 movq(g, Address(CTX, 8 * 6)); |
|
1383 |
|
1384 bind(loop0); |
|
1385 lea(TBL, ExternalAddress(K512_W)); |
|
1386 |
|
1387 //byte swap first 16 dwords |
|
1388 vmovdqu(xmm4, Address(INP, 32 * 0)); |
|
1389 vpshufb(xmm4, xmm4, BYTE_FLIP_MASK, AVX_256bit); |
|
1390 vmovdqu(xmm5, Address(INP, 32 * 1)); |
|
1391 vpshufb(xmm5, xmm5, BYTE_FLIP_MASK, AVX_256bit); |
|
1392 vmovdqu(xmm6, Address(INP, 32 * 2)); |
|
1393 vpshufb(xmm6, xmm6, BYTE_FLIP_MASK, AVX_256bit); |
|
1394 vmovdqu(xmm7, Address(INP, 32 * 3)); |
|
1395 vpshufb(xmm7, xmm7, BYTE_FLIP_MASK, AVX_256bit); |
|
1396 |
|
1397 movq(Address(rsp, _INP), INP); |
|
1398 |
|
1399 movslq(Address(rsp, _SRND), 4); |
|
1400 align(16); |
|
1401 |
|
1402 //Schedule 64 input dwords, by calling sha512_AVX2_one_round_and_schedule |
|
1403 bind(loop1); |
|
1404 vpaddq(xmm0, xmm4, Address(TBL, 0 * 32), AVX_256bit); |
|
1405 vmovdqu(Address(rsp, _XFER), xmm0); |
|
1406 //four rounds and schedule |
|
1407 sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, a, b, c, d, e, f, g, h, 0); |
|
1408 sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, h, a, b, c, d, e, f, g, 1); |
|
1409 sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, g, h, a, b, c, d, e, f, 2); |
|
1410 sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, f, g, h, a, b, c, d, e, 3); |
|
1411 |
|
1412 vpaddq(xmm0, xmm5, Address(TBL, 1 * 32), AVX_256bit); |
|
1413 vmovdqu(Address(rsp, _XFER), xmm0); |
|
1414 //four rounds and schedule |
|
1415 sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, e, f, g, h, a, b, c, d, 0); |
|
1416 sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, d, e, f, g, h, a, b, c, 1); |
|
1417 sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, c, d, e, f, g, h, a, b, 2); |
|
1418 sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, b, c, d, e, f, g, h, a, 3); |
|
1419 |
|
1420 vpaddq(xmm0, xmm6, Address(TBL, 2 * 32), AVX_256bit); |
|
1421 vmovdqu(Address(rsp, _XFER), xmm0); |
|
1422 //four rounds and schedule |
|
1423 sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, a, b, c, d, e, f, g, h, 0); |
|
1424 sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, h, a, b, c, d, e, f, g, 1); |
|
1425 sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, g, h, a, b, c, d, e, f, 2); |
|
1426 sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, f, g, h, a, b, c, d, e, 3); |
|
1427 |
|
1428 vpaddq(xmm0, xmm7, Address(TBL, 3 * 32), AVX_256bit); |
|
1429 vmovdqu(Address(rsp, _XFER), xmm0); |
|
1430 addq(TBL, 4 * 32); |
|
1431 //four rounds and schedule |
|
1432 sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, e, f, g, h, a, b, c, d, 0); |
|
1433 sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, d, e, f, g, h, a, b, c, 1); |
|
1434 sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, c, d, e, f, g, h, a, b, 2); |
|
1435 sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, b, c, d, e, f, g, h, a, 3); |
|
1436 |
|
1437 subq(Address(rsp, _SRND), 1); |
|
1438 jcc(Assembler::notEqual, loop1); |
|
1439 |
|
1440 movslq(Address(rsp, _SRND), 2); |
|
1441 |
|
1442 bind(loop2); |
|
1443 vpaddq(xmm0, xmm4, Address(TBL, 0 * 32), AVX_256bit); |
|
1444 vmovdqu(Address(rsp, _XFER), xmm0); |
|
1445 //four rounds and compute. |
|
1446 sha512_AVX2_one_round_compute(a, a, b, c, d, e, f, g, h, 0); |
|
1447 sha512_AVX2_one_round_compute(h, h, a, b, c, d, e, f, g, 1); |
|
1448 sha512_AVX2_one_round_compute(g, g, h, a, b, c, d, e, f, 2); |
|
1449 sha512_AVX2_one_round_compute(f, f, g, h, a, b, c, d, e, 3); |
|
1450 |
|
1451 vpaddq(xmm0, xmm5, Address(TBL, 1 * 32), AVX_256bit); |
|
1452 vmovdqu(Address(rsp, _XFER), xmm0); |
|
1453 addq(TBL, 2 * 32); |
|
1454 // four rounds and compute. |
|
1455 sha512_AVX2_one_round_compute(e, e, f, g, h, a, b, c, d, 0); |
|
1456 sha512_AVX2_one_round_compute(d, d, e, f, g, h, a, b, c, 1); |
|
1457 sha512_AVX2_one_round_compute(c, c, d, e, f, g, h, a, b, 2); |
|
1458 sha512_AVX2_one_round_compute(b, b, c, d, e, f, g, h, a, 3); |
|
1459 |
|
1460 vmovdqu(xmm4, xmm6); |
|
1461 vmovdqu(xmm5, xmm7); |
|
1462 |
|
1463 subq(Address(rsp, _SRND), 1); |
|
1464 jcc(Assembler::notEqual, loop2); |
|
1465 |
|
1466 addmq(8 * 0, CTX, a); |
|
1467 addmq(8 * 1, CTX, b); |
|
1468 addmq(8 * 2, CTX, c); |
|
1469 addmq(8 * 3, CTX, d); |
|
1470 addmq(8 * 4, CTX, e); |
|
1471 addmq(8 * 5, CTX, f); |
|
1472 addmq(8 * 6, CTX, g); |
|
1473 addmq(8 * 7, CTX, h); |
|
1474 |
|
1475 movq(INP, Address(rsp, _INP)); |
|
1476 addq(INP, 128); |
|
1477 cmpq(INP, Address(rsp, _INP_END)); |
|
1478 jcc(Assembler::notEqual, loop0); |
|
1479 |
|
1480 bind(done_hash); |
|
1481 |
|
1482 //Restore GPRs |
|
1483 movq(rbp, Address(rsp, (_GPR + 0))); |
|
1484 movq(rbx, Address(rsp, (_GPR + 8))); |
|
1485 movq(r12, Address(rsp, (_GPR + 16))); |
|
1486 movq(r13, Address(rsp, (_GPR + 24))); |
|
1487 movq(r14, Address(rsp, (_GPR + 32))); |
|
1488 movq(r15, Address(rsp, (_GPR + 40))); |
|
1489 |
|
1490 #ifdef _WIN64 |
|
1491 movq(rsi, Address(rsp, (_GPR + 48))); |
|
1492 movq(rdi, Address(rsp, (_GPR + 56))); |
|
1493 #endif |
|
1494 |
|
1495 //Restore Stack Pointer |
|
1496 movq(rsp, Address(rsp, _RSP)); |
|
1497 |
|
1498 #ifdef _WIN64 |
|
1499 pop(r9); |
|
1500 pop(r8); |
|
1501 #else |
|
1502 pop(rcx); |
|
1503 pop(rdx); |
|
1504 #endif |
|
1505 |
|
1506 if (multi_block) { |
|
1507 #ifdef _WIN64 |
|
1508 const Register& limit_end = r9; |
|
1509 const Register& ofs_end = r8; |
|
1510 #else |
|
1511 const Register& limit_end = rcx; |
|
1512 const Register& ofs_end = rdx; |
|
1513 #endif |
|
1514 movq(rax, ofs_end); |
|
1515 bind(compute_size); |
|
1516 cmpptr(rax, limit_end); |
|
1517 jccb(Assembler::aboveEqual, compute_size_end); |
|
1518 addq(rax, 128); |
|
1519 jmpb(compute_size); |
|
1520 bind(compute_size_end); |
|
1521 } |
|
1522 } |
|
1523 |
|
1524 #endif //#ifdef _LP64 |
|
1525 |