jdk-sandbox: comparison src/hotspot/cpu/x86/macroAssembler_x86

equal deleted inserted replaced

-:4ebc2e2fb97c
+:71c04702a3d5
+/*
+* Copyright (c) 2016, Intel Corporation.
+*
+* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+*
+* This code is free software; you can redistribute it and/or modify it
+* under the terms of the GNU General Public License version 2 only, as
+* published by the Free Software Foundation.
+*
+* This code is distributed in the hope that it will be useful, but WITHOUT
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+* version 2 for more details (a copy is included in the LICENSE file that
+* accompanied this code).
+*
+* You should have received a copy of the GNU General Public License version
+* 2 along with this work; if not, write to the Free Software Foundation,
+* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+*
+* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+* or visit www.oracle.com if you need additional information or have any
+* questions.
+*
+*/
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "asm/assembler.inline.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "macroAssembler_x86.hpp"
+// ofs and limit are used for multi-block byte array.
+// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
+void MacroAssembler::fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
+XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
+Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block) {
+Label start, done_hash, loop0;
+address upper_word_mask = StubRoutines::x86::upper_word_mask_addr();
+address shuffle_byte_flip_mask = StubRoutines::x86::shuffle_byte_flip_mask_addr();
+bind(start);
+movdqu(abcd, Address(state, 0));
+pinsrd(e0, Address(state, 16), 3);
+movdqu(shuf_mask, ExternalAddress(upper_word_mask)); // 0xFFFFFFFF000000000000000000000000
+pand(e0, shuf_mask);
+pshufd(abcd, abcd, 0x1B);
+movdqu(shuf_mask, ExternalAddress(shuffle_byte_flip_mask)); //0x000102030405060708090a0b0c0d0e0f
+bind(loop0);
+// Save hash values for addition after rounds
+movdqu(Address(rsp, 0), e0);
+movdqu(Address(rsp, 16), abcd);
+// Rounds 0 - 3
+movdqu(msg0, Address(buf, 0));
+pshufb(msg0, shuf_mask);
+paddd(e0, msg0);
+movdqa(e1, abcd);
+sha1rnds4(abcd, e0, 0);
+// Rounds 4 - 7
+movdqu(msg1, Address(buf, 16));
+pshufb(msg1, shuf_mask);
+sha1nexte(e1, msg1);
+movdqa(e0, abcd);
+sha1rnds4(abcd, e1, 0);
+sha1msg1(msg0, msg1);
+// Rounds 8 - 11
+movdqu(msg2, Address(buf, 32));
+pshufb(msg2, shuf_mask);
+sha1nexte(e0, msg2);
+movdqa(e1, abcd);
+sha1rnds4(abcd, e0, 0);
+sha1msg1(msg1, msg2);
+pxor(msg0, msg2);
+// Rounds 12 - 15
+movdqu(msg3, Address(buf, 48));
+pshufb(msg3, shuf_mask);
+sha1nexte(e1, msg3);
+movdqa(e0, abcd);
+sha1msg2(msg0, msg3);
+sha1rnds4(abcd, e1, 0);
+sha1msg1(msg2, msg3);
+pxor(msg1, msg3);
+// Rounds 16 - 19
+sha1nexte(e0, msg0);
+movdqa(e1, abcd);
+sha1msg2(msg1, msg0);
+sha1rnds4(abcd, e0, 0);
+sha1msg1(msg3, msg0);
+pxor(msg2, msg0);
+// Rounds 20 - 23
+sha1nexte(e1, msg1);
+movdqa(e0, abcd);
+sha1msg2(msg2, msg1);
+sha1rnds4(abcd, e1, 1);
+sha1msg1(msg0, msg1);
+pxor(msg3, msg1);
+// Rounds 24 - 27
+sha1nexte(e0, msg2);
+movdqa(e1, abcd);
+sha1msg2(msg3, msg2);
+sha1rnds4(abcd, e0, 1);
+sha1msg1(msg1, msg2);
+pxor(msg0, msg2);
+// Rounds 28 - 31
+sha1nexte(e1, msg3);
+movdqa(e0, abcd);
+sha1msg2(msg0, msg3);
+sha1rnds4(abcd, e1, 1);
+sha1msg1(msg2, msg3);
+pxor(msg1, msg3);
+// Rounds 32 - 35
+sha1nexte(e0, msg0);
+movdqa(e1, abcd);
+sha1msg2(msg1, msg0);
+sha1rnds4(abcd, e0, 1);
+sha1msg1(msg3, msg0);
+pxor(msg2, msg0);
+// Rounds 36 - 39
+sha1nexte(e1, msg1);
+movdqa(e0, abcd);
+sha1msg2(msg2, msg1);
+sha1rnds4(abcd, e1, 1);
+sha1msg1(msg0, msg1);
+pxor(msg3, msg1);
+// Rounds 40 - 43
+sha1nexte(e0, msg2);
+movdqa(e1, abcd);
+sha1msg2(msg3, msg2);
+sha1rnds4(abcd, e0, 2);
+sha1msg1(msg1, msg2);
+pxor(msg0, msg2);
+// Rounds 44 - 47
+sha1nexte(e1, msg3);
+movdqa(e0, abcd);
+sha1msg2(msg0, msg3);
+sha1rnds4(abcd, e1, 2);
+sha1msg1(msg2, msg3);
+pxor(msg1, msg3);
+// Rounds 48 - 51
+sha1nexte(e0, msg0);
+movdqa(e1, abcd);
+sha1msg2(msg1, msg0);
+sha1rnds4(abcd, e0, 2);
+sha1msg1(msg3, msg0);
+pxor(msg2, msg0);
+// Rounds 52 - 55
+sha1nexte(e1, msg1);
+movdqa(e0, abcd);
+sha1msg2(msg2, msg1);
+sha1rnds4(abcd, e1, 2);
+sha1msg1(msg0, msg1);
+pxor(msg3, msg1);
+// Rounds 56 - 59
+sha1nexte(e0, msg2);
+movdqa(e1, abcd);
+sha1msg2(msg3, msg2);
+sha1rnds4(abcd, e0, 2);
+sha1msg1(msg1, msg2);
+pxor(msg0, msg2);
+// Rounds 60 - 63
+sha1nexte(e1, msg3);
+movdqa(e0, abcd);
+sha1msg2(msg0, msg3);
+sha1rnds4(abcd, e1, 3);
+sha1msg1(msg2, msg3);
+pxor(msg1, msg3);
+// Rounds 64 - 67
+sha1nexte(e0, msg0);
+movdqa(e1, abcd);
+sha1msg2(msg1, msg0);
+sha1rnds4(abcd, e0, 3);
+sha1msg1(msg3, msg0);
+pxor(msg2, msg0);
+// Rounds 68 - 71
+sha1nexte(e1, msg1);
+movdqa(e0, abcd);
+sha1msg2(msg2, msg1);
+sha1rnds4(abcd, e1, 3);
+pxor(msg3, msg1);
+// Rounds 72 - 75
+sha1nexte(e0, msg2);
+movdqa(e1, abcd);
+sha1msg2(msg3, msg2);
+sha1rnds4(abcd, e0, 3);
+// Rounds 76 - 79
+sha1nexte(e1, msg3);
+movdqa(e0, abcd);
+sha1rnds4(abcd, e1, 3);
+// add current hash values with previously saved
+movdqu(msg0, Address(rsp, 0));
+sha1nexte(e0, msg0);
+movdqu(msg0, Address(rsp, 16));
+paddd(abcd, msg0);
+if (multi_block) {
+// increment data pointer and loop if more to process
+addptr(buf, 64);
+addptr(ofs, 64);
+cmpptr(ofs, limit);
+jcc(Assembler::belowEqual, loop0);
+movptr(rax, ofs); //return ofs
+}
+// write hash values back in the correct order
+pshufd(abcd, abcd, 0x1b);
+movdqu(Address(state, 0), abcd);
+pextrd(Address(state, 16), e0, 3);
+bind(done_hash);
+}
+// xmm0 (msg) is used as an implicit argument to sh256rnds2
+// and state0 and state1 can never use xmm0 register.
+// ofs and limit are used for multi-block byte array.
+// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
+#ifdef _LP64
+void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
+XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
+Register buf, Register state, Register ofs, Register limit, Register rsp,
+bool multi_block, XMMRegister shuf_mask) {
+#else
+void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
+XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
+Register buf, Register state, Register ofs, Register limit, Register rsp,
+bool multi_block) {
+#endif
+Label start, done_hash, loop0;
+address K256 = StubRoutines::x86::k256_addr();
+address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr();
+bind(start);
+movdqu(state0, Address(state, 0));
+movdqu(state1, Address(state, 16));
+pshufd(state0, state0, 0xB1);
+pshufd(state1, state1, 0x1B);
+movdqa(msgtmp4, state0);
+palignr(state0, state1, 8);
+pblendw(state1, msgtmp4, 0xF0);
+#ifdef _LP64
+movdqu(shuf_mask, ExternalAddress(pshuffle_byte_flip_mask));
+#endif
+lea(rax, ExternalAddress(K256));
+bind(loop0);
+movdqu(Address(rsp, 0), state0);
+movdqu(Address(rsp, 16), state1);
+// Rounds 0-3
+movdqu(msg, Address(buf, 0));
+#ifdef _LP64
+pshufb(msg, shuf_mask);
+#else
+pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
+#endif
+movdqa(msgtmp0, msg);
+paddd(msg, Address(rax, 0));
+sha256rnds2(state1, state0);
+pshufd(msg, msg, 0x0E);
+sha256rnds2(state0, state1);
+// Rounds 4-7
+movdqu(msg, Address(buf, 16));
+#ifdef _LP64
+pshufb(msg, shuf_mask);
+#else
+pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
+#endif
+movdqa(msgtmp1, msg);
+paddd(msg, Address(rax, 16));
+sha256rnds2(state1, state0);
+pshufd(msg, msg, 0x0E);
+sha256rnds2(state0, state1);
+sha256msg1(msgtmp0, msgtmp1);
+// Rounds 8-11
+movdqu(msg, Address(buf, 32));
+#ifdef _LP64
+pshufb(msg, shuf_mask);
+#else
+pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
+#endif
+movdqa(msgtmp2, msg);
+paddd(msg, Address(rax, 32));
+sha256rnds2(state1, state0);
+pshufd(msg, msg, 0x0E);
+sha256rnds2(state0, state1);
+sha256msg1(msgtmp1, msgtmp2);
+// Rounds 12-15
+movdqu(msg, Address(buf, 48));
+#ifdef _LP64
+pshufb(msg, shuf_mask);
+#else
+pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
+#endif
+movdqa(msgtmp3, msg);
+paddd(msg, Address(rax, 48));
+sha256rnds2(state1, state0);
+movdqa(msgtmp4, msgtmp3);
+palignr(msgtmp4, msgtmp2, 4);
+paddd(msgtmp0, msgtmp4);
+sha256msg2(msgtmp0, msgtmp3);
+pshufd(msg, msg, 0x0E);
+sha256rnds2(state0, state1);
+sha256msg1(msgtmp2, msgtmp3);
+// Rounds 16-19
+movdqa(msg, msgtmp0);
+paddd(msg, Address(rax, 64));
+sha256rnds2(state1, state0);
+movdqa(msgtmp4, msgtmp0);
+palignr(msgtmp4, msgtmp3, 4);
+paddd(msgtmp1, msgtmp4);
+sha256msg2(msgtmp1, msgtmp0);
+pshufd(msg, msg, 0x0E);
+sha256rnds2(state0, state1);
+sha256msg1(msgtmp3, msgtmp0);
+// Rounds 20-23
+movdqa(msg, msgtmp1);
+paddd(msg, Address(rax, 80));
+sha256rnds2(state1, state0);
+movdqa(msgtmp4, msgtmp1);
+palignr(msgtmp4, msgtmp0, 4);
+paddd(msgtmp2, msgtmp4);
+sha256msg2(msgtmp2, msgtmp1);
+pshufd(msg, msg, 0x0E);
+sha256rnds2(state0, state1);
+sha256msg1(msgtmp0, msgtmp1);
+// Rounds 24-27
+movdqa(msg, msgtmp2);
+paddd(msg, Address(rax, 96));
+sha256rnds2(state1, state0);
+movdqa(msgtmp4, msgtmp2);
+palignr(msgtmp4, msgtmp1, 4);
+paddd(msgtmp3, msgtmp4);
+sha256msg2(msgtmp3, msgtmp2);
+pshufd(msg, msg, 0x0E);
+sha256rnds2(state0, state1);
+sha256msg1(msgtmp1, msgtmp2);
+// Rounds 28-31
+movdqa(msg, msgtmp3);
+paddd(msg, Address(rax, 112));
+sha256rnds2(state1, state0);
+movdqa(msgtmp4, msgtmp3);
+palignr(msgtmp4, msgtmp2, 4);
+paddd(msgtmp0, msgtmp4);
+sha256msg2(msgtmp0, msgtmp3);
+pshufd(msg, msg, 0x0E);
+sha256rnds2(state0, state1);
+sha256msg1(msgtmp2, msgtmp3);
+// Rounds 32-35
+movdqa(msg, msgtmp0);
+paddd(msg, Address(rax, 128));
+sha256rnds2(state1, state0);
+movdqa(msgtmp4, msgtmp0);
+palignr(msgtmp4, msgtmp3, 4);
+paddd(msgtmp1, msgtmp4);
+sha256msg2(msgtmp1, msgtmp0);
+pshufd(msg, msg, 0x0E);
+sha256rnds2(state0, state1);
+sha256msg1(msgtmp3, msgtmp0);
+// Rounds 36-39
+movdqa(msg, msgtmp1);
+paddd(msg, Address(rax, 144));
+sha256rnds2(state1, state0);
+movdqa(msgtmp4, msgtmp1);
+palignr(msgtmp4, msgtmp0, 4);
+paddd(msgtmp2, msgtmp4);
+sha256msg2(msgtmp2, msgtmp1);
+pshufd(msg, msg, 0x0E);
+sha256rnds2(state0, state1);
+sha256msg1(msgtmp0, msgtmp1);
+// Rounds 40-43
+movdqa(msg, msgtmp2);
+paddd(msg, Address(rax, 160));
+sha256rnds2(state1, state0);
+movdqa(msgtmp4, msgtmp2);
+palignr(msgtmp4, msgtmp1, 4);
+paddd(msgtmp3, msgtmp4);
+sha256msg2(msgtmp3, msgtmp2);
+pshufd(msg, msg, 0x0E);
+sha256rnds2(state0, state1);
+sha256msg1(msgtmp1, msgtmp2);
+// Rounds 44-47
+movdqa(msg, msgtmp3);
+paddd(msg, Address(rax, 176));
+sha256rnds2(state1, state0);
+movdqa(msgtmp4, msgtmp3);
+palignr(msgtmp4, msgtmp2, 4);
+paddd(msgtmp0, msgtmp4);
+sha256msg2(msgtmp0, msgtmp3);
+pshufd(msg, msg, 0x0E);
+sha256rnds2(state0, state1);
+sha256msg1(msgtmp2, msgtmp3);
+// Rounds 48-51
+movdqa(msg, msgtmp0);
+paddd(msg, Address(rax, 192));
+sha256rnds2(state1, state0);
+movdqa(msgtmp4, msgtmp0);
+palignr(msgtmp4, msgtmp3, 4);
+paddd(msgtmp1, msgtmp4);
+sha256msg2(msgtmp1, msgtmp0);
+pshufd(msg, msg, 0x0E);
+sha256rnds2(state0, state1);
+sha256msg1(msgtmp3, msgtmp0);
+// Rounds 52-55
+movdqa(msg, msgtmp1);
+paddd(msg, Address(rax, 208));
+sha256rnds2(state1, state0);
+movdqa(msgtmp4, msgtmp1);
+palignr(msgtmp4, msgtmp0, 4);
+paddd(msgtmp2, msgtmp4);
+sha256msg2(msgtmp2, msgtmp1);
+pshufd(msg, msg, 0x0E);
+sha256rnds2(state0, state1);
+// Rounds 56-59
+movdqa(msg, msgtmp2);
+paddd(msg, Address(rax, 224));
+sha256rnds2(state1, state0);
+movdqa(msgtmp4, msgtmp2);
+palignr(msgtmp4, msgtmp1, 4);
+paddd(msgtmp3, msgtmp4);
+sha256msg2(msgtmp3, msgtmp2);
+pshufd(msg, msg, 0x0E);
+sha256rnds2(state0, state1);
+// Rounds 60-63
+movdqa(msg, msgtmp3);
+paddd(msg, Address(rax, 240));
+sha256rnds2(state1, state0);
+pshufd(msg, msg, 0x0E);
+sha256rnds2(state0, state1);
+movdqu(msg, Address(rsp, 0));
+paddd(state0, msg);
+movdqu(msg, Address(rsp, 16));
+paddd(state1, msg);
+if (multi_block) {
+// increment data pointer and loop if more to process
+addptr(buf, 64);
+addptr(ofs, 64);
+cmpptr(ofs, limit);
+jcc(Assembler::belowEqual, loop0);
+movptr(rax, ofs); //return ofs
+}
+pshufd(state0, state0, 0x1B);
+pshufd(state1, state1, 0xB1);
+movdqa(msgtmp4, state0);
+pblendw(state0, state1, 0xF0);
+palignr(state1, msgtmp4, 8);
+movdqu(Address(state, 0), state0);
+movdqu(Address(state, 16), state1);
+bind(done_hash);
+}
+#ifdef _LP64
+/*
+The algorithm below is based on Intel publication:
+"Fast SHA-256 Implementations on Intelë Architecture Processors" by Jim Guilford, Kirk Yap and Vinodh Gopal.
+The assembly code was originally provided by Sean Gulley and in many places preserves
+the original assembly NAMES and comments to simplify matching Java assembly with its original.
+The Java version was substantially redesigned to replace 1200 assembly instruction with
+much shorter run-time generator of the same code in memory.
+*/
+void MacroAssembler::sha256_AVX2_one_round_compute(
+Register  reg_old_h,
+Register  reg_a,
+Register  reg_b,
+Register  reg_c,
+Register  reg_d,
+Register  reg_e,
+Register  reg_f,
+Register  reg_g,
+Register  reg_h,
+int iter) {
+const Register& reg_y0     = r13;
+const Register& reg_y1     = r14;
+const Register& reg_y2     = r15;
+const Register& reg_y3     = rcx;
+const Register& reg_T1     = r12;
+//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND iter ;;;;;;;;;;;;;;;;;;;;;;;;;;;
+if (iter%4 > 0) {
+addl(reg_old_h, reg_y2);   // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
+}
+movl(reg_y2, reg_f);         // reg_y2 = reg_f                                ; CH
+rorxd(reg_y0, reg_e, 25);    // reg_y0 = reg_e >> 25   ; S1A
+rorxd(reg_y1, reg_e, 11);    // reg_y1 = reg_e >> 11    ; S1B
+xorl(reg_y2, reg_g);         // reg_y2 = reg_f^reg_g                              ; CH
+xorl(reg_y0, reg_y1);        // reg_y0 = (reg_e>>25) ^ (reg_h>>11)  ; S1
+rorxd(reg_y1, reg_e, 6);     // reg_y1 = (reg_e >> 6)    ; S1
+andl(reg_y2, reg_e);         // reg_y2 = (reg_f^reg_g)&reg_e                          ; CH
+if (iter%4 > 0) {
+addl(reg_old_h, reg_y3);   // reg_h = t1 + S0 + MAJ                     ; --
+}
+xorl(reg_y0, reg_y1);       // reg_y0 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1
+rorxd(reg_T1, reg_a, 13);   // reg_T1 = reg_a >> 13    ; S0B
+xorl(reg_y2, reg_g);        // reg_y2 = CH = ((reg_f^reg_g)&reg_e)^reg_g                 ; CH
+rorxd(reg_y1, reg_a, 22);   // reg_y1 = reg_a >> 22    ; S0A
+movl(reg_y3, reg_a);        // reg_y3 = reg_a                                ; MAJA
+xorl(reg_y1, reg_T1);       // reg_y1 = (reg_a>>22) ^ (reg_a>>13)  ; S0
+rorxd(reg_T1, reg_a, 2);    // reg_T1 = (reg_a >> 2)    ; S0
+addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter)); // reg_h = k + w + reg_h ; --
+orl(reg_y3, reg_c);         // reg_y3 = reg_a|reg_c                              ; MAJA
+xorl(reg_y1, reg_T1);       // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0
+movl(reg_T1, reg_a);        // reg_T1 = reg_a                                ; MAJB
+andl(reg_y3, reg_b);        // reg_y3 = (reg_a|reg_c)&reg_b                          ; MAJA
+andl(reg_T1, reg_c);        // reg_T1 = reg_a&reg_c                              ; MAJB
+addl(reg_y2, reg_y0);       // reg_y2 = S1 + CH                          ; --
+addl(reg_d, reg_h);         // reg_d = k + w + reg_h + reg_d                     ; --
+orl(reg_y3, reg_T1);        // reg_y3 = MAJ = (reg_a|reg_c)&reg_b)|(reg_a&reg_c)             ; MAJ
+addl(reg_h, reg_y1);        // reg_h = k + w + reg_h + S0                    ; --
+addl(reg_d, reg_y2);        // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1  ; --
+if (iter%4 == 3) {
+addl(reg_h, reg_y2);      // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
+addl(reg_h, reg_y3);      // reg_h = t1 + S0 + MAJ                     ; --
+}
+}
+void MacroAssembler::sha256_AVX2_four_rounds_compute_first(int start) {
+sha256_AVX2_one_round_compute(rax, rax, rbx, rdi, rsi,  r8,  r9, r10, r11, start + 0);
+sha256_AVX2_one_round_compute(r11, r11, rax, rbx, rdi, rsi,  r8,  r9, r10, start + 1);
+sha256_AVX2_one_round_compute(r10, r10, r11, rax, rbx, rdi, rsi,  r8,  r9, start + 2);
+sha256_AVX2_one_round_compute(r9,  r9,  r10, r11, rax, rbx, rdi, rsi,  r8, start + 3);
+}
+void MacroAssembler::sha256_AVX2_four_rounds_compute_last(int start) {
+sha256_AVX2_one_round_compute(r8,  r8,   r9, r10, r11, rax, rbx, rdi, rsi, start + 0);
+sha256_AVX2_one_round_compute(rsi, rsi,  r8,  r9, r10, r11, rax, rbx, rdi, start + 1);
+sha256_AVX2_one_round_compute(rdi, rdi, rsi,  r8,  r9, r10, r11, rax, rbx, start + 2);
+sha256_AVX2_one_round_compute(rbx, rbx, rdi, rsi,  r8,  r9, r10, r11, rax, start + 3);
+}
+void MacroAssembler::sha256_AVX2_one_round_and_sched(
+XMMRegister  xmm_0,     /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */
+XMMRegister  xmm_1,     /* ymm5 */  /* full cycle is 16 iterations */
+XMMRegister  xmm_2,     /* ymm6 */
+XMMRegister  xmm_3,     /* ymm7 */
+Register  reg_a,        /* == rax on 0 iteration, then rotate 8 register right on each next iteration */
+Register  reg_b,        /* rbx */    /* full cycle is 8 iterations */
+Register  reg_c,        /* rdi */
+Register  reg_d,        /* rsi */
+Register  reg_e,        /* r8 */
+Register  reg_f,        /* r9d */
+Register  reg_g,        /* r10d */
+Register  reg_h,        /* r11d */
+int iter)
+{
+movl(rcx, reg_a);           // rcx = reg_a               ; MAJA
+rorxd(r13, reg_e, 25);      // r13 = reg_e >> 25    ; S1A
+rorxd(r14, reg_e, 11);      //  r14 = reg_e >> 11    ; S1B
+addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter));
+orl(rcx, reg_c);            // rcx = reg_a|reg_c          ; MAJA
+movl(r15, reg_f);           // r15 = reg_f               ; CH
+rorxd(r12, reg_a, 13);      // r12 = reg_a >> 13      ; S0B
+xorl(r13, r14);             // r13 = (reg_e>>25) ^ (reg_e>>11)  ; S1
+xorl(r15, reg_g);           // r15 = reg_f^reg_g         ; CH
+rorxd(r14, reg_e, 6);       // r14 = (reg_e >> 6)    ; S1
+andl(r15, reg_e);           // r15 = (reg_f^reg_g)&reg_e ; CH
+xorl(r13, r14);             // r13 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1
+rorxd(r14, reg_a, 22);      // r14 = reg_a >> 22    ; S0A
+addl(reg_d, reg_h);         // reg_d = k + w + reg_h + reg_d                     ; --
+andl(rcx, reg_b);          // rcx = (reg_a|reg_c)&reg_b                          ; MAJA
+xorl(r14, r12);            // r14 = (reg_a>>22) ^ (reg_a>>13)  ; S0
+rorxd(r12, reg_a, 2);      // r12 = (reg_a >> 2)    ; S0
+xorl(r15, reg_g);          // r15 = CH = ((reg_f^reg_g)&reg_e)^reg_g                 ; CH
+xorl(r14, r12);            // r14 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0
+movl(r12, reg_a);          // r12 = reg_a                                ; MAJB
+andl(r12, reg_c);          // r12 = reg_a&reg_c                              ; MAJB
+addl(r15, r13);            // r15 = S1 + CH                          ; --
+orl(rcx, r12);             // rcx = MAJ = (reg_a|reg_c)&reg_b)|(reg_a&reg_c)             ; MAJ
+addl(reg_h, r14);          // reg_h = k + w + reg_h + S0                    ; --
+addl(reg_d, r15);          // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1  ; --
+addl(reg_h, r15);          // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
+addl(reg_h, rcx);          // reg_h = t1 + S0 + MAJ                     ; --
+if (iter%4 == 0) {
+vpalignr(xmm0, xmm_3, xmm_2, 4, AVX_256bit);   // ymm0 = W[-7]
+vpaddd(xmm0, xmm0, xmm_0, AVX_256bit);         // ymm0 = W[-7] + W[-16]; y1 = (e >> 6)     ; S1
+vpalignr(xmm1, xmm_1, xmm_0, 4, AVX_256bit);   // ymm1 = W[-15]
+vpsrld(xmm2, xmm1, 7, AVX_256bit);
+vpslld(xmm3, xmm1, 32-7, AVX_256bit);
+vpor(xmm3, xmm3, xmm2, AVX_256bit);            // ymm3 = W[-15] ror 7
+vpsrld(xmm2, xmm1,18, AVX_256bit);
+} else if (iter%4 == 1 ) {
+vpsrld(xmm8, xmm1, 3, AVX_256bit);             // ymm8 = W[-15] >> 3
+vpslld(xmm1, xmm1, 32-18, AVX_256bit);
+vpxor(xmm3, xmm3, xmm1, AVX_256bit);
+vpxor(xmm3, xmm3, xmm2, AVX_256bit);           // ymm3 = W[-15] ror 7 ^ W[-15] ror 18
+vpxor(xmm1, xmm3, xmm8, AVX_256bit);           // ymm1 = s0
+vpshufd(xmm2, xmm_3, 0xFA, AVX_256bit);        // 11111010b ; ymm2 = W[-2] {BBAA}
+vpaddd(xmm0, xmm0, xmm1, AVX_256bit);          // ymm0 = W[-16] + W[-7] + s0
+vpsrld(xmm8, xmm2, 10, AVX_256bit);            // ymm8 = W[-2] >> 10 {BBAA}
+} else if (iter%4 == 2) {
+vpsrlq(xmm3, xmm2, 19, AVX_256bit);            // ymm3 = W[-2] ror 19 {xBxA}
+vpsrlq(xmm2, xmm2, 17, AVX_256bit);            // ymm2 = W[-2] ror 17 {xBxA}
+vpxor(xmm2, xmm2, xmm3, AVX_256bit);
+vpxor(xmm8, xmm8, xmm2, AVX_256bit);           // ymm8 = s1 {xBxA}
+vpshufb(xmm8, xmm8, xmm10, AVX_256bit);        // ymm8 = s1 {00BA}
+vpaddd(xmm0, xmm0, xmm8, AVX_256bit);          // ymm0 = {..., ..., W[1], W[0]}
+vpshufd(xmm2, xmm0, 0x50, AVX_256bit);         // 01010000b ; ymm2 = W[-2] {DDCC}
+} else if (iter%4 == 3) {
+vpsrld(xmm11, xmm2, 10, AVX_256bit);           // ymm11 = W[-2] >> 10 {DDCC}
+vpsrlq(xmm3, xmm2, 19, AVX_256bit);            // ymm3 = W[-2] ror 19 {xDxC}
+vpsrlq(xmm2, xmm2, 17, AVX_256bit);            // ymm2 = W[-2] ror 17 {xDxC}
+vpxor(xmm2, xmm2, xmm3, AVX_256bit);
+vpxor(xmm11, xmm11, xmm2, AVX_256bit);         // ymm11 = s1 {xDxC}
+vpshufb(xmm11, xmm11, xmm12, AVX_256bit);      // ymm11 = s1 {DC00}
+vpaddd(xmm_0, xmm11, xmm0, AVX_256bit);        // xmm_0 = {W[3], W[2], W[1], W[0]}
+}
+}
+void MacroAssembler::addm(int disp, Register r1, Register r2) {
+addl(r2, Address(r1, disp));
+movl(Address(r1, disp), r2);
+}
+void MacroAssembler::addmq(int disp, Register r1, Register r2) {
+addq(r2, Address(r1, disp));
+movq(Address(r1, disp), r2);
+}
+void MacroAssembler::sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
+XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
+Register buf, Register state, Register ofs, Register limit, Register rsp,
+bool multi_block, XMMRegister shuf_mask) {
+Label loop0, loop1, loop2, loop3,
+last_block_enter, do_last_block, only_one_block, done_hash,
+compute_size, compute_size_end,
+compute_size1, compute_size_end1;
+address K256_W = StubRoutines::x86::k256_W_addr();
+address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr();
+address pshuffle_byte_flip_mask_addr = 0;
+const XMMRegister& SHUF_00BA        = xmm10;    // ymm10: shuffle xBxA -> 00BA
+const XMMRegister& SHUF_DC00        = xmm12;    // ymm12: shuffle xDxC -> DC00
+const XMMRegister& BYTE_FLIP_MASK   = xmm13;   // ymm13
+const XMMRegister& X_BYTE_FLIP_MASK = xmm13;   //XMM version of BYTE_FLIP_MASK
+const Register& NUM_BLKS = r8;   // 3rd arg
+const Register& CTX      = rdx;  // 2nd arg
+const Register& INP      = rcx;  // 1st arg
+const Register& c        = rdi;
+const Register& d        = rsi;
+const Register& e        = r8;    // clobbers NUM_BLKS
+const Register& y3       = rcx;  // clobbers INP
+const Register& TBL      = rbp;
+const Register& SRND     = CTX;   // SRND is same register as CTX
+const Register& a        = rax;
+const Register& b        = rbx;
+const Register& f        = r9;
+const Register& g        = r10;
+const Register& h        = r11;
+const Register& T1       = r12;
+const Register& y0       = r13;
+const Register& y1       = r14;
+const Register& y2       = r15;
+enum {
+_XFER_SIZE = 2*64*4, // 2 blocks, 64 rounds, 4 bytes/round
+_INP_END_SIZE = 8,
+_INP_SIZE = 8,
+_CTX_SIZE = 8,
+_RSP_SIZE = 8,
+_XFER = 0,
+_INP_END   = _XFER     + _XFER_SIZE,
+_INP       = _INP_END  + _INP_END_SIZE,
+_CTX       = _INP      + _INP_SIZE,
+_RSP       = _CTX      + _CTX_SIZE,
+STACK_SIZE = _RSP      + _RSP_SIZE
+};
+#ifndef _WIN64
+push(rcx);    // linux: this is limit, need at the end
+push(rdx);    // linux: this is ofs
+#else
+push(r8);     // win64: this is ofs
+push(r9);     // win64: this is limit, we need them again at the very and
+#endif
+push(rbx);
+#ifdef _WIN64
+push(rsi);
+push(rdi);
+#endif
+push(rbp);
+push(r12);
+push(r13);
+push(r14);
+push(r15);
+movq(rax, rsp);
+subq(rsp, STACK_SIZE);
+andq(rsp, -32);
+movq(Address(rsp, _RSP), rax);
+#ifndef _WIN64
+// copy linux params to win64 params, therefore the rest of code will be the same for both
+movq(r9,  rcx);
+movq(r8,  rdx);
+movq(rdx, rsi);
+movq(rcx, rdi);
+#endif
+// setting original assembly ABI
+/** message to encrypt in INP */
+lea(INP, Address(rcx, 0));    // rcx == message (buf)     ;; linux: INP = buf = rdi
+/** digest in CTX             */
+movq(CTX, rdx);               // rdx = digest  (state)    ;; linux: CTX = state = rsi
+/** NUM_BLK is the length of message, need to set it from ofs and limit  */
+if (multi_block) {
+// Win64: cannot directly update NUM_BLKS, since NUM_BLKS = ofs = r8
+// on entry r8 = ofs
+// on exit  r8 = NUM_BLKS
+xorq(rax, rax);
+bind(compute_size);
+cmpptr(r8, r9); // assume the original ofs <= limit ;; linux:  cmp rcx, rdx
+jccb(Assembler::aboveEqual, compute_size_end);
+addq(r8, 64);                                          //;; linux: ofs = rdx
+addq(rax, 64);
+jmpb(compute_size);
+bind(compute_size_end);
+movq(NUM_BLKS, rax);  // NUM_BLK (r8)                  ;; linux: NUM_BLK = rdx
+cmpq(NUM_BLKS, 0);
+jcc(Assembler::equal, done_hash);
+} else {
+xorq(NUM_BLKS, NUM_BLKS);
+addq(NUM_BLKS, 64);
+}//if (!multi_block)
+lea(NUM_BLKS, Address(INP, NUM_BLKS, Address::times_1, -64)); // pointer to the last block
+movq(Address(rsp, _INP_END), NUM_BLKS);  //
+cmpptr(INP, NUM_BLKS);                   //cmp INP, NUM_BLKS
+jcc(Assembler::equal, only_one_block);   //je only_one_block
+// load initial digest
+movl(a, Address(CTX, 4*0));
+movl(b, Address(CTX, 4*1));
+movl(c, Address(CTX, 4*2));
+movl(d, Address(CTX, 4*3));
+movl(e, Address(CTX, 4*4));
+movl(f, Address(CTX, 4*5));
+// load g - r10 after it is used as scratch
+movl(h, Address(CTX, 4*7));
+pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask;
+vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr +0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip]
+vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32));     //[_SHUF_00BA wrt rip]
+vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64));     //[_SHUF_DC00 wrt rip]
+movl(g, Address(CTX, 4*6));
+movq(Address(rsp, _CTX), CTX);           // store
+bind(loop0);
+lea(TBL, ExternalAddress(K256_W));
+// assume buffers not aligned
+// Load first 16 dwords from two blocks
+vmovdqu(xmm0, Address(INP, 0*32));
+vmovdqu(xmm1, Address(INP, 1*32));
+vmovdqu(xmm2, Address(INP, 2*32));
+vmovdqu(xmm3, Address(INP, 3*32));
+// byte swap data
+vpshufb(xmm0, xmm0, BYTE_FLIP_MASK, AVX_256bit);
+vpshufb(xmm1, xmm1, BYTE_FLIP_MASK, AVX_256bit);
+vpshufb(xmm2, xmm2, BYTE_FLIP_MASK, AVX_256bit);
+vpshufb(xmm3, xmm3, BYTE_FLIP_MASK, AVX_256bit);
+// transpose data into high/low halves
+vperm2i128(xmm4, xmm0, xmm2, 0x20);
+vperm2i128(xmm5, xmm0, xmm2, 0x31);
+vperm2i128(xmm6, xmm1, xmm3, 0x20);
+vperm2i128(xmm7, xmm1, xmm3, 0x31);
+bind(last_block_enter);
+addq(INP, 64);
+movq(Address(rsp, _INP), INP);
+//;; schedule 48 input dwords, by doing 3 rounds of 12 each
+xorq(SRND, SRND);
+align(16);
+bind(loop1);
+vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit);
+vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9);
+sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, rax, rbx, rdi, rsi, r8,  r9,  r10, r11, 0);
+sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r11, rax, rbx, rdi, rsi, r8,  r9,  r10, 1);
+sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r10, r11, rax, rbx, rdi, rsi, r8,  r9,  2);
+sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r9,  r10, r11, rax, rbx, rdi, rsi, r8,  3);
+vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit);
+vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9);
+sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, r8,  r9,  r10, r11, rax, rbx, rdi, rsi,  8+0);
+sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rsi, r8,  r9,  r10, r11, rax, rbx, rdi,  8+1);
+sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rdi, rsi, r8,  r9,  r10, r11, rax, rbx,  8+2);
+sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rbx, rdi, rsi, r8,  r9,  r10, r11, rax,  8+3);
+vpaddd(xmm9, xmm6, Address(TBL, SRND, Address::times_1, 2*32), AVX_256bit);
+vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 2*32), xmm9);
+sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, rax, rbx, rdi, rsi, r8,  r9,  r10, r11, 16+0);
+sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r11, rax, rbx, rdi, rsi, r8,  r9,  r10, 16+1);
+sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r10, r11, rax, rbx, rdi, rsi, r8,  r9,  16+2);
+sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r9,  r10, r11, rax, rbx, rdi, rsi, r8,  16+3);
+vpaddd(xmm9, xmm7, Address(TBL, SRND, Address::times_1, 3*32), AVX_256bit);
+vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 3*32), xmm9);
+sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, r8,  r9,  r10, r11, rax, rbx, rdi, rsi,  24+0);
+sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rsi, r8,  r9,  r10, r11, rax, rbx, rdi,  24+1);
+sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rdi, rsi, r8,  r9,  r10, r11, rax, rbx,  24+2);
+sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rbx, rdi, rsi, r8,  r9,  r10, r11, rax,  24+3);
+addq(SRND, 4*32);
+cmpq(SRND, 3 * 4*32);
+jcc(Assembler::below, loop1);
+bind(loop2);
+// Do last 16 rounds with no scheduling
+vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit);
+vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9);
+sha256_AVX2_four_rounds_compute_first(0);
+vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit);
+vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9);
+sha256_AVX2_four_rounds_compute_last(0 + 8);
+addq(SRND, 2*32);
+vmovdqu(xmm4, xmm6);
+vmovdqu(xmm5, xmm7);
+cmpq(SRND, 4 * 4*32);
+jcc(Assembler::below, loop2);
+movq(CTX, Address(rsp, _CTX));
+movq(INP, Address(rsp, _INP));
+addm(4*0, CTX, a);
+addm(4*1, CTX, b);
+addm(4*2, CTX, c);
+addm(4*3, CTX, d);
+addm(4*4, CTX, e);
+addm(4*5, CTX, f);
+addm(4*6, CTX, g);
+addm(4*7, CTX, h);
+cmpq(INP, Address(rsp, _INP_END));
+jcc(Assembler::above, done_hash);
+//Do second block using previously scheduled results
+xorq(SRND, SRND);
+align(16);
+bind(loop3);
+sha256_AVX2_four_rounds_compute_first(4);
+sha256_AVX2_four_rounds_compute_last(4+8);
+addq(SRND, 2*32);
+cmpq(SRND, 4 * 4*32);
+jcc(Assembler::below, loop3);
+movq(CTX, Address(rsp, _CTX));
+movq(INP, Address(rsp, _INP));
+addq(INP, 64);
+addm(4*0, CTX, a);
+addm(4*1, CTX, b);
+addm(4*2, CTX, c);
+addm(4*3, CTX, d);
+addm(4*4, CTX, e);
+addm(4*5, CTX, f);
+addm(4*6, CTX, g);
+addm(4*7, CTX, h);
+cmpq(INP, Address(rsp, _INP_END));
+jcc(Assembler::below, loop0);
+jccb(Assembler::above, done_hash);
+bind(do_last_block);
+lea(TBL, ExternalAddress(K256_W));
+movdqu(xmm4, Address(INP, 0*16));
+movdqu(xmm5, Address(INP, 1*16));
+movdqu(xmm6, Address(INP, 2*16));
+movdqu(xmm7, Address(INP, 3*16));
+vpshufb(xmm4, xmm4, xmm13, AVX_128bit);
+vpshufb(xmm5, xmm5, xmm13, AVX_128bit);
+vpshufb(xmm6, xmm6, xmm13, AVX_128bit);
+vpshufb(xmm7, xmm7, xmm13, AVX_128bit);
+jmp(last_block_enter);
+bind(only_one_block);
+// load initial digest ;; table should be preloaded with following values
+movl(a, Address(CTX, 4*0));   // 0x6a09e667
+movl(b, Address(CTX, 4*1));   // 0xbb67ae85
+movl(c, Address(CTX, 4*2));   // 0x3c6ef372
+movl(d, Address(CTX, 4*3));   // 0xa54ff53a
+movl(e, Address(CTX, 4*4));   // 0x510e527f
+movl(f, Address(CTX, 4*5));   // 0x9b05688c
+// load g - r10 after use as scratch
+movl(h, Address(CTX, 4*7));   // 0x5be0cd19
+pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask;
+vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip]
+vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32));     //[_SHUF_00BA wrt rip]
+vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64));     //[_SHUF_DC00 wrt rip]
+movl(g, Address(CTX, 4*6));   // 0x1f83d9ab
+movq(Address(rsp, _CTX), CTX);
+jmpb(do_last_block);
+bind(done_hash);
+movq(rsp, Address(rsp, _RSP));
+pop(r15);
+pop(r14);
+pop(r13);
+pop(r12);
+pop(rbp);
+#ifdef _WIN64
+pop(rdi);
+pop(rsi);
+#endif
+pop(rbx);
+#ifdef _WIN64
+pop(r9);
+pop(r8);
+#else
+pop(rdx);
+pop(rcx);
+#endif
+if (multi_block) {
+#ifdef _WIN64
+const Register& limit_end = r9;
+const Register& ofs_end   = r8;
+#else
+const Register& limit_end = rcx;
+const Register& ofs_end   = rdx;
+#endif
+movq(rax, ofs_end);
+bind(compute_size1);
+cmpptr(rax, limit_end); // assume the original ofs <= limit
+jccb(Assembler::aboveEqual, compute_size_end1);
+addq(rax, 64);
+jmpb(compute_size1);
+bind(compute_size_end1);
+}
+}
+void MacroAssembler::sha512_AVX2_one_round_compute(Register  old_h, Register a, Register b, Register c,
+Register d, Register e, Register f, Register g, Register h,
+int iteration)
+{
+const Register& y0 = r13;
+const Register& y1 = r14;
+const Register& y2 = r15;
+#ifdef _WIN64
+const Register& y3 = rcx;
+#else
+const Register& y3 = rdi;
+#endif
+const Register& T1 = r12;
+if (iteration % 4 > 0) {
+addq(old_h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0;
+}
+movq(y2, f); //y2 = f; CH
+rorxq(y0, e, 41); //y0 = e >> 41; S1A
+rorxq(y1, e, 18); //y1 = e >> 18; S1B
+xorq(y2, g); //y2 = f^g; CH
+xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18); S1
+rorxq(y1, e, 14); //y1 = (e >> 14); S1
+andq(y2, e); //y2 = (f^g)&e; CH
+if (iteration % 4 > 0 ) {
+addq(old_h, y3); //h = t1 + S0 + MAJ
+}
+xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18) ^ (e >> 14); S1
+rorxq(T1, a, 34); //T1 = a >> 34; S0B
+xorq(y2, g); //y2 = CH = ((f^g)&e) ^g; CH
+rorxq(y1, a, 39); //y1 = a >> 39; S0A
+movq(y3, a); //y3 = a; MAJA
+xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34); S0
+rorxq(T1, a, 28); //T1 = (a >> 28); S0
+addq(h, Address(rsp, (8 * iteration))); //h = k + w + h; --
+orq(y3, c); //y3 = a | c; MAJA
+xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34) ^ (a >> 28); S0
+movq(T1, a); //T1 = a; MAJB
+andq(y3, b); //y3 = (a | c)&b; MAJA
+andq(T1, c); //T1 = a&c; MAJB
+addq(y2, y0); //y2 = S1 + CH; --
+addq(d, h); //d = k + w + h + d; --
+orq(y3, T1); //y3 = MAJ = (a | c)&b) | (a&c); MAJ
+addq(h, y1); //h = k + w + h + S0; --
+addq(d, y2); //d = k + w + h + d + S1 + CH = d + t1; --
+if (iteration % 4 == 3) {
+addq(h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; --
+addq(h, y3); //h = t1 + S0 + MAJ; --
+}
+}
+void MacroAssembler::sha512_AVX2_one_round_and_schedule(
+XMMRegister xmm4, // ymm4
+XMMRegister xmm5, // ymm5
+XMMRegister xmm6, // ymm6
+XMMRegister xmm7, // ymm7
+Register a, //rax
+Register b, //rbx
+Register c, //rdi
+Register d, //rsi
+Register e, //r8
+Register f, //r9
+Register g, //r10
+Register h, //r11
+int iteration)
+{
+const Register& y0 = r13;
+const Register& y1 = r14;
+const Register& y2 = r15;
+#ifdef _WIN64
+const Register& y3 = rcx;
+#else
+const Register& y3 = rdi;
+#endif
+const Register& T1 = r12;
+if (iteration % 4 == 0) {
+// Extract w[t - 7]
+// xmm0 = W[-7]
+vperm2f128(xmm0, xmm7, xmm6, 3);
+vpalignr(xmm0, xmm0, xmm6, 8, AVX_256bit);
+// Calculate w[t - 16] + w[t - 7]
+vpaddq(xmm0, xmm0, xmm4, AVX_256bit); //xmm0 = W[-7] + W[-16]
+// Extract w[t - 15]
+//xmm1 = W[-15]
+vperm2f128(xmm1, xmm5, xmm4, 3);
+vpalignr(xmm1, xmm1, xmm4, 8, AVX_256bit);
+// Calculate sigma0
+// Calculate w[t - 15] ror 1
+vpsrlq(xmm2, xmm1, 1, AVX_256bit);
+vpsllq(xmm3, xmm1, (64 - 1), AVX_256bit);
+vpor(xmm3, xmm3, xmm2, AVX_256bit); //xmm3 = W[-15] ror 1
+// Calculate w[t - 15] shr 7
+vpsrlq(xmm8, xmm1, 7, AVX_256bit); //xmm8 = W[-15] >> 7
+} else if (iteration % 4 == 1) {
+//Calculate w[t - 15] ror 8
+vpsrlq(xmm2, xmm1, 8, AVX_256bit);
+vpsllq(xmm1, xmm1, (64 - 8), AVX_256bit);
+vpor(xmm1, xmm1, xmm2, AVX_256bit); //xmm1 = W[-15] ror 8
+//XOR the three components
+vpxor(xmm3, xmm3, xmm8, AVX_256bit); //xmm3 = W[-15] ror 1 ^ W[-15] >> 7
+vpxor(xmm1, xmm3, xmm1, AVX_256bit); //xmm1 = s0
+//Add three components, w[t - 16], w[t - 7] and sigma0
+vpaddq(xmm0, xmm0, xmm1, AVX_256bit); //xmm0 = W[-16] + W[-7] + s0
+// Move to appropriate lanes for calculating w[16] and w[17]
+vperm2f128(xmm4, xmm0, xmm0, 0); //xmm4 = W[-16] + W[-7] + s0{ BABA }
+//Move to appropriate lanes for calculating w[18] and w[19]
+vpand(xmm0, xmm0, xmm10, AVX_256bit); //xmm0 = W[-16] + W[-7] + s0{ DC00 }
+//Calculate w[16] and w[17] in both 128 bit lanes
+//Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
+vperm2f128(xmm2, xmm7, xmm7, 17); //xmm2 = W[-2] {BABA}
+vpsrlq(xmm8, xmm2, 6, AVX_256bit); //xmm8 = W[-2] >> 6 {BABA}
+} else if (iteration % 4 == 2) {
+vpsrlq(xmm3, xmm2, 19, AVX_256bit); //xmm3 = W[-2] >> 19 {BABA}
+vpsllq(xmm1, xmm2, (64 - 19), AVX_256bit); //xmm1 = W[-2] << 19 {BABA}
+vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 19 {BABA}
+vpxor(xmm8, xmm8, xmm3, AVX_256bit);// xmm8 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
+vpsrlq(xmm3, xmm2, 61, AVX_256bit); //xmm3 = W[-2] >> 61 {BABA}
+vpsllq(xmm1, xmm2, (64 - 61), AVX_256bit); //xmm1 = W[-2] << 61 {BABA}
+vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 61 {BABA}
+vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) { BABA }
+//Add sigma1 to the other components to get w[16] and w[17]
+vpaddq(xmm4, xmm4, xmm8, AVX_256bit); //xmm4 = { W[1], W[0], W[1], W[0] }
+//Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
+vpsrlq(xmm8, xmm4, 6, AVX_256bit); //xmm8 = W[-2] >> 6 {DC--}
+} else if (iteration % 4 == 3){
+vpsrlq(xmm3, xmm4, 19, AVX_256bit); //xmm3 = W[-2] >> 19 {DC--}
+vpsllq(xmm1, xmm4, (64 - 19), AVX_256bit); //xmm1 = W[-2] << 19 {DC--}
+vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 19 {DC--}
+vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
+vpsrlq(xmm3, xmm4, 61, AVX_256bit); //xmm3 = W[-2] >> 61 {DC--}
+vpsllq(xmm1, xmm4, (64 - 61), AVX_256bit); //xmm1 = W[-2] << 61 {DC--}
+vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 61 {DC--}
+vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) { DC-- }
+//Add the sigma0 + w[t - 7] + w[t - 16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19]
+vpaddq(xmm2, xmm0, xmm8, AVX_256bit); //xmm2 = { W[3], W[2], --, -- }
+//Form w[19, w[18], w17], w[16]
+vpblendd(xmm4, xmm4, xmm2, 0xF0, AVX_256bit); //xmm4 = { W[3], W[2], W[1], W[0] }
+}
+movq(y3, a); //y3 = a; MAJA
+rorxq(y0, e, 41); // y0 = e >> 41; S1A
+rorxq(y1, e, 18); //y1 = e >> 18; S1B
+addq(h, Address(rsp, (iteration * 8))); //h = k + w + h; --
+orq(y3, c); //y3 = a | c; MAJA
+movq(y2, f); //y2 = f; CH
+xorq(y2, g); //y2 = f^g; CH
+rorxq(T1, a, 34); //T1 = a >> 34; S0B
+xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18); S1
+rorxq(y1, e, 14); //y1 = (e >> 14); S1
+andq(y2, e); //y2 = (f^g) & e; CH
+addq(d, h); //d = k + w + h + d; --
+andq(y3, b); //y3 = (a | c)&b; MAJA
+xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18) ^ (e >> 14); S1
+rorxq(y1, a, 39); //y1 = a >> 39; S0A
+xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34); S0
+rorxq(T1, a, 28); //T1 = (a >> 28); S0
+xorq(y2, g); //y2 = CH = ((f^g)&e) ^ g; CH
+xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34) ^ (a >> 28); S0
+movq(T1, a); //T1 = a; MAJB
+andq(T1, c); //T1 = a&c; MAJB
+addq(y2, y0); //y2 = S1 + CH; --
+orq(y3, T1); //y3 = MAJ = (a | c)&b) | (a&c); MAJ
+addq(h, y1); //h = k + w + h + S0; --
+addq(d, y2); //d = k + w + h + d + S1 + CH = d + t1; --
+addq(h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; --
+addq(h, y3); //h = t1 + S0 + MAJ; --
+}
+void MacroAssembler::sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
+XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
+Register buf, Register state, Register ofs, Register limit, Register rsp,
+bool multi_block, XMMRegister shuf_mask)
+{
+Label loop0, loop1, loop2, done_hash,
+compute_block_size, compute_size,
+compute_block_size_end, compute_size_end;
+address K512_W = StubRoutines::x86::k512_W_addr();
+address pshuffle_byte_flip_mask_sha512 = StubRoutines::x86::pshuffle_byte_flip_mask_addr_sha512();
+address pshuffle_byte_flip_mask_addr = 0;
+const XMMRegister& XFER = xmm0; // YTMP0
+const XMMRegister& BYTE_FLIP_MASK = xmm9; // ymm9
+const XMMRegister& YMM_MASK_LO = xmm10; // ymm10
+#ifdef _WIN64
+const Register& INP = rcx; //1st arg
+const Register& CTX = rdx; //2nd arg
+const Register& NUM_BLKS = r8; //3rd arg
+const Register& c = rdi;
+const Register& d = rsi;
+const Register& e = r8;
+const Register& y3 = rcx;
+const Register& offset = r8;
+const Register& input_limit = r9;
+#else
+const Register& INP = rdi; //1st arg
+const Register& CTX = rsi; //2nd arg
+const Register& NUM_BLKS = rdx; //3rd arg
+const Register& c  = rcx;
+const Register& d  = r8;
+const Register& e  = rdx;
+const Register& y3 = rdi;
+const Register& offset = rdx;
+const Register& input_limit = rcx;
+#endif
+const Register& TBL = rbp;
+const Register& a = rax;
+const Register& b = rbx;
+const Register& f = r9;
+const Register& g = r10;
+const Register& h = r11;
+//Local variables as defined in assembly file.
+enum
+{
+_XFER_SIZE = 4 * 8, // resq 4 => reserve 4 quadwords. Hence 4 * 8
+_SRND_SIZE = 8, // resq 1
+_INP_SIZE = 8,
+_INP_END_SIZE = 8,
+_RSP_SAVE_SIZE = 8,  // defined as resq 1
+#ifdef _WIN64
+_GPR_SAVE_SIZE = 8 * 8, // defined as resq 8
+#else
+_GPR_SAVE_SIZE = 6 * 8 // resq 6
+#endif
+};
+enum
+{
+_XFER = 0,
+_SRND = _XFER + _XFER_SIZE, // 32
+_INP = _SRND + _SRND_SIZE, // 40
+_INP_END = _INP + _INP_SIZE, // 48
+_RSP = _INP_END + _INP_END_SIZE, // 56
+_GPR = _RSP + _RSP_SAVE_SIZE, // 64
+_STACK_SIZE = _GPR + _GPR_SAVE_SIZE // 128 for windows and 112 for linux.
+};
+//Saving offset and limit as it will help with blocksize calculation for multiblock SHA512.
+#ifdef _WIN64
+push(r8);    // win64: this is ofs
+push(r9);    // win64: this is limit, we need them again at the very end.
+#else
+push(rdx);   // linux : this is ofs, need at the end for multiblock calculation
+push(rcx);   // linux: This is the limit.
+#endif
+//Allocate Stack Space
+movq(rax, rsp);
+subq(rsp, _STACK_SIZE);
+andq(rsp, -32);
+movq(Address(rsp, _RSP), rax);
+//Save GPRs
+movq(Address(rsp, _GPR), rbp);
+movq(Address(rsp, (_GPR + 8)), rbx);
+movq(Address(rsp, (_GPR + 16)), r12);
+movq(Address(rsp, (_GPR + 24)), r13);
+movq(Address(rsp, (_GPR + 32)), r14);
+movq(Address(rsp, (_GPR + 40)), r15);
+#ifdef _WIN64
+movq(Address(rsp, (_GPR + 48)), rsi);
+movq(Address(rsp, (_GPR + 56)), rdi);
+#endif
+vpblendd(xmm0, xmm0, xmm1, 0xF0, AVX_128bit);
+vpblendd(xmm0, xmm0, xmm1, 0xF0, AVX_256bit);
+if (multi_block) {
+xorq(rax, rax);
+bind(compute_block_size);
+cmpptr(offset, input_limit); // Assuming that offset is less than limit.
+jccb(Assembler::aboveEqual, compute_block_size_end);
+addq(offset, 128);
+addq(rax, 128);
+jmpb(compute_block_size);
+bind(compute_block_size_end);
+movq(NUM_BLKS, rax);
+cmpq(NUM_BLKS, 0);
+jcc(Assembler::equal, done_hash);
+} else {
+xorq(NUM_BLKS, NUM_BLKS); //If single block.
+addq(NUM_BLKS, 128);
+}
+addq(NUM_BLKS, INP); //pointer to end of data
+movq(Address(rsp, _INP_END), NUM_BLKS);
+//load initial digest
+movq(a, Address(CTX, 8 * 0));
+movq(b, Address(CTX, 8 * 1));
+movq(c, Address(CTX, 8 * 2));
+movq(d, Address(CTX, 8 * 3));
+movq(e, Address(CTX, 8 * 4));
+movq(f, Address(CTX, 8 * 5));
+// load g - r10 after it is used as scratch
+movq(h, Address(CTX, 8 * 7));
+pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask_sha512;
+vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //PSHUFFLE_BYTE_FLIP_MASK wrt rip
+vmovdqu(YMM_MASK_LO, ExternalAddress(pshuffle_byte_flip_mask_addr + 32));
+movq(g, Address(CTX, 8 * 6));
+bind(loop0);
+lea(TBL, ExternalAddress(K512_W));
+//byte swap first 16 dwords
+vmovdqu(xmm4, Address(INP, 32 * 0));
+vpshufb(xmm4, xmm4, BYTE_FLIP_MASK, AVX_256bit);
+vmovdqu(xmm5, Address(INP, 32 * 1));
+vpshufb(xmm5, xmm5, BYTE_FLIP_MASK, AVX_256bit);
+vmovdqu(xmm6, Address(INP, 32 * 2));
+vpshufb(xmm6, xmm6, BYTE_FLIP_MASK, AVX_256bit);
+vmovdqu(xmm7, Address(INP, 32 * 3));
+vpshufb(xmm7, xmm7, BYTE_FLIP_MASK, AVX_256bit);
+movq(Address(rsp, _INP), INP);
+movslq(Address(rsp, _SRND), 4);
+align(16);
+//Schedule 64 input dwords, by calling sha512_AVX2_one_round_and_schedule
+bind(loop1);
+vpaddq(xmm0, xmm4, Address(TBL, 0 * 32), AVX_256bit);
+vmovdqu(Address(rsp, _XFER), xmm0);
+//four rounds and schedule
+sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, a, b, c, d, e, f, g, h, 0);
+sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, h, a, b, c, d, e, f, g, 1);
+sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, g, h, a, b, c, d, e, f, 2);
+sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, f, g, h, a, b, c, d, e, 3);
+vpaddq(xmm0, xmm5, Address(TBL, 1 * 32), AVX_256bit);
+vmovdqu(Address(rsp, _XFER), xmm0);
+//four rounds and schedule
+sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, e, f, g, h, a, b, c, d, 0);
+sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, d, e, f, g, h, a, b, c, 1);
+sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, c, d, e, f, g, h, a, b, 2);
+sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, b, c, d, e, f, g, h, a, 3);
+vpaddq(xmm0, xmm6, Address(TBL, 2 * 32), AVX_256bit);
+vmovdqu(Address(rsp, _XFER), xmm0);
+//four rounds and schedule
+sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, a, b, c, d, e, f, g, h, 0);
+sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, h, a, b, c, d, e, f, g, 1);
+sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, g, h, a, b, c, d, e, f, 2);
+sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, f, g, h, a, b, c, d, e, 3);
+vpaddq(xmm0, xmm7, Address(TBL, 3 * 32), AVX_256bit);
+vmovdqu(Address(rsp, _XFER), xmm0);
+addq(TBL, 4 * 32);
+//four rounds and schedule
+sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, e, f, g, h, a, b, c, d, 0);
+sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, d, e, f, g, h, a, b, c, 1);
+sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, c, d, e, f, g, h, a, b, 2);
+sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, b, c, d, e, f, g, h, a, 3);
+subq(Address(rsp, _SRND), 1);
+jcc(Assembler::notEqual, loop1);
+movslq(Address(rsp, _SRND), 2);
+bind(loop2);
+vpaddq(xmm0, xmm4, Address(TBL, 0 * 32), AVX_256bit);
+vmovdqu(Address(rsp, _XFER), xmm0);
+//four rounds and compute.
+sha512_AVX2_one_round_compute(a, a, b, c, d, e, f, g, h, 0);
+sha512_AVX2_one_round_compute(h, h, a, b, c, d, e, f, g, 1);
+sha512_AVX2_one_round_compute(g, g, h, a, b, c, d, e, f, 2);
+sha512_AVX2_one_round_compute(f, f, g, h, a, b, c, d, e, 3);
+vpaddq(xmm0, xmm5, Address(TBL, 1 * 32), AVX_256bit);
+vmovdqu(Address(rsp, _XFER), xmm0);
+addq(TBL, 2 * 32);
+// four rounds and compute.
+sha512_AVX2_one_round_compute(e, e, f, g, h, a, b, c, d, 0);
+sha512_AVX2_one_round_compute(d, d, e, f, g, h, a, b, c, 1);
+sha512_AVX2_one_round_compute(c, c, d, e, f, g, h, a, b, 2);
+sha512_AVX2_one_round_compute(b, b, c, d, e, f, g, h, a, 3);
+vmovdqu(xmm4, xmm6);
+vmovdqu(xmm5, xmm7);
+subq(Address(rsp, _SRND), 1);
+jcc(Assembler::notEqual, loop2);
+addmq(8 * 0, CTX, a);
+addmq(8 * 1, CTX, b);
+addmq(8 * 2, CTX, c);
+addmq(8 * 3, CTX, d);
+addmq(8 * 4, CTX, e);
+addmq(8 * 5, CTX, f);
+addmq(8 * 6, CTX, g);
+addmq(8 * 7, CTX, h);
+movq(INP, Address(rsp, _INP));
+addq(INP, 128);
+cmpq(INP, Address(rsp, _INP_END));
+jcc(Assembler::notEqual, loop0);
+bind(done_hash);
+//Restore GPRs
+movq(rbp, Address(rsp, (_GPR + 0)));
+movq(rbx, Address(rsp, (_GPR + 8)));
+movq(r12, Address(rsp, (_GPR + 16)));
+movq(r13, Address(rsp, (_GPR + 24)));
+movq(r14, Address(rsp, (_GPR + 32)));
+movq(r15, Address(rsp, (_GPR + 40)));
+#ifdef _WIN64
+movq(rsi, Address(rsp, (_GPR + 48)));
+movq(rdi, Address(rsp, (_GPR + 56)));
+#endif
+//Restore Stack Pointer
+movq(rsp, Address(rsp, _RSP));
+#ifdef _WIN64
+pop(r9);
+pop(r8);
+#else
+pop(rcx);
+pop(rdx);
+#endif
+if (multi_block) {
+#ifdef _WIN64
+const Register& limit_end = r9;
+const Register& ofs_end = r8;
+#else
+const Register& limit_end = rcx;
+const Register& ofs_end   = rdx;
+#endif
+movq(rax, ofs_end);
+bind(compute_size);
+cmpptr(rax, limit_end);
+jccb(Assembler::aboveEqual, compute_size_end);
+addq(rax, 128);
+jmpb(compute_size);
+bind(compute_size_end);
+}
+}
+#endif //#ifdef _LP64

changeset 47216	71c04702a3d5
parent 43423	bcaab17f72a5