--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86_sha.cpp Tue Apr 26 20:43:59 2016 -0700
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86_sha.cpp Tue Apr 26 21:54:21 2016 -0700
@@ -493,3 +493,543 @@
bind(done_hash);
}
+
+#ifdef _LP64
+/*
+ The algorithm below is based on Intel publication:
+ "Fast SHA-256 Implementations on Intelë Architecture Processors" by Jim Guilford, Kirk Yap and Vinodh Gopal.
+ The assembly code was originally provided by Sean Gulley and in many places preserves
+ the original assembly NAMES and comments to simplify matching Java assembly with its original.
+ The Java version was substantially redesigned to replace 1200 assembly instruction with
+ much shorter run-time generator of the same code in memory.
+*/
+
+void MacroAssembler::sha256_AVX2_one_round_compute(
+ Register reg_old_h,
+ Register reg_a,
+ Register reg_b,
+ Register reg_c,
+ Register reg_d,
+ Register reg_e,
+ Register reg_f,
+ Register reg_g,
+ Register reg_h,
+ int iter) {
+ const Register& reg_y0 = r13;
+ const Register& reg_y1 = r14;
+ const Register& reg_y2 = r15;
+ const Register& reg_y3 = rcx;
+ const Register& reg_T1 = r12;
+ //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND iter ;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ if (iter%4 > 0) {
+ addl(reg_old_h, reg_y2); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
+ }
+ movl(reg_y2, reg_f); // reg_y2 = reg_f ; CH
+ rorxd(reg_y0, reg_e, 25); // reg_y0 = reg_e >> 25 ; S1A
+ rorxd(reg_y1, reg_e, 11); // reg_y1 = reg_e >> 11 ; S1B
+ xorl(reg_y2, reg_g); // reg_y2 = reg_f^reg_g ; CH
+
+ xorl(reg_y0, reg_y1); // reg_y0 = (reg_e>>25) ^ (reg_h>>11) ; S1
+ rorxd(reg_y1, reg_e, 6); // reg_y1 = (reg_e >> 6) ; S1
+ andl(reg_y2, reg_e); // reg_y2 = (reg_f^reg_g)®_e ; CH
+
+ if (iter%4 > 0) {
+ addl(reg_old_h, reg_y3); // reg_h = t1 + S0 + MAJ ; --
+ }
+
+ xorl(reg_y0, reg_y1); // reg_y0 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1
+ rorxd(reg_T1, reg_a, 13); // reg_T1 = reg_a >> 13 ; S0B
+ xorl(reg_y2, reg_g); // reg_y2 = CH = ((reg_f^reg_g)®_e)^reg_g ; CH
+ rorxd(reg_y1, reg_a, 22); // reg_y1 = reg_a >> 22 ; S0A
+ movl(reg_y3, reg_a); // reg_y3 = reg_a ; MAJA
+
+ xorl(reg_y1, reg_T1); // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ; S0
+ rorxd(reg_T1, reg_a, 2); // reg_T1 = (reg_a >> 2) ; S0
+ addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter)); // reg_h = k + w + reg_h ; --
+ orl(reg_y3, reg_c); // reg_y3 = reg_a|reg_c ; MAJA
+
+ xorl(reg_y1, reg_T1); // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0
+ movl(reg_T1, reg_a); // reg_T1 = reg_a ; MAJB
+ andl(reg_y3, reg_b); // reg_y3 = (reg_a|reg_c)®_b ; MAJA
+ andl(reg_T1, reg_c); // reg_T1 = reg_a®_c ; MAJB
+ addl(reg_y2, reg_y0); // reg_y2 = S1 + CH ; --
+
+
+ addl(reg_d, reg_h); // reg_d = k + w + reg_h + reg_d ; --
+ orl(reg_y3, reg_T1); // reg_y3 = MAJ = (reg_a|reg_c)®_b)|(reg_a®_c) ; MAJ
+ addl(reg_h, reg_y1); // reg_h = k + w + reg_h + S0 ; --
+
+ addl(reg_d, reg_y2); // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1 ; --
+
+
+ if (iter%4 == 3) {
+ addl(reg_h, reg_y2); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
+ addl(reg_h, reg_y3); // reg_h = t1 + S0 + MAJ ; --
+ }
+}
+
+void MacroAssembler::sha256_AVX2_four_rounds_compute_first(int start) {
+ sha256_AVX2_one_round_compute(rax, rax, rbx, rdi, rsi, r8, r9, r10, r11, start + 0);
+ sha256_AVX2_one_round_compute(r11, r11, rax, rbx, rdi, rsi, r8, r9, r10, start + 1);
+ sha256_AVX2_one_round_compute(r10, r10, r11, rax, rbx, rdi, rsi, r8, r9, start + 2);
+ sha256_AVX2_one_round_compute(r9, r9, r10, r11, rax, rbx, rdi, rsi, r8, start + 3);
+}
+
+void MacroAssembler::sha256_AVX2_four_rounds_compute_last(int start) {
+ sha256_AVX2_one_round_compute(r8, r8, r9, r10, r11, rax, rbx, rdi, rsi, start + 0);
+ sha256_AVX2_one_round_compute(rsi, rsi, r8, r9, r10, r11, rax, rbx, rdi, start + 1);
+ sha256_AVX2_one_round_compute(rdi, rdi, rsi, r8, r9, r10, r11, rax, rbx, start + 2);
+ sha256_AVX2_one_round_compute(rbx, rbx, rdi, rsi, r8, r9, r10, r11, rax, start + 3);
+}
+
+void MacroAssembler::sha256_AVX2_one_round_and_sched(
+ XMMRegister xmm_0, /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */
+ XMMRegister xmm_1, /* ymm5 */ /* full cycle is 16 iterations */
+ XMMRegister xmm_2, /* ymm6 */
+ XMMRegister xmm_3, /* ymm7 */
+ Register reg_a, /* == rax on 0 iteration, then rotate 8 register right on each next iteration */
+ Register reg_b, /* rbx */ /* full cycle is 8 iterations */
+ Register reg_c, /* rdi */
+ Register reg_d, /* rsi */
+ Register reg_e, /* r8 */
+ Register reg_f, /* r9d */
+ Register reg_g, /* r10d */
+ Register reg_h, /* r11d */
+ int iter)
+{
+ movl(rcx, reg_a); // rcx = reg_a ; MAJA
+ rorxd(r13, reg_e, 25); // r13 = reg_e >> 25 ; S1A
+ rorxd(r14, reg_e, 11); // r14 = reg_e >> 11 ; S1B
+ addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter));
+ orl(rcx, reg_c); // rcx = reg_a|reg_c ; MAJA
+
+ movl(r15, reg_f); // r15 = reg_f ; CH
+ rorxd(r12, reg_a, 13); // r12 = reg_a >> 13 ; S0B
+ xorl(r13, r14); // r13 = (reg_e>>25) ^ (reg_e>>11) ; S1
+ xorl(r15, reg_g); // r15 = reg_f^reg_g ; CH
+
+ rorxd(r14, reg_e, 6); // r14 = (reg_e >> 6) ; S1
+ andl(r15, reg_e); // r15 = (reg_f^reg_g)®_e ; CH
+
+ xorl(r13, r14); // r13 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1
+ rorxd(r14, reg_a, 22); // r14 = reg_a >> 22 ; S0A
+ addl(reg_d, reg_h); // reg_d = k + w + reg_h + reg_d ; --
+
+ andl(rcx, reg_b); // rcx = (reg_a|reg_c)®_b ; MAJA
+ xorl(r14, r12); // r14 = (reg_a>>22) ^ (reg_a>>13) ; S0
+
+ rorxd(r12, reg_a, 2); // r12 = (reg_a >> 2) ; S0
+ xorl(r15, reg_g); // r15 = CH = ((reg_f^reg_g)®_e)^reg_g ; CH
+
+ xorl(r14, r12); // r14 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0
+ movl(r12, reg_a); // r12 = reg_a ; MAJB
+ andl(r12, reg_c); // r12 = reg_a®_c ; MAJB
+ addl(r15, r13); // r15 = S1 + CH ; --
+
+ orl(rcx, r12); // rcx = MAJ = (reg_a|reg_c)®_b)|(reg_a®_c) ; MAJ
+ addl(reg_h, r14); // reg_h = k + w + reg_h + S0 ; --
+ addl(reg_d, r15); // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1 ; --
+
+ addl(reg_h, r15); // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
+ addl(reg_h, rcx); // reg_h = t1 + S0 + MAJ ; --
+
+ if (iter%4 == 0) {
+ vpalignr(xmm0, xmm_3, xmm_2, 4, AVX_256bit); // ymm0 = W[-7]
+ vpaddd(xmm0, xmm0, xmm_0, AVX_256bit); // ymm0 = W[-7] + W[-16]; y1 = (e >> 6) ; S1
+ vpalignr(xmm1, xmm_1, xmm_0, 4, AVX_256bit); // ymm1 = W[-15]
+ vpsrld(xmm2, xmm1, 7, AVX_256bit);
+ vpslld(xmm3, xmm1, 32-7, AVX_256bit);
+ vpor(xmm3, xmm3, xmm2, AVX_256bit); // ymm3 = W[-15] ror 7
+ vpsrld(xmm2, xmm1,18, AVX_256bit);
+ } else if (iter%4 == 1 ) {
+ vpsrld(xmm8, xmm1, 3, AVX_256bit); // ymm8 = W[-15] >> 3
+ vpslld(xmm1, xmm1, 32-18, AVX_256bit);
+ vpxor(xmm3, xmm3, xmm1, AVX_256bit);
+ vpxor(xmm3, xmm3, xmm2, AVX_256bit); // ymm3 = W[-15] ror 7 ^ W[-15] ror 18
+ vpxor(xmm1, xmm3, xmm8, AVX_256bit); // ymm1 = s0
+ vpshufd(xmm2, xmm_3, 0xFA, AVX_256bit); // 11111010b ; ymm2 = W[-2] {BBAA}
+ vpaddd(xmm0, xmm0, xmm1, AVX_256bit); // ymm0 = W[-16] + W[-7] + s0
+ vpsrld(xmm8, xmm2, 10, AVX_256bit); // ymm8 = W[-2] >> 10 {BBAA}
+ } else if (iter%4 == 2) {
+ vpsrlq(xmm3, xmm2, 19, AVX_256bit); // ymm3 = W[-2] ror 19 {xBxA}
+ vpsrlq(xmm2, xmm2, 17, AVX_256bit); // ymm2 = W[-2] ror 17 {xBxA}
+ vpxor(xmm2, xmm2, xmm3, AVX_256bit);
+ vpxor(xmm8, xmm8, xmm2, AVX_256bit); // ymm8 = s1 {xBxA}
+ vpshufb(xmm8, xmm8, xmm10, AVX_256bit); // ymm8 = s1 {00BA}
+ vpaddd(xmm0, xmm0, xmm8, AVX_256bit); // ymm0 = {..., ..., W[1], W[0]}
+ vpshufd(xmm2, xmm0, 0x50, AVX_256bit); // 01010000b ; ymm2 = W[-2] {DDCC}
+ } else if (iter%4 == 3) {
+ vpsrld(xmm11, xmm2, 10, AVX_256bit); // ymm11 = W[-2] >> 10 {DDCC}
+ vpsrlq(xmm3, xmm2, 19, AVX_256bit); // ymm3 = W[-2] ror 19 {xDxC}
+ vpsrlq(xmm2, xmm2, 17, AVX_256bit); // ymm2 = W[-2] ror 17 {xDxC}
+ vpxor(xmm2, xmm2, xmm3, AVX_256bit);
+ vpxor(xmm11, xmm11, xmm2, AVX_256bit); // ymm11 = s1 {xDxC}
+ vpshufb(xmm11, xmm11, xmm12, AVX_256bit); // ymm11 = s1 {DC00}
+ vpaddd(xmm_0, xmm11, xmm0, AVX_256bit); // xmm_0 = {W[3], W[2], W[1], W[0]}
+ }
+}
+
+void MacroAssembler::addm(int disp, Register r1, Register r2) {
+ addl(r2, Address(r1, disp));
+ movl(Address(r1, disp), r2);
+}
+
+void MacroAssembler::sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
+ XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
+ Register buf, Register state, Register ofs, Register limit, Register rsp,
+ bool multi_block, XMMRegister shuf_mask) {
+
+ Label loop0, loop1, loop2, loop3,
+ last_block_enter, do_last_block, only_one_block, done_hash,
+ compute_size, compute_size_end,
+ compute_size1, compute_size_end1;
+
+ address K256_W = StubRoutines::x86::k256_W_addr();
+ address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr();
+ address pshuffle_byte_flip_mask_addr = 0;
+
+const XMMRegister& SHUF_00BA = xmm10; // ymm10: shuffle xBxA -> 00BA
+const XMMRegister& SHUF_DC00 = xmm12; // ymm12: shuffle xDxC -> DC00
+const XMMRegister& BYTE_FLIP_MASK = xmm13; // ymm13
+
+const XMMRegister& X_BYTE_FLIP_MASK = xmm13; //XMM version of BYTE_FLIP_MASK
+
+const Register& NUM_BLKS = r8; // 3rd arg
+const Register& CTX = rdx; // 2nd arg
+const Register& INP = rcx; // 1st arg
+
+const Register& c = rdi;
+const Register& d = rsi;
+const Register& e = r8; // clobbers NUM_BLKS
+const Register& y3 = rcx; // clobbers INP
+
+const Register& TBL = rbp;
+const Register& SRND = CTX; // SRND is same register as CTX
+
+const Register& a = rax;
+const Register& b = rbx;
+const Register& f = r9;
+const Register& g = r10;
+const Register& h = r11;
+
+const Register& T1 = r12;
+const Register& y0 = r13;
+const Register& y1 = r14;
+const Register& y2 = r15;
+
+
+enum {
+ _XFER_SIZE = 2*64*4, // 2 blocks, 64 rounds, 4 bytes/round
+#ifndef _WIN64
+ _XMM_SAVE_SIZE = 0,
+#else
+ _XMM_SAVE_SIZE = 8*16,
+#endif
+ _INP_END_SIZE = 8,
+ _INP_SIZE = 8,
+ _CTX_SIZE = 8,
+ _RSP_SIZE = 8,
+
+ _XFER = 0,
+ _XMM_SAVE = _XFER + _XFER_SIZE,
+ _INP_END = _XMM_SAVE + _XMM_SAVE_SIZE,
+ _INP = _INP_END + _INP_END_SIZE,
+ _CTX = _INP + _INP_SIZE,
+ _RSP = _CTX + _CTX_SIZE,
+ STACK_SIZE = _RSP + _RSP_SIZE
+};
+
+#ifndef _WIN64
+ push(rcx); // linux: this is limit, need at the end
+ push(rdx); // linux: this is ofs
+#else
+ push(r8); // win64: this is ofs
+ push(r9); // win64: this is limit, we need them again at the very and
+#endif
+
+
+ push(rbx);
+#ifdef _WIN64
+ push(rsi);
+ push(rdi);
+#endif
+ push(rbp);
+ push(r12);
+ push(r13);
+ push(r14);
+ push(r15);
+
+ movq(rax, rsp);
+ subq(rsp, STACK_SIZE);
+ andq(rsp, -32);
+ movq(Address(rsp, _RSP), rax);
+
+#ifndef _WIN64
+ // copy linux params to win64 params, therefore the rest of code will be the same for both
+ movq(r9, rcx);
+ movq(r8, rdx);
+ movq(rdx, rsi);
+ movq(rcx, rdi);
+#endif
+
+ // setting original assembly ABI
+ /** message to encrypt in INP */
+ lea(INP, Address(rcx, 0)); // rcx == message (buf) ;; linux: INP = buf = rdi
+ /** digest in CTX */
+ movq(CTX, rdx); // rdx = digest (state) ;; linux: CTX = state = rsi
+
+ /** NUM_BLK is the length of message, need to set it from ofs and limit */
+ if (multi_block) {
+
+ // Win64: cannot directly update NUM_BLKS, since NUM_BLKS = ofs = r8
+ // on entry r8 = ofs
+ // on exit r8 = NUM_BLKS
+
+ xorq(rax, rax);
+
+ bind(compute_size);
+ cmpptr(r8, r9); // assume the original ofs <= limit ;; linux: cmp rcx, rdx
+ jccb(Assembler::aboveEqual, compute_size_end);
+ addq(r8, 64); //;; linux: ofs = rdx
+ addq(rax, 64);
+ jmpb(compute_size);
+
+ bind(compute_size_end);
+ movq(NUM_BLKS, rax); // NUM_BLK (r8) ;; linux: NUM_BLK = rdx
+
+ cmpq(NUM_BLKS, 0);
+ jcc(Assembler::equal, done_hash);
+
+ } else {
+ xorq(NUM_BLKS, NUM_BLKS);
+ addq(NUM_BLKS, 64);
+ }//if (!multi_block)
+
+ lea(NUM_BLKS, Address(INP, NUM_BLKS, Address::times_1, -64)); // pointer to the last block
+ movq(Address(rsp, _INP_END), NUM_BLKS); //
+
+ cmpptr(INP, NUM_BLKS); //cmp INP, NUM_BLKS
+ jcc(Assembler::equal, only_one_block); //je only_one_block
+
+ // load initial digest
+ movl(a, Address(CTX, 4*0));
+ movl(b, Address(CTX, 4*1));
+ movl(c, Address(CTX, 4*2));
+ movl(d, Address(CTX, 4*3));
+ movl(e, Address(CTX, 4*4));
+ movl(f, Address(CTX, 4*5));
+ movl(g, Address(CTX, 4*6));
+ movl(h, Address(CTX, 4*7));
+
+ pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask;
+ vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr +0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip]
+ vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32)); //[_SHUF_00BA wrt rip]
+ vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64)); //[_SHUF_DC00 wrt rip]
+
+ movq(Address(rsp, _CTX), CTX); // store
+
+bind(loop0);
+ lea(TBL, ExternalAddress(K256_W));
+
+ // assume buffers not aligned
+
+ // Load first 16 dwords from two blocks
+ vmovdqu(xmm0, Address(INP, 0*32));
+ vmovdqu(xmm1, Address(INP, 1*32));
+ vmovdqu(xmm2, Address(INP, 2*32));
+ vmovdqu(xmm3, Address(INP, 3*32));
+
+ // byte swap data
+ vpshufb(xmm0, xmm0, BYTE_FLIP_MASK, AVX_256bit);
+ vpshufb(xmm1, xmm1, BYTE_FLIP_MASK, AVX_256bit);
+ vpshufb(xmm2, xmm2, BYTE_FLIP_MASK, AVX_256bit);
+ vpshufb(xmm3, xmm3, BYTE_FLIP_MASK, AVX_256bit);
+
+ // transpose data into high/low halves
+ vperm2i128(xmm4, xmm0, xmm2, 0x20);
+ vperm2i128(xmm5, xmm0, xmm2, 0x31);
+ vperm2i128(xmm6, xmm1, xmm3, 0x20);
+ vperm2i128(xmm7, xmm1, xmm3, 0x31);
+
+bind(last_block_enter);
+ addq(INP, 64);
+ movq(Address(rsp, _INP), INP);
+
+ //;; schedule 48 input dwords, by doing 3 rounds of 12 each
+ xorq(SRND, SRND);
+
+align(16);
+bind(loop1);
+ vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit);
+ vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9);
+ sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, rax, rbx, rdi, rsi, r8, r9, r10, r11, 0);
+ sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r11, rax, rbx, rdi, rsi, r8, r9, r10, 1);
+ sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r10, r11, rax, rbx, rdi, rsi, r8, r9, 2);
+ sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r9, r10, r11, rax, rbx, rdi, rsi, r8, 3);
+
+ vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit);
+ vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9);
+ sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, r8, r9, r10, r11, rax, rbx, rdi, rsi, 8+0);
+ sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rsi, r8, r9, r10, r11, rax, rbx, rdi, 8+1);
+ sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rdi, rsi, r8, r9, r10, r11, rax, rbx, 8+2);
+ sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rbx, rdi, rsi, r8, r9, r10, r11, rax, 8+3);
+
+ vpaddd(xmm9, xmm6, Address(TBL, SRND, Address::times_1, 2*32), AVX_256bit);
+ vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 2*32), xmm9);
+ sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, rax, rbx, rdi, rsi, r8, r9, r10, r11, 16+0);
+ sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r11, rax, rbx, rdi, rsi, r8, r9, r10, 16+1);
+ sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r10, r11, rax, rbx, rdi, rsi, r8, r9, 16+2);
+ sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r9, r10, r11, rax, rbx, rdi, rsi, r8, 16+3);
+
+ vpaddd(xmm9, xmm7, Address(TBL, SRND, Address::times_1, 3*32), AVX_256bit);
+ vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 3*32), xmm9);
+
+ sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, r8, r9, r10, r11, rax, rbx, rdi, rsi, 24+0);
+ sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rsi, r8, r9, r10, r11, rax, rbx, rdi, 24+1);
+ sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rdi, rsi, r8, r9, r10, r11, rax, rbx, 24+2);
+ sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rbx, rdi, rsi, r8, r9, r10, r11, rax, 24+3);
+
+ addq(SRND, 4*32);
+ cmpq(SRND, 3 * 4*32);
+ jcc(Assembler::below, loop1);
+
+bind(loop2);
+ // Do last 16 rounds with no scheduling
+ vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit);
+ vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9);
+ sha256_AVX2_four_rounds_compute_first(0);
+
+ vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit);
+ vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9);
+ sha256_AVX2_four_rounds_compute_last(0 + 8);
+
+ addq(SRND, 2*32);
+
+ vmovdqu(xmm4, xmm6);
+ vmovdqu(xmm5, xmm7);
+
+ cmpq(SRND, 4 * 4*32);
+ jcc(Assembler::below, loop2);
+
+ movq(CTX, Address(rsp, _CTX));
+ movq(INP, Address(rsp, _INP));
+
+ addm(4*0, CTX, a);
+ addm(4*1, CTX, b);
+ addm(4*2, CTX, c);
+ addm(4*3, CTX, d);
+ addm(4*4, CTX, e);
+ addm(4*5, CTX, f);
+ addm(4*6, CTX, g);
+ addm(4*7, CTX, h);
+
+ cmpq(INP, Address(rsp, _INP_END));
+ jcc(Assembler::above, done_hash);
+
+ //Do second block using previously scheduled results
+ xorq(SRND, SRND);
+align(16);
+bind(loop3);
+ sha256_AVX2_four_rounds_compute_first(4);
+ sha256_AVX2_four_rounds_compute_last(4+8);
+
+ addq(SRND, 2*32);
+ cmpq(SRND, 4 * 4*32);
+ jcc(Assembler::below, loop3);
+
+ movq(CTX, Address(rsp, _CTX));
+ movq(INP, Address(rsp, _INP));
+ addq(INP, 64);
+
+ addm(4*0, CTX, a);
+ addm(4*1, CTX, b);
+ addm(4*2, CTX, c);
+ addm(4*3, CTX, d);
+ addm(4*4, CTX, e);
+ addm(4*5, CTX, f);
+ addm(4*6, CTX, g);
+ addm(4*7, CTX, h);
+
+ cmpq(INP, Address(rsp, _INP_END));
+ jcc(Assembler::below, loop0);
+ jccb(Assembler::above, done_hash);
+
+bind(do_last_block);
+ lea(TBL, ExternalAddress(K256_W));
+
+ movdqu(xmm4, Address(INP, 0*16));
+ movdqu(xmm5, Address(INP, 1*16));
+ movdqu(xmm6, Address(INP, 2*16));
+ movdqu(xmm7, Address(INP, 3*16));
+
+ vpshufb(xmm4, xmm4, xmm13, AVX_128bit);
+ vpshufb(xmm5, xmm5, xmm13, AVX_128bit);
+ vpshufb(xmm6, xmm6, xmm13, AVX_128bit);
+ vpshufb(xmm7, xmm7, xmm13, AVX_128bit);
+
+ jmp(last_block_enter);
+
+bind(only_one_block);
+
+ // load initial digest ;; table should be preloaded with following values
+ movl(a, Address(CTX, 4*0)); // 0x6a09e667
+ movl(b, Address(CTX, 4*1)); // 0xbb67ae85
+ movl(c, Address(CTX, 4*2)); // 0x3c6ef372
+ movl(d, Address(CTX, 4*3)); // 0xa54ff53a
+ movl(e, Address(CTX, 4*4)); // 0x510e527f
+ movl(f, Address(CTX, 4*5)); // 0x9b05688c
+ movl(g, Address(CTX, 4*6)); // 0x1f83d9ab
+ movl(h, Address(CTX, 4*7)); // 0x5be0cd19
+
+
+ pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask;
+ vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip]
+ vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32)); //[_SHUF_00BA wrt rip]
+ vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64)); //[_SHUF_DC00 wrt rip]
+
+ movq(Address(rsp, _CTX), CTX);
+ jmpb(do_last_block);
+
+bind(done_hash);
+
+ movq(rsp, Address(rsp, _RSP));
+
+ pop(r15);
+ pop(r14);
+ pop(r13);
+ pop(r12);
+ pop(rbp);
+#ifdef _WIN64
+ pop(rdi);
+ pop(rsi);
+#endif
+ pop(rbx);
+
+#ifdef _WIN64
+ pop(r9);
+ pop(r8);
+#else
+ pop(rdx);
+ pop(rcx);
+#endif
+
+ if (multi_block) {
+#ifdef _WIN64
+const Register& limit_end = r9;
+const Register& ofs_end = r8;
+#else
+const Register& limit_end = rcx;
+const Register& ofs_end = rdx;
+#endif
+ movq(rax, ofs_end);
+
+bind(compute_size1);
+ cmpptr(rax, limit_end); // assume the original ofs <= limit
+ jccb(Assembler::aboveEqual, compute_size_end1);
+ addq(rax, 64);
+ jmpb(compute_size1);
+
+bind(compute_size_end1);
+ }
+}
+#endif //#ifdef _LP64