src/hotspot/cpu/x86/macroAssembler_x86_sha.cpp
changeset 47216 71c04702a3d5
parent 43423 bcaab17f72a5
equal deleted inserted replaced
47215:4ebc2e2fb97c 47216:71c04702a3d5
       
     1 /*
       
     2 * Copyright (c) 2016, Intel Corporation.
       
     3 *
       
     4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
       
     5 *
       
     6 * This code is free software; you can redistribute it and/or modify it
       
     7 * under the terms of the GNU General Public License version 2 only, as
       
     8 * published by the Free Software Foundation.
       
     9 *
       
    10 * This code is distributed in the hope that it will be useful, but WITHOUT
       
    11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
       
    12 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
       
    13 * version 2 for more details (a copy is included in the LICENSE file that
       
    14 * accompanied this code).
       
    15 *
       
    16 * You should have received a copy of the GNU General Public License version
       
    17 * 2 along with this work; if not, write to the Free Software Foundation,
       
    18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
       
    19 *
       
    20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
       
    21 * or visit www.oracle.com if you need additional information or have any
       
    22 * questions.
       
    23 *
       
    24 */
       
    25 
       
    26 #include "precompiled.hpp"
       
    27 #include "asm/assembler.hpp"
       
    28 #include "asm/assembler.inline.hpp"
       
    29 #include "runtime/stubRoutines.hpp"
       
    30 #include "macroAssembler_x86.hpp"
       
    31 
       
    32 // ofs and limit are used for multi-block byte array.
       
    33 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
       
    34 void MacroAssembler::fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
       
    35   XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
       
    36   Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block) {
       
    37 
       
    38   Label start, done_hash, loop0;
       
    39 
       
    40   address upper_word_mask = StubRoutines::x86::upper_word_mask_addr();
       
    41   address shuffle_byte_flip_mask = StubRoutines::x86::shuffle_byte_flip_mask_addr();
       
    42 
       
    43   bind(start);
       
    44   movdqu(abcd, Address(state, 0));
       
    45   pinsrd(e0, Address(state, 16), 3);
       
    46   movdqu(shuf_mask, ExternalAddress(upper_word_mask)); // 0xFFFFFFFF000000000000000000000000
       
    47   pand(e0, shuf_mask);
       
    48   pshufd(abcd, abcd, 0x1B);
       
    49   movdqu(shuf_mask, ExternalAddress(shuffle_byte_flip_mask)); //0x000102030405060708090a0b0c0d0e0f
       
    50 
       
    51   bind(loop0);
       
    52   // Save hash values for addition after rounds
       
    53   movdqu(Address(rsp, 0), e0);
       
    54   movdqu(Address(rsp, 16), abcd);
       
    55 
       
    56 
       
    57   // Rounds 0 - 3
       
    58   movdqu(msg0, Address(buf, 0));
       
    59   pshufb(msg0, shuf_mask);
       
    60   paddd(e0, msg0);
       
    61   movdqa(e1, abcd);
       
    62   sha1rnds4(abcd, e0, 0);
       
    63 
       
    64   // Rounds 4 - 7
       
    65   movdqu(msg1, Address(buf, 16));
       
    66   pshufb(msg1, shuf_mask);
       
    67   sha1nexte(e1, msg1);
       
    68   movdqa(e0, abcd);
       
    69   sha1rnds4(abcd, e1, 0);
       
    70   sha1msg1(msg0, msg1);
       
    71 
       
    72   // Rounds 8 - 11
       
    73   movdqu(msg2, Address(buf, 32));
       
    74   pshufb(msg2, shuf_mask);
       
    75   sha1nexte(e0, msg2);
       
    76   movdqa(e1, abcd);
       
    77   sha1rnds4(abcd, e0, 0);
       
    78   sha1msg1(msg1, msg2);
       
    79   pxor(msg0, msg2);
       
    80 
       
    81   // Rounds 12 - 15
       
    82   movdqu(msg3, Address(buf, 48));
       
    83   pshufb(msg3, shuf_mask);
       
    84   sha1nexte(e1, msg3);
       
    85   movdqa(e0, abcd);
       
    86   sha1msg2(msg0, msg3);
       
    87   sha1rnds4(abcd, e1, 0);
       
    88   sha1msg1(msg2, msg3);
       
    89   pxor(msg1, msg3);
       
    90 
       
    91   // Rounds 16 - 19
       
    92   sha1nexte(e0, msg0);
       
    93   movdqa(e1, abcd);
       
    94   sha1msg2(msg1, msg0);
       
    95   sha1rnds4(abcd, e0, 0);
       
    96   sha1msg1(msg3, msg0);
       
    97   pxor(msg2, msg0);
       
    98 
       
    99   // Rounds 20 - 23
       
   100   sha1nexte(e1, msg1);
       
   101   movdqa(e0, abcd);
       
   102   sha1msg2(msg2, msg1);
       
   103   sha1rnds4(abcd, e1, 1);
       
   104   sha1msg1(msg0, msg1);
       
   105   pxor(msg3, msg1);
       
   106 
       
   107   // Rounds 24 - 27
       
   108   sha1nexte(e0, msg2);
       
   109   movdqa(e1, abcd);
       
   110   sha1msg2(msg3, msg2);
       
   111   sha1rnds4(abcd, e0, 1);
       
   112   sha1msg1(msg1, msg2);
       
   113   pxor(msg0, msg2);
       
   114 
       
   115   // Rounds 28 - 31
       
   116   sha1nexte(e1, msg3);
       
   117   movdqa(e0, abcd);
       
   118   sha1msg2(msg0, msg3);
       
   119   sha1rnds4(abcd, e1, 1);
       
   120   sha1msg1(msg2, msg3);
       
   121   pxor(msg1, msg3);
       
   122 
       
   123   // Rounds 32 - 35
       
   124   sha1nexte(e0, msg0);
       
   125   movdqa(e1, abcd);
       
   126   sha1msg2(msg1, msg0);
       
   127   sha1rnds4(abcd, e0, 1);
       
   128   sha1msg1(msg3, msg0);
       
   129   pxor(msg2, msg0);
       
   130 
       
   131   // Rounds 36 - 39
       
   132   sha1nexte(e1, msg1);
       
   133   movdqa(e0, abcd);
       
   134   sha1msg2(msg2, msg1);
       
   135   sha1rnds4(abcd, e1, 1);
       
   136   sha1msg1(msg0, msg1);
       
   137   pxor(msg3, msg1);
       
   138 
       
   139   // Rounds 40 - 43
       
   140   sha1nexte(e0, msg2);
       
   141   movdqa(e1, abcd);
       
   142   sha1msg2(msg3, msg2);
       
   143   sha1rnds4(abcd, e0, 2);
       
   144   sha1msg1(msg1, msg2);
       
   145   pxor(msg0, msg2);
       
   146 
       
   147   // Rounds 44 - 47
       
   148   sha1nexte(e1, msg3);
       
   149   movdqa(e0, abcd);
       
   150   sha1msg2(msg0, msg3);
       
   151   sha1rnds4(abcd, e1, 2);
       
   152   sha1msg1(msg2, msg3);
       
   153   pxor(msg1, msg3);
       
   154 
       
   155   // Rounds 48 - 51
       
   156   sha1nexte(e0, msg0);
       
   157   movdqa(e1, abcd);
       
   158   sha1msg2(msg1, msg0);
       
   159   sha1rnds4(abcd, e0, 2);
       
   160   sha1msg1(msg3, msg0);
       
   161   pxor(msg2, msg0);
       
   162 
       
   163   // Rounds 52 - 55
       
   164   sha1nexte(e1, msg1);
       
   165   movdqa(e0, abcd);
       
   166   sha1msg2(msg2, msg1);
       
   167   sha1rnds4(abcd, e1, 2);
       
   168   sha1msg1(msg0, msg1);
       
   169   pxor(msg3, msg1);
       
   170 
       
   171   // Rounds 56 - 59
       
   172   sha1nexte(e0, msg2);
       
   173   movdqa(e1, abcd);
       
   174   sha1msg2(msg3, msg2);
       
   175   sha1rnds4(abcd, e0, 2);
       
   176   sha1msg1(msg1, msg2);
       
   177   pxor(msg0, msg2);
       
   178 
       
   179   // Rounds 60 - 63
       
   180   sha1nexte(e1, msg3);
       
   181   movdqa(e0, abcd);
       
   182   sha1msg2(msg0, msg3);
       
   183   sha1rnds4(abcd, e1, 3);
       
   184   sha1msg1(msg2, msg3);
       
   185   pxor(msg1, msg3);
       
   186 
       
   187   // Rounds 64 - 67
       
   188   sha1nexte(e0, msg0);
       
   189   movdqa(e1, abcd);
       
   190   sha1msg2(msg1, msg0);
       
   191   sha1rnds4(abcd, e0, 3);
       
   192   sha1msg1(msg3, msg0);
       
   193   pxor(msg2, msg0);
       
   194 
       
   195   // Rounds 68 - 71
       
   196   sha1nexte(e1, msg1);
       
   197   movdqa(e0, abcd);
       
   198   sha1msg2(msg2, msg1);
       
   199   sha1rnds4(abcd, e1, 3);
       
   200   pxor(msg3, msg1);
       
   201 
       
   202   // Rounds 72 - 75
       
   203   sha1nexte(e0, msg2);
       
   204   movdqa(e1, abcd);
       
   205   sha1msg2(msg3, msg2);
       
   206   sha1rnds4(abcd, e0, 3);
       
   207 
       
   208   // Rounds 76 - 79
       
   209   sha1nexte(e1, msg3);
       
   210   movdqa(e0, abcd);
       
   211   sha1rnds4(abcd, e1, 3);
       
   212 
       
   213   // add current hash values with previously saved
       
   214   movdqu(msg0, Address(rsp, 0));
       
   215   sha1nexte(e0, msg0);
       
   216   movdqu(msg0, Address(rsp, 16));
       
   217   paddd(abcd, msg0);
       
   218 
       
   219   if (multi_block) {
       
   220     // increment data pointer and loop if more to process
       
   221     addptr(buf, 64);
       
   222     addptr(ofs, 64);
       
   223     cmpptr(ofs, limit);
       
   224     jcc(Assembler::belowEqual, loop0);
       
   225     movptr(rax, ofs); //return ofs
       
   226   }
       
   227   // write hash values back in the correct order
       
   228   pshufd(abcd, abcd, 0x1b);
       
   229   movdqu(Address(state, 0), abcd);
       
   230   pextrd(Address(state, 16), e0, 3);
       
   231 
       
   232   bind(done_hash);
       
   233 
       
   234 }
       
   235 
       
   236 // xmm0 (msg) is used as an implicit argument to sh256rnds2
       
   237 // and state0 and state1 can never use xmm0 register.
       
   238 // ofs and limit are used for multi-block byte array.
       
   239 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
       
   240 #ifdef _LP64
       
   241 void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
       
   242   XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
       
   243   Register buf, Register state, Register ofs, Register limit, Register rsp,
       
   244   bool multi_block, XMMRegister shuf_mask) {
       
   245 #else
       
   246 void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
       
   247   XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
       
   248   Register buf, Register state, Register ofs, Register limit, Register rsp,
       
   249   bool multi_block) {
       
   250 #endif
       
   251   Label start, done_hash, loop0;
       
   252 
       
   253   address K256 = StubRoutines::x86::k256_addr();
       
   254   address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr();
       
   255 
       
   256   bind(start);
       
   257   movdqu(state0, Address(state, 0));
       
   258   movdqu(state1, Address(state, 16));
       
   259 
       
   260   pshufd(state0, state0, 0xB1);
       
   261   pshufd(state1, state1, 0x1B);
       
   262   movdqa(msgtmp4, state0);
       
   263   palignr(state0, state1, 8);
       
   264   pblendw(state1, msgtmp4, 0xF0);
       
   265 
       
   266 #ifdef _LP64
       
   267   movdqu(shuf_mask, ExternalAddress(pshuffle_byte_flip_mask));
       
   268 #endif
       
   269   lea(rax, ExternalAddress(K256));
       
   270 
       
   271   bind(loop0);
       
   272   movdqu(Address(rsp, 0), state0);
       
   273   movdqu(Address(rsp, 16), state1);
       
   274 
       
   275   // Rounds 0-3
       
   276   movdqu(msg, Address(buf, 0));
       
   277 #ifdef _LP64
       
   278   pshufb(msg, shuf_mask);
       
   279 #else
       
   280   pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
       
   281 #endif
       
   282   movdqa(msgtmp0, msg);
       
   283   paddd(msg, Address(rax, 0));
       
   284   sha256rnds2(state1, state0);
       
   285   pshufd(msg, msg, 0x0E);
       
   286   sha256rnds2(state0, state1);
       
   287 
       
   288   // Rounds 4-7
       
   289   movdqu(msg, Address(buf, 16));
       
   290 #ifdef _LP64
       
   291   pshufb(msg, shuf_mask);
       
   292 #else
       
   293   pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
       
   294 #endif
       
   295   movdqa(msgtmp1, msg);
       
   296   paddd(msg, Address(rax, 16));
       
   297   sha256rnds2(state1, state0);
       
   298   pshufd(msg, msg, 0x0E);
       
   299   sha256rnds2(state0, state1);
       
   300   sha256msg1(msgtmp0, msgtmp1);
       
   301 
       
   302   // Rounds 8-11
       
   303   movdqu(msg, Address(buf, 32));
       
   304 #ifdef _LP64
       
   305   pshufb(msg, shuf_mask);
       
   306 #else
       
   307   pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
       
   308 #endif
       
   309   movdqa(msgtmp2, msg);
       
   310   paddd(msg, Address(rax, 32));
       
   311   sha256rnds2(state1, state0);
       
   312   pshufd(msg, msg, 0x0E);
       
   313   sha256rnds2(state0, state1);
       
   314   sha256msg1(msgtmp1, msgtmp2);
       
   315 
       
   316   // Rounds 12-15
       
   317   movdqu(msg, Address(buf, 48));
       
   318 #ifdef _LP64
       
   319   pshufb(msg, shuf_mask);
       
   320 #else
       
   321   pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
       
   322 #endif
       
   323   movdqa(msgtmp3, msg);
       
   324   paddd(msg, Address(rax, 48));
       
   325   sha256rnds2(state1, state0);
       
   326   movdqa(msgtmp4, msgtmp3);
       
   327   palignr(msgtmp4, msgtmp2, 4);
       
   328   paddd(msgtmp0, msgtmp4);
       
   329   sha256msg2(msgtmp0, msgtmp3);
       
   330   pshufd(msg, msg, 0x0E);
       
   331   sha256rnds2(state0, state1);
       
   332   sha256msg1(msgtmp2, msgtmp3);
       
   333 
       
   334   // Rounds 16-19
       
   335   movdqa(msg, msgtmp0);
       
   336   paddd(msg, Address(rax, 64));
       
   337   sha256rnds2(state1, state0);
       
   338   movdqa(msgtmp4, msgtmp0);
       
   339   palignr(msgtmp4, msgtmp3, 4);
       
   340   paddd(msgtmp1, msgtmp4);
       
   341   sha256msg2(msgtmp1, msgtmp0);
       
   342   pshufd(msg, msg, 0x0E);
       
   343   sha256rnds2(state0, state1);
       
   344   sha256msg1(msgtmp3, msgtmp0);
       
   345 
       
   346   // Rounds 20-23
       
   347   movdqa(msg, msgtmp1);
       
   348   paddd(msg, Address(rax, 80));
       
   349   sha256rnds2(state1, state0);
       
   350   movdqa(msgtmp4, msgtmp1);
       
   351   palignr(msgtmp4, msgtmp0, 4);
       
   352   paddd(msgtmp2, msgtmp4);
       
   353   sha256msg2(msgtmp2, msgtmp1);
       
   354   pshufd(msg, msg, 0x0E);
       
   355   sha256rnds2(state0, state1);
       
   356   sha256msg1(msgtmp0, msgtmp1);
       
   357 
       
   358   // Rounds 24-27
       
   359   movdqa(msg, msgtmp2);
       
   360   paddd(msg, Address(rax, 96));
       
   361   sha256rnds2(state1, state0);
       
   362   movdqa(msgtmp4, msgtmp2);
       
   363   palignr(msgtmp4, msgtmp1, 4);
       
   364   paddd(msgtmp3, msgtmp4);
       
   365   sha256msg2(msgtmp3, msgtmp2);
       
   366   pshufd(msg, msg, 0x0E);
       
   367   sha256rnds2(state0, state1);
       
   368   sha256msg1(msgtmp1, msgtmp2);
       
   369 
       
   370   // Rounds 28-31
       
   371   movdqa(msg, msgtmp3);
       
   372   paddd(msg, Address(rax, 112));
       
   373   sha256rnds2(state1, state0);
       
   374   movdqa(msgtmp4, msgtmp3);
       
   375   palignr(msgtmp4, msgtmp2, 4);
       
   376   paddd(msgtmp0, msgtmp4);
       
   377   sha256msg2(msgtmp0, msgtmp3);
       
   378   pshufd(msg, msg, 0x0E);
       
   379   sha256rnds2(state0, state1);
       
   380   sha256msg1(msgtmp2, msgtmp3);
       
   381 
       
   382   // Rounds 32-35
       
   383   movdqa(msg, msgtmp0);
       
   384   paddd(msg, Address(rax, 128));
       
   385   sha256rnds2(state1, state0);
       
   386   movdqa(msgtmp4, msgtmp0);
       
   387   palignr(msgtmp4, msgtmp3, 4);
       
   388   paddd(msgtmp1, msgtmp4);
       
   389   sha256msg2(msgtmp1, msgtmp0);
       
   390   pshufd(msg, msg, 0x0E);
       
   391   sha256rnds2(state0, state1);
       
   392   sha256msg1(msgtmp3, msgtmp0);
       
   393 
       
   394   // Rounds 36-39
       
   395   movdqa(msg, msgtmp1);
       
   396   paddd(msg, Address(rax, 144));
       
   397   sha256rnds2(state1, state0);
       
   398   movdqa(msgtmp4, msgtmp1);
       
   399   palignr(msgtmp4, msgtmp0, 4);
       
   400   paddd(msgtmp2, msgtmp4);
       
   401   sha256msg2(msgtmp2, msgtmp1);
       
   402   pshufd(msg, msg, 0x0E);
       
   403   sha256rnds2(state0, state1);
       
   404   sha256msg1(msgtmp0, msgtmp1);
       
   405 
       
   406   // Rounds 40-43
       
   407   movdqa(msg, msgtmp2);
       
   408   paddd(msg, Address(rax, 160));
       
   409   sha256rnds2(state1, state0);
       
   410   movdqa(msgtmp4, msgtmp2);
       
   411   palignr(msgtmp4, msgtmp1, 4);
       
   412   paddd(msgtmp3, msgtmp4);
       
   413   sha256msg2(msgtmp3, msgtmp2);
       
   414   pshufd(msg, msg, 0x0E);
       
   415   sha256rnds2(state0, state1);
       
   416   sha256msg1(msgtmp1, msgtmp2);
       
   417 
       
   418   // Rounds 44-47
       
   419   movdqa(msg, msgtmp3);
       
   420   paddd(msg, Address(rax, 176));
       
   421   sha256rnds2(state1, state0);
       
   422   movdqa(msgtmp4, msgtmp3);
       
   423   palignr(msgtmp4, msgtmp2, 4);
       
   424   paddd(msgtmp0, msgtmp4);
       
   425   sha256msg2(msgtmp0, msgtmp3);
       
   426   pshufd(msg, msg, 0x0E);
       
   427   sha256rnds2(state0, state1);
       
   428   sha256msg1(msgtmp2, msgtmp3);
       
   429 
       
   430   // Rounds 48-51
       
   431   movdqa(msg, msgtmp0);
       
   432   paddd(msg, Address(rax, 192));
       
   433   sha256rnds2(state1, state0);
       
   434   movdqa(msgtmp4, msgtmp0);
       
   435   palignr(msgtmp4, msgtmp3, 4);
       
   436   paddd(msgtmp1, msgtmp4);
       
   437   sha256msg2(msgtmp1, msgtmp0);
       
   438   pshufd(msg, msg, 0x0E);
       
   439   sha256rnds2(state0, state1);
       
   440   sha256msg1(msgtmp3, msgtmp0);
       
   441 
       
   442   // Rounds 52-55
       
   443   movdqa(msg, msgtmp1);
       
   444   paddd(msg, Address(rax, 208));
       
   445   sha256rnds2(state1, state0);
       
   446   movdqa(msgtmp4, msgtmp1);
       
   447   palignr(msgtmp4, msgtmp0, 4);
       
   448   paddd(msgtmp2, msgtmp4);
       
   449   sha256msg2(msgtmp2, msgtmp1);
       
   450   pshufd(msg, msg, 0x0E);
       
   451   sha256rnds2(state0, state1);
       
   452 
       
   453   // Rounds 56-59
       
   454   movdqa(msg, msgtmp2);
       
   455   paddd(msg, Address(rax, 224));
       
   456   sha256rnds2(state1, state0);
       
   457   movdqa(msgtmp4, msgtmp2);
       
   458   palignr(msgtmp4, msgtmp1, 4);
       
   459   paddd(msgtmp3, msgtmp4);
       
   460   sha256msg2(msgtmp3, msgtmp2);
       
   461   pshufd(msg, msg, 0x0E);
       
   462   sha256rnds2(state0, state1);
       
   463 
       
   464   // Rounds 60-63
       
   465   movdqa(msg, msgtmp3);
       
   466   paddd(msg, Address(rax, 240));
       
   467   sha256rnds2(state1, state0);
       
   468   pshufd(msg, msg, 0x0E);
       
   469   sha256rnds2(state0, state1);
       
   470   movdqu(msg, Address(rsp, 0));
       
   471   paddd(state0, msg);
       
   472   movdqu(msg, Address(rsp, 16));
       
   473   paddd(state1, msg);
       
   474 
       
   475   if (multi_block) {
       
   476     // increment data pointer and loop if more to process
       
   477     addptr(buf, 64);
       
   478     addptr(ofs, 64);
       
   479     cmpptr(ofs, limit);
       
   480     jcc(Assembler::belowEqual, loop0);
       
   481     movptr(rax, ofs); //return ofs
       
   482   }
       
   483 
       
   484   pshufd(state0, state0, 0x1B);
       
   485   pshufd(state1, state1, 0xB1);
       
   486   movdqa(msgtmp4, state0);
       
   487   pblendw(state0, state1, 0xF0);
       
   488   palignr(state1, msgtmp4, 8);
       
   489 
       
   490   movdqu(Address(state, 0), state0);
       
   491   movdqu(Address(state, 16), state1);
       
   492 
       
   493   bind(done_hash);
       
   494 
       
   495 }
       
   496 
       
   497 #ifdef _LP64
       
   498 /*
       
   499   The algorithm below is based on Intel publication:
       
   500   "Fast SHA-256 Implementations on Intelë Architecture Processors" by Jim Guilford, Kirk Yap and Vinodh Gopal.
       
   501   The assembly code was originally provided by Sean Gulley and in many places preserves
       
   502   the original assembly NAMES and comments to simplify matching Java assembly with its original.
       
   503   The Java version was substantially redesigned to replace 1200 assembly instruction with
       
   504   much shorter run-time generator of the same code in memory.
       
   505 */
       
   506 
       
   507 void MacroAssembler::sha256_AVX2_one_round_compute(
       
   508     Register  reg_old_h,
       
   509     Register  reg_a,
       
   510     Register  reg_b,
       
   511     Register  reg_c,
       
   512     Register  reg_d,
       
   513     Register  reg_e,
       
   514     Register  reg_f,
       
   515     Register  reg_g,
       
   516     Register  reg_h,
       
   517     int iter) {
       
   518   const Register& reg_y0     = r13;
       
   519   const Register& reg_y1     = r14;
       
   520   const Register& reg_y2     = r15;
       
   521   const Register& reg_y3     = rcx;
       
   522   const Register& reg_T1     = r12;
       
   523   //;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND iter ;;;;;;;;;;;;;;;;;;;;;;;;;;;
       
   524   if (iter%4 > 0) {
       
   525     addl(reg_old_h, reg_y2);   // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
       
   526   }
       
   527   movl(reg_y2, reg_f);         // reg_y2 = reg_f                                ; CH
       
   528   rorxd(reg_y0, reg_e, 25);    // reg_y0 = reg_e >> 25   ; S1A
       
   529   rorxd(reg_y1, reg_e, 11);    // reg_y1 = reg_e >> 11    ; S1B
       
   530   xorl(reg_y2, reg_g);         // reg_y2 = reg_f^reg_g                              ; CH
       
   531 
       
   532   xorl(reg_y0, reg_y1);        // reg_y0 = (reg_e>>25) ^ (reg_h>>11)  ; S1
       
   533   rorxd(reg_y1, reg_e, 6);     // reg_y1 = (reg_e >> 6)    ; S1
       
   534   andl(reg_y2, reg_e);         // reg_y2 = (reg_f^reg_g)&reg_e                          ; CH
       
   535 
       
   536   if (iter%4 > 0) {
       
   537     addl(reg_old_h, reg_y3);   // reg_h = t1 + S0 + MAJ                     ; --
       
   538   }
       
   539 
       
   540   xorl(reg_y0, reg_y1);       // reg_y0 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1
       
   541   rorxd(reg_T1, reg_a, 13);   // reg_T1 = reg_a >> 13    ; S0B
       
   542   xorl(reg_y2, reg_g);        // reg_y2 = CH = ((reg_f^reg_g)&reg_e)^reg_g                 ; CH
       
   543   rorxd(reg_y1, reg_a, 22);   // reg_y1 = reg_a >> 22    ; S0A
       
   544   movl(reg_y3, reg_a);        // reg_y3 = reg_a                                ; MAJA
       
   545 
       
   546   xorl(reg_y1, reg_T1);       // reg_y1 = (reg_a>>22) ^ (reg_a>>13)  ; S0
       
   547   rorxd(reg_T1, reg_a, 2);    // reg_T1 = (reg_a >> 2)    ; S0
       
   548   addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter)); // reg_h = k + w + reg_h ; --
       
   549   orl(reg_y3, reg_c);         // reg_y3 = reg_a|reg_c                              ; MAJA
       
   550 
       
   551   xorl(reg_y1, reg_T1);       // reg_y1 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0
       
   552   movl(reg_T1, reg_a);        // reg_T1 = reg_a                                ; MAJB
       
   553   andl(reg_y3, reg_b);        // reg_y3 = (reg_a|reg_c)&reg_b                          ; MAJA
       
   554   andl(reg_T1, reg_c);        // reg_T1 = reg_a&reg_c                              ; MAJB
       
   555   addl(reg_y2, reg_y0);       // reg_y2 = S1 + CH                          ; --
       
   556 
       
   557 
       
   558   addl(reg_d, reg_h);         // reg_d = k + w + reg_h + reg_d                     ; --
       
   559   orl(reg_y3, reg_T1);        // reg_y3 = MAJ = (reg_a|reg_c)&reg_b)|(reg_a&reg_c)             ; MAJ
       
   560   addl(reg_h, reg_y1);        // reg_h = k + w + reg_h + S0                    ; --
       
   561 
       
   562   addl(reg_d, reg_y2);        // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1  ; --
       
   563 
       
   564 
       
   565   if (iter%4 == 3) {
       
   566     addl(reg_h, reg_y2);      // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
       
   567     addl(reg_h, reg_y3);      // reg_h = t1 + S0 + MAJ                     ; --
       
   568   }
       
   569 }
       
   570 
       
   571 void MacroAssembler::sha256_AVX2_four_rounds_compute_first(int start) {
       
   572     sha256_AVX2_one_round_compute(rax, rax, rbx, rdi, rsi,  r8,  r9, r10, r11, start + 0);
       
   573     sha256_AVX2_one_round_compute(r11, r11, rax, rbx, rdi, rsi,  r8,  r9, r10, start + 1);
       
   574     sha256_AVX2_one_round_compute(r10, r10, r11, rax, rbx, rdi, rsi,  r8,  r9, start + 2);
       
   575     sha256_AVX2_one_round_compute(r9,  r9,  r10, r11, rax, rbx, rdi, rsi,  r8, start + 3);
       
   576 }
       
   577 
       
   578 void MacroAssembler::sha256_AVX2_four_rounds_compute_last(int start) {
       
   579     sha256_AVX2_one_round_compute(r8,  r8,   r9, r10, r11, rax, rbx, rdi, rsi, start + 0);
       
   580     sha256_AVX2_one_round_compute(rsi, rsi,  r8,  r9, r10, r11, rax, rbx, rdi, start + 1);
       
   581     sha256_AVX2_one_round_compute(rdi, rdi, rsi,  r8,  r9, r10, r11, rax, rbx, start + 2);
       
   582     sha256_AVX2_one_round_compute(rbx, rbx, rdi, rsi,  r8,  r9, r10, r11, rax, start + 3);
       
   583 }
       
   584 
       
   585 void MacroAssembler::sha256_AVX2_one_round_and_sched(
       
   586         XMMRegister  xmm_0,     /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */
       
   587         XMMRegister  xmm_1,     /* ymm5 */  /* full cycle is 16 iterations */
       
   588         XMMRegister  xmm_2,     /* ymm6 */
       
   589         XMMRegister  xmm_3,     /* ymm7 */
       
   590         Register  reg_a,        /* == rax on 0 iteration, then rotate 8 register right on each next iteration */
       
   591         Register  reg_b,        /* rbx */    /* full cycle is 8 iterations */
       
   592         Register  reg_c,        /* rdi */
       
   593         Register  reg_d,        /* rsi */
       
   594         Register  reg_e,        /* r8 */
       
   595         Register  reg_f,        /* r9d */
       
   596         Register  reg_g,        /* r10d */
       
   597         Register  reg_h,        /* r11d */
       
   598         int iter)
       
   599 {
       
   600   movl(rcx, reg_a);           // rcx = reg_a               ; MAJA
       
   601   rorxd(r13, reg_e, 25);      // r13 = reg_e >> 25    ; S1A
       
   602   rorxd(r14, reg_e, 11);      //  r14 = reg_e >> 11    ; S1B
       
   603   addl(reg_h, Address(rsp, rdx, Address::times_1, 4*iter));
       
   604   orl(rcx, reg_c);            // rcx = reg_a|reg_c          ; MAJA
       
   605 
       
   606   movl(r15, reg_f);           // r15 = reg_f               ; CH
       
   607   rorxd(r12, reg_a, 13);      // r12 = reg_a >> 13      ; S0B
       
   608   xorl(r13, r14);             // r13 = (reg_e>>25) ^ (reg_e>>11)  ; S1
       
   609   xorl(r15, reg_g);           // r15 = reg_f^reg_g         ; CH
       
   610 
       
   611   rorxd(r14, reg_e, 6);       // r14 = (reg_e >> 6)    ; S1
       
   612   andl(r15, reg_e);           // r15 = (reg_f^reg_g)&reg_e ; CH
       
   613 
       
   614   xorl(r13, r14);             // r13 = (reg_e>>25) ^ (reg_e>>11) ^ (reg_e>>6) ; S1
       
   615   rorxd(r14, reg_a, 22);      // r14 = reg_a >> 22    ; S0A
       
   616   addl(reg_d, reg_h);         // reg_d = k + w + reg_h + reg_d                     ; --
       
   617 
       
   618   andl(rcx, reg_b);          // rcx = (reg_a|reg_c)&reg_b                          ; MAJA
       
   619   xorl(r14, r12);            // r14 = (reg_a>>22) ^ (reg_a>>13)  ; S0
       
   620 
       
   621   rorxd(r12, reg_a, 2);      // r12 = (reg_a >> 2)    ; S0
       
   622   xorl(r15, reg_g);          // r15 = CH = ((reg_f^reg_g)&reg_e)^reg_g                 ; CH
       
   623 
       
   624   xorl(r14, r12);            // r14 = (reg_a>>22) ^ (reg_a>>13) ^ (reg_a>>2) ; S0
       
   625   movl(r12, reg_a);          // r12 = reg_a                                ; MAJB
       
   626   andl(r12, reg_c);          // r12 = reg_a&reg_c                              ; MAJB
       
   627   addl(r15, r13);            // r15 = S1 + CH                          ; --
       
   628 
       
   629   orl(rcx, r12);             // rcx = MAJ = (reg_a|reg_c)&reg_b)|(reg_a&reg_c)             ; MAJ
       
   630   addl(reg_h, r14);          // reg_h = k + w + reg_h + S0                    ; --
       
   631   addl(reg_d, r15);          // reg_d = k + w + reg_h + reg_d + S1 + CH = reg_d + t1  ; --
       
   632 
       
   633   addl(reg_h, r15);          // reg_h = k + w + reg_h + S0 + S1 + CH = t1 + S0; --
       
   634   addl(reg_h, rcx);          // reg_h = t1 + S0 + MAJ                     ; --
       
   635 
       
   636   if (iter%4 == 0) {
       
   637     vpalignr(xmm0, xmm_3, xmm_2, 4, AVX_256bit);   // ymm0 = W[-7]
       
   638     vpaddd(xmm0, xmm0, xmm_0, AVX_256bit);         // ymm0 = W[-7] + W[-16]; y1 = (e >> 6)     ; S1
       
   639     vpalignr(xmm1, xmm_1, xmm_0, 4, AVX_256bit);   // ymm1 = W[-15]
       
   640     vpsrld(xmm2, xmm1, 7, AVX_256bit);
       
   641     vpslld(xmm3, xmm1, 32-7, AVX_256bit);
       
   642     vpor(xmm3, xmm3, xmm2, AVX_256bit);            // ymm3 = W[-15] ror 7
       
   643     vpsrld(xmm2, xmm1,18, AVX_256bit);
       
   644   } else if (iter%4 == 1 ) {
       
   645     vpsrld(xmm8, xmm1, 3, AVX_256bit);             // ymm8 = W[-15] >> 3
       
   646     vpslld(xmm1, xmm1, 32-18, AVX_256bit);
       
   647     vpxor(xmm3, xmm3, xmm1, AVX_256bit);
       
   648     vpxor(xmm3, xmm3, xmm2, AVX_256bit);           // ymm3 = W[-15] ror 7 ^ W[-15] ror 18
       
   649     vpxor(xmm1, xmm3, xmm8, AVX_256bit);           // ymm1 = s0
       
   650     vpshufd(xmm2, xmm_3, 0xFA, AVX_256bit);        // 11111010b ; ymm2 = W[-2] {BBAA}
       
   651     vpaddd(xmm0, xmm0, xmm1, AVX_256bit);          // ymm0 = W[-16] + W[-7] + s0
       
   652     vpsrld(xmm8, xmm2, 10, AVX_256bit);            // ymm8 = W[-2] >> 10 {BBAA}
       
   653   } else if (iter%4 == 2) {
       
   654     vpsrlq(xmm3, xmm2, 19, AVX_256bit);            // ymm3 = W[-2] ror 19 {xBxA}
       
   655     vpsrlq(xmm2, xmm2, 17, AVX_256bit);            // ymm2 = W[-2] ror 17 {xBxA}
       
   656     vpxor(xmm2, xmm2, xmm3, AVX_256bit);
       
   657     vpxor(xmm8, xmm8, xmm2, AVX_256bit);           // ymm8 = s1 {xBxA}
       
   658     vpshufb(xmm8, xmm8, xmm10, AVX_256bit);        // ymm8 = s1 {00BA}
       
   659     vpaddd(xmm0, xmm0, xmm8, AVX_256bit);          // ymm0 = {..., ..., W[1], W[0]}
       
   660     vpshufd(xmm2, xmm0, 0x50, AVX_256bit);         // 01010000b ; ymm2 = W[-2] {DDCC}
       
   661   } else if (iter%4 == 3) {
       
   662     vpsrld(xmm11, xmm2, 10, AVX_256bit);           // ymm11 = W[-2] >> 10 {DDCC}
       
   663     vpsrlq(xmm3, xmm2, 19, AVX_256bit);            // ymm3 = W[-2] ror 19 {xDxC}
       
   664     vpsrlq(xmm2, xmm2, 17, AVX_256bit);            // ymm2 = W[-2] ror 17 {xDxC}
       
   665     vpxor(xmm2, xmm2, xmm3, AVX_256bit);
       
   666     vpxor(xmm11, xmm11, xmm2, AVX_256bit);         // ymm11 = s1 {xDxC}
       
   667     vpshufb(xmm11, xmm11, xmm12, AVX_256bit);      // ymm11 = s1 {DC00}
       
   668     vpaddd(xmm_0, xmm11, xmm0, AVX_256bit);        // xmm_0 = {W[3], W[2], W[1], W[0]}
       
   669   }
       
   670 }
       
   671 
       
   672 void MacroAssembler::addm(int disp, Register r1, Register r2) {
       
   673   addl(r2, Address(r1, disp));
       
   674   movl(Address(r1, disp), r2);
       
   675 }
       
   676 
       
   677 void MacroAssembler::addmq(int disp, Register r1, Register r2) {
       
   678   addq(r2, Address(r1, disp));
       
   679   movq(Address(r1, disp), r2);
       
   680 }
       
   681 
       
   682 void MacroAssembler::sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
       
   683   XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
       
   684   Register buf, Register state, Register ofs, Register limit, Register rsp,
       
   685   bool multi_block, XMMRegister shuf_mask) {
       
   686 
       
   687   Label loop0, loop1, loop2, loop3,
       
   688         last_block_enter, do_last_block, only_one_block, done_hash,
       
   689         compute_size, compute_size_end,
       
   690         compute_size1, compute_size_end1;
       
   691 
       
   692   address K256_W = StubRoutines::x86::k256_W_addr();
       
   693   address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr();
       
   694   address pshuffle_byte_flip_mask_addr = 0;
       
   695 
       
   696 const XMMRegister& SHUF_00BA        = xmm10;    // ymm10: shuffle xBxA -> 00BA
       
   697 const XMMRegister& SHUF_DC00        = xmm12;    // ymm12: shuffle xDxC -> DC00
       
   698 const XMMRegister& BYTE_FLIP_MASK   = xmm13;   // ymm13
       
   699 
       
   700 const XMMRegister& X_BYTE_FLIP_MASK = xmm13;   //XMM version of BYTE_FLIP_MASK
       
   701 
       
   702 const Register& NUM_BLKS = r8;   // 3rd arg
       
   703 const Register& CTX      = rdx;  // 2nd arg
       
   704 const Register& INP      = rcx;  // 1st arg
       
   705 
       
   706 const Register& c        = rdi;
       
   707 const Register& d        = rsi;
       
   708 const Register& e        = r8;    // clobbers NUM_BLKS
       
   709 const Register& y3       = rcx;  // clobbers INP
       
   710 
       
   711 const Register& TBL      = rbp;
       
   712 const Register& SRND     = CTX;   // SRND is same register as CTX
       
   713 
       
   714 const Register& a        = rax;
       
   715 const Register& b        = rbx;
       
   716 const Register& f        = r9;
       
   717 const Register& g        = r10;
       
   718 const Register& h        = r11;
       
   719 
       
   720 const Register& T1       = r12;
       
   721 const Register& y0       = r13;
       
   722 const Register& y1       = r14;
       
   723 const Register& y2       = r15;
       
   724 
       
   725 
       
   726 enum {
       
   727   _XFER_SIZE = 2*64*4, // 2 blocks, 64 rounds, 4 bytes/round
       
   728   _INP_END_SIZE = 8,
       
   729   _INP_SIZE = 8,
       
   730   _CTX_SIZE = 8,
       
   731   _RSP_SIZE = 8,
       
   732 
       
   733   _XFER = 0,
       
   734   _INP_END   = _XFER     + _XFER_SIZE,
       
   735   _INP       = _INP_END  + _INP_END_SIZE,
       
   736   _CTX       = _INP      + _INP_SIZE,
       
   737   _RSP       = _CTX      + _CTX_SIZE,
       
   738   STACK_SIZE = _RSP      + _RSP_SIZE
       
   739 };
       
   740 
       
   741 #ifndef _WIN64
       
   742   push(rcx);    // linux: this is limit, need at the end
       
   743   push(rdx);    // linux: this is ofs
       
   744 #else
       
   745   push(r8);     // win64: this is ofs
       
   746   push(r9);     // win64: this is limit, we need them again at the very and
       
   747 #endif
       
   748 
       
   749 
       
   750   push(rbx);
       
   751 #ifdef _WIN64
       
   752   push(rsi);
       
   753   push(rdi);
       
   754 #endif
       
   755   push(rbp);
       
   756   push(r12);
       
   757   push(r13);
       
   758   push(r14);
       
   759   push(r15);
       
   760 
       
   761   movq(rax, rsp);
       
   762   subq(rsp, STACK_SIZE);
       
   763   andq(rsp, -32);
       
   764   movq(Address(rsp, _RSP), rax);
       
   765 
       
   766 #ifndef _WIN64
       
   767   // copy linux params to win64 params, therefore the rest of code will be the same for both
       
   768   movq(r9,  rcx);
       
   769   movq(r8,  rdx);
       
   770   movq(rdx, rsi);
       
   771   movq(rcx, rdi);
       
   772 #endif
       
   773 
       
   774   // setting original assembly ABI
       
   775   /** message to encrypt in INP */
       
   776   lea(INP, Address(rcx, 0));    // rcx == message (buf)     ;; linux: INP = buf = rdi
       
   777   /** digest in CTX             */
       
   778   movq(CTX, rdx);               // rdx = digest  (state)    ;; linux: CTX = state = rsi
       
   779 
       
   780   /** NUM_BLK is the length of message, need to set it from ofs and limit  */
       
   781   if (multi_block) {
       
   782 
       
   783     // Win64: cannot directly update NUM_BLKS, since NUM_BLKS = ofs = r8
       
   784     // on entry r8 = ofs
       
   785     // on exit  r8 = NUM_BLKS
       
   786 
       
   787     xorq(rax, rax);
       
   788 
       
   789     bind(compute_size);
       
   790     cmpptr(r8, r9); // assume the original ofs <= limit ;; linux:  cmp rcx, rdx
       
   791     jccb(Assembler::aboveEqual, compute_size_end);
       
   792     addq(r8, 64);                                          //;; linux: ofs = rdx
       
   793     addq(rax, 64);
       
   794     jmpb(compute_size);
       
   795 
       
   796     bind(compute_size_end);
       
   797     movq(NUM_BLKS, rax);  // NUM_BLK (r8)                  ;; linux: NUM_BLK = rdx
       
   798 
       
   799     cmpq(NUM_BLKS, 0);
       
   800     jcc(Assembler::equal, done_hash);
       
   801 
       
   802     } else {
       
   803     xorq(NUM_BLKS, NUM_BLKS);
       
   804     addq(NUM_BLKS, 64);
       
   805   }//if (!multi_block)
       
   806 
       
   807   lea(NUM_BLKS, Address(INP, NUM_BLKS, Address::times_1, -64)); // pointer to the last block
       
   808   movq(Address(rsp, _INP_END), NUM_BLKS);  //
       
   809 
       
   810   cmpptr(INP, NUM_BLKS);                   //cmp INP, NUM_BLKS
       
   811   jcc(Assembler::equal, only_one_block);   //je only_one_block
       
   812 
       
   813   // load initial digest
       
   814   movl(a, Address(CTX, 4*0));
       
   815   movl(b, Address(CTX, 4*1));
       
   816   movl(c, Address(CTX, 4*2));
       
   817   movl(d, Address(CTX, 4*3));
       
   818   movl(e, Address(CTX, 4*4));
       
   819   movl(f, Address(CTX, 4*5));
       
   820   // load g - r10 after it is used as scratch
       
   821   movl(h, Address(CTX, 4*7));
       
   822 
       
   823   pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask;
       
   824   vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr +0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip]
       
   825   vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32));     //[_SHUF_00BA wrt rip]
       
   826   vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64));     //[_SHUF_DC00 wrt rip]
       
   827 
       
   828   movl(g, Address(CTX, 4*6));
       
   829 
       
   830   movq(Address(rsp, _CTX), CTX);           // store
       
   831 
       
   832 bind(loop0);
       
   833   lea(TBL, ExternalAddress(K256_W));
       
   834 
       
   835   // assume buffers not aligned
       
   836 
       
   837   // Load first 16 dwords from two blocks
       
   838   vmovdqu(xmm0, Address(INP, 0*32));
       
   839   vmovdqu(xmm1, Address(INP, 1*32));
       
   840   vmovdqu(xmm2, Address(INP, 2*32));
       
   841   vmovdqu(xmm3, Address(INP, 3*32));
       
   842 
       
   843   // byte swap data
       
   844   vpshufb(xmm0, xmm0, BYTE_FLIP_MASK, AVX_256bit);
       
   845   vpshufb(xmm1, xmm1, BYTE_FLIP_MASK, AVX_256bit);
       
   846   vpshufb(xmm2, xmm2, BYTE_FLIP_MASK, AVX_256bit);
       
   847   vpshufb(xmm3, xmm3, BYTE_FLIP_MASK, AVX_256bit);
       
   848 
       
   849   // transpose data into high/low halves
       
   850   vperm2i128(xmm4, xmm0, xmm2, 0x20);
       
   851   vperm2i128(xmm5, xmm0, xmm2, 0x31);
       
   852   vperm2i128(xmm6, xmm1, xmm3, 0x20);
       
   853   vperm2i128(xmm7, xmm1, xmm3, 0x31);
       
   854 
       
   855 bind(last_block_enter);
       
   856   addq(INP, 64);
       
   857   movq(Address(rsp, _INP), INP);
       
   858 
       
   859   //;; schedule 48 input dwords, by doing 3 rounds of 12 each
       
   860   xorq(SRND, SRND);
       
   861 
       
   862 align(16);
       
   863 bind(loop1);
       
   864   vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit);
       
   865   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9);
       
   866   sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, rax, rbx, rdi, rsi, r8,  r9,  r10, r11, 0);
       
   867   sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r11, rax, rbx, rdi, rsi, r8,  r9,  r10, 1);
       
   868   sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r10, r11, rax, rbx, rdi, rsi, r8,  r9,  2);
       
   869   sha256_AVX2_one_round_and_sched(xmm4, xmm5, xmm6, xmm7, r9,  r10, r11, rax, rbx, rdi, rsi, r8,  3);
       
   870 
       
   871   vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit);
       
   872   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9);
       
   873   sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, r8,  r9,  r10, r11, rax, rbx, rdi, rsi,  8+0);
       
   874   sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rsi, r8,  r9,  r10, r11, rax, rbx, rdi,  8+1);
       
   875   sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rdi, rsi, r8,  r9,  r10, r11, rax, rbx,  8+2);
       
   876   sha256_AVX2_one_round_and_sched(xmm5, xmm6, xmm7, xmm4, rbx, rdi, rsi, r8,  r9,  r10, r11, rax,  8+3);
       
   877 
       
   878   vpaddd(xmm9, xmm6, Address(TBL, SRND, Address::times_1, 2*32), AVX_256bit);
       
   879   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 2*32), xmm9);
       
   880   sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, rax, rbx, rdi, rsi, r8,  r9,  r10, r11, 16+0);
       
   881   sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r11, rax, rbx, rdi, rsi, r8,  r9,  r10, 16+1);
       
   882   sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r10, r11, rax, rbx, rdi, rsi, r8,  r9,  16+2);
       
   883   sha256_AVX2_one_round_and_sched(xmm6, xmm7, xmm4, xmm5, r9,  r10, r11, rax, rbx, rdi, rsi, r8,  16+3);
       
   884 
       
   885   vpaddd(xmm9, xmm7, Address(TBL, SRND, Address::times_1, 3*32), AVX_256bit);
       
   886   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 3*32), xmm9);
       
   887 
       
   888   sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, r8,  r9,  r10, r11, rax, rbx, rdi, rsi,  24+0);
       
   889   sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rsi, r8,  r9,  r10, r11, rax, rbx, rdi,  24+1);
       
   890   sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rdi, rsi, r8,  r9,  r10, r11, rax, rbx,  24+2);
       
   891   sha256_AVX2_one_round_and_sched(xmm7, xmm4, xmm5, xmm6, rbx, rdi, rsi, r8,  r9,  r10, r11, rax,  24+3);
       
   892 
       
   893   addq(SRND, 4*32);
       
   894   cmpq(SRND, 3 * 4*32);
       
   895   jcc(Assembler::below, loop1);
       
   896 
       
   897 bind(loop2);
       
   898   // Do last 16 rounds with no scheduling
       
   899   vpaddd(xmm9, xmm4, Address(TBL, SRND, Address::times_1, 0*32), AVX_256bit);
       
   900   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 0*32), xmm9);
       
   901   sha256_AVX2_four_rounds_compute_first(0);
       
   902 
       
   903   vpaddd(xmm9, xmm5, Address(TBL, SRND, Address::times_1, 1*32), AVX_256bit);
       
   904   vmovdqu(Address(rsp, SRND, Address::times_1, _XFER + 1*32), xmm9);
       
   905   sha256_AVX2_four_rounds_compute_last(0 + 8);
       
   906 
       
   907   addq(SRND, 2*32);
       
   908 
       
   909   vmovdqu(xmm4, xmm6);
       
   910   vmovdqu(xmm5, xmm7);
       
   911 
       
   912   cmpq(SRND, 4 * 4*32);
       
   913   jcc(Assembler::below, loop2);
       
   914 
       
   915   movq(CTX, Address(rsp, _CTX));
       
   916   movq(INP, Address(rsp, _INP));
       
   917 
       
   918   addm(4*0, CTX, a);
       
   919   addm(4*1, CTX, b);
       
   920   addm(4*2, CTX, c);
       
   921   addm(4*3, CTX, d);
       
   922   addm(4*4, CTX, e);
       
   923   addm(4*5, CTX, f);
       
   924   addm(4*6, CTX, g);
       
   925   addm(4*7, CTX, h);
       
   926 
       
   927   cmpq(INP, Address(rsp, _INP_END));
       
   928   jcc(Assembler::above, done_hash);
       
   929 
       
   930   //Do second block using previously scheduled results
       
   931   xorq(SRND, SRND);
       
   932 align(16);
       
   933 bind(loop3);
       
   934   sha256_AVX2_four_rounds_compute_first(4);
       
   935   sha256_AVX2_four_rounds_compute_last(4+8);
       
   936 
       
   937   addq(SRND, 2*32);
       
   938   cmpq(SRND, 4 * 4*32);
       
   939   jcc(Assembler::below, loop3);
       
   940 
       
   941   movq(CTX, Address(rsp, _CTX));
       
   942   movq(INP, Address(rsp, _INP));
       
   943   addq(INP, 64);
       
   944 
       
   945   addm(4*0, CTX, a);
       
   946   addm(4*1, CTX, b);
       
   947   addm(4*2, CTX, c);
       
   948   addm(4*3, CTX, d);
       
   949   addm(4*4, CTX, e);
       
   950   addm(4*5, CTX, f);
       
   951   addm(4*6, CTX, g);
       
   952   addm(4*7, CTX, h);
       
   953 
       
   954   cmpq(INP, Address(rsp, _INP_END));
       
   955   jcc(Assembler::below, loop0);
       
   956   jccb(Assembler::above, done_hash);
       
   957 
       
   958 bind(do_last_block);
       
   959   lea(TBL, ExternalAddress(K256_W));
       
   960 
       
   961   movdqu(xmm4, Address(INP, 0*16));
       
   962   movdqu(xmm5, Address(INP, 1*16));
       
   963   movdqu(xmm6, Address(INP, 2*16));
       
   964   movdqu(xmm7, Address(INP, 3*16));
       
   965 
       
   966   vpshufb(xmm4, xmm4, xmm13, AVX_128bit);
       
   967   vpshufb(xmm5, xmm5, xmm13, AVX_128bit);
       
   968   vpshufb(xmm6, xmm6, xmm13, AVX_128bit);
       
   969   vpshufb(xmm7, xmm7, xmm13, AVX_128bit);
       
   970 
       
   971   jmp(last_block_enter);
       
   972 
       
   973 bind(only_one_block);
       
   974 
       
   975   // load initial digest ;; table should be preloaded with following values
       
   976   movl(a, Address(CTX, 4*0));   // 0x6a09e667
       
   977   movl(b, Address(CTX, 4*1));   // 0xbb67ae85
       
   978   movl(c, Address(CTX, 4*2));   // 0x3c6ef372
       
   979   movl(d, Address(CTX, 4*3));   // 0xa54ff53a
       
   980   movl(e, Address(CTX, 4*4));   // 0x510e527f
       
   981   movl(f, Address(CTX, 4*5));   // 0x9b05688c
       
   982   // load g - r10 after use as scratch
       
   983   movl(h, Address(CTX, 4*7));   // 0x5be0cd19
       
   984 
       
   985 
       
   986   pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask;
       
   987   vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //[PSHUFFLE_BYTE_FLIP_MASK wrt rip]
       
   988   vmovdqu(SHUF_00BA, ExternalAddress(pshuffle_byte_flip_mask_addr + 32));     //[_SHUF_00BA wrt rip]
       
   989   vmovdqu(SHUF_DC00, ExternalAddress(pshuffle_byte_flip_mask_addr + 64));     //[_SHUF_DC00 wrt rip]
       
   990 
       
   991   movl(g, Address(CTX, 4*6));   // 0x1f83d9ab
       
   992 
       
   993   movq(Address(rsp, _CTX), CTX);
       
   994   jmpb(do_last_block);
       
   995 
       
   996 bind(done_hash);
       
   997 
       
   998   movq(rsp, Address(rsp, _RSP));
       
   999 
       
  1000   pop(r15);
       
  1001   pop(r14);
       
  1002   pop(r13);
       
  1003   pop(r12);
       
  1004   pop(rbp);
       
  1005 #ifdef _WIN64
       
  1006   pop(rdi);
       
  1007   pop(rsi);
       
  1008 #endif
       
  1009   pop(rbx);
       
  1010 
       
  1011 #ifdef _WIN64
       
  1012   pop(r9);
       
  1013   pop(r8);
       
  1014 #else
       
  1015   pop(rdx);
       
  1016   pop(rcx);
       
  1017 #endif
       
  1018 
       
  1019   if (multi_block) {
       
  1020 #ifdef _WIN64
       
  1021 const Register& limit_end = r9;
       
  1022 const Register& ofs_end   = r8;
       
  1023 #else
       
  1024 const Register& limit_end = rcx;
       
  1025 const Register& ofs_end   = rdx;
       
  1026 #endif
       
  1027     movq(rax, ofs_end);
       
  1028 
       
  1029 bind(compute_size1);
       
  1030     cmpptr(rax, limit_end); // assume the original ofs <= limit
       
  1031     jccb(Assembler::aboveEqual, compute_size_end1);
       
  1032     addq(rax, 64);
       
  1033     jmpb(compute_size1);
       
  1034 
       
  1035 bind(compute_size_end1);
       
  1036   }
       
  1037 }
       
  1038 
       
  1039 void MacroAssembler::sha512_AVX2_one_round_compute(Register  old_h, Register a, Register b, Register c,
       
  1040                                                    Register d, Register e, Register f, Register g, Register h,
       
  1041                                                    int iteration)
       
  1042 {
       
  1043 
       
  1044     const Register& y0 = r13;
       
  1045     const Register& y1 = r14;
       
  1046     const Register& y2 = r15;
       
  1047 #ifdef _WIN64
       
  1048     const Register& y3 = rcx;
       
  1049 #else
       
  1050     const Register& y3 = rdi;
       
  1051 #endif
       
  1052     const Register& T1 = r12;
       
  1053 
       
  1054     if (iteration % 4 > 0) {
       
  1055       addq(old_h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0;
       
  1056     }
       
  1057     movq(y2, f); //y2 = f; CH
       
  1058     rorxq(y0, e, 41); //y0 = e >> 41; S1A
       
  1059     rorxq(y1, e, 18); //y1 = e >> 18; S1B
       
  1060     xorq(y2, g); //y2 = f^g; CH
       
  1061 
       
  1062     xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18); S1
       
  1063     rorxq(y1, e, 14); //y1 = (e >> 14); S1
       
  1064     andq(y2, e); //y2 = (f^g)&e; CH
       
  1065 
       
  1066     if (iteration % 4 > 0 ) {
       
  1067       addq(old_h, y3); //h = t1 + S0 + MAJ
       
  1068     }
       
  1069     xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18) ^ (e >> 14); S1
       
  1070     rorxq(T1, a, 34); //T1 = a >> 34; S0B
       
  1071     xorq(y2, g); //y2 = CH = ((f^g)&e) ^g; CH
       
  1072     rorxq(y1, a, 39); //y1 = a >> 39; S0A
       
  1073     movq(y3, a); //y3 = a; MAJA
       
  1074 
       
  1075     xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34); S0
       
  1076     rorxq(T1, a, 28); //T1 = (a >> 28); S0
       
  1077     addq(h, Address(rsp, (8 * iteration))); //h = k + w + h; --
       
  1078     orq(y3, c); //y3 = a | c; MAJA
       
  1079 
       
  1080     xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34) ^ (a >> 28); S0
       
  1081     movq(T1, a); //T1 = a; MAJB
       
  1082     andq(y3, b); //y3 = (a | c)&b; MAJA
       
  1083     andq(T1, c); //T1 = a&c; MAJB
       
  1084     addq(y2, y0); //y2 = S1 + CH; --
       
  1085 
       
  1086     addq(d, h); //d = k + w + h + d; --
       
  1087     orq(y3, T1); //y3 = MAJ = (a | c)&b) | (a&c); MAJ
       
  1088     addq(h, y1); //h = k + w + h + S0; --
       
  1089 
       
  1090     addq(d, y2); //d = k + w + h + d + S1 + CH = d + t1; --
       
  1091 
       
  1092     if (iteration % 4 == 3) {
       
  1093       addq(h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; --
       
  1094       addq(h, y3); //h = t1 + S0 + MAJ; --
       
  1095     }
       
  1096 }
       
  1097 
       
  1098 void MacroAssembler::sha512_AVX2_one_round_and_schedule(
       
  1099     XMMRegister xmm4, // ymm4
       
  1100     XMMRegister xmm5, // ymm5
       
  1101     XMMRegister xmm6, // ymm6
       
  1102     XMMRegister xmm7, // ymm7
       
  1103     Register a, //rax
       
  1104     Register b, //rbx
       
  1105     Register c, //rdi
       
  1106     Register d, //rsi
       
  1107     Register e, //r8
       
  1108     Register f, //r9
       
  1109     Register g, //r10
       
  1110     Register h, //r11
       
  1111     int iteration)
       
  1112 {
       
  1113 
       
  1114     const Register& y0 = r13;
       
  1115     const Register& y1 = r14;
       
  1116     const Register& y2 = r15;
       
  1117 #ifdef _WIN64
       
  1118     const Register& y3 = rcx;
       
  1119 #else
       
  1120     const Register& y3 = rdi;
       
  1121 #endif
       
  1122     const Register& T1 = r12;
       
  1123 
       
  1124     if (iteration % 4 == 0) {
       
  1125       // Extract w[t - 7]
       
  1126       // xmm0 = W[-7]
       
  1127       vperm2f128(xmm0, xmm7, xmm6, 3);
       
  1128       vpalignr(xmm0, xmm0, xmm6, 8, AVX_256bit);
       
  1129 
       
  1130       // Calculate w[t - 16] + w[t - 7]
       
  1131       vpaddq(xmm0, xmm0, xmm4, AVX_256bit); //xmm0 = W[-7] + W[-16]
       
  1132       // Extract w[t - 15]
       
  1133       //xmm1 = W[-15]
       
  1134       vperm2f128(xmm1, xmm5, xmm4, 3);
       
  1135       vpalignr(xmm1, xmm1, xmm4, 8, AVX_256bit);
       
  1136 
       
  1137       // Calculate sigma0
       
  1138       // Calculate w[t - 15] ror 1
       
  1139       vpsrlq(xmm2, xmm1, 1, AVX_256bit);
       
  1140       vpsllq(xmm3, xmm1, (64 - 1), AVX_256bit);
       
  1141       vpor(xmm3, xmm3, xmm2, AVX_256bit); //xmm3 = W[-15] ror 1
       
  1142       // Calculate w[t - 15] shr 7
       
  1143       vpsrlq(xmm8, xmm1, 7, AVX_256bit); //xmm8 = W[-15] >> 7
       
  1144 
       
  1145     } else if (iteration % 4 == 1) {
       
  1146       //Calculate w[t - 15] ror 8
       
  1147       vpsrlq(xmm2, xmm1, 8, AVX_256bit);
       
  1148       vpsllq(xmm1, xmm1, (64 - 8), AVX_256bit);
       
  1149       vpor(xmm1, xmm1, xmm2, AVX_256bit); //xmm1 = W[-15] ror 8
       
  1150 
       
  1151       //XOR the three components
       
  1152       vpxor(xmm3, xmm3, xmm8, AVX_256bit); //xmm3 = W[-15] ror 1 ^ W[-15] >> 7
       
  1153       vpxor(xmm1, xmm3, xmm1, AVX_256bit); //xmm1 = s0
       
  1154 
       
  1155       //Add three components, w[t - 16], w[t - 7] and sigma0
       
  1156       vpaddq(xmm0, xmm0, xmm1, AVX_256bit); //xmm0 = W[-16] + W[-7] + s0
       
  1157 
       
  1158       // Move to appropriate lanes for calculating w[16] and w[17]
       
  1159       vperm2f128(xmm4, xmm0, xmm0, 0); //xmm4 = W[-16] + W[-7] + s0{ BABA }
       
  1160 
       
  1161       //Move to appropriate lanes for calculating w[18] and w[19]
       
  1162       vpand(xmm0, xmm0, xmm10, AVX_256bit); //xmm0 = W[-16] + W[-7] + s0{ DC00 }
       
  1163       //Calculate w[16] and w[17] in both 128 bit lanes
       
  1164       //Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
       
  1165       vperm2f128(xmm2, xmm7, xmm7, 17); //xmm2 = W[-2] {BABA}
       
  1166       vpsrlq(xmm8, xmm2, 6, AVX_256bit); //xmm8 = W[-2] >> 6 {BABA}
       
  1167 
       
  1168     } else if (iteration % 4 == 2) {
       
  1169       vpsrlq(xmm3, xmm2, 19, AVX_256bit); //xmm3 = W[-2] >> 19 {BABA}
       
  1170       vpsllq(xmm1, xmm2, (64 - 19), AVX_256bit); //xmm1 = W[-2] << 19 {BABA}
       
  1171       vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 19 {BABA}
       
  1172       vpxor(xmm8, xmm8, xmm3, AVX_256bit);// xmm8 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
       
  1173       vpsrlq(xmm3, xmm2, 61, AVX_256bit); //xmm3 = W[-2] >> 61 {BABA}
       
  1174       vpsllq(xmm1, xmm2, (64 - 61), AVX_256bit); //xmm1 = W[-2] << 61 {BABA}
       
  1175       vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 61 {BABA}
       
  1176       vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) { BABA }
       
  1177 
       
  1178       //Add sigma1 to the other components to get w[16] and w[17]
       
  1179       vpaddq(xmm4, xmm4, xmm8, AVX_256bit); //xmm4 = { W[1], W[0], W[1], W[0] }
       
  1180 
       
  1181       //Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
       
  1182       vpsrlq(xmm8, xmm4, 6, AVX_256bit); //xmm8 = W[-2] >> 6 {DC--}
       
  1183 
       
  1184     } else if (iteration % 4 == 3){
       
  1185       vpsrlq(xmm3, xmm4, 19, AVX_256bit); //xmm3 = W[-2] >> 19 {DC--}
       
  1186       vpsllq(xmm1, xmm4, (64 - 19), AVX_256bit); //xmm1 = W[-2] << 19 {DC--}
       
  1187       vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 19 {DC--}
       
  1188       vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
       
  1189       vpsrlq(xmm3, xmm4, 61, AVX_256bit); //xmm3 = W[-2] >> 61 {DC--}
       
  1190       vpsllq(xmm1, xmm4, (64 - 61), AVX_256bit); //xmm1 = W[-2] << 61 {DC--}
       
  1191       vpor(xmm3, xmm3, xmm1, AVX_256bit); //xmm3 = W[-2] ror 61 {DC--}
       
  1192       vpxor(xmm8, xmm8, xmm3, AVX_256bit); //xmm8 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) { DC-- }
       
  1193 
       
  1194       //Add the sigma0 + w[t - 7] + w[t - 16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19]
       
  1195       vpaddq(xmm2, xmm0, xmm8, AVX_256bit); //xmm2 = { W[3], W[2], --, -- }
       
  1196 
       
  1197       //Form w[19, w[18], w17], w[16]
       
  1198       vpblendd(xmm4, xmm4, xmm2, 0xF0, AVX_256bit); //xmm4 = { W[3], W[2], W[1], W[0] }
       
  1199     }
       
  1200 
       
  1201     movq(y3, a); //y3 = a; MAJA
       
  1202     rorxq(y0, e, 41); // y0 = e >> 41; S1A
       
  1203     rorxq(y1, e, 18); //y1 = e >> 18; S1B
       
  1204     addq(h, Address(rsp, (iteration * 8))); //h = k + w + h; --
       
  1205     orq(y3, c); //y3 = a | c; MAJA
       
  1206     movq(y2, f); //y2 = f; CH
       
  1207 
       
  1208     xorq(y2, g); //y2 = f^g; CH
       
  1209 
       
  1210     rorxq(T1, a, 34); //T1 = a >> 34; S0B
       
  1211     xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18); S1
       
  1212 
       
  1213     rorxq(y1, e, 14); //y1 = (e >> 14); S1
       
  1214 
       
  1215     andq(y2, e); //y2 = (f^g) & e; CH
       
  1216     addq(d, h); //d = k + w + h + d; --
       
  1217 
       
  1218     andq(y3, b); //y3 = (a | c)&b; MAJA
       
  1219     xorq(y0, y1); //y0 = (e >> 41) ^ (e >> 18) ^ (e >> 14); S1
       
  1220     rorxq(y1, a, 39); //y1 = a >> 39; S0A
       
  1221 
       
  1222     xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34); S0
       
  1223     rorxq(T1, a, 28); //T1 = (a >> 28); S0
       
  1224     xorq(y2, g); //y2 = CH = ((f^g)&e) ^ g; CH
       
  1225 
       
  1226     xorq(y1, T1); //y1 = (a >> 39) ^ (a >> 34) ^ (a >> 28); S0
       
  1227     movq(T1, a); //T1 = a; MAJB
       
  1228 
       
  1229     andq(T1, c); //T1 = a&c; MAJB
       
  1230     addq(y2, y0); //y2 = S1 + CH; --
       
  1231 
       
  1232     orq(y3, T1); //y3 = MAJ = (a | c)&b) | (a&c); MAJ
       
  1233     addq(h, y1); //h = k + w + h + S0; --
       
  1234 
       
  1235     addq(d, y2); //d = k + w + h + d + S1 + CH = d + t1; --
       
  1236     addq(h, y2); //h = k + w + h + S0 + S1 + CH = t1 + S0; --
       
  1237     addq(h, y3); //h = t1 + S0 + MAJ; --
       
  1238 }
       
  1239 
       
  1240 void MacroAssembler::sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
       
  1241                                  XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
       
  1242                                  Register buf, Register state, Register ofs, Register limit, Register rsp,
       
  1243                                  bool multi_block, XMMRegister shuf_mask)
       
  1244 {
       
  1245 
       
  1246     Label loop0, loop1, loop2, done_hash,
       
  1247     compute_block_size, compute_size,
       
  1248     compute_block_size_end, compute_size_end;
       
  1249 
       
  1250     address K512_W = StubRoutines::x86::k512_W_addr();
       
  1251     address pshuffle_byte_flip_mask_sha512 = StubRoutines::x86::pshuffle_byte_flip_mask_addr_sha512();
       
  1252     address pshuffle_byte_flip_mask_addr = 0;
       
  1253 
       
  1254     const XMMRegister& XFER = xmm0; // YTMP0
       
  1255     const XMMRegister& BYTE_FLIP_MASK = xmm9; // ymm9
       
  1256     const XMMRegister& YMM_MASK_LO = xmm10; // ymm10
       
  1257 #ifdef _WIN64
       
  1258     const Register& INP = rcx; //1st arg
       
  1259     const Register& CTX = rdx; //2nd arg
       
  1260     const Register& NUM_BLKS = r8; //3rd arg
       
  1261     const Register& c = rdi;
       
  1262     const Register& d = rsi;
       
  1263     const Register& e = r8;
       
  1264     const Register& y3 = rcx;
       
  1265     const Register& offset = r8;
       
  1266     const Register& input_limit = r9;
       
  1267 #else
       
  1268     const Register& INP = rdi; //1st arg
       
  1269     const Register& CTX = rsi; //2nd arg
       
  1270     const Register& NUM_BLKS = rdx; //3rd arg
       
  1271     const Register& c  = rcx;
       
  1272     const Register& d  = r8;
       
  1273     const Register& e  = rdx;
       
  1274     const Register& y3 = rdi;
       
  1275     const Register& offset = rdx;
       
  1276     const Register& input_limit = rcx;
       
  1277 #endif
       
  1278 
       
  1279     const Register& TBL = rbp;
       
  1280 
       
  1281     const Register& a = rax;
       
  1282     const Register& b = rbx;
       
  1283 
       
  1284     const Register& f = r9;
       
  1285     const Register& g = r10;
       
  1286     const Register& h = r11;
       
  1287 
       
  1288     //Local variables as defined in assembly file.
       
  1289     enum
       
  1290     {
       
  1291       _XFER_SIZE = 4 * 8, // resq 4 => reserve 4 quadwords. Hence 4 * 8
       
  1292       _SRND_SIZE = 8, // resq 1
       
  1293       _INP_SIZE = 8,
       
  1294       _INP_END_SIZE = 8,
       
  1295       _RSP_SAVE_SIZE = 8,  // defined as resq 1
       
  1296 
       
  1297 #ifdef _WIN64
       
  1298       _GPR_SAVE_SIZE = 8 * 8, // defined as resq 8
       
  1299 #else
       
  1300       _GPR_SAVE_SIZE = 6 * 8 // resq 6
       
  1301 #endif
       
  1302     };
       
  1303 
       
  1304     enum
       
  1305     {
       
  1306       _XFER = 0,
       
  1307       _SRND = _XFER + _XFER_SIZE, // 32
       
  1308       _INP = _SRND + _SRND_SIZE, // 40
       
  1309       _INP_END = _INP + _INP_SIZE, // 48
       
  1310       _RSP = _INP_END + _INP_END_SIZE, // 56
       
  1311       _GPR = _RSP + _RSP_SAVE_SIZE, // 64
       
  1312       _STACK_SIZE = _GPR + _GPR_SAVE_SIZE // 128 for windows and 112 for linux.
       
  1313     };
       
  1314 
       
  1315 //Saving offset and limit as it will help with blocksize calculation for multiblock SHA512.
       
  1316 #ifdef _WIN64
       
  1317     push(r8);    // win64: this is ofs
       
  1318     push(r9);    // win64: this is limit, we need them again at the very end.
       
  1319 #else
       
  1320     push(rdx);   // linux : this is ofs, need at the end for multiblock calculation
       
  1321     push(rcx);   // linux: This is the limit.
       
  1322 #endif
       
  1323 
       
  1324     //Allocate Stack Space
       
  1325     movq(rax, rsp);
       
  1326     subq(rsp, _STACK_SIZE);
       
  1327     andq(rsp, -32);
       
  1328     movq(Address(rsp, _RSP), rax);
       
  1329 
       
  1330     //Save GPRs
       
  1331     movq(Address(rsp, _GPR), rbp);
       
  1332     movq(Address(rsp, (_GPR + 8)), rbx);
       
  1333     movq(Address(rsp, (_GPR + 16)), r12);
       
  1334     movq(Address(rsp, (_GPR + 24)), r13);
       
  1335     movq(Address(rsp, (_GPR + 32)), r14);
       
  1336     movq(Address(rsp, (_GPR + 40)), r15);
       
  1337 
       
  1338 #ifdef _WIN64
       
  1339     movq(Address(rsp, (_GPR + 48)), rsi);
       
  1340     movq(Address(rsp, (_GPR + 56)), rdi);
       
  1341 #endif
       
  1342 
       
  1343     vpblendd(xmm0, xmm0, xmm1, 0xF0, AVX_128bit);
       
  1344     vpblendd(xmm0, xmm0, xmm1, 0xF0, AVX_256bit);
       
  1345 
       
  1346     if (multi_block) {
       
  1347       xorq(rax, rax);
       
  1348       bind(compute_block_size);
       
  1349       cmpptr(offset, input_limit); // Assuming that offset is less than limit.
       
  1350       jccb(Assembler::aboveEqual, compute_block_size_end);
       
  1351       addq(offset, 128);
       
  1352       addq(rax, 128);
       
  1353       jmpb(compute_block_size);
       
  1354 
       
  1355       bind(compute_block_size_end);
       
  1356       movq(NUM_BLKS, rax);
       
  1357 
       
  1358       cmpq(NUM_BLKS, 0);
       
  1359       jcc(Assembler::equal, done_hash);
       
  1360     } else {
       
  1361       xorq(NUM_BLKS, NUM_BLKS); //If single block.
       
  1362       addq(NUM_BLKS, 128);
       
  1363     }
       
  1364 
       
  1365     addq(NUM_BLKS, INP); //pointer to end of data
       
  1366     movq(Address(rsp, _INP_END), NUM_BLKS);
       
  1367 
       
  1368     //load initial digest
       
  1369     movq(a, Address(CTX, 8 * 0));
       
  1370     movq(b, Address(CTX, 8 * 1));
       
  1371     movq(c, Address(CTX, 8 * 2));
       
  1372     movq(d, Address(CTX, 8 * 3));
       
  1373     movq(e, Address(CTX, 8 * 4));
       
  1374     movq(f, Address(CTX, 8 * 5));
       
  1375     // load g - r10 after it is used as scratch
       
  1376     movq(h, Address(CTX, 8 * 7));
       
  1377 
       
  1378     pshuffle_byte_flip_mask_addr = pshuffle_byte_flip_mask_sha512;
       
  1379     vmovdqu(BYTE_FLIP_MASK, ExternalAddress(pshuffle_byte_flip_mask_addr + 0)); //PSHUFFLE_BYTE_FLIP_MASK wrt rip
       
  1380     vmovdqu(YMM_MASK_LO, ExternalAddress(pshuffle_byte_flip_mask_addr + 32));
       
  1381 
       
  1382     movq(g, Address(CTX, 8 * 6));
       
  1383 
       
  1384     bind(loop0);
       
  1385     lea(TBL, ExternalAddress(K512_W));
       
  1386 
       
  1387     //byte swap first 16 dwords
       
  1388     vmovdqu(xmm4, Address(INP, 32 * 0));
       
  1389     vpshufb(xmm4, xmm4, BYTE_FLIP_MASK, AVX_256bit);
       
  1390     vmovdqu(xmm5, Address(INP, 32 * 1));
       
  1391     vpshufb(xmm5, xmm5, BYTE_FLIP_MASK, AVX_256bit);
       
  1392     vmovdqu(xmm6, Address(INP, 32 * 2));
       
  1393     vpshufb(xmm6, xmm6, BYTE_FLIP_MASK, AVX_256bit);
       
  1394     vmovdqu(xmm7, Address(INP, 32 * 3));
       
  1395     vpshufb(xmm7, xmm7, BYTE_FLIP_MASK, AVX_256bit);
       
  1396 
       
  1397     movq(Address(rsp, _INP), INP);
       
  1398 
       
  1399     movslq(Address(rsp, _SRND), 4);
       
  1400     align(16);
       
  1401 
       
  1402     //Schedule 64 input dwords, by calling sha512_AVX2_one_round_and_schedule
       
  1403     bind(loop1);
       
  1404     vpaddq(xmm0, xmm4, Address(TBL, 0 * 32), AVX_256bit);
       
  1405     vmovdqu(Address(rsp, _XFER), xmm0);
       
  1406     //four rounds and schedule
       
  1407     sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, a, b, c, d, e, f, g, h, 0);
       
  1408     sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, h, a, b, c, d, e, f, g, 1);
       
  1409     sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, g, h, a, b, c, d, e, f, 2);
       
  1410     sha512_AVX2_one_round_and_schedule(xmm4, xmm5, xmm6, xmm7, f, g, h, a, b, c, d, e, 3);
       
  1411 
       
  1412     vpaddq(xmm0, xmm5, Address(TBL, 1 * 32), AVX_256bit);
       
  1413     vmovdqu(Address(rsp, _XFER), xmm0);
       
  1414     //four rounds and schedule
       
  1415     sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, e, f, g, h, a, b, c, d, 0);
       
  1416     sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, d, e, f, g, h, a, b, c, 1);
       
  1417     sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, c, d, e, f, g, h, a, b, 2);
       
  1418     sha512_AVX2_one_round_and_schedule(xmm5, xmm6, xmm7, xmm4, b, c, d, e, f, g, h, a, 3);
       
  1419 
       
  1420     vpaddq(xmm0, xmm6, Address(TBL, 2 * 32), AVX_256bit);
       
  1421     vmovdqu(Address(rsp, _XFER), xmm0);
       
  1422     //four rounds and schedule
       
  1423     sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, a, b, c, d, e, f, g, h, 0);
       
  1424     sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, h, a, b, c, d, e, f, g, 1);
       
  1425     sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, g, h, a, b, c, d, e, f, 2);
       
  1426     sha512_AVX2_one_round_and_schedule(xmm6, xmm7, xmm4, xmm5, f, g, h, a, b, c, d, e, 3);
       
  1427 
       
  1428     vpaddq(xmm0, xmm7, Address(TBL, 3 * 32), AVX_256bit);
       
  1429     vmovdqu(Address(rsp, _XFER), xmm0);
       
  1430     addq(TBL, 4 * 32);
       
  1431     //four rounds and schedule
       
  1432     sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, e, f, g, h, a, b, c, d, 0);
       
  1433     sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, d, e, f, g, h, a, b, c, 1);
       
  1434     sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, c, d, e, f, g, h, a, b, 2);
       
  1435     sha512_AVX2_one_round_and_schedule(xmm7, xmm4, xmm5, xmm6, b, c, d, e, f, g, h, a, 3);
       
  1436 
       
  1437     subq(Address(rsp, _SRND), 1);
       
  1438     jcc(Assembler::notEqual, loop1);
       
  1439 
       
  1440     movslq(Address(rsp, _SRND), 2);
       
  1441 
       
  1442     bind(loop2);
       
  1443     vpaddq(xmm0, xmm4, Address(TBL, 0 * 32), AVX_256bit);
       
  1444     vmovdqu(Address(rsp, _XFER), xmm0);
       
  1445     //four rounds and compute.
       
  1446     sha512_AVX2_one_round_compute(a, a, b, c, d, e, f, g, h, 0);
       
  1447     sha512_AVX2_one_round_compute(h, h, a, b, c, d, e, f, g, 1);
       
  1448     sha512_AVX2_one_round_compute(g, g, h, a, b, c, d, e, f, 2);
       
  1449     sha512_AVX2_one_round_compute(f, f, g, h, a, b, c, d, e, 3);
       
  1450 
       
  1451     vpaddq(xmm0, xmm5, Address(TBL, 1 * 32), AVX_256bit);
       
  1452     vmovdqu(Address(rsp, _XFER), xmm0);
       
  1453     addq(TBL, 2 * 32);
       
  1454     // four rounds and compute.
       
  1455     sha512_AVX2_one_round_compute(e, e, f, g, h, a, b, c, d, 0);
       
  1456     sha512_AVX2_one_round_compute(d, d, e, f, g, h, a, b, c, 1);
       
  1457     sha512_AVX2_one_round_compute(c, c, d, e, f, g, h, a, b, 2);
       
  1458     sha512_AVX2_one_round_compute(b, b, c, d, e, f, g, h, a, 3);
       
  1459 
       
  1460     vmovdqu(xmm4, xmm6);
       
  1461     vmovdqu(xmm5, xmm7);
       
  1462 
       
  1463     subq(Address(rsp, _SRND), 1);
       
  1464     jcc(Assembler::notEqual, loop2);
       
  1465 
       
  1466     addmq(8 * 0, CTX, a);
       
  1467     addmq(8 * 1, CTX, b);
       
  1468     addmq(8 * 2, CTX, c);
       
  1469     addmq(8 * 3, CTX, d);
       
  1470     addmq(8 * 4, CTX, e);
       
  1471     addmq(8 * 5, CTX, f);
       
  1472     addmq(8 * 6, CTX, g);
       
  1473     addmq(8 * 7, CTX, h);
       
  1474 
       
  1475     movq(INP, Address(rsp, _INP));
       
  1476     addq(INP, 128);
       
  1477     cmpq(INP, Address(rsp, _INP_END));
       
  1478     jcc(Assembler::notEqual, loop0);
       
  1479 
       
  1480     bind(done_hash);
       
  1481 
       
  1482     //Restore GPRs
       
  1483     movq(rbp, Address(rsp, (_GPR + 0)));
       
  1484     movq(rbx, Address(rsp, (_GPR + 8)));
       
  1485     movq(r12, Address(rsp, (_GPR + 16)));
       
  1486     movq(r13, Address(rsp, (_GPR + 24)));
       
  1487     movq(r14, Address(rsp, (_GPR + 32)));
       
  1488     movq(r15, Address(rsp, (_GPR + 40)));
       
  1489 
       
  1490 #ifdef _WIN64
       
  1491     movq(rsi, Address(rsp, (_GPR + 48)));
       
  1492     movq(rdi, Address(rsp, (_GPR + 56)));
       
  1493 #endif
       
  1494 
       
  1495     //Restore Stack Pointer
       
  1496     movq(rsp, Address(rsp, _RSP));
       
  1497 
       
  1498 #ifdef _WIN64
       
  1499     pop(r9);
       
  1500     pop(r8);
       
  1501 #else
       
  1502     pop(rcx);
       
  1503     pop(rdx);
       
  1504 #endif
       
  1505 
       
  1506     if (multi_block) {
       
  1507 #ifdef _WIN64
       
  1508       const Register& limit_end = r9;
       
  1509       const Register& ofs_end = r8;
       
  1510 #else
       
  1511       const Register& limit_end = rcx;
       
  1512       const Register& ofs_end   = rdx;
       
  1513 #endif
       
  1514       movq(rax, ofs_end);
       
  1515       bind(compute_size);
       
  1516       cmpptr(rax, limit_end);
       
  1517       jccb(Assembler::aboveEqual, compute_size_end);
       
  1518       addq(rax, 128);
       
  1519       jmpb(compute_size);
       
  1520       bind(compute_size_end);
       
  1521     }
       
  1522 }
       
  1523 
       
  1524 #endif //#ifdef _LP64
       
  1525