hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
changeset 36825 6ebe5519b753
parent 36561 b18243f4d955
child 38018 1dc6c6f21231
child 37466 287c4ebd11b0
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Thu Mar 24 17:52:39 2016 +0000
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Thu Mar 24 11:48:37 2016 -0700
@@ -3469,16 +3469,12 @@
   // Output:
   //   rax       - input length
   //
-
   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
     assert(UseAES, "need AES instructions and misaligned SSE support");
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
     address start = __ pc();
 
-    Label L_exit, L_key_192_256, L_key_256;
-    Label L_singleBlock_loopTop_128, L_multiBlock_loopTop_128;
-    Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256;
     const Register from        = c_rarg0;  // source array address
     const Register to          = c_rarg1;  // destination array address
     const Register key         = c_rarg2;  // key array address
@@ -3492,7 +3488,17 @@
 #endif
     const Register pos         = rax;
 
-    // keys 0-10 preloaded into xmm2-xmm12
+    const int PARALLEL_FACTOR = 4;
+    const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256
+
+    Label L_exit;
+    Label L_singleBlock_loopTopHead[3]; // 128, 192, 256
+    Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256
+    Label L_singleBlock_loopTop[3]; // 128, 192, 256
+    Label L_multiBlock_loopTopHead[3]; // 128, 192, 256
+    Label L_multiBlock_loopTop[3]; // 128, 192, 256
+
+    // keys 0-10 preloaded into xmm5-xmm15
     const int XMM_REG_NUM_KEY_FIRST = 5;
     const int XMM_REG_NUM_KEY_LAST  = 15;
     const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
@@ -3519,7 +3525,7 @@
 #else
     __ push(len_reg); // Save
 #endif
-
+    __ push(rbx);
     // the java expanded key ordering is rotated one position from what we want
     // so we start from 0x10 here and hit 0x00 last
     const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
@@ -3541,85 +3547,173 @@
 
     __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));   // initialize with initial rvec
 
+    __ xorptr(pos, pos);
+
     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
-    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
-    __ cmpl(rax, 44);
-    __ jcc(Assembler::notEqual, L_key_192_256);
-
-
-    // 128-bit code follows here, parallelized
-    __ movptr(pos, 0);
-    __ align(OptoLoopAlignment);
-    __ BIND(L_multiBlock_loopTop_128);
-    __ cmpptr(len_reg, 4*AESBlockSize);           // see if at least 4 blocks left
-    __ jcc(Assembler::less, L_singleBlock_loopTop_128);
-
-    __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0*AESBlockSize));   // get next 4 blocks into xmmresult registers
-    __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1*AESBlockSize));
-    __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2*AESBlockSize));
-    __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3*AESBlockSize));
-
-#define DoFour(opc, src_reg)                    \
-    __ opc(xmm_result0, src_reg);               \
-    __ opc(xmm_result1, src_reg);               \
-    __ opc(xmm_result2, src_reg);               \
-    __ opc(xmm_result3, src_reg);
-
-    DoFour(pxor, xmm_key_first);
-    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
-      DoFour(aesdec, as_XMMRegister(rnum));
-    }
-    DoFour(aesdeclast, xmm_key_last);
-    // for each result, xor with the r vector of previous cipher block
-    __ pxor(xmm_result0, xmm_prev_block_cipher);
-    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0*AESBlockSize));
-    __ pxor(xmm_result1, xmm_prev_block_cipher);
-    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1*AESBlockSize));
-    __ pxor(xmm_result2, xmm_prev_block_cipher);
-    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2*AESBlockSize));
-    __ pxor(xmm_result3, xmm_prev_block_cipher);
-    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3*AESBlockSize));   // this will carry over to next set of blocks
-
-    __ movdqu(Address(to, pos, Address::times_1, 0*AESBlockSize), xmm_result0);     // store 4 results into the next 64 bytes of output
-    __ movdqu(Address(to, pos, Address::times_1, 1*AESBlockSize), xmm_result1);
-    __ movdqu(Address(to, pos, Address::times_1, 2*AESBlockSize), xmm_result2);
-    __ movdqu(Address(to, pos, Address::times_1, 3*AESBlockSize), xmm_result3);
-
-    __ addptr(pos, 4*AESBlockSize);
-    __ subptr(len_reg, 4*AESBlockSize);
-    __ jmp(L_multiBlock_loopTop_128);
-
-    // registers used in the non-parallelized loops
-    // xmm register assignments for the loops below
-    const XMMRegister xmm_result = xmm0;
-    const XMMRegister xmm_prev_block_cipher_save = xmm2;
-    const XMMRegister xmm_key11 = xmm3;
-    const XMMRegister xmm_key12 = xmm4;
-    const XMMRegister xmm_temp  = xmm4;
-
-    __ align(OptoLoopAlignment);
-    __ BIND(L_singleBlock_loopTop_128);
-    __ cmpptr(len_reg, 0);           // any blocks left??
-    __ jcc(Assembler::equal, L_exit);
-    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
-    __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
-    __ pxor  (xmm_result, xmm_key_first);               // do the aes dec rounds
-    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
-      __ aesdec(xmm_result, as_XMMRegister(rnum));
-    }
-    __ aesdeclast(xmm_result, xmm_key_last);
-    __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
-    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
-    // no need to store r to memory until we exit
-    __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);              // set up next r vector with cipher input from this block
-
-    __ addptr(pos, AESBlockSize);
-    __ subptr(len_reg, AESBlockSize);
-    __ jmp(L_singleBlock_loopTop_128);
-
+    __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+    __ cmpl(rbx, 52);
+    __ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]);
+    __ cmpl(rbx, 60);
+    __ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]);
+
+#define DoFour(opc, src_reg)           \
+  __ opc(xmm_result0, src_reg);         \
+  __ opc(xmm_result1, src_reg);         \
+  __ opc(xmm_result2, src_reg);         \
+  __ opc(xmm_result3, src_reg);         \
+
+    for (int k = 0; k < 3; ++k) {
+      __ BIND(L_multiBlock_loopTopHead[k]);
+      if (k != 0) {
+        __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
+        __ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]);
+      }
+      if (k == 1) {
+        __ subptr(rsp, 6 * wordSize);
+        __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
+        load_key(xmm15, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0
+        __ movdqu(Address(rsp, 2 * wordSize), xmm15);
+        load_key(xmm1, key, 0xc0);  // 0xc0;
+        __ movdqu(Address(rsp, 4 * wordSize), xmm1);
+      } else if (k == 2) {
+        __ subptr(rsp, 10 * wordSize);
+        __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
+        load_key(xmm15, key, 0xd0); // 0xd0; 256-bit key goes upto 0xe0
+        __ movdqu(Address(rsp, 6 * wordSize), xmm15);
+        load_key(xmm1, key, 0xe0);  // 0xe0;
+        __ movdqu(Address(rsp, 8 * wordSize), xmm1);
+        load_key(xmm15, key, 0xb0); // 0xb0;
+        __ movdqu(Address(rsp, 2 * wordSize), xmm15);
+        load_key(xmm1, key, 0xc0);  // 0xc0;
+        __ movdqu(Address(rsp, 4 * wordSize), xmm1);
+      }
+      __ align(OptoLoopAlignment);
+      __ BIND(L_multiBlock_loopTop[k]);
+      __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
+      __ jcc(Assembler::less, L_singleBlock_loopTopHead[k]);
+
+      if  (k != 0) {
+        __ movdqu(xmm15, Address(rsp, 2 * wordSize));
+        __ movdqu(xmm1, Address(rsp, 4 * wordSize));
+      }
+
+      __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
+      __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
+      __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
+      __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
+
+      DoFour(pxor, xmm_key_first);
+      if (k == 0) {
+        for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
+          DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
+        }
+        DoFour(aesdeclast, xmm_key_last);
+      } else if (k == 1) {
+        for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) {
+          DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
+        }
+        __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
+        DoFour(aesdec, xmm1);  // key : 0xc0
+        __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
+        DoFour(aesdeclast, xmm_key_last);
+      } else if (k == 2) {
+        for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) {
+          DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
+        }
+        DoFour(aesdec, xmm1);  // key : 0xc0
+        __ movdqu(xmm15, Address(rsp, 6 * wordSize));
+        __ movdqu(xmm1, Address(rsp, 8 * wordSize));
+        DoFour(aesdec, xmm15);  // key : 0xd0
+        __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
+        DoFour(aesdec, xmm1);  // key : 0xe0
+        __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
+        DoFour(aesdeclast, xmm_key_last);
+      }
+
+      // for each result, xor with the r vector of previous cipher block
+      __ pxor(xmm_result0, xmm_prev_block_cipher);
+      __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
+      __ pxor(xmm_result1, xmm_prev_block_cipher);
+      __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
+      __ pxor(xmm_result2, xmm_prev_block_cipher);
+      __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
+      __ pxor(xmm_result3, xmm_prev_block_cipher);
+      __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize));   // this will carry over to next set of blocks
+      if (k != 0) {
+        __ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher);
+      }
+
+      __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);     // store 4 results into the next 64 bytes of output
+      __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
+      __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
+      __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
+
+      __ addptr(pos, PARALLEL_FACTOR * AESBlockSize);
+      __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize);
+      __ jmp(L_multiBlock_loopTop[k]);
+
+      // registers used in the non-parallelized loops
+      // xmm register assignments for the loops below
+      const XMMRegister xmm_result = xmm0;
+      const XMMRegister xmm_prev_block_cipher_save = xmm2;
+      const XMMRegister xmm_key11 = xmm3;
+      const XMMRegister xmm_key12 = xmm4;
+      const XMMRegister key_tmp = xmm4;
+
+      __ BIND(L_singleBlock_loopTopHead[k]);
+      if (k == 1) {
+        __ addptr(rsp, 6 * wordSize);
+      } else if (k == 2) {
+        __ addptr(rsp, 10 * wordSize);
+      }
+      __ cmpptr(len_reg, 0); // any blocks left??
+      __ jcc(Assembler::equal, L_exit);
+      __ BIND(L_singleBlock_loopTopHead2[k]);
+      if (k == 1) {
+        load_key(xmm_key11, key, 0xb0); // 0xb0; 192-bit key goes upto 0xc0
+        load_key(xmm_key12, key, 0xc0); // 0xc0; 192-bit key goes upto 0xc0
+      }
+      if (k == 2) {
+        load_key(xmm_key11, key, 0xb0); // 0xb0; 256-bit key goes upto 0xe0
+      }
+      __ align(OptoLoopAlignment);
+      __ BIND(L_singleBlock_loopTop[k]);
+      __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
+      __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
+      __ pxor(xmm_result, xmm_key_first); // do the aes dec rounds
+      for (int rnum = 1; rnum <= 9 ; rnum++) {
+          __ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
+      }
+      if (k == 1) {
+        __ aesdec(xmm_result, xmm_key11);
+        __ aesdec(xmm_result, xmm_key12);
+      }
+      if (k == 2) {
+        __ aesdec(xmm_result, xmm_key11);
+        load_key(key_tmp, key, 0xc0);
+        __ aesdec(xmm_result, key_tmp);
+        load_key(key_tmp, key, 0xd0);
+        __ aesdec(xmm_result, key_tmp);
+        load_key(key_tmp, key, 0xe0);
+        __ aesdec(xmm_result, key_tmp);
+      }
+
+      __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
+      __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
+      __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
+      // no need to store r to memory until we exit
+      __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
+      __ addptr(pos, AESBlockSize);
+      __ subptr(len_reg, AESBlockSize);
+      __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
+      if (k != 2) {
+        __ jmp(L_exit);
+      }
+    } //for 128/192/256
 
     __ BIND(L_exit);
     __ movdqu(Address(rvec, 0), xmm_prev_block_cipher);     // final value of r stored in rvec of CipherBlockChaining object
+    __ pop(rbx);
 #ifdef _WIN64
     // restore regs belonging to calling function
     for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
@@ -3631,69 +3725,8 @@
 #endif
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
-
-
-    __ BIND(L_key_192_256);
-    // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
-    load_key(xmm_key11, key, 0xb0);
-    __ cmpl(rax, 52);
-    __ jcc(Assembler::notEqual, L_key_256);
-
-    // 192-bit code follows here (could be optimized to use parallelism)
-    load_key(xmm_key12, key, 0xc0);     // 192-bit key goes up to c0
-    __ movptr(pos, 0);
-    __ align(OptoLoopAlignment);
-
-    __ BIND(L_singleBlock_loopTop_192);
-    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
-    __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
-    __ pxor  (xmm_result, xmm_key_first);               // do the aes dec rounds
-    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
-      __ aesdec(xmm_result, as_XMMRegister(rnum));
-    }
-    __ aesdec(xmm_result, xmm_key11);
-    __ aesdec(xmm_result, xmm_key12);
-    __ aesdeclast(xmm_result, xmm_key_last);                    // xmm15 always came from key+0
-    __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
-    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);  // store into the next 16 bytes of output
-    // no need to store r to memory until we exit
-    __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);  // set up next r vector with cipher input from this block
-    __ addptr(pos, AESBlockSize);
-    __ subptr(len_reg, AESBlockSize);
-    __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192);
-    __ jmp(L_exit);
-
-    __ BIND(L_key_256);
-    // 256-bit code follows here (could be optimized to use parallelism)
-    __ movptr(pos, 0);
-    __ align(OptoLoopAlignment);
-
-    __ BIND(L_singleBlock_loopTop_256);
-    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
-    __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
-    __ pxor  (xmm_result, xmm_key_first);               // do the aes dec rounds
-    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
-      __ aesdec(xmm_result, as_XMMRegister(rnum));
-    }
-    __ aesdec(xmm_result, xmm_key11);
-    load_key(xmm_temp, key, 0xc0);
-    __ aesdec(xmm_result, xmm_temp);
-    load_key(xmm_temp, key, 0xd0);
-    __ aesdec(xmm_result, xmm_temp);
-    load_key(xmm_temp, key, 0xe0);     // 256-bit key goes up to e0
-    __ aesdec(xmm_result, xmm_temp);
-    __ aesdeclast(xmm_result, xmm_key_last);          // xmm15 came from key+0
-    __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
-    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);  // store into the next 16 bytes of output
-    // no need to store r to memory until we exit
-    __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);  // set up next r vector with cipher input from this block
-    __ addptr(pos, AESBlockSize);
-    __ subptr(len_reg, AESBlockSize);
-    __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
-    __ jmp(L_exit);
-
     return start;
-  }
+}
 
   address generate_upper_word_mask() {
     __ align(64);