8152354: Update for x86 AES CBC Decryption
authorvdeshpande
Thu, 24 Mar 2016 11:48:37 -0700
changeset 36825 6ebe5519b753
parent 36824 593d798b0954
child 36826 62836dfa9191
child 36827 53d2d43551d2
8152354: Update for x86 AES CBC Decryption Summary: 4 way parallel version for key sizes 192 and 256 for 64 bit and for key sizes 128, 192 and 256 for 32 bit. Reviewed-by: kvn
hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Thu Mar 24 17:52:39 2016 +0000
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Thu Mar 24 11:48:37 2016 -0700
@@ -2600,15 +2600,12 @@
   //   rax       - input length
   //
 
-  address generate_cipherBlockChaining_decryptAESCrypt() {
+  address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
     assert(UseAES, "need AES instructions and misaligned SSE support");
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
     address start = __ pc();
 
-    Label L_exit, L_key_192_256, L_key_256;
-    Label L_singleBlock_loopTop_128;
-    Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256;
     const Register from        = rsi;      // source array address
     const Register to          = rdx;      // destination array address
     const Register key         = rcx;      // key array address
@@ -2617,14 +2614,24 @@
     const Register len_reg     = rbx;      // src len (must be multiple of blocksize 16)
     const Register pos         = rax;
 
-    // xmm register assignments for the loops below
-    const XMMRegister xmm_result = xmm0;
-    const XMMRegister xmm_temp   = xmm1;
-    // first 6 keys preloaded into xmm2-xmm7
-    const int XMM_REG_NUM_KEY_FIRST = 2;
-    const int XMM_REG_NUM_KEY_LAST  = 7;
-    const int FIRST_NON_REG_KEY_offset = 0x70;
-    const XMMRegister xmm_key_first   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
+    const int PARALLEL_FACTOR = 4;
+    const int ROUNDS[3] = { 10, 12, 14 }; //aes rounds for key128, key192, key256
+
+    Label L_exit;
+    Label L_singleBlock_loopTop[3]; //128, 192, 256
+    Label L_multiBlock_loopTop[3]; //128, 192, 256
+
+    const XMMRegister xmm_prev_block_cipher = xmm0; // holds cipher of previous block
+    const XMMRegister xmm_key_shuf_mask = xmm1;
+
+    const XMMRegister xmm_key_tmp0 = xmm2;
+    const XMMRegister xmm_key_tmp1 = xmm3;
+
+    // registers holding the six results in the parallelized loop
+    const XMMRegister xmm_result0 = xmm4;
+    const XMMRegister xmm_result1 = xmm5;
+    const XMMRegister xmm_result2 = xmm6;
+    const XMMRegister xmm_result3 = xmm7;
 
     __ enter(); // required for proper stackwalking of RuntimeStub frame
     handleSOERegisters(true /*saving*/);
@@ -2643,126 +2650,123 @@
     const Address  key_param (rbp, 8+8);
     const Address  rvec_param (rbp, 8+12);
     const Address  len_param  (rbp, 8+16);
+
     __ movptr(from , from_param);
     __ movptr(to   , to_param);
     __ movptr(key  , key_param);
     __ movptr(rvec , rvec_param);
     __ movptr(len_reg , len_param);
 
-    // the java expanded key ordering is rotated one position from what we want
-    // so we start from 0x10 here and hit 0x00 last
-    const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
-    // load up xmm regs 2 thru 6 with first 5 keys
-    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
-      load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
-      offset += 0x10;
-    }
-
-    // inside here, use the rvec register to point to previous block cipher
-    // with which we xor at the end of each newly decrypted block
-    const Register  prev_block_cipher_ptr = rvec;
+    __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec
+
+    __ xorptr(pos, pos);
 
     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
-    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
-    __ cmpl(rax, 44);
-    __ jcc(Assembler::notEqual, L_key_192_256);
-
-
-    // 128-bit code follows here, parallelized
-    __ movl(pos, 0);
-    __ align(OptoLoopAlignment);
-    __ BIND(L_singleBlock_loopTop_128);
-    __ cmpptr(len_reg, 0);           // any blocks left??
-    __ jcc(Assembler::equal, L_exit);
-    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
-    __ pxor  (xmm_result, xmm_key_first);                             // do the aes dec rounds
-    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
-      __ aesdec(xmm_result, as_XMMRegister(rnum));
-    }
-    for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xa0; key_offset += 0x10) {   // 128-bit runs up to key offset a0
-      aes_dec_key(xmm_result, xmm_temp, key, key_offset);
-    }
-    load_key(xmm_temp, key, 0x00);                                     // final key is stored in java expanded array at offset 0
-    __ aesdeclast(xmm_result, xmm_temp);
-    __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
-    __ pxor  (xmm_result, xmm_temp);                                  // xor with the current r vector
-    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
-    // no need to store r to memory until we exit
-    __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0));     // set up new ptr
-    __ addptr(pos, AESBlockSize);
-    __ subptr(len_reg, AESBlockSize);
-    __ jmp(L_singleBlock_loopTop_128);
-
+    // rvec is reused
+    __ movl(rvec, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+    __ cmpl(rvec, 52);
+    __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
+    __ cmpl(rvec, 60);
+    __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
+
+#define DoFour(opc, src_reg)           \
+  __ opc(xmm_result0, src_reg);         \
+  __ opc(xmm_result1, src_reg);         \
+  __ opc(xmm_result2, src_reg);         \
+  __ opc(xmm_result3, src_reg);         \
+
+    for (int k = 0; k < 3; ++k) {
+      __ align(OptoLoopAlignment);
+      __ BIND(L_multiBlock_loopTop[k]);
+      __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
+      __ jcc(Assembler::less, L_singleBlock_loopTop[k]);
+
+      __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
+      __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
+      __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
+      __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
+
+      // the java expanded key ordering is rotated one position from what we want
+      // so we start from 0x10 here and hit 0x00 last
+      load_key(xmm_key_tmp0, key, 0x10, xmm_key_shuf_mask);
+      DoFour(pxor, xmm_key_tmp0); //xor with first key
+      // do the aes dec rounds
+      for (int rnum = 1; rnum <= ROUNDS[k];) {
+        //load two keys at a time
+        //k1->0x20, ..., k9->0xa0, k10->0x00
+        load_key(xmm_key_tmp1, key, (rnum + 1) * 0x10, xmm_key_shuf_mask);
+        load_key(xmm_key_tmp0, key, ((rnum + 2) % (ROUNDS[k] + 1)) * 0x10, xmm_key_shuf_mask); // hit 0x00 last!
+        DoFour(aesdec, xmm_key_tmp1);
+        rnum++;
+        if (rnum != ROUNDS[k]) {
+          DoFour(aesdec, xmm_key_tmp0);
+        }
+        else {
+          DoFour(aesdeclast, xmm_key_tmp0);
+        }
+        rnum++;
+      }
+
+      // for each result, xor with the r vector of previous cipher block
+      __ pxor(xmm_result0, xmm_prev_block_cipher);
+      __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
+      __ pxor(xmm_result1, xmm_prev_block_cipher);
+      __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
+      __ pxor(xmm_result2, xmm_prev_block_cipher);
+      __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
+      __ pxor(xmm_result3, xmm_prev_block_cipher);
+      __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize)); // this will carry over to next set of blocks
+
+            // store 4 results into the next 64 bytes of output
+       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
+       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
+       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
+       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
+
+       __ addptr(pos, 4 * AESBlockSize);
+       __ subptr(len_reg, 4 * AESBlockSize);
+       __ jmp(L_multiBlock_loopTop[k]);
+
+       //singleBlock starts here
+       __ align(OptoLoopAlignment);
+       __ BIND(L_singleBlock_loopTop[k]);
+       __ cmpptr(len_reg, 0); // any blocks left?
+       __ jcc(Assembler::equal, L_exit);
+       __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
+       __ movdqa(xmm_result1, xmm_result0);
+
+       load_key(xmm_key_tmp0, key, 0x10, xmm_key_shuf_mask);
+       __ pxor(xmm_result0, xmm_key_tmp0);
+       // do the aes dec rounds
+       for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
+         // the java expanded key ordering is rotated one position from what we want
+         load_key(xmm_key_tmp0, key, (rnum + 1) * 0x10, xmm_key_shuf_mask);
+         __ aesdec(xmm_result0, xmm_key_tmp0);
+       }
+       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
+       __ aesdeclast(xmm_result0, xmm_key_tmp0);
+       __ pxor(xmm_result0, xmm_prev_block_cipher); // xor with the current r vector
+       __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result0); // store into the next 16 bytes of output
+       // no need to store r to memory until we exit
+       __ movdqa(xmm_prev_block_cipher, xmm_result1); // set up next r vector with cipher input from this block
+
+       __ addptr(pos, AESBlockSize);
+       __ subptr(len_reg, AESBlockSize);
+       __ jmp(L_singleBlock_loopTop[k]);
+    }//for 128/192/256
 
     __ BIND(L_exit);
-    __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
-    __ movptr(rvec , rvec_param);                                     // restore this since used in loop
-    __ movdqu(Address(rvec, 0), xmm_temp);                            // final value of r stored in rvec of CipherBlockChaining object
+    __ movptr(rvec, rvec_param);                        // restore this since reused earlier
+    __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object
     handleSOERegisters(false /*restoring*/);
-    __ movptr(rax, len_param); // return length
-    __ leave();                                                       // required for proper stackwalking of RuntimeStub frame
+    __ movptr(rax, len_param);                          // return length
+    __ leave();                                         // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
-
-    __ BIND(L_key_192_256);
-    // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
-    __ cmpl(rax, 52);
-    __ jcc(Assembler::notEqual, L_key_256);
-
-    // 192-bit code follows here (could be optimized to use parallelism)
-    __ movl(pos, 0);
-    __ align(OptoLoopAlignment);
-    __ BIND(L_singleBlock_loopTop_192);
-    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
-    __ pxor  (xmm_result, xmm_key_first);                             // do the aes dec rounds
-    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
-      __ aesdec(xmm_result, as_XMMRegister(rnum));
-    }
-    for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xc0; key_offset += 0x10) {   // 192-bit runs up to key offset c0
-      aes_dec_key(xmm_result, xmm_temp, key, key_offset);
-    }
-    load_key(xmm_temp, key, 0x00);                                     // final key is stored in java expanded array at offset 0
-    __ aesdeclast(xmm_result, xmm_temp);
-    __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
-    __ pxor  (xmm_result, xmm_temp);                                  // xor with the current r vector
-    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
-    // no need to store r to memory until we exit
-    __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0));     // set up new ptr
-    __ addptr(pos, AESBlockSize);
-    __ subptr(len_reg, AESBlockSize);
-    __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192);
-    __ jmp(L_exit);
-
-    __ BIND(L_key_256);
-    // 256-bit code follows here (could be optimized to use parallelism)
-    __ movl(pos, 0);
-    __ align(OptoLoopAlignment);
-    __ BIND(L_singleBlock_loopTop_256);
-    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
-    __ pxor  (xmm_result, xmm_key_first);                             // do the aes dec rounds
-    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
-      __ aesdec(xmm_result, as_XMMRegister(rnum));
-    }
-    for (int key_offset = FIRST_NON_REG_KEY_offset; key_offset <= 0xe0; key_offset += 0x10) {   // 256-bit runs up to key offset e0
-      aes_dec_key(xmm_result, xmm_temp, key, key_offset);
-    }
-    load_key(xmm_temp, key, 0x00);                                     // final key is stored in java expanded array at offset 0
-    __ aesdeclast(xmm_result, xmm_temp);
-    __ movdqu(xmm_temp, Address(prev_block_cipher_ptr, 0x00));
-    __ pxor  (xmm_result, xmm_temp);                                  // xor with the current r vector
-    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
-    // no need to store r to memory until we exit
-    __ lea(prev_block_cipher_ptr, Address(from, pos, Address::times_1, 0));     // set up new ptr
-    __ addptr(pos, AESBlockSize);
-    __ subptr(len_reg, AESBlockSize);
-    __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
-    __ jmp(L_exit);
-
     return start;
   }
 
-
   // CTR AES crypt.
   // In 32-bit stub, parallelize 4 blocks at a time
   // Arguments:
@@ -3894,7 +3898,7 @@
       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
-      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
+      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
     }
 
     if (UseAESCTRIntrinsics) {
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Thu Mar 24 17:52:39 2016 +0000
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Thu Mar 24 11:48:37 2016 -0700
@@ -3469,16 +3469,12 @@
   // Output:
   //   rax       - input length
   //
-
   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
     assert(UseAES, "need AES instructions and misaligned SSE support");
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
     address start = __ pc();
 
-    Label L_exit, L_key_192_256, L_key_256;
-    Label L_singleBlock_loopTop_128, L_multiBlock_loopTop_128;
-    Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256;
     const Register from        = c_rarg0;  // source array address
     const Register to          = c_rarg1;  // destination array address
     const Register key         = c_rarg2;  // key array address
@@ -3492,7 +3488,17 @@
 #endif
     const Register pos         = rax;
 
-    // keys 0-10 preloaded into xmm2-xmm12
+    const int PARALLEL_FACTOR = 4;
+    const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256
+
+    Label L_exit;
+    Label L_singleBlock_loopTopHead[3]; // 128, 192, 256
+    Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256
+    Label L_singleBlock_loopTop[3]; // 128, 192, 256
+    Label L_multiBlock_loopTopHead[3]; // 128, 192, 256
+    Label L_multiBlock_loopTop[3]; // 128, 192, 256
+
+    // keys 0-10 preloaded into xmm5-xmm15
     const int XMM_REG_NUM_KEY_FIRST = 5;
     const int XMM_REG_NUM_KEY_LAST  = 15;
     const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
@@ -3519,7 +3525,7 @@
 #else
     __ push(len_reg); // Save
 #endif
-
+    __ push(rbx);
     // the java expanded key ordering is rotated one position from what we want
     // so we start from 0x10 here and hit 0x00 last
     const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
@@ -3541,85 +3547,173 @@
 
     __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));   // initialize with initial rvec
 
+    __ xorptr(pos, pos);
+
     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
-    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
-    __ cmpl(rax, 44);
-    __ jcc(Assembler::notEqual, L_key_192_256);
-
-
-    // 128-bit code follows here, parallelized
-    __ movptr(pos, 0);
-    __ align(OptoLoopAlignment);
-    __ BIND(L_multiBlock_loopTop_128);
-    __ cmpptr(len_reg, 4*AESBlockSize);           // see if at least 4 blocks left
-    __ jcc(Assembler::less, L_singleBlock_loopTop_128);
-
-    __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0*AESBlockSize));   // get next 4 blocks into xmmresult registers
-    __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1*AESBlockSize));
-    __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2*AESBlockSize));
-    __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3*AESBlockSize));
-
-#define DoFour(opc, src_reg)                    \
-    __ opc(xmm_result0, src_reg);               \
-    __ opc(xmm_result1, src_reg);               \
-    __ opc(xmm_result2, src_reg);               \
-    __ opc(xmm_result3, src_reg);
-
-    DoFour(pxor, xmm_key_first);
-    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
-      DoFour(aesdec, as_XMMRegister(rnum));
-    }
-    DoFour(aesdeclast, xmm_key_last);
-    // for each result, xor with the r vector of previous cipher block
-    __ pxor(xmm_result0, xmm_prev_block_cipher);
-    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0*AESBlockSize));
-    __ pxor(xmm_result1, xmm_prev_block_cipher);
-    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1*AESBlockSize));
-    __ pxor(xmm_result2, xmm_prev_block_cipher);
-    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2*AESBlockSize));
-    __ pxor(xmm_result3, xmm_prev_block_cipher);
-    __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3*AESBlockSize));   // this will carry over to next set of blocks
-
-    __ movdqu(Address(to, pos, Address::times_1, 0*AESBlockSize), xmm_result0);     // store 4 results into the next 64 bytes of output
-    __ movdqu(Address(to, pos, Address::times_1, 1*AESBlockSize), xmm_result1);
-    __ movdqu(Address(to, pos, Address::times_1, 2*AESBlockSize), xmm_result2);
-    __ movdqu(Address(to, pos, Address::times_1, 3*AESBlockSize), xmm_result3);
-
-    __ addptr(pos, 4*AESBlockSize);
-    __ subptr(len_reg, 4*AESBlockSize);
-    __ jmp(L_multiBlock_loopTop_128);
-
-    // registers used in the non-parallelized loops
-    // xmm register assignments for the loops below
-    const XMMRegister xmm_result = xmm0;
-    const XMMRegister xmm_prev_block_cipher_save = xmm2;
-    const XMMRegister xmm_key11 = xmm3;
-    const XMMRegister xmm_key12 = xmm4;
-    const XMMRegister xmm_temp  = xmm4;
-
-    __ align(OptoLoopAlignment);
-    __ BIND(L_singleBlock_loopTop_128);
-    __ cmpptr(len_reg, 0);           // any blocks left??
-    __ jcc(Assembler::equal, L_exit);
-    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
-    __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
-    __ pxor  (xmm_result, xmm_key_first);               // do the aes dec rounds
-    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
-      __ aesdec(xmm_result, as_XMMRegister(rnum));
-    }
-    __ aesdeclast(xmm_result, xmm_key_last);
-    __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
-    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
-    // no need to store r to memory until we exit
-    __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);              // set up next r vector with cipher input from this block
-
-    __ addptr(pos, AESBlockSize);
-    __ subptr(len_reg, AESBlockSize);
-    __ jmp(L_singleBlock_loopTop_128);
-
+    __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+    __ cmpl(rbx, 52);
+    __ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]);
+    __ cmpl(rbx, 60);
+    __ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]);
+
+#define DoFour(opc, src_reg)           \
+  __ opc(xmm_result0, src_reg);         \
+  __ opc(xmm_result1, src_reg);         \
+  __ opc(xmm_result2, src_reg);         \
+  __ opc(xmm_result3, src_reg);         \
+
+    for (int k = 0; k < 3; ++k) {
+      __ BIND(L_multiBlock_loopTopHead[k]);
+      if (k != 0) {
+        __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
+        __ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]);
+      }
+      if (k == 1) {
+        __ subptr(rsp, 6 * wordSize);
+        __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
+        load_key(xmm15, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0
+        __ movdqu(Address(rsp, 2 * wordSize), xmm15);
+        load_key(xmm1, key, 0xc0);  // 0xc0;
+        __ movdqu(Address(rsp, 4 * wordSize), xmm1);
+      } else if (k == 2) {
+        __ subptr(rsp, 10 * wordSize);
+        __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
+        load_key(xmm15, key, 0xd0); // 0xd0; 256-bit key goes upto 0xe0
+        __ movdqu(Address(rsp, 6 * wordSize), xmm15);
+        load_key(xmm1, key, 0xe0);  // 0xe0;
+        __ movdqu(Address(rsp, 8 * wordSize), xmm1);
+        load_key(xmm15, key, 0xb0); // 0xb0;
+        __ movdqu(Address(rsp, 2 * wordSize), xmm15);
+        load_key(xmm1, key, 0xc0);  // 0xc0;
+        __ movdqu(Address(rsp, 4 * wordSize), xmm1);
+      }
+      __ align(OptoLoopAlignment);
+      __ BIND(L_multiBlock_loopTop[k]);
+      __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
+      __ jcc(Assembler::less, L_singleBlock_loopTopHead[k]);
+
+      if  (k != 0) {
+        __ movdqu(xmm15, Address(rsp, 2 * wordSize));
+        __ movdqu(xmm1, Address(rsp, 4 * wordSize));
+      }
+
+      __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
+      __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
+      __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
+      __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
+
+      DoFour(pxor, xmm_key_first);
+      if (k == 0) {
+        for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
+          DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
+        }
+        DoFour(aesdeclast, xmm_key_last);
+      } else if (k == 1) {
+        for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) {
+          DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
+        }
+        __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
+        DoFour(aesdec, xmm1);  // key : 0xc0
+        __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
+        DoFour(aesdeclast, xmm_key_last);
+      } else if (k == 2) {
+        for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) {
+          DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
+        }
+        DoFour(aesdec, xmm1);  // key : 0xc0
+        __ movdqu(xmm15, Address(rsp, 6 * wordSize));
+        __ movdqu(xmm1, Address(rsp, 8 * wordSize));
+        DoFour(aesdec, xmm15);  // key : 0xd0
+        __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
+        DoFour(aesdec, xmm1);  // key : 0xe0
+        __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
+        DoFour(aesdeclast, xmm_key_last);
+      }
+
+      // for each result, xor with the r vector of previous cipher block
+      __ pxor(xmm_result0, xmm_prev_block_cipher);
+      __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
+      __ pxor(xmm_result1, xmm_prev_block_cipher);
+      __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
+      __ pxor(xmm_result2, xmm_prev_block_cipher);
+      __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
+      __ pxor(xmm_result3, xmm_prev_block_cipher);
+      __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize));   // this will carry over to next set of blocks
+      if (k != 0) {
+        __ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher);
+      }
+
+      __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);     // store 4 results into the next 64 bytes of output
+      __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
+      __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
+      __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
+
+      __ addptr(pos, PARALLEL_FACTOR * AESBlockSize);
+      __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize);
+      __ jmp(L_multiBlock_loopTop[k]);
+
+      // registers used in the non-parallelized loops
+      // xmm register assignments for the loops below
+      const XMMRegister xmm_result = xmm0;
+      const XMMRegister xmm_prev_block_cipher_save = xmm2;
+      const XMMRegister xmm_key11 = xmm3;
+      const XMMRegister xmm_key12 = xmm4;
+      const XMMRegister key_tmp = xmm4;
+
+      __ BIND(L_singleBlock_loopTopHead[k]);
+      if (k == 1) {
+        __ addptr(rsp, 6 * wordSize);
+      } else if (k == 2) {
+        __ addptr(rsp, 10 * wordSize);
+      }
+      __ cmpptr(len_reg, 0); // any blocks left??
+      __ jcc(Assembler::equal, L_exit);
+      __ BIND(L_singleBlock_loopTopHead2[k]);
+      if (k == 1) {
+        load_key(xmm_key11, key, 0xb0); // 0xb0; 192-bit key goes upto 0xc0
+        load_key(xmm_key12, key, 0xc0); // 0xc0; 192-bit key goes upto 0xc0
+      }
+      if (k == 2) {
+        load_key(xmm_key11, key, 0xb0); // 0xb0; 256-bit key goes upto 0xe0
+      }
+      __ align(OptoLoopAlignment);
+      __ BIND(L_singleBlock_loopTop[k]);
+      __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
+      __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
+      __ pxor(xmm_result, xmm_key_first); // do the aes dec rounds
+      for (int rnum = 1; rnum <= 9 ; rnum++) {
+          __ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
+      }
+      if (k == 1) {
+        __ aesdec(xmm_result, xmm_key11);
+        __ aesdec(xmm_result, xmm_key12);
+      }
+      if (k == 2) {
+        __ aesdec(xmm_result, xmm_key11);
+        load_key(key_tmp, key, 0xc0);
+        __ aesdec(xmm_result, key_tmp);
+        load_key(key_tmp, key, 0xd0);
+        __ aesdec(xmm_result, key_tmp);
+        load_key(key_tmp, key, 0xe0);
+        __ aesdec(xmm_result, key_tmp);
+      }
+
+      __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
+      __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
+      __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
+      // no need to store r to memory until we exit
+      __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
+      __ addptr(pos, AESBlockSize);
+      __ subptr(len_reg, AESBlockSize);
+      __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
+      if (k != 2) {
+        __ jmp(L_exit);
+      }
+    } //for 128/192/256
 
     __ BIND(L_exit);
     __ movdqu(Address(rvec, 0), xmm_prev_block_cipher);     // final value of r stored in rvec of CipherBlockChaining object
+    __ pop(rbx);
 #ifdef _WIN64
     // restore regs belonging to calling function
     for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
@@ -3631,69 +3725,8 @@
 #endif
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
-
-
-    __ BIND(L_key_192_256);
-    // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
-    load_key(xmm_key11, key, 0xb0);
-    __ cmpl(rax, 52);
-    __ jcc(Assembler::notEqual, L_key_256);
-
-    // 192-bit code follows here (could be optimized to use parallelism)
-    load_key(xmm_key12, key, 0xc0);     // 192-bit key goes up to c0
-    __ movptr(pos, 0);
-    __ align(OptoLoopAlignment);
-
-    __ BIND(L_singleBlock_loopTop_192);
-    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
-    __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
-    __ pxor  (xmm_result, xmm_key_first);               // do the aes dec rounds
-    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
-      __ aesdec(xmm_result, as_XMMRegister(rnum));
-    }
-    __ aesdec(xmm_result, xmm_key11);
-    __ aesdec(xmm_result, xmm_key12);
-    __ aesdeclast(xmm_result, xmm_key_last);                    // xmm15 always came from key+0
-    __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
-    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);  // store into the next 16 bytes of output
-    // no need to store r to memory until we exit
-    __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);  // set up next r vector with cipher input from this block
-    __ addptr(pos, AESBlockSize);
-    __ subptr(len_reg, AESBlockSize);
-    __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192);
-    __ jmp(L_exit);
-
-    __ BIND(L_key_256);
-    // 256-bit code follows here (could be optimized to use parallelism)
-    __ movptr(pos, 0);
-    __ align(OptoLoopAlignment);
-
-    __ BIND(L_singleBlock_loopTop_256);
-    __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
-    __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
-    __ pxor  (xmm_result, xmm_key_first);               // do the aes dec rounds
-    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
-      __ aesdec(xmm_result, as_XMMRegister(rnum));
-    }
-    __ aesdec(xmm_result, xmm_key11);
-    load_key(xmm_temp, key, 0xc0);
-    __ aesdec(xmm_result, xmm_temp);
-    load_key(xmm_temp, key, 0xd0);
-    __ aesdec(xmm_result, xmm_temp);
-    load_key(xmm_temp, key, 0xe0);     // 256-bit key goes up to e0
-    __ aesdec(xmm_result, xmm_temp);
-    __ aesdeclast(xmm_result, xmm_key_last);          // xmm15 came from key+0
-    __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
-    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);  // store into the next 16 bytes of output
-    // no need to store r to memory until we exit
-    __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);  // set up next r vector with cipher input from this block
-    __ addptr(pos, AESBlockSize);
-    __ subptr(len_reg, AESBlockSize);
-    __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
-    __ jmp(L_exit);
-
     return start;
-  }
+}
 
   address generate_upper_word_mask() {
     __ align(64);