8143925: Enhancing CounterMode.crypt() for AES
authorkvn
Mon, 28 Dec 2015 23:11:01 -0800
changeset 35154 a9b3c1984a01
parent 35153 0341260cd1f2
child 35155 db692d3ebbcc
8143925: Enhancing CounterMode.crypt() for AES Summary: Add intrinsic for CounterMode.crypt() to leverage the parallel nature of AES in Counter(CTR) Mode. Reviewed-by: kvn, ascarpino Contributed-by: kishor.kharbas@intel.com
hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp
hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp
hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp
hotspot/src/cpu/x86/vm/assembler_x86.cpp
hotspot/src/cpu/x86/vm/assembler_x86.hpp
hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp
hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp
hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp
hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp
hotspot/src/cpu/x86/vm/vm_version_x86.cpp
hotspot/src/share/vm/classfile/vmSymbols.cpp
hotspot/src/share/vm/classfile/vmSymbols.hpp
hotspot/src/share/vm/opto/c2compiler.cpp
hotspot/src/share/vm/opto/escape.cpp
hotspot/src/share/vm/opto/library_call.cpp
hotspot/src/share/vm/opto/runtime.cpp
hotspot/src/share/vm/opto/runtime.hpp
hotspot/src/share/vm/runtime/globals.hpp
hotspot/src/share/vm/runtime/stubRoutines.cpp
hotspot/src/share/vm/runtime/stubRoutines.hpp
hotspot/src/share/vm/runtime/vmStructs.cpp
hotspot/test/compiler/codegen/7184394/TestAESBase.java
hotspot/test/compiler/codegen/7184394/TestAESMain.java
--- a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Mon Dec 28 10:10:37 2015 -1000
+++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Mon Dec 28 23:11:01 2015 -0800
@@ -202,6 +202,11 @@
     }
   }
 
+  if (UseAESCTRIntrinsics) {
+    warning("AES/CTR intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
+  }
+
   if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {
     UseCRC32Intrinsics = true;
   }
--- a/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp	Mon Dec 28 10:10:37 2015 -1000
+++ b/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp	Mon Dec 28 23:11:01 2015 -0800
@@ -196,6 +196,11 @@
     FLAG_SET_DEFAULT(UseAESIntrinsics, false);
   }
 
+  if (UseAESCTRIntrinsics) {
+    warning("AES/CTR intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
+  }
+
   if (UseGHASHIntrinsics) {
     warning("GHASH intrinsics are not available on this CPU");
     FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp	Mon Dec 28 10:10:37 2015 -1000
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp	Mon Dec 28 23:11:01 2015 -0800
@@ -260,6 +260,11 @@
     }
   }
 
+  if (UseAESCTRIntrinsics) {
+    warning("AES/CTR intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
+  }
+
   // GHASH/GCM intrinsics
   if (has_vis3() && (UseVIS > 2)) {
     if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) {
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Mon Dec 28 10:10:37 2015 -1000
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Mon Dec 28 23:11:01 2015 -0800
@@ -3349,22 +3349,41 @@
 void Assembler::pextrd(Register dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
   InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  int encode = simd_prefix_and_encode(src, xnoreg, as_XMMRegister(dst->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x16);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
 }
 
+void Assembler::pextrd(Address dst, XMMRegister src, int imm8) {
+  assert(VM_Version::supports_sse4_1(), "");
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
+  simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x16);
+  emit_operand(src, dst);
+  emit_int8(imm8);
+}
+
 void Assembler::pextrq(Register dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
   InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
-  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  int encode = simd_prefix_and_encode(src, xnoreg, as_XMMRegister(dst->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
   emit_int8(0x16);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
 }
 
-// The encoding for pextrw is SSE2 to support the LIBM implementation.
+void Assembler::pextrq(Address dst, XMMRegister src, int imm8) {
+  assert(VM_Version::supports_sse4_1(), "");
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
+  simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x16);
+  emit_operand(src, dst);
+  emit_int8(imm8);
+}
+
 void Assembler::pextrw(Register dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse2(), "");
   InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
@@ -3374,6 +3393,26 @@
   emit_int8(imm8);
 }
 
+void Assembler::pextrw(Address dst, XMMRegister src, int imm8) {
+  assert(VM_Version::supports_sse4_1(), "");
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_16bit);
+  simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8((unsigned char)0x15);
+  emit_operand(src, dst);
+  emit_int8(imm8);
+}
+
+void Assembler::pextrb(Address dst, XMMRegister src, int imm8) {
+  assert(VM_Version::supports_sse4_1(), "");
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_8bit);
+  simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x14);
+  emit_operand(src, dst);
+  emit_int8(imm8);
+}
+
 void Assembler::pinsrd(XMMRegister dst, Register src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
   InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
@@ -3383,6 +3422,16 @@
   emit_int8(imm8);
 }
 
+void Assembler::pinsrd(XMMRegister dst, Address src, int imm8) {
+  assert(VM_Version::supports_sse4_1(), "");
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x22);
+  emit_operand(dst,src);
+  emit_int8(imm8);
+}
+
 void Assembler::pinsrq(XMMRegister dst, Register src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
   InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
@@ -3392,6 +3441,16 @@
   emit_int8(imm8);
 }
 
+void Assembler::pinsrq(XMMRegister dst, Address src, int imm8) {
+  assert(VM_Version::supports_sse4_1(), "");
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ _legacy_mode_dq, /* no_mask_reg */ true, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x22);
+  emit_operand(dst, src);
+  emit_int8(imm8);
+}
+
 void Assembler::pinsrw(XMMRegister dst, Register src, int imm8) {
   assert(VM_Version::supports_sse2(), "");
   InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
@@ -3401,6 +3460,26 @@
   emit_int8(imm8);
 }
 
+void Assembler::pinsrw(XMMRegister dst, Address src, int imm8) {
+  assert(VM_Version::supports_sse2(), "");
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_16bit);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+  emit_int8((unsigned char)0xC4);
+  emit_operand(dst, src);
+  emit_int8(imm8);
+}
+
+void Assembler::pinsrb(XMMRegister dst, Address src, int imm8) {
+  assert(VM_Version::supports_sse4_1(), "");
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ false);
+  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_8bit);
+  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int8(0x20);
+  emit_operand(dst, src);
+  emit_int8(imm8);
+}
+
 void Assembler::pmovzxbw(XMMRegister dst, Address src) {
   assert(VM_Version::supports_sse4_1(), "");
   InstructionMark im(this);
@@ -4188,6 +4267,12 @@
   emit_arith(0x33, 0xC0, dst, src);
 }
 
+void Assembler::xorb(Register dst, Address src) {
+  InstructionMark im(this);
+  prefix(src, dst);
+  emit_int8(0x32);
+  emit_operand(dst, src);
+}
 
 // AVX 3-operands scalar float-point arithmetic instructions
 
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Mon Dec 28 10:10:37 2015 -1000
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Mon Dec 28 23:11:01 2015 -0800
@@ -1543,14 +1543,22 @@
   // SSE 4.1 extract
   void pextrd(Register dst, XMMRegister src, int imm8);
   void pextrq(Register dst, XMMRegister src, int imm8);
+  void pextrd(Address dst, XMMRegister src, int imm8);
+  void pextrq(Address dst, XMMRegister src, int imm8);
+  void pextrb(Address dst, XMMRegister src, int imm8);
   // SSE 2 extract
   void pextrw(Register dst, XMMRegister src, int imm8);
+  void pextrw(Address dst, XMMRegister src, int imm8);
 
   // SSE 4.1 insert
   void pinsrd(XMMRegister dst, Register src, int imm8);
   void pinsrq(XMMRegister dst, Register src, int imm8);
+  void pinsrd(XMMRegister dst, Address src, int imm8);
+  void pinsrq(XMMRegister dst, Address src, int imm8);
+  void pinsrb(XMMRegister dst, Address src, int imm8);
   // SSE 2 insert
   void pinsrw(XMMRegister dst, Register src, int imm8);
+  void pinsrw(XMMRegister dst, Address src, int imm8);
 
   // SSE4.1 packed move
   void pmovzxbw(XMMRegister dst, XMMRegister src);
@@ -1762,6 +1770,8 @@
   void xorl(Register dst, Address src);
   void xorl(Register dst, Register src);
 
+  void xorb(Register dst, Address src);
+
   void xorq(Register dst, Address src);
   void xorq(Register dst, Register src);
 
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Mon Dec 28 10:10:37 2015 -1000
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Mon Dec 28 23:11:01 2015 -0800
@@ -2142,6 +2142,17 @@
     return start;
   }
 
+  address generate_counter_shuffle_mask() {
+    __ align(16);
+    StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
+    address start = __ pc();
+    __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
+    __ emit_data(0x08090a0b, relocInfo::none, 0);
+    __ emit_data(0x04050607, relocInfo::none, 0);
+    __ emit_data(0x00010203, relocInfo::none, 0);
+    return start;
+  }
+
   // Utility routine for loading a 128-bit key word in little endian format
   // can optionally specify that the shuffle mask is already in an xmmregister
   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
@@ -2167,6 +2178,31 @@
     __ aesdec(xmmdst, xmmtmp);
   }
 
+  // Utility routine for increase 128bit counter (iv in CTR mode)
+  //  XMM_128bit,  D3, D2, D1, D0
+  void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
+    __ pextrd(reg, xmmdst, 0x0);
+    __ addl(reg, inc_delta);
+    __ pinsrd(xmmdst, reg, 0x0);
+    __ jcc(Assembler::carryClear, next_block); // jump if no carry
+
+    __ pextrd(reg, xmmdst, 0x01); // Carry-> D1
+    __ addl(reg, 0x01);
+    __ pinsrd(xmmdst, reg, 0x01);
+    __ jcc(Assembler::carryClear, next_block); // jump if no carry
+
+    __ pextrd(reg, xmmdst, 0x02); // Carry-> D2
+    __ addl(reg, 0x01);
+    __ pinsrd(xmmdst, reg, 0x02);
+    __ jcc(Assembler::carryClear, next_block); // jump if no carry
+
+    __ pextrd(reg, xmmdst, 0x03); // Carry -> D3
+    __ addl(reg, 0x01);
+    __ pinsrd(xmmdst, reg, 0x03);
+
+    __ BIND(next_block);          // next instruction
+  }
+
 
   // Arguments:
   //
@@ -2742,6 +2778,317 @@
     return start;
   }
 
+
+  // CTR AES crypt.
+  // In 32-bit stub, parallelize 4 blocks at a time
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //   c_rarg3   - counter vector byte array address
+  //   c_rarg4   - input length
+  //
+  // Output:
+  //   rax       - input length
+  //
+  address generate_counterMode_AESCrypt_Parallel() {
+    assert(UseAES, "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
+    address start = __ pc();
+    const Register from        = rsi;      // source array address
+    const Register to          = rdx;      // destination array address
+    const Register key         = rcx;      // key array address
+    const Register counter     = rdi;      // counter byte array initialized from initvector array address
+
+    // and left with the results of the last encryption block
+    const Register len_reg     = rbx;
+    const Register pos         = rax;
+
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+    handleSOERegisters(true /*saving*/); // save rbx, rsi, rdi
+
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+      __ movl(rdx, 0xffff);
+      __ kmovdl(k1, rdx);
+    }
+
+    // load registers from incoming parameters
+    const Address  from_param(rbp, 8+0);
+    const Address  to_param  (rbp, 8+4);
+    const Address  key_param (rbp, 8+8);
+    const Address  rvec_param (rbp, 8+12);
+    const Address  len_param  (rbp, 8+16);
+    const Address  saved_counter_param(rbp, 8 + 20);
+    const Address  used_addr_param(rbp, 8 + 24);
+
+    __ movptr(from , from_param);
+    __ movptr(to   , to_param);
+    //__ movptr(key, key_param);
+    //__ movptr(counter, rvec_param);
+    __ movptr(len_reg , len_param);
+    //__ movptr(pos, 0);
+
+    // Use the partially used encrpyted counter from last invocation
+    Label L_exit_preLoop, L_preLoop_start;
+
+    // Use the registers 'counter' and 'key' here in this preloop
+    // to hold of last 2 params 'used' and 'saved_encCounter_start'
+    Register used = counter;
+    Register saved_encCounter_start = key;
+    Register used_addr = saved_encCounter_start;
+
+    __ movptr(used_addr, used_addr_param);
+    __ movptr(used, Address(used_addr, 0));
+    __ movptr(saved_encCounter_start, saved_counter_param);
+
+    __ BIND(L_preLoop_start);
+    __ cmpptr(used, 16);
+    __ jcc(Assembler::aboveEqual, L_exit_preLoop);
+    __ cmpptr(len_reg, 0);
+    __ jcc(Assembler::lessEqual, L_exit_preLoop);
+    __ movb(rax, Address(saved_encCounter_start, used));
+    __ xorb(rax, Address(from, 0));
+    __ movb(Address(to, 0), rax);
+    __ addptr(from, 1);
+    __ addptr(to, 1);
+    __ addptr(used, 1);
+    __ subptr(len_reg, 1);
+
+    __ jmp(L_preLoop_start);
+
+    __ BIND(L_exit_preLoop);
+    __ movptr(used_addr, used_addr_param);
+    __ movptr(used_addr, used_addr_param);
+    __ movl(Address(used_addr, 0), used);
+
+    // load the parameters 'key' and 'counter'
+    __ movptr(key, key_param);
+    __ movptr(counter, rvec_param);
+
+    // xmm register assignments for the loops below
+    const XMMRegister xmm_curr_counter      = xmm0;
+    const XMMRegister xmm_counter_shuf_mask = xmm1;  // need to be reloaded
+    const XMMRegister xmm_key_shuf_mask     = xmm2;  // need to be reloaded
+    const XMMRegister xmm_key               = xmm3;
+    const XMMRegister xmm_result0           = xmm4;
+    const XMMRegister xmm_result1           = xmm5;
+    const XMMRegister xmm_result2           = xmm6;
+    const XMMRegister xmm_result3           = xmm7;
+    const XMMRegister xmm_from0             = xmm1;   //reuse XMM register
+    const XMMRegister xmm_from1             = xmm2;
+    const XMMRegister xmm_from2             = xmm3;
+    const XMMRegister xmm_from3             = xmm4;
+
+    //for key_128, key_192, key_256
+    const int rounds[3] = {10, 12, 14};
+    Label L_singleBlockLoopTop[3];
+    Label L_multiBlock_loopTop[3];
+    Label L_key192_top, L_key256_top;
+    Label L_incCounter[3][4]; // 3: different key length,  4: 4 blocks at a time
+    Label L_incCounter_single[3]; //for single block, key128, key192, key256
+    Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
+    Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
+
+    Label L_exit;
+    const int PARALLEL_FACTOR = 4;  //because of the limited register number
+
+    // initialize counter with initial counter
+    __ movdqu(xmm_curr_counter, Address(counter, 0x00));
+    __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
+    __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled for increase
+
+    // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
+    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+    __ cmpl(rax, 52);
+    __ jcc(Assembler::equal, L_key192_top);
+    __ cmpl(rax, 60);
+    __ jcc(Assembler::equal, L_key256_top);
+
+    //key128 begins here
+    __ movptr(pos, 0); // init pos before L_multiBlock_loopTop
+
+#define CTR_DoFour(opc, src_reg)               \
+    __ opc(xmm_result0, src_reg);              \
+    __ opc(xmm_result1, src_reg);              \
+    __ opc(xmm_result2, src_reg);              \
+    __ opc(xmm_result3, src_reg);
+
+    // k == 0 :  generate code for key_128
+    // k == 1 :  generate code for key_192
+    // k == 2 :  generate code for key_256
+    for (int k = 0; k < 3; ++k) {
+      //multi blocks starts here
+      __ align(OptoLoopAlignment);
+      __ BIND(L_multiBlock_loopTop[k]);
+      __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
+      __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
+
+      __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+      __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
+
+      //load, then increase counters
+      CTR_DoFour(movdqa, xmm_curr_counter);
+      __ push(rbx);
+      inc_counter(rbx, xmm_result1, 0x01, L_incCounter[k][0]);
+      inc_counter(rbx, xmm_result2, 0x02, L_incCounter[k][1]);
+      inc_counter(rbx, xmm_result3, 0x03, L_incCounter[k][2]);
+      inc_counter(rbx, xmm_curr_counter, 0x04, L_incCounter[k][3]);
+      __ pop (rbx);
+
+      load_key(xmm_key, key, 0x00, xmm_key_shuf_mask); // load Round 0 key. interleaving for better performance
+
+      CTR_DoFour(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
+      CTR_DoFour(pxor, xmm_key);   //PXOR with Round 0 key
+
+      for (int i = 1; i < rounds[k]; ++i) {
+        load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask);
+        CTR_DoFour(aesenc, xmm_key);
+      }
+      load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask);
+      CTR_DoFour(aesenclast, xmm_key);
+
+      // get next PARALLEL_FACTOR blocks into xmm_from registers
+      __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
+      __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
+      __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
+
+      // PXOR with input text
+      __ pxor(xmm_result0, xmm_from0); //result0 is xmm4
+      __ pxor(xmm_result1, xmm_from1);
+      __ pxor(xmm_result2, xmm_from2);
+
+      // store PARALLEL_FACTOR results into the next 64 bytes of output
+      __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
+      __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
+      __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
+
+      // do it here after xmm_result0 is saved, because xmm_from3 reuse the same register of xmm_result0.
+      __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
+      __ pxor(xmm_result3, xmm_from3);
+      __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
+
+      __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
+      __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
+      __ jmp(L_multiBlock_loopTop[k]);
+
+      // singleBlock starts here
+      __ align(OptoLoopAlignment);
+      __ BIND(L_singleBlockLoopTop[k]);
+      __ cmpptr(len_reg, 0);
+      __ jcc(Assembler::equal, L_exit);
+      __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+      __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
+      __ movdqa(xmm_result0, xmm_curr_counter);
+      load_key(xmm_key, key, 0x00, xmm_key_shuf_mask);
+      __ push(rbx);//rbx is used for increasing counter
+      inc_counter(rbx, xmm_curr_counter, 0x01, L_incCounter_single[k]);
+      __ pop (rbx);
+      __ pshufb(xmm_result0, xmm_counter_shuf_mask);
+      __ pxor(xmm_result0, xmm_key);
+      for (int i = 1; i < rounds[k]; i++) {
+        load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask);
+        __ aesenc(xmm_result0, xmm_key);
+      }
+      load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask);
+      __ aesenclast(xmm_result0, xmm_key);
+      __ cmpptr(len_reg, AESBlockSize);
+      __ jcc(Assembler::less, L_processTail_insr[k]);
+        __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
+        __ pxor(xmm_result0, xmm_from0);
+        __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
+        __ addptr(pos, AESBlockSize);
+        __ subptr(len_reg, AESBlockSize);
+        __ jmp(L_singleBlockLoopTop[k]);
+
+      __ BIND(L_processTail_insr[k]);
+        __ addptr(pos, len_reg);
+        __ testptr(len_reg, 8);
+        __ jcc(Assembler::zero, L_processTail_4_insr[k]);
+          __ subptr(pos,8);
+          __ pinsrd(xmm_from0, Address(from, pos), 0);
+          __ pinsrd(xmm_from0, Address(from, pos, Address::times_1, 4), 1);
+        __ BIND(L_processTail_4_insr[k]);
+        __ testptr(len_reg, 4);
+        __ jcc(Assembler::zero, L_processTail_2_insr[k]);
+          __ subptr(pos,4);
+          __ pslldq(xmm_from0, 4);
+          __ pinsrd(xmm_from0, Address(from, pos), 0);
+        __ BIND(L_processTail_2_insr[k]);
+        __ testptr(len_reg, 2);
+        __ jcc(Assembler::zero, L_processTail_1_insr[k]);
+          __ subptr(pos, 2);
+          __ pslldq(xmm_from0, 2);
+          __ pinsrw(xmm_from0, Address(from, pos), 0);
+        __ BIND(L_processTail_1_insr[k]);
+        __ testptr(len_reg, 1);
+        __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
+          __ subptr(pos, 1);
+          __ pslldq(xmm_from0, 1);
+          __ pinsrb(xmm_from0, Address(from, pos), 0);
+        __ BIND(L_processTail_exit_insr[k]);
+
+        __ movptr(saved_encCounter_start, saved_counter_param);
+        __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);
+        __ pxor(xmm_result0, xmm_from0);
+
+        __ testptr(len_reg, 8);
+        __ jcc(Assembler::zero, L_processTail_4_extr[k]);
+          __ pextrd(Address(to, pos), xmm_result0, 0);
+          __ pextrd(Address(to, pos, Address::times_1, 4), xmm_result0, 1);
+          __ psrldq(xmm_result0, 8);
+          __ addptr(pos, 8);
+        __ BIND(L_processTail_4_extr[k]);
+        __ testptr(len_reg, 4);
+        __ jcc(Assembler::zero, L_processTail_2_extr[k]);
+          __ pextrd(Address(to, pos), xmm_result0, 0);
+          __ psrldq(xmm_result0, 4);
+          __ addptr(pos, 4);
+        __ BIND(L_processTail_2_extr[k]);
+        __ testptr(len_reg, 2);
+        __ jcc(Assembler::zero, L_processTail_1_extr[k]);
+          __ pextrb(Address(to, pos), xmm_result0, 0);
+          __ pextrb(Address(to, pos, Address::times_1, 1), xmm_result0, 1);
+          __ psrldq(xmm_result0, 2);
+          __ addptr(pos, 2);
+        __ BIND(L_processTail_1_extr[k]);
+        __ testptr(len_reg, 1);
+        __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
+          __ pextrb(Address(to, pos), xmm_result0, 0);
+
+        __ BIND(L_processTail_exit_extr[k]);
+        __ movptr(used_addr, used_addr_param);
+        __ movl(Address(used_addr, 0), len_reg);
+        __ jmp(L_exit);
+    }
+
+    __ BIND(L_exit);
+    __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
+    __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
+    __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
+    handleSOERegisters(false /*restoring*/);
+    __ movptr(rax, len_param); // return length
+    __ leave();                // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+    __ BIND (L_key192_top);
+    __ movptr(pos, 0); // init pos before L_multiBlock_loopTop
+    __ jmp(L_multiBlock_loopTop[1]); //key192
+
+    __ BIND (L_key256_top);
+    __ movptr(pos, 0); // init pos before L_multiBlock_loopTop
+    __ jmp(L_multiBlock_loopTop[2]); //key192
+
+    return start;
+  }
+
+
   // byte swap x86 long
   address generate_ghash_long_swap_mask() {
     __ align(CodeEntryAlignment);
@@ -3360,6 +3707,11 @@
       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
     }
 
+    if (UseAESCTRIntrinsics) {
+      StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
+      StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
+    }
+
     // Generate GHASH intrinsics code
     if (UseGHASHIntrinsics) {
       StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Mon Dec 28 10:10:37 2015 -1000
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Mon Dec 28 23:11:01 2015 -0800
@@ -3039,6 +3039,15 @@
     return start;
   }
 
+  address generate_counter_shuffle_mask() {
+    __ align(16);
+    StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
+    address start = __ pc();
+    __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
+    __ emit_data64(0x0001020304050607, relocInfo::none);
+    return start;
+  }
+
   // Utility routine for loading a 128-bit key word in little endian format
   // can optionally specify that the shuffle mask is already in an xmmregister
   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
@@ -3050,6 +3059,18 @@
     }
   }
 
+  // Utility routine for increase 128bit counter (iv in CTR mode)
+  void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
+    __ pextrq(reg, xmmdst, 0x0);
+    __ addq(reg, inc_delta);
+    __ pinsrq(xmmdst, reg, 0x0);
+    __ jcc(Assembler::carryClear, next_block); // jump if no carry
+    __ pextrq(reg, xmmdst, 0x01); // Carry
+    __ addq(reg, 0x01);
+    __ pinsrq(xmmdst, reg, 0x01); //Carry end
+    __ BIND(next_block);          // next instruction
+  }
+
   // Arguments:
   //
   // Inputs:
@@ -3700,6 +3721,328 @@
     return start;
   }
 
+  // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
+  // to hide instruction latency
+  //
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //   c_rarg3   - counter vector byte array address
+  //   Linux
+  //     c_rarg4   -          input length
+  //     c_rarg5   -          saved encryptedCounter start
+  //     rbp + 6 * wordSize - saved used length
+  //   Windows
+  //     rbp + 6 * wordSize - input length
+  //     rbp + 7 * wordSize - saved encryptedCounter start
+  //     rbp + 8 * wordSize - saved used length
+  //
+  // Output:
+  //   rax       - input length
+  //
+  address generate_counterMode_AESCrypt_Parallel() {
+    assert(UseAES, "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
+    address start = __ pc();
+    const Register from = c_rarg0; // source array address
+    const Register to = c_rarg1; // destination array address
+    const Register key = c_rarg2; // key array address
+    const Register counter = c_rarg3; // counter byte array initialized from counter array address
+    // and left with the results of the last encryption block
+#ifndef _WIN64
+    const Register len_reg = c_rarg4;
+    const Register saved_encCounter_start = c_rarg5;
+    const Register used_addr = r10;
+    const Address  used_mem(rbp, 2 * wordSize);
+    const Register used = r11;
+#else
+    const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
+    const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64
+    const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64
+    const Register len_reg = r10; // pick the first volatile windows register
+    const Register saved_encCounter_start = r11;
+    const Register used_addr = r13;
+    const Register used = r14;
+#endif
+    const Register pos = rax;
+
+    const int PARALLEL_FACTOR = 6;
+    const XMMRegister xmm_counter_shuf_mask = xmm0;
+    const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
+    const XMMRegister xmm_curr_counter = xmm2;
+
+    const XMMRegister xmm_key_tmp0 = xmm3;
+    const XMMRegister xmm_key_tmp1 = xmm4;
+
+    // registers holding the four results in the parallelized loop
+    const XMMRegister xmm_result0 = xmm5;
+    const XMMRegister xmm_result1 = xmm6;
+    const XMMRegister xmm_result2 = xmm7;
+    const XMMRegister xmm_result3 = xmm8;
+    const XMMRegister xmm_result4 = xmm9;
+    const XMMRegister xmm_result5 = xmm10;
+
+    const XMMRegister xmm_from0 = xmm11;
+    const XMMRegister xmm_from1 = xmm12;
+    const XMMRegister xmm_from2 = xmm13;
+    const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
+    const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
+    const XMMRegister xmm_from5 = xmm4;
+
+    //for key_128, key_192, key_256
+    const int rounds[3] = {10, 12, 14};
+    Label L_exit_preLoop, L_preLoop_start;
+    Label L_multiBlock_loopTop[3];
+    Label L_singleBlockLoopTop[3];
+    Label L__incCounter[3][6]; //for 6 blocks
+    Label L__incCounter_single[3]; //for single block, key128, key192, key256
+    Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
+    Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
+
+    Label L_exit;
+
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+        __ movl(rax, 0xffff);
+        __ kmovql(k1, rax);
+    }
+
+#ifdef _WIN64
+    // save the xmm registers which must be preserved 6-14
+    const int XMM_REG_NUM_KEY_LAST = 14;
+    __ subptr(rsp, -rsp_after_call_off * wordSize);
+    for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
+      __ movdqu(xmm_save(i), as_XMMRegister(i));
+    }
+
+    const Address r13_save(rbp, rdi_off * wordSize);
+    const Address r14_save(rbp, rsi_off * wordSize);
+
+    __ movptr(r13_save, r13);
+    __ movptr(r14_save, r14);
+
+    // on win64, fill len_reg from stack position
+    __ movl(len_reg, len_mem);
+    __ movptr(saved_encCounter_start, saved_encCounter_mem);
+    __ movptr(used_addr, used_mem);
+    __ movl(used, Address(used_addr, 0));
+#else
+    __ push(len_reg); // Save
+    __ movptr(used_addr, used_mem);
+    __ movl(used, Address(used_addr, 0));
+#endif
+
+    __ push(rbx); // Save RBX
+    __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
+    __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
+    __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
+    __ movptr(pos, 0);
+
+    // Use the partially used encrpyted counter from last invocation
+    __ BIND(L_preLoop_start);
+    __ cmpptr(used, 16);
+    __ jcc(Assembler::aboveEqual, L_exit_preLoop);
+      __ cmpptr(len_reg, 0);
+      __ jcc(Assembler::lessEqual, L_exit_preLoop);
+      __ movb(rbx, Address(saved_encCounter_start, used));
+      __ xorb(rbx, Address(from, pos));
+      __ movb(Address(to, pos), rbx);
+      __ addptr(pos, 1);
+      __ addptr(used, 1);
+      __ subptr(len_reg, 1);
+
+    __ jmp(L_preLoop_start);
+
+    __ BIND(L_exit_preLoop);
+    __ movl(Address(used_addr, 0), used);
+
+    // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
+    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+    __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+    __ cmpl(rbx, 52);
+    __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
+    __ cmpl(rbx, 60);
+    __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
+
+#define CTR_DoSix(opc, src_reg)                \
+    __ opc(xmm_result0, src_reg);              \
+    __ opc(xmm_result1, src_reg);              \
+    __ opc(xmm_result2, src_reg);              \
+    __ opc(xmm_result3, src_reg);              \
+    __ opc(xmm_result4, src_reg);              \
+    __ opc(xmm_result5, src_reg);
+
+    // k == 0 :  generate code for key_128
+    // k == 1 :  generate code for key_192
+    // k == 2 :  generate code for key_256
+    for (int k = 0; k < 3; ++k) {
+      //multi blocks starts here
+      __ align(OptoLoopAlignment);
+      __ BIND(L_multiBlock_loopTop[k]);
+      __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
+      __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
+      load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
+
+      //load, then increase counters
+      CTR_DoSix(movdqa, xmm_curr_counter);
+      inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
+      inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
+      inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
+      inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
+      inc_counter(rbx, xmm_result5,  0x05, L__incCounter[k][4]);
+      inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
+      CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
+      CTR_DoSix(pxor, xmm_key_tmp0);   //PXOR with Round 0 key
+
+      //load two ROUND_KEYs at a time
+      for (int i = 1; i < rounds[k]; ) {
+        load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
+        load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
+        CTR_DoSix(aesenc, xmm_key_tmp1);
+        i++;
+        if (i != rounds[k]) {
+          CTR_DoSix(aesenc, xmm_key_tmp0);
+        } else {
+          CTR_DoSix(aesenclast, xmm_key_tmp0);
+        }
+        i++;
+      }
+
+      // get next PARALLEL_FACTOR blocks into xmm_result registers
+      __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
+      __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
+      __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
+      __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
+      __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
+      __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));
+
+      __ pxor(xmm_result0, xmm_from0);
+      __ pxor(xmm_result1, xmm_from1);
+      __ pxor(xmm_result2, xmm_from2);
+      __ pxor(xmm_result3, xmm_from3);
+      __ pxor(xmm_result4, xmm_from4);
+      __ pxor(xmm_result5, xmm_from5);
+
+      // store 6 results into the next 64 bytes of output
+      __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
+      __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
+      __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
+      __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
+      __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
+      __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);
+
+      __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
+      __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
+      __ jmp(L_multiBlock_loopTop[k]);
+
+      // singleBlock starts here
+      __ align(OptoLoopAlignment);
+      __ BIND(L_singleBlockLoopTop[k]);
+      __ cmpptr(len_reg, 0);
+      __ jcc(Assembler::lessEqual, L_exit);
+      load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
+      __ movdqa(xmm_result0, xmm_curr_counter);
+      inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
+      __ pshufb(xmm_result0, xmm_counter_shuf_mask);
+      __ pxor(xmm_result0, xmm_key_tmp0);
+      for (int i = 1; i < rounds[k]; i++) {
+        load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
+        __ aesenc(xmm_result0, xmm_key_tmp0);
+      }
+      load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
+      __ aesenclast(xmm_result0, xmm_key_tmp0);
+      __ cmpptr(len_reg, AESBlockSize);
+      __ jcc(Assembler::less, L_processTail_insr[k]);
+        __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
+        __ pxor(xmm_result0, xmm_from0);
+        __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
+        __ addptr(pos, AESBlockSize);
+        __ subptr(len_reg, AESBlockSize);
+        __ jmp(L_singleBlockLoopTop[k]);
+      __ BIND(L_processTail_insr[k]);
+        __ addptr(pos, len_reg);
+        __ testptr(len_reg, 8);
+        __ jcc(Assembler::zero, L_processTail_4_insr[k]);
+          __ subptr(pos,8);
+          __ pinsrq(xmm_from0, Address(from, pos), 0);
+        __ BIND(L_processTail_4_insr[k]);
+        __ testptr(len_reg, 4);
+        __ jcc(Assembler::zero, L_processTail_2_insr[k]);
+          __ subptr(pos,4);
+          __ pslldq(xmm_from0, 4);
+          __ pinsrd(xmm_from0, Address(from, pos), 0);
+        __ BIND(L_processTail_2_insr[k]);
+        __ testptr(len_reg, 2);
+        __ jcc(Assembler::zero, L_processTail_1_insr[k]);
+          __ subptr(pos, 2);
+          __ pslldq(xmm_from0, 2);
+          __ pinsrw(xmm_from0, Address(from, pos), 0);
+        __ BIND(L_processTail_1_insr[k]);
+        __ testptr(len_reg, 1);
+        __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
+          __ subptr(pos, 1);
+          __ pslldq(xmm_from0, 1);
+          __ pinsrb(xmm_from0, Address(from, pos), 0);
+        __ BIND(L_processTail_exit_insr[k]);
+
+        __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);
+        __ pxor(xmm_result0, xmm_from0);
+
+        __ testptr(len_reg, 8);
+        __ jcc(Assembler::zero, L_processTail_4_extr[k]);
+          __ pextrq(Address(to, pos), xmm_result0, 0);
+          __ psrldq(xmm_result0, 8);
+          __ addptr(pos, 8);
+        __ BIND(L_processTail_4_extr[k]);
+        __ testptr(len_reg, 4);
+        __ jcc(Assembler::zero, L_processTail_2_extr[k]);
+          __ pextrd(Address(to, pos), xmm_result0, 0);
+          __ psrldq(xmm_result0, 4);
+          __ addptr(pos, 4);
+        __ BIND(L_processTail_2_extr[k]);
+        __ testptr(len_reg, 2);
+        __ jcc(Assembler::zero, L_processTail_1_extr[k]);
+          __ pextrw(Address(to, pos), xmm_result0, 0);
+          __ psrldq(xmm_result0, 2);
+          __ addptr(pos, 2);
+        __ BIND(L_processTail_1_extr[k]);
+        __ testptr(len_reg, 1);
+        __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
+          __ pextrb(Address(to, pos), xmm_result0, 0);
+
+        __ BIND(L_processTail_exit_extr[k]);
+        __ movl(Address(used_addr, 0), len_reg);
+        __ jmp(L_exit);
+
+    }
+
+    __ BIND(L_exit);
+    __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
+    __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
+    __ pop(rbx); // pop the saved RBX.
+#ifdef _WIN64
+    // restore regs belonging to calling function
+    for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
+      __ movdqu(as_XMMRegister(i), xmm_save(i));
+    }
+    __ movl(rax, len_mem);
+    __ movptr(r13, r13_save);
+    __ movptr(r14, r14_save);
+#else
+    __ pop(rax); // return 'len'
+#endif
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+    return start;
+  }
 
   // byte swap x86 long
   address generate_ghash_long_swap_mask() {
@@ -4555,12 +4898,15 @@
     // don't bother generating these AES intrinsic stubs unless global flag is set
     if (UseAESIntrinsics) {
       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
-
       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
     }
+    if (UseAESCTRIntrinsics){
+      StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
+      StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
+    }
 
     // Generate GHASH intrinsics code
     if (UseGHASHIntrinsics) {
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp	Mon Dec 28 10:10:37 2015 -1000
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp	Mon Dec 28 23:11:01 2015 -0800
@@ -34,6 +34,7 @@
 
 address StubRoutines::x86::_verify_mxcsr_entry = NULL;
 address StubRoutines::x86::_key_shuffle_mask_addr = NULL;
+address StubRoutines::x86::_counter_shuffle_mask_addr = NULL;
 address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL;
 address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL;
 
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp	Mon Dec 28 10:10:37 2015 -1000
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp	Mon Dec 28 23:11:01 2015 -0800
@@ -33,6 +33,10 @@
   static address _verify_mxcsr_entry;
   // shuffle mask for fixing up 128-bit words consisting of big-endian 32-bit integers
   static address _key_shuffle_mask_addr;
+
+  //shuffle mask for big-endian 128-bit integers
+  static address _counter_shuffle_mask_addr;
+
   // masks and table for CRC32
   static uint64_t _crc_by128_masks[];
   static juint    _crc_table[];
@@ -45,9 +49,9 @@
  public:
   static address verify_mxcsr_entry()    { return _verify_mxcsr_entry; }
   static address key_shuffle_mask_addr() { return _key_shuffle_mask_addr; }
+  static address counter_shuffle_mask_addr() { return _counter_shuffle_mask_addr; }
   static address crc_by128_masks_addr()  { return (address)_crc_by128_masks; }
   static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; }
   static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; }
   static void generate_CRC32C_table(bool is_pclmulqdq_supported);
-
 #endif // CPU_X86_VM_STUBROUTINES_X86_32_HPP
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp	Mon Dec 28 10:10:37 2015 -1000
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp	Mon Dec 28 23:11:01 2015 -0800
@@ -31,7 +31,7 @@
 
 enum platform_dependent_constants {
   code_size1 =  9000,           // simply increase if too small (assembler will crash if too small)
-  code_size2 = 30000            // simply increase if too small (assembler will crash if too small)
+  code_size2 = 33800            // simply increase if too small (assembler will crash if too small)
 };
 
 class x86 {
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp	Mon Dec 28 10:10:37 2015 -1000
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp	Mon Dec 28 23:11:01 2015 -0800
@@ -33,7 +33,7 @@
 
 enum platform_dependent_constants {
   code_size1 = 19000,          // simply increase if too small (assembler will crash if too small)
-  code_size2 = 32000           // simply increase if too small (assembler will crash if too small)
+  code_size2 = 35000           // simply increase if too small (assembler will crash if too small)
 };
 
 class x86 {
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp	Mon Dec 28 10:10:37 2015 -1000
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp	Mon Dec 28 23:11:01 2015 -0800
@@ -648,6 +648,28 @@
         }
         FLAG_SET_DEFAULT(UseAESIntrinsics, false);
       }
+
+      // --AES-CTR begins--
+      if (!UseAESIntrinsics) {
+        if (UseAESCTRIntrinsics && !FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) {
+          warning("AES-CTR intrinsics require UseAESIntrinsics flag to be enabled. Intrinsics will be disabled.");
+          FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
+        }
+      } else {
+        if(supports_sse4_1() && UseSSE >= 4) {
+          if (FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) {
+            FLAG_SET_DEFAULT(UseAESCTRIntrinsics, true);
+          }
+        } else {
+           // The AES-CTR intrinsic stubs require AES instruction support (of course)
+           // but also require sse4.1 mode or higher for instructions it use.
+          if (UseAESCTRIntrinsics && !FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) {
+             warning("X86 AES-CTR intrinsics require SSE4.1 instructions or higher. Intrinsics will be disabled.");
+           }
+           FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
+        }
+      }
+      // --AES-CTR ends--
     }
   } else if (UseAES || UseAESIntrinsics) {
     if (UseAES && !FLAG_IS_DEFAULT(UseAES)) {
@@ -658,6 +680,10 @@
       warning("AES intrinsics are not available on this CPU");
       FLAG_SET_DEFAULT(UseAESIntrinsics, false);
     }
+    if (UseAESCTRIntrinsics && !FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) {
+      warning("AES-CTR intrinsics are not available on this CPU");
+      FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
+    }
   }
 
   // Use CLMUL instructions if available.
@@ -681,6 +707,16 @@
     FLAG_SET_DEFAULT(UseCRC32Intrinsics, false);
   }
 
+  if (UseAESIntrinsics) {
+    if (FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) {
+      UseAESCTRIntrinsics = true;
+    }
+  } else if (UseAESCTRIntrinsics) {
+    if (!FLAG_IS_DEFAULT(UseAESCTRIntrinsics))
+        warning("AES/CTR intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
+  }
+
   if (supports_sse4_2()) {
     if (FLAG_IS_DEFAULT(UseCRC32CIntrinsics)) {
       UseCRC32CIntrinsics = true;
--- a/hotspot/src/share/vm/classfile/vmSymbols.cpp	Mon Dec 28 10:10:37 2015 -1000
+++ b/hotspot/src/share/vm/classfile/vmSymbols.cpp	Mon Dec 28 23:11:01 2015 -0800
@@ -409,6 +409,7 @@
   switch (id) {
   case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt:
   case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
+  case vmIntrinsics::_counterMode_AESCrypt:
     return 1;
   case vmIntrinsics::_digestBase_implCompressMB:
     return 3;
@@ -597,6 +598,9 @@
   case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
     if (!UseAESIntrinsics) return true;
     break;
+  case vmIntrinsics::_counterMode_AESCrypt:
+    if (!UseAESCTRIntrinsics) return true;
+    break;
   case vmIntrinsics::_sha_implCompress:
     if (!UseSHA1Intrinsics) return true;
     break;
--- a/hotspot/src/share/vm/classfile/vmSymbols.hpp	Mon Dec 28 10:10:37 2015 -1000
+++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp	Mon Dec 28 23:11:01 2015 -0800
@@ -981,6 +981,10 @@
    do_name(     decrypt_name,                                      "implDecrypt")                                       \
    do_signature(byteArray_int_int_byteArray_int_signature,         "([BII[BI)I")                                        \
                                                                                                                         \
+  do_class(com_sun_crypto_provider_counterMode,      "com/sun/crypto/provider/CounterMode")                             \
+   do_intrinsic(_counterMode_AESCrypt, com_sun_crypto_provider_counterMode, crypt_name, byteArray_int_int_byteArray_int_signature, F_R)   \
+   do_name(     crypt_name,                                 "implCrypt")                                                    \
+                                                                                                                        \
   /* support for sun.security.provider.SHA */                                                                           \
   do_class(sun_security_provider_sha,                              "sun/security/provider/SHA")                         \
   do_intrinsic(_sha_implCompress, sun_security_provider_sha, implCompress_name, implCompress_signature, F_R)            \
--- a/hotspot/src/share/vm/opto/c2compiler.cpp	Mon Dec 28 10:10:37 2015 -1000
+++ b/hotspot/src/share/vm/opto/c2compiler.cpp	Mon Dec 28 23:11:01 2015 -0800
@@ -432,6 +432,7 @@
   case vmIntrinsics::_aescrypt_decryptBlock:
   case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt:
   case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
+  case vmIntrinsics::_counterMode_AESCrypt:
   case vmIntrinsics::_sha_implCompress:
   case vmIntrinsics::_sha2_implCompress:
   case vmIntrinsics::_sha5_implCompress:
--- a/hotspot/src/share/vm/opto/escape.cpp	Mon Dec 28 10:10:37 2015 -1000
+++ b/hotspot/src/share/vm/opto/escape.cpp	Mon Dec 28 23:11:01 2015 -0800
@@ -976,6 +976,7 @@
                   strcmp(call->as_CallLeaf()->_name, "aescrypt_decryptBlock") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_encryptAESCrypt") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_decryptAESCrypt") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "counterMode_AESCrypt") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "ghash_processBlocks") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "sha1_implCompress") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "sha1_implCompressMB") == 0 ||
--- a/hotspot/src/share/vm/opto/library_call.cpp	Mon Dec 28 10:10:37 2015 -1000
+++ b/hotspot/src/share/vm/opto/library_call.cpp	Mon Dec 28 23:11:01 2015 -0800
@@ -201,6 +201,7 @@
     return generate_method_call(method_id, true, false);
   }
   Node * load_field_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, bool is_exact, bool is_static, ciInstanceKlass * fromKls);
+  Node * field_address_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, bool is_exact, bool is_static, ciInstanceKlass * fromKls);
 
   Node* make_string_method_node(int opcode, Node* str1_start, Node* cnt1, Node* str2_start, Node* cnt2, StrIntrinsicNode::ArgEnc ae);
   bool inline_string_compareTo(StrIntrinsicNode::ArgEnc ae);
@@ -283,7 +284,9 @@
   bool inline_Class_cast();
   bool inline_aescrypt_Block(vmIntrinsics::ID id);
   bool inline_cipherBlockChaining_AESCrypt(vmIntrinsics::ID id);
+  bool inline_counterMode_AESCrypt(vmIntrinsics::ID id);
   Node* inline_cipherBlockChaining_AESCrypt_predicate(bool decrypting);
+  Node* inline_counterMode_AESCrypt_predicate();
   Node* get_key_start_from_aescrypt_object(Node* aescrypt_object);
   Node* get_original_key_start_from_aescrypt_object(Node* aescrypt_object);
   bool inline_ghash_processBlocks();
@@ -697,6 +700,9 @@
   case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
     return inline_cipherBlockChaining_AESCrypt(intrinsic_id());
 
+  case vmIntrinsics::_counterMode_AESCrypt:
+    return inline_counterMode_AESCrypt(intrinsic_id());
+
   case vmIntrinsics::_sha_implCompress:
   case vmIntrinsics::_sha2_implCompress:
   case vmIntrinsics::_sha5_implCompress:
@@ -784,6 +790,8 @@
     return inline_cipherBlockChaining_AESCrypt_predicate(false);
   case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
     return inline_cipherBlockChaining_AESCrypt_predicate(true);
+  case vmIntrinsics::_counterMode_AESCrypt:
+    return inline_counterMode_AESCrypt_predicate();
   case vmIntrinsics::_digestBase_implCompressMB:
     return inline_digestBase_implCompressMB_predicate(predicate);
 
@@ -5778,6 +5786,39 @@
   return loadedField;
 }
 
+Node * LibraryCallKit::field_address_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString,
+                                                 bool is_exact = true, bool is_static = false,
+                                                 ciInstanceKlass * fromKls = NULL) {
+  if (fromKls == NULL) {
+    const TypeInstPtr* tinst = _gvn.type(fromObj)->isa_instptr();
+    assert(tinst != NULL, "obj is null");
+    assert(tinst->klass()->is_loaded(), "obj is not loaded");
+    assert(!is_exact || tinst->klass_is_exact(), "klass not exact");
+    fromKls = tinst->klass()->as_instance_klass();
+  }
+  else {
+    assert(is_static, "only for static field access");
+  }
+  ciField* field = fromKls->get_field_by_name(ciSymbol::make(fieldName),
+    ciSymbol::make(fieldTypeString),
+    is_static);
+
+  assert(field != NULL, "undefined field");
+  assert(!field->is_volatile(), "not defined for volatile fields");
+
+  if (is_static) {
+    const TypeInstPtr* tip = TypeInstPtr::make(fromKls->java_mirror());
+    fromObj = makecon(tip);
+  }
+
+  // Next code  copied from Parse::do_get_xxx():
+
+  // Compute address and memory type.
+  int offset = field->offset_in_bytes();
+  Node *adr = basic_plus_adr(fromObj, fromObj, offset);
+
+  return adr;
+}
 
 //------------------------------inline_aescrypt_Block-----------------------
 bool LibraryCallKit::inline_aescrypt_Block(vmIntrinsics::ID id) {
@@ -5944,6 +5985,90 @@
   return true;
 }
 
+//------------------------------inline_counterMode_AESCrypt-----------------------
+bool LibraryCallKit::inline_counterMode_AESCrypt(vmIntrinsics::ID id) {
+  assert(UseAES, "need AES instruction support");
+  if (!UseAESCTRIntrinsics) return false;
+
+  address stubAddr = NULL;
+  const char *stubName = NULL;
+  if (id == vmIntrinsics::_counterMode_AESCrypt) {
+    stubAddr = StubRoutines::counterMode_AESCrypt();
+    stubName = "counterMode_AESCrypt";
+  }
+  if (stubAddr == NULL) return false;
+
+  Node* counterMode_object = argument(0);
+  Node* src = argument(1);
+  Node* src_offset = argument(2);
+  Node* len = argument(3);
+  Node* dest = argument(4);
+  Node* dest_offset = argument(5);
+
+  // (1) src and dest are arrays.
+  const Type* src_type = src->Value(&_gvn);
+  const Type* dest_type = dest->Value(&_gvn);
+  const TypeAryPtr* top_src = src_type->isa_aryptr();
+  const TypeAryPtr* top_dest = dest_type->isa_aryptr();
+  assert(top_src != NULL && top_src->klass() != NULL &&
+         top_dest != NULL && top_dest->klass() != NULL, "args are strange");
+
+  // checks are the responsibility of the caller
+  Node* src_start = src;
+  Node* dest_start = dest;
+  if (src_offset != NULL || dest_offset != NULL) {
+    assert(src_offset != NULL && dest_offset != NULL, "");
+    src_start = array_element_address(src, src_offset, T_BYTE);
+    dest_start = array_element_address(dest, dest_offset, T_BYTE);
+  }
+
+  // if we are in this set of code, we "know" the embeddedCipher is an AESCrypt object
+  // (because of the predicated logic executed earlier).
+  // so we cast it here safely.
+  // this requires a newer class file that has this array as littleEndian ints, otherwise we revert to java
+  Node* embeddedCipherObj = load_field_from_object(counterMode_object, "embeddedCipher", "Lcom/sun/crypto/provider/SymmetricCipher;", /*is_exact*/ false);
+  if (embeddedCipherObj == NULL) return false;
+  // cast it to what we know it will be at runtime
+  const TypeInstPtr* tinst = _gvn.type(counterMode_object)->isa_instptr();
+  assert(tinst != NULL, "CTR obj is null");
+  assert(tinst->klass()->is_loaded(), "CTR obj is not loaded");
+  ciKlass* klass_AESCrypt = tinst->klass()->as_instance_klass()->find_klass(ciSymbol::make("com/sun/crypto/provider/AESCrypt"));
+  assert(klass_AESCrypt->is_loaded(), "predicate checks that this class is loaded");
+  ciInstanceKlass* instklass_AESCrypt = klass_AESCrypt->as_instance_klass();
+  const TypeKlassPtr* aklass = TypeKlassPtr::make(instklass_AESCrypt);
+  const TypeOopPtr* xtype = aklass->as_instance_type();
+  Node* aescrypt_object = new CheckCastPPNode(control(), embeddedCipherObj, xtype);
+  aescrypt_object = _gvn.transform(aescrypt_object);
+  // we need to get the start of the aescrypt_object's expanded key array
+  Node* k_start = get_key_start_from_aescrypt_object(aescrypt_object);
+  if (k_start == NULL) return false;
+  // similarly, get the start address of the r vector
+  Node* obj_counter = load_field_from_object(counterMode_object, "counter", "[B", /*is_exact*/ false);
+  if (obj_counter == NULL) return false;
+  Node* cnt_start = array_element_address(obj_counter, intcon(0), T_BYTE);
+
+  Node* saved_encCounter = load_field_from_object(counterMode_object, "encryptedCounter", "[B", /*is_exact*/ false);
+  if (saved_encCounter == NULL) return false;
+  Node* saved_encCounter_start = array_element_address(saved_encCounter, intcon(0), T_BYTE);
+  Node* used = field_address_from_object(counterMode_object, "used", "I", /*is_exact*/ false);
+
+  Node* ctrCrypt;
+  if (Matcher::pass_original_key_for_aes()) {
+    // no SPARC version for AES/CTR intrinsics now.
+    return false;
+  }
+  // Call the stub, passing src_start, dest_start, k_start, r_start and src_len
+  ctrCrypt = make_runtime_call(RC_LEAF|RC_NO_FP,
+                               OptoRuntime::counterMode_aescrypt_Type(),
+                               stubAddr, stubName, TypePtr::BOTTOM,
+                               src_start, dest_start, k_start, cnt_start, len, saved_encCounter_start, used);
+
+  // return cipher length (int)
+  Node* retvalue = _gvn.transform(new ProjNode(ctrCrypt, TypeFunc::Parms));
+  set_result(retvalue);
+  return true;
+}
+
 //------------------------------get_key_start_from_aescrypt_object-----------------------
 Node * LibraryCallKit::get_key_start_from_aescrypt_object(Node *aescrypt_object) {
   Node* objAESCryptKey = load_field_from_object(aescrypt_object, "K", "[I", /*is_exact*/ false);
@@ -6025,6 +6150,48 @@
   return _gvn.transform(region);
 }
 
+//----------------------------inline_counterMode_AESCrypt_predicate----------------------------
+// Return node representing slow path of predicate check.
+// the pseudo code we want to emulate with this predicate is:
+// for encryption:
+//    if (embeddedCipherObj instanceof AESCrypt) do_intrinsic, else do_javapath
+// for decryption:
+//    if ((embeddedCipherObj instanceof AESCrypt) && (cipher!=plain)) do_intrinsic, else do_javapath
+//    note cipher==plain is more conservative than the original java code but that's OK
+//
+
+Node* LibraryCallKit::inline_counterMode_AESCrypt_predicate() {
+  // The receiver was checked for NULL already.
+  Node* objCTR = argument(0);
+
+  // Load embeddedCipher field of CipherBlockChaining object.
+  Node* embeddedCipherObj = load_field_from_object(objCTR, "embeddedCipher", "Lcom/sun/crypto/provider/SymmetricCipher;", /*is_exact*/ false);
+
+  // get AESCrypt klass for instanceOf check
+  // AESCrypt might not be loaded yet if some other SymmetricCipher got us to this compile point
+  // will have same classloader as CipherBlockChaining object
+  const TypeInstPtr* tinst = _gvn.type(objCTR)->isa_instptr();
+  assert(tinst != NULL, "CTRobj is null");
+  assert(tinst->klass()->is_loaded(), "CTRobj is not loaded");
+
+  // we want to do an instanceof comparison against the AESCrypt class
+  ciKlass* klass_AESCrypt = tinst->klass()->as_instance_klass()->find_klass(ciSymbol::make("com/sun/crypto/provider/AESCrypt"));
+  if (!klass_AESCrypt->is_loaded()) {
+    // if AESCrypt is not even loaded, we never take the intrinsic fast path
+    Node* ctrl = control();
+    set_control(top()); // no regular fast path
+    return ctrl;
+  }
+
+  ciInstanceKlass* instklass_AESCrypt = klass_AESCrypt->as_instance_klass();
+  Node* instof = gen_instanceof(embeddedCipherObj, makecon(TypeKlassPtr::make(instklass_AESCrypt)));
+  Node* cmp_instof = _gvn.transform(new CmpINode(instof, intcon(1)));
+  Node* bool_instof = _gvn.transform(new BoolNode(cmp_instof, BoolTest::ne));
+  Node* instof_false = generate_guard(bool_instof, NULL, PROB_MIN);
+
+  return instof_false; // even if it is NULL
+}
+
 //------------------------------inline_ghash_processBlocks
 bool LibraryCallKit::inline_ghash_processBlocks() {
   address stubAddr;
--- a/hotspot/src/share/vm/opto/runtime.cpp	Mon Dec 28 10:10:37 2015 -1000
+++ b/hotspot/src/share/vm/opto/runtime.cpp	Mon Dec 28 23:11:01 2015 -0800
@@ -948,6 +948,35 @@
   return TypeFunc::make(domain, range);
 }
 
+//for counterMode calls of aescrypt encrypt/decrypt, four pointers and a length, returning int
+const TypeFunc* OptoRuntime::counterMode_aescrypt_Type() {
+  // create input type (domain)
+  int num_args = 7;
+  if (Matcher::pass_original_key_for_aes()) {
+    num_args = 8;
+  }
+  int argcnt = num_args;
+  const Type** fields = TypeTuple::fields(argcnt);
+  int argp = TypeFunc::Parms;
+  fields[argp++] = TypePtr::NOTNULL; // src
+  fields[argp++] = TypePtr::NOTNULL; // dest
+  fields[argp++] = TypePtr::NOTNULL; // k array
+  fields[argp++] = TypePtr::NOTNULL; // counter array
+  fields[argp++] = TypeInt::INT; // src len
+  fields[argp++] = TypePtr::NOTNULL; // saved_encCounter
+  fields[argp++] = TypePtr::NOTNULL; // saved used addr
+  if (Matcher::pass_original_key_for_aes()) {
+    fields[argp++] = TypePtr::NOTNULL; // original k array
+  }
+  assert(argp == TypeFunc::Parms + argcnt, "correct decoding");
+  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms + argcnt, fields);
+  // returning cipher len (int)
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms + 0] = TypeInt::INT;
+  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms + 1, fields);
+  return TypeFunc::make(domain, range);
+}
+
 /*
  * void implCompress(byte[] buf, int ofs)
  */
--- a/hotspot/src/share/vm/opto/runtime.hpp	Mon Dec 28 10:10:37 2015 -1000
+++ b/hotspot/src/share/vm/opto/runtime.hpp	Mon Dec 28 23:11:01 2015 -0800
@@ -287,6 +287,7 @@
 
   static const TypeFunc* aescrypt_block_Type();
   static const TypeFunc* cipherBlockChaining_aescrypt_Type();
+  static const TypeFunc* counterMode_aescrypt_Type();
 
   static const TypeFunc* sha_implCompress_Type();
   static const TypeFunc* digestBase_implCompressMB_Type();
--- a/hotspot/src/share/vm/runtime/globals.hpp	Mon Dec 28 10:10:37 2015 -1000
+++ b/hotspot/src/share/vm/runtime/globals.hpp	Mon Dec 28 23:11:01 2015 -0800
@@ -836,6 +836,9 @@
   product(bool, UseAESIntrinsics, false,                                    \
           "Use intrinsics for AES versions of crypto")                      \
                                                                             \
+  product(bool, UseAESCTRIntrinsics, false,                                 \
+          "Use intrinsics for the paralleled version of AES/CTR crypto")    \
+                                                                            \
   product(bool, UseSHA1Intrinsics, false,                                   \
           "Use intrinsics for SHA-1 crypto hash function. "                 \
           "Requires that UseSHA is enabled.")                               \
--- a/hotspot/src/share/vm/runtime/stubRoutines.cpp	Mon Dec 28 10:10:37 2015 -1000
+++ b/hotspot/src/share/vm/runtime/stubRoutines.cpp	Mon Dec 28 23:11:01 2015 -0800
@@ -127,6 +127,7 @@
 address StubRoutines::_aescrypt_decryptBlock               = NULL;
 address StubRoutines::_cipherBlockChaining_encryptAESCrypt = NULL;
 address StubRoutines::_cipherBlockChaining_decryptAESCrypt = NULL;
+address StubRoutines::_counterMode_AESCrypt                = NULL;
 address StubRoutines::_ghash_processBlocks                 = NULL;
 
 address StubRoutines::_sha1_implCompress     = NULL;
--- a/hotspot/src/share/vm/runtime/stubRoutines.hpp	Mon Dec 28 10:10:37 2015 -1000
+++ b/hotspot/src/share/vm/runtime/stubRoutines.hpp	Mon Dec 28 23:11:01 2015 -0800
@@ -186,6 +186,7 @@
   static address _aescrypt_decryptBlock;
   static address _cipherBlockChaining_encryptAESCrypt;
   static address _cipherBlockChaining_decryptAESCrypt;
+  static address _counterMode_AESCrypt;
   static address _ghash_processBlocks;
 
   static address _sha1_implCompress;
@@ -359,6 +360,7 @@
   static address aescrypt_decryptBlock()                { return _aescrypt_decryptBlock; }
   static address cipherBlockChaining_encryptAESCrypt()  { return _cipherBlockChaining_encryptAESCrypt; }
   static address cipherBlockChaining_decryptAESCrypt()  { return _cipherBlockChaining_decryptAESCrypt; }
+  static address counterMode_AESCrypt() { return _counterMode_AESCrypt; }
   static address ghash_processBlocks() { return _ghash_processBlocks; }
 
   static address sha1_implCompress()     { return _sha1_implCompress; }
--- a/hotspot/src/share/vm/runtime/vmStructs.cpp	Mon Dec 28 10:10:37 2015 -1000
+++ b/hotspot/src/share/vm/runtime/vmStructs.cpp	Mon Dec 28 23:11:01 2015 -0800
@@ -850,6 +850,7 @@
      static_field(StubRoutines,                _aescrypt_decryptBlock,                        address)                               \
      static_field(StubRoutines,                _cipherBlockChaining_encryptAESCrypt,          address)                               \
      static_field(StubRoutines,                _cipherBlockChaining_decryptAESCrypt,          address)                               \
+     static_field(StubRoutines,                _counterMode_AESCrypt,                         address)                               \
      static_field(StubRoutines,                _ghash_processBlocks,                          address)                               \
      static_field(StubRoutines,                _updateBytesCRC32,                             address)                               \
      static_field(StubRoutines,                _crc_table_adr,                                address)                               \
--- a/hotspot/test/compiler/codegen/7184394/TestAESBase.java	Mon Dec 28 10:10:37 2015 -1000
+++ b/hotspot/test/compiler/codegen/7184394/TestAESBase.java	Mon Dec 28 23:11:01 2015 -0800
@@ -104,8 +104,8 @@
       cipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE");
       dCipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE");
 
-      // CBC init
-      if (mode.equals("CBC")) {
+      // CBC or CTR init
+      if (mode.equals("CBC") || mode.equals("CTR")) {
         IvParameterSpec initVector = new IvParameterSpec(iv);
         cipher.init(Cipher.ENCRYPT_MODE, key, initVector);
         algParams = cipher.getParameters();
--- a/hotspot/test/compiler/codegen/7184394/TestAESMain.java	Mon Dec 28 10:10:37 2015 -1000
+++ b/hotspot/test/compiler/codegen/7184394/TestAESMain.java	Mon Dec 28 23:11:01 2015 -0800
@@ -51,6 +51,13 @@
  * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 TestAESMain
  * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain
  * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencInputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DdecOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencInputOffset=1 -DencOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain
  *
  * @author Tom Deneau
  */