8143355: Update for addition of vectorizedMismatch intrinsic for x86
authorkvn
Mon, 07 Dec 2015 16:35:07 -0800
changeset 35110 f19bcdf40799
parent 35095 4ca2192f9709
child 35111 d72f2f2e084e
8143355: Update for addition of vectorizedMismatch intrinsic for x86 Reviewed-by: kvn Contributed-by: vivek.r.deshpande@intel.com, liqi.yi@intel.com
hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp
hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp
hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp
hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
hotspot/src/cpu/x86/vm/vm_version_x86.cpp
hotspot/src/share/vm/classfile/vmSymbols.cpp
hotspot/src/share/vm/classfile/vmSymbols.hpp
hotspot/src/share/vm/opto/c2compiler.cpp
hotspot/src/share/vm/opto/escape.cpp
hotspot/src/share/vm/opto/library_call.cpp
hotspot/src/share/vm/opto/runtime.cpp
hotspot/src/share/vm/opto/runtime.hpp
hotspot/src/share/vm/runtime/globals.hpp
hotspot/src/share/vm/runtime/stubRoutines.cpp
hotspot/src/share/vm/runtime/stubRoutines.hpp
hotspot/src/share/vm/runtime/vmStructs.cpp
--- a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Mon Dec 07 15:00:46 2015 +0000
+++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Mon Dec 07 16:35:07 2015 -0800
@@ -182,6 +182,11 @@
     FLAG_SET_DEFAULT(UseAdler32Intrinsics, true);
   }
 
+  if (UseVectorizedMismatchIntrinsic) {
+    warning("UseVectorizedMismatchIntrinsic specified, but not available on this CPU.");
+    FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
+  }
+
   if (auxv & HWCAP_AES) {
     UseAES = UseAES || FLAG_IS_DEFAULT(UseAES);
     UseAESIntrinsics =
--- a/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp	Mon Dec 07 15:00:46 2015 +0000
+++ b/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp	Mon Dec 07 16:35:07 2015 -0800
@@ -223,6 +223,11 @@
     UseMultiplyToLenIntrinsic = true;
   }
 
+  if (UseVectorizedMismatchIntrinsic) {
+    warning("UseVectorizedMismatchIntrinsic specified, but not available on this CPU.");
+    FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
+  }
+
   // Adjust RTM (Restricted Transactional Memory) flags.
   if (!has_tcheck() && UseRTMLocking) {
     // Can't continue because UseRTMLocking affects UseBiasedLocking flag
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp	Mon Dec 07 15:00:46 2015 +0000
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp	Mon Dec 07 16:35:07 2015 -0800
@@ -356,6 +356,11 @@
     FLAG_SET_DEFAULT(UseCRC32Intrinsics, false);
   }
 
+  if (UseVectorizedMismatchIntrinsic) {
+    warning("UseVectorizedMismatchIntrinsic specified, but not available on this CPU.");
+    FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
+  }
+
   if (FLAG_IS_DEFAULT(ContendedPaddingWidth) &&
     (cache_line_size > ContendedPaddingWidth))
     ContendedPaddingWidth = cache_line_size;
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Mon Dec 07 15:00:46 2015 +0000
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Mon Dec 07 16:35:07 2015 -0800
@@ -9439,13 +9439,184 @@
   pop(tmp1);
 }
 
+void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
+  Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
+  assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
+  Label VECTOR32_LOOP, VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
+  Label VECTOR16_TAIL, VECTOR8_TAIL, VECTOR4_TAIL;
+  Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
+  Label SAME_TILL_END, DONE;
+  Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
+
+  //scale is in rcx in both Win64 and Unix
+  ShortBranchVerifier sbv(this);
+
+  shlq(length);
+  xorq(result, result);
+
+  cmpq(length, 8);
+  jcc(Assembler::equal, VECTOR8_LOOP);
+  jcc(Assembler::less, VECTOR4_TAIL);
+
+  if (UseAVX >= 2){
+
+    cmpq(length, 16);
+    jcc(Assembler::equal, VECTOR16_LOOP);
+    jcc(Assembler::less, VECTOR8_LOOP);
+
+    cmpq(length, 32);
+    jccb(Assembler::less, VECTOR16_TAIL);
+
+    subq(length, 32);
+    bind(VECTOR32_LOOP);
+    vmovdqu(rymm0, Address(obja, result));
+    vmovdqu(rymm1, Address(objb, result));
+    vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
+    vptest(rymm2, rymm2);
+    jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
+    addq(result, 32);
+    subq(length, 32);
+    jccb(Assembler::greaterEqual, VECTOR32_LOOP);
+    addq(length, 32);
+    jcc(Assembler::equal, SAME_TILL_END);
+    //falling through if less than 32 bytes left //close the branch here.
+
+    bind(VECTOR16_TAIL);
+    cmpq(length, 16);
+    jccb(Assembler::less, VECTOR8_TAIL);
+    bind(VECTOR16_LOOP);
+    movdqu(rymm0, Address(obja, result));
+    movdqu(rymm1, Address(objb, result));
+    vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
+    ptest(rymm2, rymm2);
+    jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
+    addq(result, 16);
+    subq(length, 16);
+    jcc(Assembler::equal, SAME_TILL_END);
+    //falling through if less than 16 bytes left
+  } else {//regular intrinsics
+
+    cmpq(length, 16);
+    jccb(Assembler::less, VECTOR8_TAIL);
+
+    subq(length, 16);
+    bind(VECTOR16_LOOP);
+    movdqu(rymm0, Address(obja, result));
+    movdqu(rymm1, Address(objb, result));
+    pxor(rymm0, rymm1);
+    ptest(rymm0, rymm0);
+    jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
+    addq(result, 16);
+    subq(length, 16);
+    jccb(Assembler::greaterEqual, VECTOR16_LOOP);
+    addq(length, 16);
+    jcc(Assembler::equal, SAME_TILL_END);
+    //falling through if less than 16 bytes left
+  }
+
+  bind(VECTOR8_TAIL);
+  cmpq(length, 8);
+  jccb(Assembler::less, VECTOR4_TAIL);
+  bind(VECTOR8_LOOP);
+  movq(tmp1, Address(obja, result));
+  movq(tmp2, Address(objb, result));
+  xorq(tmp1, tmp2);
+  testq(tmp1, tmp1);
+  jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
+  addq(result, 8);
+  subq(length, 8);
+  jcc(Assembler::equal, SAME_TILL_END);
+  //falling through if less than 8 bytes left
+
+  bind(VECTOR4_TAIL);
+  cmpq(length, 4);
+  jccb(Assembler::less, BYTES_TAIL);
+  bind(VECTOR4_LOOP);
+  movl(tmp1, Address(obja, result));
+  xorl(tmp1, Address(objb, result));
+  testl(tmp1, tmp1);
+  jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
+  addq(result, 4);
+  subq(length, 4);
+  jcc(Assembler::equal, SAME_TILL_END);
+  //falling through if less than 4 bytes left
+
+  bind(BYTES_TAIL);
+  bind(BYTES_LOOP);
+  load_unsigned_byte(tmp1, Address(obja, result));
+  load_unsigned_byte(tmp2, Address(objb, result));
+  xorl(tmp1, tmp2);
+  testl(tmp1, tmp1);
+  jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
+  decq(length);
+  jccb(Assembler::zero, SAME_TILL_END);
+  incq(result);
+  load_unsigned_byte(tmp1, Address(obja, result));
+  load_unsigned_byte(tmp2, Address(objb, result));
+  xorl(tmp1, tmp2);
+  testl(tmp1, tmp1);
+  jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
+  decq(length);
+  jccb(Assembler::zero, SAME_TILL_END);
+  incq(result);
+  load_unsigned_byte(tmp1, Address(obja, result));
+  load_unsigned_byte(tmp2, Address(objb, result));
+  xorl(tmp1, tmp2);
+  testl(tmp1, tmp1);
+  jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
+  jmpb(SAME_TILL_END);
+
+  if (UseAVX >= 2){
+    bind(VECTOR32_NOT_EQUAL);
+    vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
+    vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
+    vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
+    vpmovmskb(tmp1, rymm0);
+    bsfq(tmp1, tmp1);
+    addq(result, tmp1);
+    shrq(result);
+    jmpb(DONE);
+  }
+
+  bind(VECTOR16_NOT_EQUAL);
+  if (UseAVX >= 2){
+    vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
+    vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
+    pxor(rymm0, rymm2);
+  } else {
+    pcmpeqb(rymm2, rymm2);
+    pxor(rymm0, rymm1);
+    pcmpeqb(rymm0, rymm1);
+    pxor(rymm0, rymm2);
+  }
+  pmovmskb(tmp1, rymm0);
+  bsfq(tmp1, tmp1);
+  addq(result, tmp1);
+  shrq(result);
+  jmpb(DONE);
+
+  bind(VECTOR8_NOT_EQUAL);
+  bind(VECTOR4_NOT_EQUAL);
+  bsfq(tmp1, tmp1);
+  shrq(tmp1, 3);
+  addq(result, tmp1);
+  bind(BYTES_NOT_EQUAL);
+  shrq(result);
+  jmpb(DONE);
+
+  bind(SAME_TILL_END);
+  mov64(result, -1);
+
+  bind(DONE);
+}
+
+
 //Helper functions for square_to_len()
 
 /**
  * Store the squares of x[], right shifted one bit (divided by 2) into z[]
  * Preserves x and z and modifies rest of the registers.
  */
-
 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
   // Perform square and right shift by 1
   // Handle odd xlen case first, then for even xlen do the following
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp	Mon Dec 07 15:00:46 2015 +0000
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp	Mon Dec 07 16:35:07 2015 -0800
@@ -1346,7 +1346,6 @@
                                Register carry2);
   void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5);
-
   void square_rshift(Register x, Register len, Register z, Register tmp1, Register tmp3,
                      Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
   void multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry,
@@ -1365,6 +1364,9 @@
   void mul_add(Register out, Register in, Register offset, Register len, Register k, Register tmp1,
                Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
                Register raxReg);
+  void vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
+                           Register result, Register tmp1, Register tmp2,
+                           XMMRegister vec1, XMMRegister vec2, XMMRegister vec3);
 #endif
 
   // CRC32 code for java.util.zip.CRC32::updateBytes() intrinsic.
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Mon Dec 07 15:00:46 2015 +0000
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Mon Dec 07 16:35:07 2015 -0800
@@ -4054,6 +4054,54 @@
     return start;
   }
 
+  /**
+  *  Arguments:
+  *
+  *  Input:
+  *    c_rarg0   - obja     address
+  *    c_rarg1   - objb     address
+  *    c_rarg3   - length   length
+  *    c_rarg4   - scale    log2_array_indxscale
+  */
+  address generate_vectorizedMismatch() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch");
+    address start = __ pc();
+
+    BLOCK_COMMENT("Entry:");
+    __ enter();
+
+#ifdef _WIN64  // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
+    const Register scale = c_rarg0;  //rcx, will exchange with r9
+    const Register objb = c_rarg1;   //rdx
+    const Register length = c_rarg2; //r8
+    const Register obja = c_rarg3;   //r9
+    __ xchgq(obja, scale);  //now obja and scale contains the correct contents
+
+    const Register tmp1 = r10;
+    const Register tmp2 = r11;
+#endif
+#ifndef _WIN64 // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
+    const Register obja = c_rarg0;   //U:rdi
+    const Register objb = c_rarg1;   //U:rsi
+    const Register length = c_rarg2; //U:rdx
+    const Register scale = c_rarg3;  //U:rcx
+    const Register tmp1 = r8;
+    const Register tmp2 = r9;
+#endif
+    const Register result = rax; //return value
+    const XMMRegister vec0 = xmm0;
+    const XMMRegister vec1 = xmm1;
+    const XMMRegister vec2 = xmm2;
+
+    __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
+
+    __ leave();
+    __ ret(0);
+
+    return start;
+  }
+
 /**
    *  Arguments:
    *
@@ -4505,7 +4553,9 @@
     if (UseMulAddIntrinsic) {
       StubRoutines::_mulAdd = generate_mulAdd();
     }
-
+    if (UseVectorizedMismatchIntrinsic) {
+      StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
+    }
 #ifndef _WINDOWS
     if (UseMontgomeryMultiplyIntrinsic) {
       StubRoutines::_montgomeryMultiply
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp	Mon Dec 07 15:00:46 2015 +0000
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp	Mon Dec 07 16:35:07 2015 -0800
@@ -1041,6 +1041,25 @@
     }
   }
 
+#ifdef _LP64
+  if (UseSSE42Intrinsics) {
+    if (FLAG_IS_DEFAULT(UseVectorizedMismatchIntrinsic)) {
+      UseVectorizedMismatchIntrinsic = true;
+    }
+  } else if (UseVectorizedMismatchIntrinsic) {
+    if (!FLAG_IS_DEFAULT(UseVectorizedMismatchIntrinsic))
+      warning("vectorizedMismatch intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
+  }
+#else
+  if (UseVectorizedMismatchIntrinsic) {
+    if (!FLAG_IS_DEFAULT(UseVectorizedMismatchIntrinsic)) {
+      warning("vectorizedMismatch intrinsic is not available in 32-bit VM");
+    }
+    FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
+  }
+#endif // _LP64
+
   // Use count leading zeros count instruction if available.
   if (supports_lzcnt()) {
     if (FLAG_IS_DEFAULT(UseCountLeadingZerosInstruction)) {
--- a/hotspot/src/share/vm/classfile/vmSymbols.cpp	Mon Dec 07 15:00:46 2015 +0000
+++ b/hotspot/src/share/vm/classfile/vmSymbols.cpp	Mon Dec 07 16:35:07 2015 -0800
@@ -681,6 +681,9 @@
   case vmIntrinsics::_montgomerySquare:
     if (!UseMontgomerySquareIntrinsic) return true;
     break;
+  case vmIntrinsics::_vectorizedMismatch:
+    if (!UseVectorizedMismatchIntrinsic) return true;
+    break;
   case vmIntrinsics::_addExactI:
   case vmIntrinsics::_addExactL:
   case vmIntrinsics::_decrementExactI:
--- a/hotspot/src/share/vm/classfile/vmSymbols.hpp	Mon Dec 07 15:00:46 2015 +0000
+++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp	Mon Dec 07 16:35:07 2015 -0800
@@ -957,6 +957,11 @@
    do_name(     montgomerySquare_name,                             "implMontgomerySquare")                              \
    do_signature(montgomerySquare_signature,                        "([I[IIJ[I)[I")                                      \
                                                                                                                         \
+  do_class(java_util_ArraysSupport, "java/util/ArraysSupport")                                                          \
+  do_intrinsic(_vectorizedMismatch, java_util_ArraysSupport, vectorizedMismatch_name, vectorizedMismatch_signature, F_S)\
+   do_name(vectorizedMismatch_name, "vectorizedMismatch")                                                               \
+   do_signature(vectorizedMismatch_signature, "(Ljava/lang/Object;JLjava/lang/Object;JII)I")                            \
+                                                                                                                        \
   /* java/lang/ref/Reference */                                                                                         \
   do_intrinsic(_Reference_get,            java_lang_ref_Reference, get_name,    void_object_signature, F_R)             \
                                                                                                                         \
--- a/hotspot/src/share/vm/opto/c2compiler.cpp	Mon Dec 07 15:00:46 2015 +0000
+++ b/hotspot/src/share/vm/opto/c2compiler.cpp	Mon Dec 07 16:35:07 2015 -0800
@@ -441,6 +441,7 @@
   case vmIntrinsics::_mulAdd:
   case vmIntrinsics::_montgomeryMultiply:
   case vmIntrinsics::_montgomerySquare:
+  case vmIntrinsics::_vectorizedMismatch:
   case vmIntrinsics::_ghash_processBlocks:
   case vmIntrinsics::_updateCRC32:
   case vmIntrinsics::_updateBytesCRC32:
--- a/hotspot/src/share/vm/opto/escape.cpp	Mon Dec 07 15:00:46 2015 +0000
+++ b/hotspot/src/share/vm/opto/escape.cpp	Mon Dec 07 16:35:07 2015 -0800
@@ -987,7 +987,8 @@
                   strcmp(call->as_CallLeaf()->_name, "squareToLen") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "mulAdd") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "montgomery_multiply") == 0 ||
-                  strcmp(call->as_CallLeaf()->_name, "montgomery_square") == 0)
+                  strcmp(call->as_CallLeaf()->_name, "montgomery_square") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "vectorizedMismatch") == 0)
                  ))) {
             call->dump();
             fatal("EA unexpected CallLeaf %s", call->as_CallLeaf()->_name);
--- a/hotspot/src/share/vm/opto/library_call.cpp	Mon Dec 07 15:00:46 2015 +0000
+++ b/hotspot/src/share/vm/opto/library_call.cpp	Mon Dec 07 16:35:07 2015 -0800
@@ -312,6 +312,7 @@
   bool inline_mulAdd();
   bool inline_montgomeryMultiply();
   bool inline_montgomerySquare();
+  bool inline_vectorizedMismatch();
 
   bool inline_profileBoolean();
   bool inline_isCompileConstant();
@@ -720,6 +721,9 @@
   case vmIntrinsics::_montgomerySquare:
     return inline_montgomerySquare();
 
+  case vmIntrinsics::_vectorizedMismatch:
+    return inline_vectorizedMismatch();
+
   case vmIntrinsics::_ghash_processBlocks:
     return inline_ghash_processBlocks();
 
@@ -5581,6 +5585,50 @@
   return true;
 }
 
+//-------------inline_vectorizedMismatch------------------------------
+bool LibraryCallKit::inline_vectorizedMismatch() {
+  assert(UseVectorizedMismatchIntrinsic, "not implementated on this platform");
+
+  address stubAddr = StubRoutines::vectorizedMismatch();
+  if (stubAddr == NULL) {
+    return false; // Intrinsic's stub is not implemented on this platform
+  }
+  const char* stubName = "vectorizedMismatch";
+  int size_l = callee()->signature()->size();
+  assert(callee()->signature()->size() == 8, "vectorizedMismatch has 6 parameters");
+
+  Node* obja = argument(0);
+  Node* aoffset = argument(1);
+  Node* objb = argument(3);
+  Node* boffset = argument(4);
+  Node* length = argument(6);
+  Node* scale = argument(7);
+
+  const Type* a_type = obja->Value(&_gvn);
+  const Type* b_type = objb->Value(&_gvn);
+  const TypeAryPtr* top_a = a_type->isa_aryptr();
+  const TypeAryPtr* top_b = b_type->isa_aryptr();
+  if (top_a == NULL || top_a->klass() == NULL ||
+    top_b == NULL || top_b->klass() == NULL) {
+    // failed array check
+    return false;
+  }
+
+  Node* call;
+  jvms()->set_should_reexecute(true);
+
+  Node* obja_adr = make_unsafe_address(obja, aoffset);
+  Node* objb_adr = make_unsafe_address(objb, boffset);
+
+  call = make_runtime_call(RC_LEAF,
+    OptoRuntime::vectorizedMismatch_Type(),
+    stubAddr, stubName, TypePtr::BOTTOM,
+    obja_adr, objb_adr, length, scale);
+
+  Node* result = _gvn.transform(new ProjNode(call, TypeFunc::Parms));
+  set_result(result);
+  return true;
+}
 
 /**
  * Calculate CRC32 for byte.
--- a/hotspot/src/share/vm/opto/runtime.cpp	Mon Dec 07 15:00:46 2015 +0000
+++ b/hotspot/src/share/vm/opto/runtime.cpp	Mon Dec 07 16:35:07 2015 -0800
@@ -1103,6 +1103,26 @@
   return TypeFunc::make(domain, range);
 }
 
+const TypeFunc* OptoRuntime::vectorizedMismatch_Type() {
+  // create input type (domain)
+  int num_args = 4;
+  int argcnt = num_args;
+  const Type** fields = TypeTuple::fields(argcnt);
+  int argp = TypeFunc::Parms;
+  fields[argp++] = TypePtr::NOTNULL;    // obja
+  fields[argp++] = TypePtr::NOTNULL;    // objb
+  fields[argp++] = TypeInt::INT;        // length, number of elements
+  fields[argp++] = TypeInt::INT;        // log2scale, element size
+  assert(argp == TypeFunc::Parms + argcnt, "correct decoding");
+  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms + argcnt, fields);
+
+  //return mismatch index (int)
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms + 0] = TypeInt::INT;
+  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms + 1, fields);
+  return TypeFunc::make(domain, range);
+}
+
 // GHASH block processing
 const TypeFunc* OptoRuntime::ghash_processBlocks_Type() {
     int argcnt = 4;
--- a/hotspot/src/share/vm/opto/runtime.hpp	Mon Dec 07 15:00:46 2015 +0000
+++ b/hotspot/src/share/vm/opto/runtime.hpp	Mon Dec 07 16:35:07 2015 -0800
@@ -299,6 +299,8 @@
 
   static const TypeFunc* mulAdd_Type();
 
+  static const TypeFunc* vectorizedMismatch_Type();
+
   static const TypeFunc* ghash_processBlocks_Type();
 
   static const TypeFunc* updateBytesCRC32_Type();
--- a/hotspot/src/share/vm/runtime/globals.hpp	Mon Dec 07 15:00:46 2015 +0000
+++ b/hotspot/src/share/vm/runtime/globals.hpp	Mon Dec 07 16:35:07 2015 -0800
@@ -855,6 +855,9 @@
   product(bool, UseAdler32Intrinsics, false,                                \
           "use intrinsics for java.util.zip.Adler32")                       \
                                                                             \
+  product(bool, UseVectorizedMismatchIntrinsic, false,                      \
+          "Enables intrinsification of ArraysSupport.vectorizedMismatch()") \
+                                                                            \
   diagnostic(ccstrlist, DisableIntrinsic, "",                               \
          "do not expand intrinsics whose (internal) names appear here")     \
                                                                             \
--- a/hotspot/src/share/vm/runtime/stubRoutines.cpp	Mon Dec 07 15:00:46 2015 +0000
+++ b/hotspot/src/share/vm/runtime/stubRoutines.cpp	Mon Dec 07 16:35:07 2015 -0800
@@ -148,6 +148,8 @@
 address StubRoutines::_montgomeryMultiply = NULL;
 address StubRoutines::_montgomerySquare = NULL;
 
+address StubRoutines::_vectorizedMismatch = NULL;
+
 address StubRoutines::_dexp = NULL;
 address StubRoutines::_dlog = NULL;
 
--- a/hotspot/src/share/vm/runtime/stubRoutines.hpp	Mon Dec 07 15:00:46 2015 +0000
+++ b/hotspot/src/share/vm/runtime/stubRoutines.hpp	Mon Dec 07 16:35:07 2015 -0800
@@ -207,6 +207,8 @@
   static address _montgomeryMultiply;
   static address _montgomerySquare;
 
+  static address _vectorizedMismatch;
+
   static address _dexp;
   static address _dlog;
 
@@ -376,6 +378,8 @@
   static address montgomeryMultiply()  { return _montgomeryMultiply; }
   static address montgomerySquare()    { return _montgomerySquare; }
 
+  static address vectorizedMismatch()  { return _vectorizedMismatch; }
+
   static address dexp()                { return _dexp; }
   static address dlog()                { return _dlog; }
 
--- a/hotspot/src/share/vm/runtime/vmStructs.cpp	Mon Dec 07 15:00:46 2015 +0000
+++ b/hotspot/src/share/vm/runtime/vmStructs.cpp	Mon Dec 07 16:35:07 2015 -0800
@@ -860,6 +860,7 @@
      static_field(StubRoutines,                _mulAdd,                                       address)                               \
      static_field(StubRoutines,                _dexp,                                         address)                               \
      static_field(StubRoutines,                _dlog,                                         address)                               \
+     static_field(StubRoutines,                _vectorizedMismatch,                           address)                               \
      static_field(StubRoutines,                _jbyte_arraycopy,                              address)                               \
      static_field(StubRoutines,                _jshort_arraycopy,                             address)                               \
      static_field(StubRoutines,                _jint_arraycopy,                               address)                               \