Merge
authornever
Tue, 31 Mar 2009 19:20:34 -0700
changeset 2350 0c1e6e520065
parent 2349 d438a6c62f88 (current diff)
parent 2348 4e71ed4c2709 (diff)
child 2351 260a3a5a1bed
Merge
--- a/hotspot/src/cpu/sparc/vm/sparc.ad	Tue Mar 31 15:09:45 2009 -0700
+++ b/hotspot/src/cpu/sparc/vm/sparc.ad	Tue Mar 31 19:20:34 2009 -0700
@@ -3003,6 +3003,202 @@
     __ bind(Ldone);
   %}
 
+enc_class enc_String_Equals(o0RegP str1, o1RegP str2, g3RegP tmp1, g4RegP tmp2, notemp_iRegI result) %{
+    Label Lword, Lword_loop, Lpost_word, Lchar, Lchar_loop, Ldone;
+    MacroAssembler _masm(&cbuf);
+
+    Register   str1_reg = reg_to_register_object($str1$$reg);
+    Register   str2_reg = reg_to_register_object($str2$$reg);
+    Register   tmp1_reg = reg_to_register_object($tmp1$$reg);
+    Register   tmp2_reg = reg_to_register_object($tmp2$$reg);
+    Register result_reg = reg_to_register_object($result$$reg);
+
+    // Get the first character position in both strings
+    //         [8] char array, [12] offset, [16] count
+    int  value_offset = java_lang_String:: value_offset_in_bytes();
+    int offset_offset = java_lang_String::offset_offset_in_bytes();
+    int  count_offset = java_lang_String:: count_offset_in_bytes();
+
+    // load str1 (jchar*) base address into tmp1_reg
+    __ load_heap_oop(Address(str1_reg, 0,  value_offset), tmp1_reg);
+    __ ld(Address(str1_reg, 0, offset_offset), result_reg);
+    __ add(tmp1_reg, arrayOopDesc::base_offset_in_bytes(T_CHAR), tmp1_reg);
+    __    ld(Address(str1_reg, 0, count_offset), str1_reg); // hoisted
+    __ sll(result_reg, exact_log2(sizeof(jchar)), result_reg);
+    __    load_heap_oop(Address(str2_reg, 0,  value_offset), tmp2_reg); // hoisted
+    __ add(result_reg, tmp1_reg, tmp1_reg);
+
+    // load str2 (jchar*) base address into tmp2_reg
+    // __ ld_ptr(Address(str2_reg, 0,  value_offset), tmp2_reg); // hoisted
+    __ ld(Address(str2_reg, 0, offset_offset), result_reg);
+    __ add(tmp2_reg, arrayOopDesc::base_offset_in_bytes(T_CHAR), tmp2_reg);
+    __    ld(Address(str2_reg, 0, count_offset), str2_reg); // hoisted
+    __ sll(result_reg, exact_log2(sizeof(jchar)), result_reg);
+    __   cmp(str1_reg, str2_reg); // hoisted
+    __ add(result_reg, tmp2_reg, tmp2_reg);
+
+    __ sll(str1_reg, exact_log2(sizeof(jchar)), str1_reg);
+    __ br(Assembler::notEqual, true, Assembler::pt, Ldone);
+    __ delayed()->mov(G0, result_reg);    // not equal
+
+    __ br_zero(Assembler::equal, true, Assembler::pn, str1_reg, Ldone);
+    __ delayed()->add(G0, 1, result_reg); //equals
+
+    __ cmp(tmp1_reg, tmp2_reg); //same string ?
+    __ brx(Assembler::equal, true, Assembler::pn, Ldone);
+    __ delayed()->add(G0, 1, result_reg);
+
+    //rename registers
+    Register limit_reg =   str1_reg;
+    Register  chr2_reg =   str2_reg;
+    Register  chr1_reg = result_reg;
+    // tmp{12} are the base pointers
+
+    //check for alignment and position the pointers to the ends
+    __ or3(tmp1_reg, tmp2_reg, chr1_reg);
+    __ andcc(chr1_reg, 0x3, chr1_reg); // notZero means at least one not 4-byte aligned
+    __ br(Assembler::notZero, false, Assembler::pn, Lchar);
+    __ delayed()->nop();
+
+    __ bind(Lword);
+    __ and3(limit_reg, 0x2, O7); //remember the remainder (either 0 or 2)
+    __ andn(limit_reg, 0x3, limit_reg);
+    __ br_zero(Assembler::zero, false, Assembler::pn, limit_reg, Lpost_word);
+    __ delayed()->nop();
+
+    __ add(tmp1_reg, limit_reg, tmp1_reg);
+    __ add(tmp2_reg, limit_reg, tmp2_reg);
+    __ neg(limit_reg);
+
+    __ lduw(tmp1_reg, limit_reg, chr1_reg);
+    __ bind(Lword_loop);
+    __ lduw(tmp2_reg, limit_reg, chr2_reg);
+    __ cmp(chr1_reg, chr2_reg);
+    __ br(Assembler::notEqual, true, Assembler::pt, Ldone);
+    __ delayed()->mov(G0, result_reg);
+    __ inccc(limit_reg, 2*sizeof(jchar));
+    // annul LDUW if branch i  s not taken to prevent access past end of string
+    __ br(Assembler::notZero, true, Assembler::pt, Lword_loop); //annul on taken
+    __ delayed()->lduw(tmp1_reg, limit_reg, chr1_reg); // hoisted
+
+    __ bind(Lpost_word);
+    __ br_zero(Assembler::zero, true, Assembler::pt, O7, Ldone);
+    __ delayed()->add(G0, 1, result_reg);
+
+    __ lduh(tmp1_reg, 0, chr1_reg);
+    __ lduh(tmp2_reg, 0, chr2_reg);
+    __ cmp (chr1_reg, chr2_reg);
+    __ br(Assembler::notEqual, true, Assembler::pt, Ldone);
+    __ delayed()->mov(G0, result_reg);
+    __ ba(false,Ldone);
+    __ delayed()->add(G0, 1, result_reg);
+
+    __ bind(Lchar);
+    __ add(tmp1_reg, limit_reg, tmp1_reg);
+    __ add(tmp2_reg, limit_reg, tmp2_reg);
+    __ neg(limit_reg); //negate count
+
+    __ lduh(tmp1_reg, limit_reg, chr1_reg);
+    __ bind(Lchar_loop);
+    __ lduh(tmp2_reg, limit_reg, chr2_reg);
+    __ cmp(chr1_reg, chr2_reg);
+    __ br(Assembler::notEqual, true, Assembler::pt, Ldone);
+    __ delayed()->mov(G0, result_reg); //not equal
+    __ inccc(limit_reg, sizeof(jchar));
+    // annul LDUH if branch is not taken to prevent access past end of string
+    __ br(Assembler::notZero, true, Assembler::pt, Lchar_loop); //annul on taken
+    __ delayed()->lduh(tmp1_reg, limit_reg, chr1_reg); // hoisted
+
+    __ add(G0, 1, result_reg);  //equal
+
+    __ bind(Ldone);
+  %}
+
+enc_class enc_Array_Equals(o0RegP ary1, o1RegP ary2, g3RegP tmp1, g4RegP tmp2, notemp_iRegI result) %{
+    Label Lvector, Ldone, Lloop;
+    MacroAssembler _masm(&cbuf);
+
+    Register   ary1_reg = reg_to_register_object($ary1$$reg);
+    Register   ary2_reg = reg_to_register_object($ary2$$reg);
+    Register   tmp1_reg = reg_to_register_object($tmp1$$reg);
+    Register   tmp2_reg = reg_to_register_object($tmp2$$reg);
+    Register result_reg = reg_to_register_object($result$$reg);
+
+    int length_offset  = arrayOopDesc::length_offset_in_bytes();
+    int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
+
+    // return true if the same array
+    __ cmp(ary1_reg, ary2_reg);
+    __ br(Assembler::equal, true, Assembler::pn, Ldone);
+    __ delayed()->add(G0, 1, result_reg); // equal
+
+    __ br_null(ary1_reg, true, Assembler::pn, Ldone);
+    __ delayed()->mov(G0, result_reg);    // not equal
+
+    __ br_null(ary2_reg, true, Assembler::pn, Ldone);
+    __ delayed()->mov(G0, result_reg);    // not equal
+
+    //load the lengths of arrays
+    __ ld(Address(ary1_reg, 0, length_offset), tmp1_reg);
+    __ ld(Address(ary2_reg, 0, length_offset), tmp2_reg);
+
+    // return false if the two arrays are not equal length
+    __ cmp(tmp1_reg, tmp2_reg);
+    __ br(Assembler::notEqual, true, Assembler::pn, Ldone);
+    __ delayed()->mov(G0, result_reg);     // not equal
+
+    __ br_zero(Assembler::zero, true, Assembler::pn, tmp1_reg, Ldone);
+    __ delayed()->add(G0, 1, result_reg); // zero-length arrays are equal
+
+    // load array addresses
+    __ add(ary1_reg, base_offset, ary1_reg);
+    __ add(ary2_reg, base_offset, ary2_reg);
+
+    // renaming registers
+    Register chr1_reg  =  tmp2_reg;   // for characters in ary1
+    Register chr2_reg  =  result_reg; // for characters in ary2
+    Register limit_reg =  tmp1_reg;   // length
+
+    // set byte count
+    __ sll(limit_reg, exact_log2(sizeof(jchar)), limit_reg);
+    __ andcc(limit_reg, 0x2, chr1_reg); //trailing character ?
+    __ br(Assembler::zero, false, Assembler::pt, Lvector);
+    __ delayed()->nop();
+
+    //compare the trailing char
+    __ sub(limit_reg, sizeof(jchar), limit_reg);
+    __ lduh(ary1_reg, limit_reg, chr1_reg);
+    __ lduh(ary2_reg, limit_reg, chr2_reg);
+    __ cmp(chr1_reg, chr2_reg);
+    __ br(Assembler::notEqual, true, Assembler::pt, Ldone);
+    __ delayed()->mov(G0, result_reg);     // not equal
+
+    // only one char ?
+    __ br_zero(Assembler::zero, true, Assembler::pn, limit_reg, Ldone);
+    __ delayed()->add(G0, 1, result_reg); // zero-length arrays are equal
+
+    __ bind(Lvector);
+    // Shift ary1_reg and ary2_reg to the end of the arrays, negate limit
+    __ add(ary1_reg, limit_reg, ary1_reg);
+    __ add(ary2_reg, limit_reg, ary2_reg);
+    __ neg(limit_reg, limit_reg);
+
+    __ lduw(ary1_reg, limit_reg, chr1_reg);
+    __ bind(Lloop);
+    __ lduw(ary2_reg, limit_reg, chr2_reg);
+    __ cmp(chr1_reg, chr2_reg);
+    __ br(Assembler::notEqual, false, Assembler::pt, Ldone);
+    __ delayed()->mov(G0, result_reg);     // not equal
+    __ inccc(limit_reg, 2*sizeof(jchar));
+    // annul LDUW if branch is not taken to prevent access past end of string
+    __ br(Assembler::notZero, true, Assembler::pt, Lloop); //annul on taken
+    __ delayed()->lduw(ary1_reg, limit_reg, chr1_reg); // hoisted
+
+    __ add(G0, 1, result_reg); // equals
+
+    __ bind(Ldone);
+  %}
+
   enc_class enc_rethrow() %{
     cbuf.set_inst_mark();
     Register temp_reg = G3;
@@ -9015,6 +9211,25 @@
   ins_pipe(long_memory_op);
 %}
 
+instruct string_equals(o0RegP str1, o1RegP str2, g3RegP tmp1, g4RegP tmp2, notemp_iRegI result,
+                       o7RegI tmp3, flagsReg ccr) %{
+  match(Set result (StrEquals str1 str2));
+  effect(USE_KILL str1, USE_KILL str2, KILL tmp1, KILL tmp2, KILL ccr, KILL tmp3);
+  ins_cost(300);
+  format %{ "String Equals $str1,$str2 -> $result" %}
+  ins_encode( enc_String_Equals(str1, str2, tmp1, tmp2, result) );
+  ins_pipe(long_memory_op);
+%}
+
+instruct array_equals(o0RegP ary1, o1RegP ary2, g3RegP tmp1, g4RegP tmp2, notemp_iRegI result,
+                        flagsReg ccr) %{
+  match(Set result (AryEq ary1 ary2));
+  effect(USE_KILL ary1, USE_KILL ary2, KILL tmp1, KILL tmp2, KILL ccr);
+  ins_cost(300);
+  format %{ "Array Equals $ary1,$ary2 -> $result" %}
+  ins_encode( enc_Array_Equals(ary1, ary2, tmp1, tmp2, result));
+  ins_pipe(long_memory_op);
+%}
 
 //---------- Population Count Instructions -------------------------------------
 
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Tue Mar 31 15:09:45 2009 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Tue Mar 31 19:20:34 2009 -0700
@@ -2173,6 +2173,31 @@
   emit_arith(0x0B, 0xC0, dst, src);
 }
 
+void Assembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
+  assert(VM_Version::supports_sse4_2(), "");
+
+  InstructionMark im(this);
+  emit_byte(0x66);
+  prefix(src, dst);
+  emit_byte(0x0F);
+  emit_byte(0x3A);
+  emit_byte(0x61);
+  emit_operand(dst, src);
+  emit_byte(imm8);
+}
+
+void Assembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
+  assert(VM_Version::supports_sse4_2(), "");
+
+  emit_byte(0x66);
+  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
+  emit_byte(0x0F);
+  emit_byte(0x3A);
+  emit_byte(0x61);
+  emit_byte(0xC0 | encode);
+  emit_byte(imm8);
+}
+
 // generic
 void Assembler::pop(Register dst) {
   int encode = prefix_and_encode(dst->encoding());
@@ -2330,6 +2355,29 @@
   emit_byte(shift);
 }
 
+void Assembler::ptest(XMMRegister dst, Address src) {
+  assert(VM_Version::supports_sse4_1(), "");
+
+  InstructionMark im(this);
+  emit_byte(0x66);
+  prefix(src, dst);
+  emit_byte(0x0F);
+  emit_byte(0x38);
+  emit_byte(0x17);
+  emit_operand(dst, src);
+}
+
+void Assembler::ptest(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_sse4_1(), "");
+
+  emit_byte(0x66);
+  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
+  emit_byte(0x0F);
+  emit_byte(0x38);
+  emit_byte(0x17);
+  emit_byte(0xC0 | encode);
+}
+
 void Assembler::punpcklbw(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_byte(0x66);
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Tue Mar 31 15:09:45 2009 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Tue Mar 31 19:20:34 2009 -0700
@@ -1226,6 +1226,10 @@
   void orq(Register dst, Address src);
   void orq(Register dst, Register src);
 
+  // SSE4.2 string instructions
+  void pcmpestri(XMMRegister xmm1, XMMRegister xmm2, int imm8);
+  void pcmpestri(XMMRegister xmm1, Address src, int imm8);
+
   void popl(Address dst);
 
 #ifdef _LP64
@@ -1260,6 +1264,10 @@
   // Shift Right Logical Quadword Immediate
   void psrlq(XMMRegister dst, int shift);
 
+  // Logical Compare Double Quadword
+  void ptest(XMMRegister dst, XMMRegister src);
+  void ptest(XMMRegister dst, Address src);
+
   // Interleave Low Bytes
   void punpcklbw(XMMRegister dst, XMMRegister src);
 
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp	Tue Mar 31 15:09:45 2009 -0700
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp	Tue Mar 31 19:20:34 2009 -0700
@@ -408,6 +408,11 @@
           UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus
         }
       }
+      if( supports_sse4_2() && UseSSE >= 4 ) {
+        if( FLAG_IS_DEFAULT(UseSSE42Intrinsics)) {
+          UseSSE42Intrinsics = true;
+        }
+      }
     }
   }
 
--- a/hotspot/src/cpu/x86/vm/x86_32.ad	Tue Mar 31 15:09:45 2009 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad	Tue Mar 31 19:20:34 2009 -0700
@@ -3694,12 +3694,16 @@
     }
   %}
 
-  enc_class enc_String_Compare() %{
+  enc_class enc_String_Compare(eDIRegP str1, eSIRegP str2, regXD tmp1, regXD tmp2,
+                        eAXRegI tmp3, eBXRegI tmp4, eCXRegI result) %{
     Label ECX_GOOD_LABEL, LENGTH_DIFF_LABEL,
           POP_LABEL, DONE_LABEL, CONT_LABEL,
           WHILE_HEAD_LABEL;
     MacroAssembler masm(&cbuf);
 
+    XMMRegister tmp1Reg   = as_XMMRegister($tmp1$$reg);
+    XMMRegister tmp2Reg   = as_XMMRegister($tmp2$$reg);
+
     // Get the first character position in both strings
     //         [8] char array, [12] offset, [16] count
     int value_offset  = java_lang_String::value_offset_in_bytes();
@@ -3717,7 +3721,6 @@
     // Compute the minimum of the string lengths(rsi) and the
     // difference of the string lengths (stack)
 
-
     if (VM_Version::supports_cmov()) {
       masm.movl(rdi, Address(rdi, count_offset));
       masm.movl(rsi, Address(rsi, count_offset));
@@ -3731,7 +3734,7 @@
       masm.movl(rsi, rdi);
       masm.subl(rdi, rcx);
       masm.push(rdi);
-      masm.jcc(Assembler::lessEqual, ECX_GOOD_LABEL);
+      masm.jccb(Assembler::lessEqual, ECX_GOOD_LABEL);
       masm.movl(rsi, rcx);
       // rsi holds min, rcx is unused
     }
@@ -3756,7 +3759,7 @@
       Label LSkip2;
       // Check if the strings start at same location
       masm.cmpptr(rbx,rax);
-      masm.jcc(Assembler::notEqual, LSkip2);
+      masm.jccb(Assembler::notEqual, LSkip2);
 
       // Check if the length difference is zero (from stack)
       masm.cmpl(Address(rsp, 0), 0x0);
@@ -3766,9 +3769,52 @@
       masm.bind(LSkip2);
     }
 
-    // Shift rax, and rbx, to the end of the arrays, negate min
-    masm.lea(rax, Address(rax, rsi, Address::times_2, 2));
-    masm.lea(rbx, Address(rbx, rsi, Address::times_2, 2));
+   // Advance to next character
+    masm.addptr(rax, 2);
+    masm.addptr(rbx, 2);
+
+    if (UseSSE42Intrinsics) {
+      // With SSE4.2, use double quad vector compare
+      Label COMPARE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
+      // Setup to compare 16-byte vectors
+      masm.movl(rdi, rsi);
+      masm.andl(rsi, 0xfffffff8); // rsi holds the vector count
+      masm.andl(rdi, 0x00000007); // rdi holds the tail count
+      masm.testl(rsi, rsi);
+      masm.jccb(Assembler::zero, COMPARE_TAIL);
+
+      masm.lea(rax, Address(rax, rsi, Address::times_2));
+      masm.lea(rbx, Address(rbx, rsi, Address::times_2));
+      masm.negl(rsi);
+
+      masm.bind(COMPARE_VECTORS);
+      masm.movdqu(tmp1Reg, Address(rax, rsi, Address::times_2));
+      masm.movdqu(tmp2Reg, Address(rbx, rsi, Address::times_2));
+      masm.pxor(tmp1Reg, tmp2Reg);
+      masm.ptest(tmp1Reg, tmp1Reg);
+      masm.jccb(Assembler::notZero, VECTOR_NOT_EQUAL);
+      masm.addl(rsi, 8);
+      masm.jcc(Assembler::notZero, COMPARE_VECTORS);
+      masm.jmpb(COMPARE_TAIL);
+
+      // Mismatched characters in the vectors
+      masm.bind(VECTOR_NOT_EQUAL);
+      masm.lea(rax, Address(rax, rsi, Address::times_2));
+      masm.lea(rbx, Address(rbx, rsi, Address::times_2));
+      masm.movl(rdi, 8);
+
+      // Compare tail (< 8 chars), or rescan last vectors to
+      // find 1st mismatched characters
+      masm.bind(COMPARE_TAIL);
+      masm.testl(rdi, rdi);
+      masm.jccb(Assembler::zero, LENGTH_DIFF_LABEL);
+      masm.movl(rsi, rdi);
+      // Fallthru to tail compare
+    }
+
+    //Shift rax, and rbx, to the end of the arrays, negate min
+    masm.lea(rax, Address(rax, rsi, Address::times_2, 0));
+    masm.lea(rbx, Address(rbx, rsi, Address::times_2, 0));
     masm.negl(rsi);
 
     // Compare the rest of the characters
@@ -3776,93 +3822,329 @@
     masm.load_unsigned_short(rcx, Address(rbx, rsi, Address::times_2, 0));
     masm.load_unsigned_short(rdi, Address(rax, rsi, Address::times_2, 0));
     masm.subl(rcx, rdi);
-    masm.jcc(Assembler::notZero, POP_LABEL);
+    masm.jccb(Assembler::notZero, POP_LABEL);
     masm.incrementl(rsi);
     masm.jcc(Assembler::notZero, WHILE_HEAD_LABEL);
 
     // Strings are equal up to min length.  Return the length difference.
     masm.bind(LENGTH_DIFF_LABEL);
     masm.pop(rcx);
-    masm.jmp(DONE_LABEL);
+    masm.jmpb(DONE_LABEL);
 
     // Discard the stored length difference
     masm.bind(POP_LABEL);
     masm.addptr(rsp, 4);
-       
+
     // That's it
     masm.bind(DONE_LABEL);
   %}
 
-  enc_class enc_Array_Equals(eDIRegP ary1, eSIRegP ary2, eAXRegI tmp1, eBXRegI tmp2, eCXRegI result) %{
-    Label TRUE_LABEL, FALSE_LABEL, DONE_LABEL, COMPARE_LOOP_HDR, COMPARE_LOOP;
+ enc_class enc_String_Equals(eDIRegP str1, eSIRegP str2, regXD tmp1, regXD tmp2,
+                       eBXRegI tmp3, eCXRegI tmp4, eAXRegI result) %{
+    Label RET_TRUE, RET_FALSE, DONE, COMPARE_VECTORS, COMPARE_CHAR;
+    MacroAssembler masm(&cbuf);
+
+    XMMRegister tmp1Reg   = as_XMMRegister($tmp1$$reg);
+    XMMRegister tmp2Reg   = as_XMMRegister($tmp2$$reg);
+
+    int value_offset  = java_lang_String::value_offset_in_bytes();
+    int offset_offset = java_lang_String::offset_offset_in_bytes();
+    int count_offset  = java_lang_String::count_offset_in_bytes();
+    int base_offset   = arrayOopDesc::base_offset_in_bytes(T_CHAR);
+
+    // does source == target string?
+    masm.cmpptr(rdi, rsi);
+    masm.jcc(Assembler::equal, RET_TRUE);
+
+    // get and compare counts
+    masm.movl(rcx, Address(rdi, count_offset));
+    masm.movl(rax, Address(rsi, count_offset));
+    masm.cmpl(rcx, rax);
+    masm.jcc(Assembler::notEqual, RET_FALSE);
+    masm.testl(rax, rax);
+    masm.jcc(Assembler::zero, RET_TRUE);
+
+    // get source string offset and value
+    masm.movptr(rbx, Address(rsi, value_offset));
+    masm.movl(rax, Address(rsi, offset_offset));
+    masm.leal(rsi, Address(rbx, rax, Address::times_2, base_offset));
+
+    // get compare string offset and value
+    masm.movptr(rbx, Address(rdi, value_offset));
+    masm.movl(rax, Address(rdi, offset_offset));
+    masm.leal(rdi, Address(rbx, rax, Address::times_2, base_offset));
+
+    // Set byte count
+    masm.shll(rcx, 1);
+    masm.movl(rax, rcx);
+
+    if (UseSSE42Intrinsics) {
+      // With SSE4.2, use double quad vector compare
+      Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
+      // Compare 16-byte vectors
+      masm.andl(rcx, 0xfffffff0);  // vector count (in bytes)
+      masm.andl(rax, 0x0000000e);  // tail count (in bytes)
+      masm.testl(rcx, rcx);
+      masm.jccb(Assembler::zero, COMPARE_TAIL);
+      masm.lea(rdi, Address(rdi, rcx, Address::times_1));
+      masm.lea(rsi, Address(rsi, rcx, Address::times_1));
+      masm.negl(rcx);
+
+      masm.bind(COMPARE_WIDE_VECTORS);
+      masm.movdqu(tmp1Reg, Address(rdi, rcx, Address::times_1));
+      masm.movdqu(tmp2Reg, Address(rsi, rcx, Address::times_1));
+      masm.pxor(tmp1Reg, tmp2Reg);
+      masm.ptest(tmp1Reg, tmp1Reg);
+      masm.jccb(Assembler::notZero, RET_FALSE);
+      masm.addl(rcx, 16);
+      masm.jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
+      masm.bind(COMPARE_TAIL);
+      masm.movl(rcx, rax);
+      // Fallthru to tail compare
+    }
+
+    // Compare 4-byte vectors
+    masm.andl(rcx, 0xfffffffc);  // vector count (in bytes)
+    masm.andl(rax, 0x00000002);  // tail char (in bytes)
+    masm.testl(rcx, rcx);
+    masm.jccb(Assembler::zero, COMPARE_CHAR);
+    masm.lea(rdi, Address(rdi, rcx, Address::times_1));
+    masm.lea(rsi, Address(rsi, rcx, Address::times_1));
+    masm.negl(rcx);
+
+    masm.bind(COMPARE_VECTORS);
+    masm.movl(rbx, Address(rdi, rcx, Address::times_1));
+    masm.cmpl(rbx, Address(rsi, rcx, Address::times_1));
+    masm.jccb(Assembler::notEqual, RET_FALSE);
+    masm.addl(rcx, 4);
+    masm.jcc(Assembler::notZero, COMPARE_VECTORS);
+
+    // Compare trailing char (final 2 bytes), if any
+    masm.bind(COMPARE_CHAR);
+    masm.testl(rax, rax);
+    masm.jccb(Assembler::zero, RET_TRUE);
+    masm.load_unsigned_short(rbx, Address(rdi, 0));
+    masm.load_unsigned_short(rcx, Address(rsi, 0));
+    masm.cmpl(rbx, rcx);
+    masm.jccb(Assembler::notEqual, RET_FALSE);
+
+    masm.bind(RET_TRUE);
+    masm.movl(rax, 1);   // return true
+    masm.jmpb(DONE);
+
+    masm.bind(RET_FALSE);
+    masm.xorl(rax, rax); // return false
+
+    masm.bind(DONE);
+    %}
+
+ enc_class enc_String_IndexOf(eSIRegP str1, eDIRegP str2, regXD tmp1, eAXRegI tmp2,
+                        eCXRegI tmp3, eDXRegI tmp4, eBXRegI result) %{
+    // SSE4.2 version
+    Label LOAD_SUBSTR, PREP_FOR_SCAN, SCAN_TO_SUBSTR,
+          SCAN_SUBSTR, RET_NEG_ONE, RET_NOT_FOUND, CLEANUP, DONE;
     MacroAssembler masm(&cbuf);
 
-    Register ary1Reg   = as_Register($ary1$$reg);
-    Register ary2Reg   = as_Register($ary2$$reg);
-    Register tmp1Reg   = as_Register($tmp1$$reg);
-    Register tmp2Reg   = as_Register($tmp2$$reg);
-    Register resultReg = as_Register($result$$reg);
+    XMMRegister tmp1Reg   = as_XMMRegister($tmp1$$reg);
+
+    // Get the first character position in both strings
+    //         [8] char array, [12] offset, [16] count
+    int value_offset  = java_lang_String::value_offset_in_bytes();
+    int offset_offset = java_lang_String::offset_offset_in_bytes();
+    int count_offset  = java_lang_String::count_offset_in_bytes();
+    int base_offset   = arrayOopDesc::base_offset_in_bytes(T_CHAR);
+
+    // Get counts for string and substr
+    masm.movl(rdx, Address(rsi, count_offset));
+    masm.movl(rax, Address(rdi, count_offset));
+    // Check for substr count > string count
+    masm.cmpl(rax, rdx);
+    masm.jcc(Assembler::greater, RET_NEG_ONE);
+
+    // Start the indexOf operation
+    // Get start addr of string
+    masm.movptr(rbx, Address(rsi, value_offset));
+    masm.movl(rcx, Address(rsi, offset_offset));
+    masm.lea(rsi, Address(rbx, rcx, Address::times_2, base_offset));
+    masm.push(rsi);
+
+    // Get start addr of substr
+    masm.movptr(rbx, Address(rdi, value_offset));
+    masm.movl(rcx, Address(rdi, offset_offset));
+    masm.lea(rdi, Address(rbx, rcx, Address::times_2, base_offset));
+    masm.push(rdi);
+    masm.push(rax);
+    masm.jmpb(PREP_FOR_SCAN);
+
+    // Substr count saved at sp
+    // Substr saved at sp+4
+    // String saved at sp+8
+
+    // Prep to load substr for scan
+    masm.bind(LOAD_SUBSTR);
+    masm.movptr(rdi, Address(rsp, 4));
+    masm.movl(rax, Address(rsp, 0));
+
+    // Load substr
+    masm.bind(PREP_FOR_SCAN);
+    masm.movdqu(tmp1Reg, Address(rdi, 0));
+    masm.addl(rdx, 8);        // prime the loop
+    masm.subptr(rsi, 16);
+
+    // Scan string for substr in 16-byte vectors
+    masm.bind(SCAN_TO_SUBSTR);
+    masm.subl(rdx, 8);
+    masm.addptr(rsi, 16);
+    masm.pcmpestri(tmp1Reg, Address(rsi, 0), 0x0d);
+    masm.jcc(Assembler::above, SCAN_TO_SUBSTR);     // CF == 0 && ZF == 0
+    masm.jccb(Assembler::aboveEqual, RET_NOT_FOUND); // CF == 0
+
+    // Fallthru: found a potential substr
+
+    // Make sure string is still long enough
+    masm.subl(rdx, rcx);
+    masm.cmpl(rdx, rax);
+    masm.jccb(Assembler::negative, RET_NOT_FOUND);
+    // Compute start addr of substr
+    masm.lea(rsi, Address(rsi, rcx, Address::times_2));
+    masm.movptr(rbx, rsi);
+
+    // Compare potential substr
+    masm.addl(rdx, 8);        // prime the loop
+    masm.addl(rax, 8);
+    masm.subptr(rsi, 16);
+    masm.subptr(rdi, 16);
+
+    // Scan 16-byte vectors of string and substr
+    masm.bind(SCAN_SUBSTR);
+    masm.subl(rax, 8);
+    masm.subl(rdx, 8);
+    masm.addptr(rsi, 16);
+    masm.addptr(rdi, 16);
+    masm.movdqu(tmp1Reg, Address(rdi, 0));
+    masm.pcmpestri(tmp1Reg, Address(rsi, 0), 0x0d);
+    masm.jcc(Assembler::noOverflow, LOAD_SUBSTR);   // OF == 0
+    masm.jcc(Assembler::positive, SCAN_SUBSTR);     // SF == 0
+
+    // Compute substr offset
+    masm.movptr(rsi, Address(rsp, 8));
+    masm.subptr(rbx, rsi);
+    masm.shrl(rbx, 1);
+    masm.jmpb(CLEANUP);
+
+    masm.bind(RET_NEG_ONE);
+    masm.movl(rbx, -1);
+    masm.jmpb(DONE);
+
+    masm.bind(RET_NOT_FOUND);
+    masm.movl(rbx, -1);
+
+    masm.bind(CLEANUP);
+    masm.addptr(rsp, 12);
+
+    masm.bind(DONE);
+  %}
+
+  enc_class enc_Array_Equals(eDIRegP ary1, eSIRegP ary2, regXD tmp1, regXD tmp2,
+                             eBXRegI tmp3, eDXRegI tmp4, eAXRegI result) %{
+    Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR;
+    MacroAssembler masm(&cbuf);
+
+    XMMRegister tmp1Reg   = as_XMMRegister($tmp1$$reg);
+    XMMRegister tmp2Reg   = as_XMMRegister($tmp2$$reg);
+    Register ary1Reg      = as_Register($ary1$$reg);
+    Register ary2Reg      = as_Register($ary2$$reg);
+    Register tmp3Reg      = as_Register($tmp3$$reg);
+    Register tmp4Reg      = as_Register($tmp4$$reg);
+    Register resultReg    = as_Register($result$$reg);
 
     int length_offset  = arrayOopDesc::length_offset_in_bytes();
     int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
 
     // Check the input args
-    masm.cmpl(ary1Reg, ary2Reg);
+    masm.cmpptr(ary1Reg, ary2Reg);
     masm.jcc(Assembler::equal, TRUE_LABEL);
-    masm.testl(ary1Reg, ary1Reg);
+    masm.testptr(ary1Reg, ary1Reg);
     masm.jcc(Assembler::zero, FALSE_LABEL);
-    masm.testl(ary2Reg, ary2Reg);
+    masm.testptr(ary2Reg, ary2Reg);
     masm.jcc(Assembler::zero, FALSE_LABEL);
 
     // Check the lengths
-    masm.movl(tmp2Reg, Address(ary1Reg, length_offset));
+    masm.movl(tmp4Reg, Address(ary1Reg, length_offset));
     masm.movl(resultReg, Address(ary2Reg, length_offset));
-    masm.cmpl(tmp2Reg, resultReg);
+    masm.cmpl(tmp4Reg, resultReg);
     masm.jcc(Assembler::notEqual, FALSE_LABEL);
     masm.testl(resultReg, resultReg);
     masm.jcc(Assembler::zero, TRUE_LABEL);
 
-    // Get the number of 4 byte vectors to compare
-    masm.shrl(resultReg, 1);
-
-    // Check for odd-length arrays
-    masm.andl(tmp2Reg, 1);
-    masm.testl(tmp2Reg, tmp2Reg);
-    masm.jcc(Assembler::zero, COMPARE_LOOP_HDR);
-
-    // Compare 2-byte "tail" at end of arrays
-    masm.load_unsigned_short(tmp1Reg, Address(ary1Reg, resultReg, Address::times_4, base_offset));
-    masm.load_unsigned_short(tmp2Reg, Address(ary2Reg, resultReg, Address::times_4, base_offset));
-    masm.cmpl(tmp1Reg, tmp2Reg);
-    masm.jcc(Assembler::notEqual, FALSE_LABEL);
+    // Load array addrs
+    masm.lea(ary1Reg, Address(ary1Reg, base_offset));
+    masm.lea(ary2Reg, Address(ary2Reg, base_offset));
+
+    // Set byte count
+    masm.shll(tmp4Reg, 1);
+    masm.movl(resultReg, tmp4Reg);
+
+    if (UseSSE42Intrinsics) {
+      // With SSE4.2, use double quad vector compare
+      Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
+      // Compare 16-byte vectors
+      masm.andl(tmp4Reg, 0xfffffff0);    // vector count (in bytes)
+      masm.andl(resultReg, 0x0000000e);  // tail count (in bytes)
+      masm.testl(tmp4Reg, tmp4Reg);
+      masm.jccb(Assembler::zero, COMPARE_TAIL);
+      masm.lea(ary1Reg, Address(ary1Reg, tmp4Reg, Address::times_1));
+      masm.lea(ary2Reg, Address(ary2Reg, tmp4Reg, Address::times_1));
+      masm.negl(tmp4Reg);
+
+      masm.bind(COMPARE_WIDE_VECTORS);
+      masm.movdqu(tmp1Reg, Address(ary1Reg, tmp4Reg, Address::times_1));
+      masm.movdqu(tmp2Reg, Address(ary2Reg, tmp4Reg, Address::times_1));
+      masm.pxor(tmp1Reg, tmp2Reg);
+      masm.ptest(tmp1Reg, tmp1Reg);
+
+      masm.jccb(Assembler::notZero, FALSE_LABEL);
+      masm.addl(tmp4Reg, 16);
+      masm.jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
+      masm.bind(COMPARE_TAIL);
+      masm.movl(tmp4Reg, resultReg);
+      // Fallthru to tail compare
+    }
+
+    // Compare 4-byte vectors
+    masm.andl(tmp4Reg, 0xfffffffc);    // vector count (in bytes)
+    masm.andl(resultReg, 0x00000002);  // tail char (in bytes)
+    masm.testl(tmp4Reg, tmp4Reg);
+    masm.jccb(Assembler::zero, COMPARE_CHAR);
+    masm.lea(ary1Reg, Address(ary1Reg, tmp4Reg, Address::times_1));
+    masm.lea(ary2Reg, Address(ary2Reg, tmp4Reg, Address::times_1));
+    masm.negl(tmp4Reg);
+
+    masm.bind(COMPARE_VECTORS);
+    masm.movl(tmp3Reg, Address(ary1Reg, tmp4Reg, Address::times_1));
+    masm.cmpl(tmp3Reg, Address(ary2Reg, tmp4Reg, Address::times_1));
+    masm.jccb(Assembler::notEqual, FALSE_LABEL);
+    masm.addl(tmp4Reg, 4);
+    masm.jcc(Assembler::notZero, COMPARE_VECTORS);
+
+    // Compare trailing char (final 2 bytes), if any
+    masm.bind(COMPARE_CHAR);
     masm.testl(resultReg, resultReg);
-    masm.jcc(Assembler::zero, TRUE_LABEL);
-
-    // Setup compare loop
-    masm.bind(COMPARE_LOOP_HDR);
-    // Shift tmp1Reg and tmp2Reg to the last 4-byte boundary of the arrays
-    masm.leal(tmp1Reg, Address(ary1Reg, resultReg, Address::times_4, base_offset));
-    masm.leal(tmp2Reg, Address(ary2Reg, resultReg, Address::times_4, base_offset));
-    masm.negl(resultReg);
-
-    // 4-byte-wide compare loop
-    masm.bind(COMPARE_LOOP);
-    masm.movl(ary1Reg, Address(tmp1Reg, resultReg, Address::times_4, 0));
-    masm.movl(ary2Reg, Address(tmp2Reg, resultReg, Address::times_4, 0));
-    masm.cmpl(ary1Reg, ary2Reg);
-    masm.jcc(Assembler::notEqual, FALSE_LABEL);
-    masm.increment(resultReg);
-    masm.jcc(Assembler::notZero, COMPARE_LOOP);
+    masm.jccb(Assembler::zero, TRUE_LABEL);
+    masm.load_unsigned_short(tmp3Reg, Address(ary1Reg, 0));
+    masm.load_unsigned_short(tmp4Reg, Address(ary2Reg, 0));
+    masm.cmpl(tmp3Reg, tmp4Reg);
+    masm.jccb(Assembler::notEqual, FALSE_LABEL);
 
     masm.bind(TRUE_LABEL);
     masm.movl(resultReg, 1);   // return true
-    masm.jmp(DONE_LABEL);
+    masm.jmpb(DONE);
 
     masm.bind(FALSE_LABEL);
     masm.xorl(resultReg, resultReg); // return false
 
     // That's it
-    masm.bind(DONE_LABEL);
+    masm.bind(DONE);
   %}
 
   enc_class enc_pop_rdx() %{
@@ -12074,11 +12356,8 @@
   ins_pipe( fpu_reg_reg );
 %}
 
-
-
 // =======================================================================
 // fast clearing of an array
-
 instruct rep_stos(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
   match(Set dummy (ClearArray cnt base));
   effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
@@ -12092,24 +12371,48 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct string_compare(eDIRegP str1, eSIRegP str2, eAXRegI tmp1, eBXRegI tmp2, eCXRegI result, eFlagsReg cr) %{
+instruct string_compare(eDIRegP str1, eSIRegP str2, regXD tmp1, regXD tmp2,
+                        eAXRegI tmp3, eBXRegI tmp4, eCXRegI result, eFlagsReg cr) %{
   match(Set result (StrComp str1 str2));
-  effect(USE_KILL str1, USE_KILL str2, KILL tmp1, KILL tmp2, KILL cr);
+  effect(TEMP tmp1, TEMP tmp2, USE_KILL str1, USE_KILL str2, KILL tmp3, KILL tmp4, KILL cr);
   //ins_cost(300);
 
   format %{ "String Compare $str1,$str2 -> $result    // KILL EAX, EBX" %}
-  ins_encode( enc_String_Compare() );
+  ins_encode( enc_String_Compare(str1, str2, tmp1, tmp2, tmp3, tmp4, result) );
+  ins_pipe( pipe_slow );
+%}
+
+// fast string equals
+instruct string_equals(eDIRegP str1, eSIRegP str2, regXD tmp1, regXD tmp2,
+                       eBXRegI tmp3, eCXRegI tmp4, eAXRegI result, eFlagsReg cr) %{
+  match(Set result (StrEquals str1 str2));
+  effect(TEMP tmp1, TEMP tmp2, USE_KILL str1, USE_KILL str2, KILL tmp3, KILL tmp4, KILL cr);
+
+  format %{ "String Equals $str1,$str2 -> $result    // KILL EBX, ECX" %}
+  ins_encode( enc_String_Equals(tmp1, tmp2, str1, str2, tmp3, tmp4, result) );
+  ins_pipe( pipe_slow );
+%}
+
+instruct string_indexof(eSIRegP str1, eDIRegP str2, regXD tmp1, eAXRegI tmp2,
+                        eCXRegI tmp3, eDXRegI tmp4, eBXRegI result, eFlagsReg cr) %{
+  predicate(UseSSE42Intrinsics);
+  match(Set result (StrIndexOf str1 str2));
+  effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, KILL tmp2, KILL tmp3, KILL tmp4, KILL cr);
+
+  format %{ "String IndexOf $str1,$str2 -> $result    // KILL EAX, ECX, EDX" %}
+  ins_encode( enc_String_IndexOf(str1, str2, tmp1, tmp2, tmp3, tmp4, result) );
   ins_pipe( pipe_slow );
 %}
 
 // fast array equals
-instruct array_equals(eDIRegP ary1, eSIRegP ary2, eAXRegI tmp1, eBXRegI tmp2, eCXRegI result, eFlagsReg cr) %{
+instruct array_equals(eDIRegP ary1, eSIRegP ary2, regXD tmp1, regXD tmp2, eBXRegI tmp3,
+                      eDXRegI tmp4, eAXRegI result, eFlagsReg cr) %{
   match(Set result (AryEq ary1 ary2));
-  effect(USE_KILL ary1, USE_KILL ary2, KILL tmp1, KILL tmp2, KILL cr);
+  effect(TEMP tmp1, TEMP tmp2, USE_KILL ary1, USE_KILL ary2, KILL tmp3, KILL tmp4, KILL cr);
   //ins_cost(300);
 
-  format %{ "Array Equals $ary1,$ary2 -> $result    // KILL EAX, EBX" %}
-  ins_encode( enc_Array_Equals(ary1, ary2, tmp1, tmp2, result) );
+  format %{ "Array Equals $ary1,$ary2 -> $result    // KILL EBX, EDX" %}
+  ins_encode( enc_Array_Equals(ary1, ary2, tmp1, tmp2, tmp3, tmp4, result) );
   ins_pipe( pipe_slow );
 %}
 
--- a/hotspot/src/cpu/x86/vm/x86_64.ad	Tue Mar 31 15:09:45 2009 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad	Tue Mar 31 19:20:34 2009 -0700
@@ -3694,13 +3694,16 @@
     }
   %}
 
-  enc_class enc_String_Compare()
-  %{
+  enc_class enc_String_Compare(rdi_RegP str1, rsi_RegP str2, regD tmp1, regD tmp2,
+                        rax_RegI tmp3, rbx_RegI tmp4, rcx_RegI result) %{
     Label RCX_GOOD_LABEL, LENGTH_DIFF_LABEL,
           POP_LABEL, DONE_LABEL, CONT_LABEL,
           WHILE_HEAD_LABEL;
     MacroAssembler masm(&cbuf);
 
+    XMMRegister tmp1Reg   = as_XMMRegister($tmp1$$reg);
+    XMMRegister tmp2Reg   = as_XMMRegister($tmp2$$reg);
+
     // Get the first character position in both strings
     //         [8] char array, [12] offset, [16] count
     int value_offset  = java_lang_String::value_offset_in_bytes();
@@ -3718,6 +3721,7 @@
     // Compute the minimum of the string lengths(rsi) and the
     // difference of the string lengths (stack)
 
+    // do the conditional move stuff
     masm.movl(rdi, Address(rdi, count_offset));
     masm.movl(rsi, Address(rsi, count_offset));
     masm.movl(rcx, rdi);
@@ -3745,7 +3749,7 @@
       Label LSkip2;
       // Check if the strings start at same location
       masm.cmpptr(rbx, rax);
-      masm.jcc(Assembler::notEqual, LSkip2);
+      masm.jccb(Assembler::notEqual, LSkip2);
 
       // Check if the length difference is zero (from stack)
       masm.cmpl(Address(rsp, 0), 0x0);
@@ -3755,9 +3759,52 @@
       masm.bind(LSkip2);
     }
 
+    // Advance to next character
+    masm.addptr(rax, 2);
+    masm.addptr(rbx, 2);
+
+    if (UseSSE42Intrinsics) {
+      // With SSE4.2, use double quad vector compare
+      Label COMPARE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
+      // Setup to compare 16-byte vectors
+      masm.movl(rdi, rsi);
+      masm.andl(rsi, 0xfffffff8); // rsi holds the vector count
+      masm.andl(rdi, 0x00000007); // rdi holds the tail count
+      masm.testl(rsi, rsi);
+      masm.jccb(Assembler::zero, COMPARE_TAIL);
+
+      masm.lea(rax, Address(rax, rsi, Address::times_2));
+      masm.lea(rbx, Address(rbx, rsi, Address::times_2));
+      masm.negptr(rsi);
+
+      masm.bind(COMPARE_VECTORS);
+      masm.movdqu(tmp1Reg, Address(rax, rsi, Address::times_2));
+      masm.movdqu(tmp2Reg, Address(rbx, rsi, Address::times_2));
+      masm.pxor(tmp1Reg, tmp2Reg);
+      masm.ptest(tmp1Reg, tmp1Reg);
+      masm.jccb(Assembler::notZero, VECTOR_NOT_EQUAL);
+      masm.addptr(rsi, 8);
+      masm.jcc(Assembler::notZero, COMPARE_VECTORS);
+      masm.jmpb(COMPARE_TAIL);
+
+      // Mismatched characters in the vectors
+      masm.bind(VECTOR_NOT_EQUAL);
+      masm.lea(rax, Address(rax, rsi, Address::times_2));
+      masm.lea(rbx, Address(rbx, rsi, Address::times_2));
+      masm.movl(rdi, 8);
+
+      // Compare tail (< 8 chars), or rescan last vectors to
+      // find 1st mismatched characters
+      masm.bind(COMPARE_TAIL);
+      masm.testl(rdi, rdi);
+      masm.jccb(Assembler::zero, LENGTH_DIFF_LABEL);
+      masm.movl(rsi, rdi);
+      // Fallthru to tail compare
+    }
+
     // Shift RAX and RBX to the end of the arrays, negate min
-    masm.lea(rax, Address(rax, rsi, Address::times_2, 2));
-    masm.lea(rbx, Address(rbx, rsi, Address::times_2, 2));
+    masm.lea(rax, Address(rax, rsi, Address::times_2, 0));
+    masm.lea(rbx, Address(rbx, rsi, Address::times_2, 0));
     masm.negptr(rsi);
 
     // Compare the rest of the characters
@@ -3765,93 +3812,329 @@
     masm.load_unsigned_short(rcx, Address(rbx, rsi, Address::times_2, 0));
     masm.load_unsigned_short(rdi, Address(rax, rsi, Address::times_2, 0));
     masm.subl(rcx, rdi);
-    masm.jcc(Assembler::notZero, POP_LABEL);
+    masm.jccb(Assembler::notZero, POP_LABEL);
     masm.increment(rsi);
     masm.jcc(Assembler::notZero, WHILE_HEAD_LABEL);
 
     // Strings are equal up to min length.  Return the length difference.
     masm.bind(LENGTH_DIFF_LABEL);
     masm.pop(rcx);
-    masm.jmp(DONE_LABEL);
+    masm.jmpb(DONE_LABEL);
 
     // Discard the stored length difference
     masm.bind(POP_LABEL);
     masm.addptr(rsp, 8);
-       
+
     // That's it
     masm.bind(DONE_LABEL);
   %}
 
-  enc_class enc_Array_Equals(rdi_RegP ary1, rsi_RegP ary2, rax_RegI tmp1, rbx_RegI tmp2, rcx_RegI result) %{
-    Label TRUE_LABEL, FALSE_LABEL, DONE_LABEL, COMPARE_LOOP_HDR, COMPARE_LOOP;
+ enc_class enc_String_IndexOf(rsi_RegP str1, rdi_RegP str2, regD tmp1, rax_RegI tmp2,
+                        rcx_RegI tmp3, rdx_RegI tmp4, rbx_RegI result) %{
+    // SSE4.2 version
+    Label LOAD_SUBSTR, PREP_FOR_SCAN, SCAN_TO_SUBSTR,
+          SCAN_SUBSTR, RET_NEG_ONE, RET_NOT_FOUND, CLEANUP, DONE;
     MacroAssembler masm(&cbuf);
 
-    Register ary1Reg   = as_Register($ary1$$reg);
-    Register ary2Reg   = as_Register($ary2$$reg);
-    Register tmp1Reg   = as_Register($tmp1$$reg);
-    Register tmp2Reg   = as_Register($tmp2$$reg);
-    Register resultReg = as_Register($result$$reg);
+    XMMRegister tmp1Reg   = as_XMMRegister($tmp1$$reg);
+
+    // Get the first character position in both strings
+    //         [8] char array, [12] offset, [16] count
+    int value_offset  = java_lang_String::value_offset_in_bytes();
+    int offset_offset = java_lang_String::offset_offset_in_bytes();
+    int count_offset  = java_lang_String::count_offset_in_bytes();
+    int base_offset   = arrayOopDesc::base_offset_in_bytes(T_CHAR);
+
+    // Get counts for string and substr
+    masm.movl(rdx, Address(rsi, count_offset));
+    masm.movl(rax, Address(rdi, count_offset));
+    // Check for substr count > string count
+    masm.cmpl(rax, rdx);
+    masm.jcc(Assembler::greater, RET_NEG_ONE);
+
+    // Start the indexOf operation
+    // Get start addr of string
+    masm.load_heap_oop(rbx, Address(rsi, value_offset));
+    masm.movl(rcx, Address(rsi, offset_offset));
+    masm.lea(rsi, Address(rbx, rcx, Address::times_2, base_offset));
+    masm.push(rsi);
+
+    // Get start addr of substr
+    masm.load_heap_oop(rbx, Address(rdi, value_offset));
+    masm.movl(rcx, Address(rdi, offset_offset));
+    masm.lea(rdi, Address(rbx, rcx, Address::times_2, base_offset));
+    masm.push(rdi);
+    masm.push(rax);
+    masm.jmpb(PREP_FOR_SCAN);
+
+    // Substr count saved at sp
+    // Substr saved at sp+8
+    // String saved at sp+16
+
+    // Prep to load substr for scan
+    masm.bind(LOAD_SUBSTR);
+    masm.movptr(rdi, Address(rsp, 8));
+    masm.movl(rax, Address(rsp, 0));
+
+    // Load substr
+    masm.bind(PREP_FOR_SCAN);
+    masm.movdqu(tmp1Reg, Address(rdi, 0));
+    masm.addq(rdx, 8);    // prime the loop
+    masm.subptr(rsi, 16);
+
+    // Scan string for substr in 16-byte vectors
+    masm.bind(SCAN_TO_SUBSTR);
+    masm.subq(rdx, 8);
+    masm.addptr(rsi, 16);
+    masm.pcmpestri(tmp1Reg, Address(rsi, 0), 0x0d);
+    masm.jcc(Assembler::above, SCAN_TO_SUBSTR);
+    masm.jccb(Assembler::aboveEqual, RET_NOT_FOUND);
+
+    // Fallthru: found a potential substr
+
+    //Make sure string is still long enough
+    masm.subl(rdx, rcx);
+    masm.cmpl(rdx, rax);
+    masm.jccb(Assembler::negative, RET_NOT_FOUND);
+    // Compute start addr of substr
+    masm.lea(rsi, Address(rsi, rcx, Address::times_2));
+    masm.movptr(rbx, rsi);
+
+    // Compare potential substr
+    masm.addq(rdx, 8);        // prime the loop
+    masm.addq(rax, 8);
+    masm.subptr(rsi, 16);
+    masm.subptr(rdi, 16);
+
+    // Scan 16-byte vectors of string and substr
+    masm.bind(SCAN_SUBSTR);
+    masm.subq(rax, 8);
+    masm.subq(rdx, 8);
+    masm.addptr(rsi, 16);
+    masm.addptr(rdi, 16);
+    masm.movdqu(tmp1Reg, Address(rdi, 0));
+    masm.pcmpestri(tmp1Reg, Address(rsi, 0), 0x0d);
+    masm.jcc(Assembler::noOverflow, LOAD_SUBSTR);   // OF == 0
+    masm.jcc(Assembler::positive, SCAN_SUBSTR);     // SF == 0
+
+    // Compute substr offset
+    masm.movptr(rsi, Address(rsp, 16));
+    masm.subptr(rbx, rsi);
+    masm.shrl(rbx, 1);
+    masm.jmpb(CLEANUP);
+
+    masm.bind(RET_NEG_ONE);
+    masm.movl(rbx, -1);
+    masm.jmpb(DONE);
+
+    masm.bind(RET_NOT_FOUND);
+    masm.movl(rbx, -1);
+
+    masm.bind(CLEANUP);
+    masm.addptr(rsp, 24);
+
+    masm.bind(DONE);
+  %}
+
+  enc_class enc_String_Equals(rdi_RegP str1, rsi_RegP str2, regD tmp1, regD tmp2,
+                              rbx_RegI tmp3, rcx_RegI tmp2, rax_RegI result) %{
+    Label RET_TRUE, RET_FALSE, DONE, COMPARE_VECTORS, COMPARE_CHAR;
+    MacroAssembler masm(&cbuf);
+
+    XMMRegister tmp1Reg   = as_XMMRegister($tmp1$$reg);
+    XMMRegister tmp2Reg   = as_XMMRegister($tmp2$$reg);
+
+    int value_offset  = java_lang_String::value_offset_in_bytes();
+    int offset_offset = java_lang_String::offset_offset_in_bytes();
+    int count_offset  = java_lang_String::count_offset_in_bytes();
+    int base_offset   = arrayOopDesc::base_offset_in_bytes(T_CHAR);
+
+    // does source == target string?
+    masm.cmpptr(rdi, rsi);
+    masm.jcc(Assembler::equal, RET_TRUE);
+
+    // get and compare counts
+    masm.movl(rcx, Address(rdi, count_offset));
+    masm.movl(rax, Address(rsi, count_offset));
+    masm.cmpl(rcx, rax);
+    masm.jcc(Assembler::notEqual, RET_FALSE);
+    masm.testl(rax, rax);
+    masm.jcc(Assembler::zero, RET_TRUE);
+
+    // get source string offset and value
+    masm.load_heap_oop(rbx, Address(rsi, value_offset));
+    masm.movl(rax, Address(rsi, offset_offset));
+    masm.lea(rsi, Address(rbx, rax, Address::times_2, base_offset));
+
+    // get compare string offset and value
+    masm.load_heap_oop(rbx, Address(rdi, value_offset));
+    masm.movl(rax, Address(rdi, offset_offset));
+    masm.lea(rdi, Address(rbx, rax, Address::times_2, base_offset));
+
+    // Set byte count
+    masm.shll(rcx, 1);
+    masm.movl(rax, rcx);
+
+    if (UseSSE42Intrinsics) {
+      // With SSE4.2, use double quad vector compare
+      Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
+      // Compare 16-byte vectors
+      masm.andl(rcx, 0xfffffff0);  // vector count (in bytes)
+      masm.andl(rax, 0x0000000e);  // tail count (in bytes)
+      masm.testl(rcx, rcx);
+      masm.jccb(Assembler::zero, COMPARE_TAIL);
+      masm.lea(rdi, Address(rdi, rcx, Address::times_1));
+      masm.lea(rsi, Address(rsi, rcx, Address::times_1));
+      masm.negptr(rcx);
+
+      masm.bind(COMPARE_WIDE_VECTORS);
+      masm.movdqu(tmp1Reg, Address(rdi, rcx, Address::times_1));
+      masm.movdqu(tmp2Reg, Address(rsi, rcx, Address::times_1));
+      masm.pxor(tmp1Reg, tmp2Reg);
+      masm.ptest(tmp1Reg, tmp1Reg);
+      masm.jccb(Assembler::notZero, RET_FALSE);
+      masm.addptr(rcx, 16);
+      masm.jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
+      masm.bind(COMPARE_TAIL);
+      masm.movl(rcx, rax);
+      // Fallthru to tail compare
+    }
+
+    // Compare 4-byte vectors
+    masm.andl(rcx, 0xfffffffc);  // vector count (in bytes)
+    masm.andl(rax, 0x00000002);  // tail char (in bytes)
+    masm.testl(rcx, rcx);
+    masm.jccb(Assembler::zero, COMPARE_CHAR);
+    masm.lea(rdi, Address(rdi, rcx, Address::times_1));
+    masm.lea(rsi, Address(rsi, rcx, Address::times_1));
+    masm.negptr(rcx);
+
+    masm.bind(COMPARE_VECTORS);
+    masm.movl(rbx, Address(rdi, rcx, Address::times_1));
+    masm.cmpl(rbx, Address(rsi, rcx, Address::times_1));
+    masm.jccb(Assembler::notEqual, RET_FALSE);
+    masm.addptr(rcx, 4);
+    masm.jcc(Assembler::notZero, COMPARE_VECTORS);
+
+    // Compare trailing char (final 2 bytes), if any
+    masm.bind(COMPARE_CHAR);
+    masm.testl(rax, rax);
+    masm.jccb(Assembler::zero, RET_TRUE);
+    masm.load_unsigned_short(rbx, Address(rdi, 0));
+    masm.load_unsigned_short(rcx, Address(rsi, 0));
+    masm.cmpl(rbx, rcx);
+    masm.jccb(Assembler::notEqual, RET_FALSE);
+
+    masm.bind(RET_TRUE);
+    masm.movl(rax, 1);   // return true
+    masm.jmpb(DONE);
+
+    masm.bind(RET_FALSE);
+    masm.xorl(rax, rax); // return false
+
+    masm.bind(DONE);
+  %}
+
+  enc_class enc_Array_Equals(rdi_RegP ary1, rsi_RegP ary2, regD tmp1, regD tmp2,
+                             rax_RegI tmp3, rbx_RegI tmp4, rcx_RegI result) %{
+    Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR;
+    MacroAssembler masm(&cbuf);
+
+    XMMRegister tmp1Reg   = as_XMMRegister($tmp1$$reg);
+    XMMRegister tmp2Reg   = as_XMMRegister($tmp2$$reg);
+    Register ary1Reg      = as_Register($ary1$$reg);
+    Register ary2Reg      = as_Register($ary2$$reg);
+    Register tmp3Reg      = as_Register($tmp3$$reg);
+    Register tmp4Reg      = as_Register($tmp4$$reg);
+    Register resultReg    = as_Register($result$$reg);
 
     int length_offset  = arrayOopDesc::length_offset_in_bytes();
     int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
 
     // Check the input args
-    masm.cmpq(ary1Reg, ary2Reg);                        
+    masm.cmpq(ary1Reg, ary2Reg);
     masm.jcc(Assembler::equal, TRUE_LABEL);
-    masm.testq(ary1Reg, ary1Reg);                       
+    masm.testq(ary1Reg, ary1Reg);
     masm.jcc(Assembler::zero, FALSE_LABEL);
-    masm.testq(ary2Reg, ary2Reg);                       
+    masm.testq(ary2Reg, ary2Reg);
     masm.jcc(Assembler::zero, FALSE_LABEL);
 
     // Check the lengths
-    masm.movl(tmp2Reg, Address(ary1Reg, length_offset));
+    masm.movl(tmp4Reg, Address(ary1Reg, length_offset));
     masm.movl(resultReg, Address(ary2Reg, length_offset));
-    masm.cmpl(tmp2Reg, resultReg);
+    masm.cmpl(tmp4Reg, resultReg);
     masm.jcc(Assembler::notEqual, FALSE_LABEL);
     masm.testl(resultReg, resultReg);
     masm.jcc(Assembler::zero, TRUE_LABEL);
 
-    // Get the number of 4 byte vectors to compare
-    masm.shrl(resultReg, 1);
-
-    // Check for odd-length arrays
-    masm.andl(tmp2Reg, 1);
-    masm.testl(tmp2Reg, tmp2Reg);
-    masm.jcc(Assembler::zero, COMPARE_LOOP_HDR);
-
-    // Compare 2-byte "tail" at end of arrays
-    masm.load_unsigned_short(tmp1Reg, Address(ary1Reg, resultReg, Address::times_4, base_offset));
-    masm.load_unsigned_short(tmp2Reg, Address(ary2Reg, resultReg, Address::times_4, base_offset));
-    masm.cmpl(tmp1Reg, tmp2Reg);
-    masm.jcc(Assembler::notEqual, FALSE_LABEL);
+    //load array address
+    masm.lea(ary1Reg, Address(ary1Reg, base_offset));
+    masm.lea(ary2Reg, Address(ary2Reg, base_offset));
+
+    //set byte count
+    masm.shll(tmp4Reg, 1);
+    masm.movl(resultReg,tmp4Reg);
+
+    if (UseSSE42Intrinsics){
+      // With SSE4.2, use double quad vector compare
+      Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
+      // Compare 16-byte vectors
+      masm.andl(tmp4Reg, 0xfffffff0);    // vector count (in bytes)
+      masm.andl(resultReg, 0x0000000e);  // tail count (in bytes)
+      masm.testl(tmp4Reg, tmp4Reg);
+      masm.jccb(Assembler::zero, COMPARE_TAIL);
+      masm.lea(ary1Reg, Address(ary1Reg, tmp4Reg, Address::times_1));
+      masm.lea(ary2Reg, Address(ary2Reg, tmp4Reg, Address::times_1));
+      masm.negptr(tmp4Reg);
+
+      masm.bind(COMPARE_WIDE_VECTORS);
+      masm.movdqu(tmp1Reg, Address(ary1Reg, tmp4Reg, Address::times_1));
+      masm.movdqu(tmp2Reg, Address(ary2Reg, tmp4Reg, Address::times_1));
+      masm.pxor(tmp1Reg, tmp2Reg);
+      masm.ptest(tmp1Reg, tmp1Reg);
+
+      masm.jccb(Assembler::notZero, FALSE_LABEL);
+      masm.addptr(tmp4Reg, 16);
+      masm.jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
+      masm.bind(COMPARE_TAIL);
+      masm.movl(tmp4Reg, resultReg);
+      // Fallthru to tail compare
+    }
+
+   // Compare 4-byte vectors
+    masm.andl(tmp4Reg, 0xfffffffc);    // vector count (in bytes)
+    masm.andl(resultReg, 0x00000002);  // tail char (in bytes)
+    masm.testl(tmp4Reg, tmp4Reg); //if tmp2 == 0, only compare char
+    masm.jccb(Assembler::zero, COMPARE_CHAR);
+    masm.lea(ary1Reg, Address(ary1Reg, tmp4Reg, Address::times_1));
+    masm.lea(ary2Reg, Address(ary2Reg, tmp4Reg, Address::times_1));
+    masm.negptr(tmp4Reg);
+
+    masm.bind(COMPARE_VECTORS);
+    masm.movl(tmp3Reg, Address(ary1Reg, tmp4Reg, Address::times_1));
+    masm.cmpl(tmp3Reg, Address(ary2Reg, tmp4Reg, Address::times_1));
+    masm.jccb(Assembler::notEqual, FALSE_LABEL);
+    masm.addptr(tmp4Reg, 4);
+    masm.jcc(Assembler::notZero, COMPARE_VECTORS);
+
+    // Compare trailing char (final 2 bytes), if any
+    masm.bind(COMPARE_CHAR);
     masm.testl(resultReg, resultReg);
-    masm.jcc(Assembler::zero, TRUE_LABEL);
-
-    // Setup compare loop
-    masm.bind(COMPARE_LOOP_HDR);
-    // Shift tmp1Reg and tmp2Reg to the last 4-byte boundary of the arrays
-    masm.leaq(tmp1Reg, Address(ary1Reg, resultReg, Address::times_4, base_offset));
-    masm.leaq(tmp2Reg, Address(ary2Reg, resultReg, Address::times_4, base_offset));
-    masm.negq(resultReg);
-
-    // 4-byte-wide compare loop
-    masm.bind(COMPARE_LOOP);
-    masm.movl(ary1Reg, Address(tmp1Reg, resultReg, Address::times_4, 0));
-    masm.movl(ary2Reg, Address(tmp2Reg, resultReg, Address::times_4, 0));
-    masm.cmpl(ary1Reg, ary2Reg);
-    masm.jcc(Assembler::notEqual, FALSE_LABEL);
-    masm.incrementq(resultReg);
-    masm.jcc(Assembler::notZero, COMPARE_LOOP);
+    masm.jccb(Assembler::zero, TRUE_LABEL);
+    masm.load_unsigned_short(tmp3Reg, Address(ary1Reg, 0));
+    masm.load_unsigned_short(tmp4Reg, Address(ary2Reg, 0));
+    masm.cmpl(tmp3Reg, tmp4Reg);
+    masm.jccb(Assembler::notEqual, FALSE_LABEL);
 
     masm.bind(TRUE_LABEL);
     masm.movl(resultReg, 1);   // return true
-    masm.jmp(DONE_LABEL);
+    masm.jmpb(DONE);
 
     masm.bind(FALSE_LABEL);
     masm.xorl(resultReg, resultReg); // return false
 
     // That's it
-    masm.bind(DONE_LABEL);
+    masm.bind(DONE);
   %}
 
   enc_class enc_rethrow()
@@ -5087,7 +5370,7 @@
 %}
 
 // Double register operands
-operand regD()
+operand regD() 
 %{
   constraint(ALLOC_IN_RC(double_reg));
   match(RegD);
@@ -11540,27 +11823,52 @@
   ins_pipe(pipe_slow);
 %}
 
-instruct string_compare(rdi_RegP str1, rsi_RegP str2, rax_RegI tmp1,
-                        rbx_RegI tmp2, rcx_RegI result, rFlagsReg cr)
+instruct string_compare(rdi_RegP str1, rsi_RegP str2, regD tmp1, regD tmp2,
+                        rax_RegI tmp3, rbx_RegI tmp4, rcx_RegI result, rFlagsReg cr)
 %{
   match(Set result (StrComp str1 str2));
-  effect(USE_KILL str1, USE_KILL str2, KILL tmp1, KILL tmp2, KILL cr);
+  effect(TEMP tmp1, TEMP tmp2, USE_KILL str1, USE_KILL str2, KILL tmp3, KILL tmp4, KILL cr);
   //ins_cost(300);
 
   format %{ "String Compare $str1, $str2 -> $result    // XXX KILL RAX, RBX" %}
-  ins_encode( enc_String_Compare() );
+  ins_encode( enc_String_Compare(str1, str2, tmp1, tmp2, tmp3, tmp4, result) );
+  ins_pipe( pipe_slow );
+%}
+
+instruct string_indexof(rsi_RegP str1, rdi_RegP str2, regD tmp1, rax_RegI tmp2,
+                        rcx_RegI tmp3, rdx_RegI tmp4, rbx_RegI result, rFlagsReg cr)
+%{
+  predicate(UseSSE42Intrinsics);
+  match(Set result (StrIndexOf str1 str2));
+  effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, KILL tmp2, KILL tmp3, KILL tmp4, KILL cr);
+
+  format %{ "String IndexOf $str1,$str2 -> $result   // KILL RAX, RCX, RDX" %}
+  ins_encode( enc_String_IndexOf(str1, str2, tmp1, tmp2, tmp3, tmp4, result) );
+  ins_pipe( pipe_slow );
+%}
+
+// fast string equals
+instruct string_equals(rdi_RegP str1, rsi_RegP str2, regD tmp1, regD tmp2, rbx_RegI tmp3,
+                       rcx_RegI tmp4, rax_RegI result, rFlagsReg cr)
+%{
+  match(Set result (StrEquals str1 str2));
+  effect(TEMP tmp1, TEMP tmp2, USE_KILL str1, USE_KILL str2, KILL tmp3, KILL tmp4, KILL cr);
+
+  format %{ "String Equals $str1,$str2 -> $result    // KILL RBX, RCX" %}
+  ins_encode( enc_String_Equals(str1, str2, tmp1, tmp2, tmp3, tmp4, result) );
   ins_pipe( pipe_slow );
 %}
 
 // fast array equals
-instruct array_equals(rdi_RegP ary1, rsi_RegP ary2, rax_RegI tmp1, 
-                      rbx_RegI tmp2, rcx_RegI result, rFlagsReg cr) %{
+instruct array_equals(rdi_RegP ary1, rsi_RegP ary2, regD tmp1, regD tmp2, rax_RegI tmp3,
+                      rbx_RegI tmp4, rcx_RegI result, rFlagsReg cr)
+%{
   match(Set result (AryEq ary1 ary2));
-  effect(USE_KILL ary1, USE_KILL ary2, KILL tmp1, KILL tmp2, KILL cr);
+  effect(TEMP tmp1, TEMP tmp2, USE_KILL ary1, USE_KILL ary2, KILL tmp3, KILL tmp4, KILL cr);
   //ins_cost(300);
 
-  format %{ "Array Equals $ary1,$ary2 -> $result    // KILL RAX, RBX" %}
-  ins_encode( enc_Array_Equals(ary1, ary2, tmp1, tmp2, result) );
+  format %{ "Array Equals $ary1,$ary2 -> $result   // KILL RAX, RBX" %}
+  ins_encode( enc_Array_Equals(ary1, ary2, tmp1, tmp2, tmp3, tmp4, result) );
   ins_pipe( pipe_slow );
 %}
 
--- a/hotspot/src/share/vm/adlc/formssel.cpp	Tue Mar 31 15:09:45 2009 -0700
+++ b/hotspot/src/share/vm/adlc/formssel.cpp	Tue Mar 31 19:20:34 2009 -0700
@@ -574,9 +574,13 @@
   // TEMPORARY
   // if( is_simple_chain_rule(globals) )  return false;
 
-  // String-compare uses many memorys edges, but writes none
+  // String.(compareTo/equals/indexOf) and Arrays.equals use many memorys edges,
+  // but writes none
   if( _matrule && _matrule->_rChild &&
-      strcmp(_matrule->_rChild->_opType,"StrComp")==0 )
+      ( strcmp(_matrule->_rChild->_opType,"StrComp"    )==0 ||
+        strcmp(_matrule->_rChild->_opType,"StrEquals"  )==0 ||
+        strcmp(_matrule->_rChild->_opType,"StrIndexOf" )==0 ||
+        strcmp(_matrule->_rChild->_opType,"AryEq"      )==0 ))
     return true;
 
   // Check if instruction has a USE of a memory operand class, but no defs
@@ -815,8 +819,10 @@
     return AdlcVMDeps::Parms;   // Skip the machine-state edges
 
   if( _matrule->_rChild &&
-          strcmp(_matrule->_rChild->_opType,"StrComp")==0 ) {
-        // String compare takes 1 control and 4 memory edges.
+      ( strcmp(_matrule->_rChild->_opType,"StrComp"   )==0 ||
+        strcmp(_matrule->_rChild->_opType,"StrEquals" )==0 ||
+        strcmp(_matrule->_rChild->_opType,"StrIndexOf")==0 )) {
+        // String.(compareTo/equals/indexOf) take 1 control and 4 memory edges.
     return 5;
   }
 
--- a/hotspot/src/share/vm/classfile/vmSymbols.hpp	Tue Mar 31 15:09:45 2009 -0700
+++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp	Tue Mar 31 19:20:34 2009 -0700
@@ -288,6 +288,7 @@
   template(stringCacheEnabled_name,                   "stringCacheEnabled")                       \
   template(bitCount_name,                             "bitCount")                                 \
   template(profile_name,                              "profile")                                  \
+  template(equals_name,                               "equals")                                   \
                                                                                                   \
   /* non-intrinsic name/signature pairs: */                                                       \
   template(register_method_name,                      "register")                                 \
@@ -579,7 +580,6 @@
    do_signature(copyOfRange_signature,        "([Ljava/lang/Object;IILjava/lang/Class;)[Ljava/lang/Object;")            \
                                                                                                                         \
   do_intrinsic(_equalsC,                  java_util_Arrays,       equals_name,    equalsC_signature,             F_S)   \
-   do_name(     equals_name,                                     "equals")                                              \
    do_signature(equalsC_signature,                               "([C[C)Z")                                             \
                                                                                                                         \
   do_intrinsic(_invoke,                   java_lang_reflect_Method, invoke_name, object_array_object_object_signature, F_R) \
@@ -589,6 +589,7 @@
    do_name(     compareTo_name,                                  "compareTo")                                           \
   do_intrinsic(_indexOf,                  java_lang_String,       indexOf_name, string_int_signature,            F_R)   \
    do_name(     indexOf_name,                                    "indexOf")                                             \
+  do_intrinsic(_equals,                   java_lang_String,       equals_name, object_boolean_signature,         F_R)   \
                                                                                                                         \
   do_class(java_nio_Buffer,               "java/nio/Buffer")                                                            \
   do_intrinsic(_checkIndex,               java_nio_Buffer,        checkIndex_name, int_int_signature,            F_R)   \
--- a/hotspot/src/share/vm/opto/classes.hpp	Tue Mar 31 15:09:45 2009 -0700
+++ b/hotspot/src/share/vm/opto/classes.hpp	Tue Mar 31 19:20:34 2009 -0700
@@ -218,6 +218,8 @@
 macro(StoreP)
 macro(StoreN)
 macro(StrComp)
+macro(StrEquals)
+macro(StrIndexOf)
 macro(SubD)
 macro(SubF)
 macro(SubI)
--- a/hotspot/src/share/vm/opto/gcm.cpp	Tue Mar 31 15:09:45 2009 -0700
+++ b/hotspot/src/share/vm/opto/gcm.cpp	Tue Mar 31 19:20:34 2009 -0700
@@ -438,6 +438,12 @@
 #endif
   assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrComp),
          "String compare is only known 'load' that does not conflict with any stores");
+  assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrEquals),
+         "String equals is a 'load' that does not conflict with any stores");
+  assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrIndexOf),
+         "String indexOf is a 'load' that does not conflict with any stores");
+  assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_AryEq),
+         "Arrays equals is a 'load' that do not conflict with any stores");
 
   if (!C->alias_type(load_alias_idx)->is_rewritable()) {
     // It is impossible to spoil this load by putting stores before it,
--- a/hotspot/src/share/vm/opto/lcm.cpp	Tue Mar 31 15:09:45 2009 -0700
+++ b/hotspot/src/share/vm/opto/lcm.cpp	Tue Mar 31 19:20:34 2009 -0700
@@ -137,6 +137,8 @@
       if( mach->in(2) != val ) continue;
       break;                    // Found a memory op?
     case Op_StrComp:
+    case Op_StrEquals:
+    case Op_StrIndexOf:
     case Op_AryEq:
       // Not a legit memory op for implicit null check regardless of
       // embedded loads
--- a/hotspot/src/share/vm/opto/library_call.cpp	Tue Mar 31 15:09:45 2009 -0700
+++ b/hotspot/src/share/vm/opto/library_call.cpp	Tue Mar 31 19:20:34 2009 -0700
@@ -136,6 +136,7 @@
   bool inline_string_compareTo();
   bool inline_string_indexOf();
   Node* string_indexOf(Node* string_object, ciTypeArray* target_array, jint offset, jint cache_i, jint md2_i);
+  bool inline_string_equals();
   Node* pop_math_arg();
   bool runtime_math(const TypeFunc* call_type, address funcAddr, const char* funcName);
   bool inline_math_native(vmIntrinsics::ID id);
@@ -261,6 +262,7 @@
     switch (id) {
     case vmIntrinsics::_indexOf:
     case vmIntrinsics::_compareTo:
+    case vmIntrinsics::_equals:
     case vmIntrinsics::_equalsC:
       break;  // InlineNatives does not control String.compareTo
     default:
@@ -275,6 +277,9 @@
   case vmIntrinsics::_indexOf:
     if (!SpecialStringIndexOf)  return NULL;
     break;
+  case vmIntrinsics::_equals:
+    if (!SpecialStringEquals)  return NULL;
+    break;
   case vmIntrinsics::_equalsC:
     if (!SpecialArraysEquals)  return NULL;
     break;
@@ -442,6 +447,8 @@
     return inline_string_compareTo();
   case vmIntrinsics::_indexOf:
     return inline_string_indexOf();
+  case vmIntrinsics::_equals:
+    return inline_string_equals();
 
   case vmIntrinsics::_getObject:
     return inline_unsafe_access(!is_native_ptr, !is_store, T_OBJECT, false);
@@ -793,6 +800,8 @@
 //------------------------------inline_string_compareTo------------------------
 bool LibraryCallKit::inline_string_compareTo() {
 
+  if (!Matcher::has_match_rule(Op_StrComp)) return false;
+
   const int value_offset = java_lang_String::value_offset_in_bytes();
   const int count_offset = java_lang_String::count_offset_in_bytes();
   const int offset_offset = java_lang_String::offset_offset_in_bytes();
@@ -830,6 +839,82 @@
   return true;
 }
 
+//------------------------------inline_string_equals------------------------
+bool LibraryCallKit::inline_string_equals() {
+
+  if (!Matcher::has_match_rule(Op_StrEquals)) return false;
+
+  const int value_offset = java_lang_String::value_offset_in_bytes();
+  const int count_offset = java_lang_String::count_offset_in_bytes();
+  const int offset_offset = java_lang_String::offset_offset_in_bytes();
+
+  _sp += 2;
+  Node* argument = pop();  // pop non-receiver first:  it was pushed second
+  Node* receiver = pop();
+
+  // Null check on self without removing any arguments.  The argument
+  // null check technically happens in the wrong place, which can lead to
+  // invalid stack traces when string compare is inlined into a method
+  // which handles NullPointerExceptions.
+  _sp += 2;
+  receiver = do_null_check(receiver, T_OBJECT);
+  //should not do null check for argument for String.equals(), because spec
+  //allows to specify NULL as argument.
+  _sp -= 2;
+
+  if (stopped()) {
+    return true;
+  }
+
+  // get String klass for instanceOf
+  ciInstanceKlass* klass = env()->String_klass();
+
+  // two paths (plus control) merge
+  RegionNode* region = new (C, 3) RegionNode(3);
+  Node* phi = new (C, 3) PhiNode(region, TypeInt::BOOL);
+
+  Node* inst = gen_instanceof(argument, makecon(TypeKlassPtr::make(klass)));
+  Node* cmp  = _gvn.transform(new (C, 3) CmpINode(inst, intcon(1)));
+  Node* bol  = _gvn.transform(new (C, 2) BoolNode(cmp, BoolTest::eq));
+
+  IfNode* iff = create_and_map_if(control(), bol, PROB_MAX, COUNT_UNKNOWN);
+
+  Node* if_true  = _gvn.transform(new (C, 1) IfTrueNode(iff));
+  set_control(if_true);
+
+  const TypeInstPtr* string_type =
+    TypeInstPtr::make(TypePtr::BotPTR, klass, false, NULL, 0);
+
+  // instanceOf == true
+  Node* equals =
+    _gvn.transform(new (C, 7) StrEqualsNode(
+                        control(),
+                        memory(TypeAryPtr::CHARS),
+                        memory(string_type->add_offset(value_offset)),
+                        memory(string_type->add_offset(count_offset)),
+                        memory(string_type->add_offset(offset_offset)),
+                        receiver,
+                        argument));
+
+  phi->init_req(1, _gvn.transform(equals));
+  region->init_req(1, if_true);
+
+  //instanceOf == false, fallthrough
+  Node* if_false = _gvn.transform(new (C, 1) IfFalseNode(iff));
+  set_control(if_false);
+
+  phi->init_req(2, _gvn.transform(intcon(0)));
+  region->init_req(2, if_false);
+
+  // post merge
+  set_control(_gvn.transform(region));
+  record_for_igvn(region);
+
+  push(_gvn.transform(phi));
+
+  return true;
+}
+
 //------------------------------inline_array_equals----------------------------
 bool LibraryCallKit::inline_array_equals() {
 
@@ -994,80 +1079,115 @@
   return result;
 }
 
-
 //------------------------------inline_string_indexOf------------------------
 bool LibraryCallKit::inline_string_indexOf() {
 
-  _sp += 2;
-  Node *argument = pop();  // pop non-receiver first:  it was pushed second
-  Node *receiver = pop();
-
-  // don't intrinsify if argument isn't a constant string.
-  if (!argument->is_Con()) {
-    return false;
-  }
-  const TypeOopPtr* str_type = _gvn.type(argument)->isa_oopptr();
-  if (str_type == NULL) {
-    return false;
-  }
-  ciInstanceKlass* klass = env()->String_klass();
-  ciObject* str_const = str_type->const_oop();
-  if (str_const == NULL || str_const->klass() != klass) {
-    return false;
-  }
-  ciInstance* str = str_const->as_instance();
-  assert(str != NULL, "must be instance");
-
   const int value_offset  = java_lang_String::value_offset_in_bytes();
   const int count_offset  = java_lang_String::count_offset_in_bytes();
   const int offset_offset = java_lang_String::offset_offset_in_bytes();
 
-  ciObject* v = str->field_value_by_offset(value_offset).as_object();
-  int       o = str->field_value_by_offset(offset_offset).as_int();
-  int       c = str->field_value_by_offset(count_offset).as_int();
-  ciTypeArray* pat = v->as_type_array(); // pattern (argument) character array
-
-  // constant strings have no offset and count == length which
-  // simplifies the resulting code somewhat so lets optimize for that.
-  if (o != 0 || c != pat->length()) {
-    return false;
-  }
-
-  // Null check on self without removing any arguments.  The argument
-  // null check technically happens in the wrong place, which can lead to
-  // invalid stack traces when string compare is inlined into a method
-  // which handles NullPointerExceptions.
   _sp += 2;
-  receiver = do_null_check(receiver, T_OBJECT);
-  // No null check on the argument is needed since it's a constant String oop.
-  _sp -= 2;
-  if (stopped()) {
-    return true;
+  Node *argument = pop();  // pop non-receiver first:  it was pushed second
+  Node *receiver = pop();
+
+  Node* result;
+  if (Matcher::has_match_rule(Op_StrIndexOf) &&
+      UseSSE42Intrinsics) {
+    // Generate SSE4.2 version of indexOf
+    // We currently only have match rules that use SSE4.2
+
+    // Null check on self without removing any arguments.  The argument
+    // null check technically happens in the wrong place, which can lead to
+    // invalid stack traces when string compare is inlined into a method
+    // which handles NullPointerExceptions.
+    _sp += 2;
+    receiver = do_null_check(receiver, T_OBJECT);
+    argument = do_null_check(argument, T_OBJECT);
+    _sp -= 2;
+
+    if (stopped()) {
+      return true;
+    }
+
+    ciInstanceKlass* klass = env()->String_klass();
+    const TypeInstPtr* string_type =
+      TypeInstPtr::make(TypePtr::BotPTR, klass, false, NULL, 0);
+
+    result =
+      _gvn.transform(new (C, 7)
+                     StrIndexOfNode(control(),
+                                    memory(TypeAryPtr::CHARS),
+                                    memory(string_type->add_offset(value_offset)),
+                                    memory(string_type->add_offset(count_offset)),
+                                    memory(string_type->add_offset(offset_offset)),
+                                    receiver,
+                                    argument));
+  } else { //Use LibraryCallKit::string_indexOf
+    // don't intrinsify is argument isn't a constant string.
+    if (!argument->is_Con()) {
+     return false;
+    }
+    const TypeOopPtr* str_type = _gvn.type(argument)->isa_oopptr();
+    if (str_type == NULL) {
+      return false;
+    }
+    ciInstanceKlass* klass = env()->String_klass();
+    ciObject* str_const = str_type->const_oop();
+    if (str_const == NULL || str_const->klass() != klass) {
+      return false;
+    }
+    ciInstance* str = str_const->as_instance();
+    assert(str != NULL, "must be instance");
+
+    ciObject* v = str->field_value_by_offset(value_offset).as_object();
+    int       o = str->field_value_by_offset(offset_offset).as_int();
+    int       c = str->field_value_by_offset(count_offset).as_int();
+    ciTypeArray* pat = v->as_type_array(); // pattern (argument) character array
+
+    // constant strings have no offset and count == length which
+    // simplifies the resulting code somewhat so lets optimize for that.
+    if (o != 0 || c != pat->length()) {
+     return false;
+    }
+
+    // Null check on self without removing any arguments.  The argument
+    // null check technically happens in the wrong place, which can lead to
+    // invalid stack traces when string compare is inlined into a method
+    // which handles NullPointerExceptions.
+    _sp += 2;
+    receiver = do_null_check(receiver, T_OBJECT);
+    // No null check on the argument is needed since it's a constant String oop.
+    _sp -= 2;
+    if (stopped()) {
+     return true;
+    }
+
+    // The null string as a pattern always returns 0 (match at beginning of string)
+    if (c == 0) {
+      push(intcon(0));
+      return true;
+    }
+
+    // Generate default indexOf
+    jchar lastChar = pat->char_at(o + (c - 1));
+    int cache = 0;
+    int i;
+    for (i = 0; i < c - 1; i++) {
+      assert(i < pat->length(), "out of range");
+      cache |= (1 << (pat->char_at(o + i) & (sizeof(cache) * BitsPerByte - 1)));
+    }
+
+    int md2 = c;
+    for (i = 0; i < c - 1; i++) {
+      assert(i < pat->length(), "out of range");
+      if (pat->char_at(o + i) == lastChar) {
+        md2 = (c - 1) - i;
+      }
+    }
+
+    result = string_indexOf(receiver, pat, o, cache, md2);
   }
 
-  // The null string as a pattern always returns 0 (match at beginning of string)
-  if (c == 0) {
-    push(intcon(0));
-    return true;
-  }
-
-  jchar lastChar = pat->char_at(o + (c - 1));
-  int cache = 0;
-  int i;
-  for (i = 0; i < c - 1; i++) {
-    assert(i < pat->length(), "out of range");
-    cache |= (1 << (pat->char_at(o + i) & (sizeof(cache) * BitsPerByte - 1)));
-  }
-
-  int md2 = c;
-  for (i = 0; i < c - 1; i++) {
-    assert(i < pat->length(), "out of range");
-    if (pat->char_at(o + i) == lastChar) {
-      md2 = (c - 1) - i;
-    }
-  }
-
-  Node* result = string_indexOf(receiver, pat, o, cache, md2);
   push(result);
   return true;
 }
--- a/hotspot/src/share/vm/opto/loopnode.cpp	Tue Mar 31 15:09:45 2009 -0700
+++ b/hotspot/src/share/vm/opto/loopnode.cpp	Tue Mar 31 19:20:34 2009 -0700
@@ -2668,6 +2668,8 @@
     case Op_LoadD_unaligned:
     case Op_LoadL_unaligned:
     case Op_StrComp:            // Does a bunch of load-like effects
+    case Op_StrEquals:
+    case Op_StrIndexOf:
     case Op_AryEq:
       pinned = false;
     }
--- a/hotspot/src/share/vm/opto/matcher.cpp	Tue Mar 31 15:09:45 2009 -0700
+++ b/hotspot/src/share/vm/opto/matcher.cpp	Tue Mar 31 19:20:34 2009 -0700
@@ -746,6 +746,8 @@
   if (nidx == Compile::AliasIdxBot && midx == Compile::AliasIdxTop) {
     switch (n->Opcode()) {
     case Op_StrComp:
+    case Op_StrEquals:
+    case Op_StrIndexOf:
     case Op_AryEq:
     case Op_MemBarVolatile:
     case Op_MemBarCPUOrder: // %%% these ideals should have narrower adr_type?
@@ -1788,6 +1790,8 @@
         mstack.push(n->in(0), Pre_Visit);     // Visit Control input
         continue;                             // while (mstack.is_nonempty())
       case Op_StrComp:
+      case Op_StrEquals:
+      case Op_StrIndexOf:
       case Op_AryEq:
         set_shared(n); // Force result into register (it will be anyways)
         break;
--- a/hotspot/src/share/vm/opto/memnode.cpp	Tue Mar 31 15:09:45 2009 -0700
+++ b/hotspot/src/share/vm/opto/memnode.cpp	Tue Mar 31 19:20:34 2009 -0700
@@ -2481,6 +2481,31 @@
   return remove_dead_region(phase, can_reshape) ? this : NULL;
 }
 
+// Do we match on this edge? No memory edges
+uint StrEqualsNode::match_edge(uint idx) const {
+  return idx == 5 || idx == 6;
+}
+
+//------------------------------Ideal------------------------------------------
+// Return a node which is more "ideal" than the current node.  Strip out
+// control copies
+Node *StrEqualsNode::Ideal(PhaseGVN *phase, bool can_reshape){
+  return remove_dead_region(phase, can_reshape) ? this : NULL;
+}
+
+//=============================================================================
+// Do we match on this edge? No memory edges
+uint StrIndexOfNode::match_edge(uint idx) const {
+  return idx == 5 || idx == 6;
+}
+
+//------------------------------Ideal------------------------------------------
+// Return a node which is more "ideal" than the current node.  Strip out
+// control copies
+Node *StrIndexOfNode::Ideal(PhaseGVN *phase, bool can_reshape){
+  return remove_dead_region(phase, can_reshape) ? this : NULL;
+}
+
 //------------------------------Ideal------------------------------------------
 // Return a node which is more "ideal" than the current node.  Strip out
 // control copies
@@ -2488,7 +2513,6 @@
   return remove_dead_region(phase, can_reshape) ? this : NULL;
 }
 
-
 //=============================================================================
 MemBarNode::MemBarNode(Compile* C, int alias_idx, Node* precedent)
   : MultiNode(TypeFunc::Parms + (precedent == NULL? 0: 1)),
--- a/hotspot/src/share/vm/opto/memnode.hpp	Tue Mar 31 15:09:45 2009 -0700
+++ b/hotspot/src/share/vm/opto/memnode.hpp	Tue Mar 31 19:20:34 2009 -0700
@@ -765,6 +765,54 @@
   virtual Node *Ideal(PhaseGVN *phase, bool can_reshape);
 };
 
+//------------------------------StrEquals-------------------------------------
+class StrEqualsNode: public Node {
+public:
+  StrEqualsNode(Node *control,
+                Node* char_array_mem,
+                Node* value_mem,
+                Node* count_mem,
+                Node* offset_mem,
+                Node* s1, Node* s2): Node(control,
+                                          char_array_mem,
+                                          value_mem,
+                                          count_mem,
+                                          offset_mem,
+                                          s1, s2) {};
+  virtual int Opcode() const;
+  virtual bool depends_only_on_test() const { return false; }
+  virtual const Type* bottom_type() const { return TypeInt::BOOL; }
+  // a StrEqualsNode (conservatively) aliases with everything:
+  virtual const TypePtr* adr_type() const { return TypePtr::BOTTOM; }
+  virtual uint match_edge(uint idx) const;
+  virtual uint ideal_reg() const { return Op_RegI; }
+  virtual Node *Ideal(PhaseGVN *phase, bool can_reshape);
+};
+
+//------------------------------StrIndexOf-------------------------------------
+class StrIndexOfNode: public Node {
+public:
+  StrIndexOfNode(Node *control,
+                 Node* char_array_mem,
+                 Node* value_mem,
+                 Node* count_mem,
+                 Node* offset_mem,
+                 Node* s1, Node* s2): Node(control,
+                                           char_array_mem,
+                                           value_mem,
+                                           count_mem,
+                                           offset_mem,
+                                           s1, s2) {};
+  virtual int Opcode() const;
+  virtual bool depends_only_on_test() const { return false; }
+  virtual const Type* bottom_type() const { return TypeInt::INT; }
+  // a StrIndexOfNode (conservatively) aliases with everything:
+  virtual const TypePtr* adr_type() const { return TypePtr::BOTTOM; }
+  virtual uint match_edge(uint idx) const;
+  virtual uint ideal_reg() const { return Op_RegI; }
+  virtual Node *Ideal(PhaseGVN *phase, bool can_reshape);
+};
+
 //------------------------------AryEq---------------------------------------
 class AryEqNode: public Node {
 public:
--- a/hotspot/src/share/vm/runtime/arguments.cpp	Tue Mar 31 15:09:45 2009 -0700
+++ b/hotspot/src/share/vm/runtime/arguments.cpp	Tue Mar 31 19:20:34 2009 -0700
@@ -1366,9 +1366,6 @@
   if (AggressiveOpts && FLAG_IS_DEFAULT(DoEscapeAnalysis)) {
     FLAG_SET_DEFAULT(DoEscapeAnalysis, true);
   }
-  if (AggressiveOpts && FLAG_IS_DEFAULT(SpecialArraysEquals)) {
-    FLAG_SET_DEFAULT(SpecialArraysEquals, true);
-  }
   if (AggressiveOpts && FLAG_IS_DEFAULT(BiasedLockingStartupDelay)) {
     FLAG_SET_DEFAULT(BiasedLockingStartupDelay, 500);
   }
--- a/hotspot/src/share/vm/runtime/globals.hpp	Tue Mar 31 15:09:45 2009 -0700
+++ b/hotspot/src/share/vm/runtime/globals.hpp	Tue Mar 31 19:20:34 2009 -0700
@@ -491,9 +491,15 @@
   develop(bool, SpecialStringIndexOf, true,                                 \
           "special version of string indexOf")                              \
                                                                             \
-  product(bool, SpecialArraysEquals, false,                                 \
+  develop(bool, SpecialStringEquals, true,                                  \
+          "special version of string equals")                               \
+                                                                            \
+  develop(bool, SpecialArraysEquals, true,                                  \
           "special version of Arrays.equals(char[],char[])")                \
                                                                             \
+  product(bool, UseSSE42Intrinsics, false,                                  \
+          "SSE4.2 versions of intrinsics")                                  \
+                                                                            \
   develop(bool, TraceCallFixup, false,                                      \
           "traces all call fixups")                                         \
                                                                             \