6937112: String.lastIndexOf confused by unpaired trailing surrogate
authormartin
Wed, 30 Jun 2010 16:11:32 -0700
changeset 5987 caec61968454
parent 5986 04eb44085c00
child 5988 2c984724db38
6937112: String.lastIndexOf confused by unpaired trailing surrogate Summary: Rewrite lastIndexOf for performance and correctness Reviewed-by: sherman Contributed-by: Reviewed by Ulf Zibis <ulf.zibis@gmx.de>
jdk/src/share/classes/java/lang/String.java
jdk/test/java/lang/String/Supplementary.java
jdk/test/java/lang/StringBuffer/Supplementary.java
jdk/test/java/lang/StringBuilder/Supplementary.java
--- a/jdk/src/share/classes/java/lang/String.java	Wed Jun 30 16:11:32 2010 -0700
+++ b/jdk/src/share/classes/java/lang/String.java	Wed Jun 30 16:11:32 2010 -0700
@@ -37,6 +37,7 @@
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.regex.PatternSyntaxException;
+import sun.nio.cs.Surrogate;
 
 
 /**
@@ -1575,9 +1576,6 @@
      *          if the character does not occur.
      */
     public int indexOf(int ch, int fromIndex) {
-        int max = offset + count;
-        char v[] = value;
-
         if (fromIndex < 0) {
             fromIndex = 0;
         } else if (fromIndex >= count) {
@@ -1585,29 +1583,36 @@
             return -1;
         }
 
-        int i = offset + fromIndex;
         if (ch < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
             // handle most cases here (ch is a BMP code point or a
             // negative value (invalid code point))
-            for (; i < max ; i++) {
-                if (v[i] == ch) {
+            final char[] value = this.value;
+            final int offset = this.offset;
+            final int max = offset + count;
+            for (int i = offset + fromIndex; i < max ; i++) {
+                if (value[i] == ch) {
                     return i - offset;
                 }
             }
             return -1;
+        } else {
+            return indexOfSupplementary(ch, fromIndex);
         }
+    }
 
-        if (ch <= Character.MAX_CODE_POINT) {
-            // handle supplementary characters here
-            char[] surrogates = Character.toChars(ch);
-            for (; i < max; i++) {
-                if (v[i] == surrogates[0]) {
-                    if (i + 1 == max) {
-                        break;
-                    }
-                    if (v[i+1] == surrogates[1]) {
-                        return i - offset;
-                    }
+    /**
+     * Handles (rare) calls of indexOf with a supplementary character.
+     */
+    private int indexOfSupplementary(int ch, int fromIndex) {
+        if (Character.isValidCodePoint(ch)) {
+            final char[] value = this.value;
+            final int offset = this.offset;
+            final char hi = Surrogate.high(ch);
+            final char lo = Surrogate.low(ch);
+            final int max = offset + count - 1;
+            for (int i = offset + fromIndex; i < max; i++) {
+                if (value[i] == hi && value[i+1] == lo) {
+                    return i - offset;
                 }
             }
         }
@@ -1676,34 +1681,36 @@
      *          if the character does not occur before that point.
      */
     public int lastIndexOf(int ch, int fromIndex) {
-        int min = offset;
-        char v[] = value;
-
-        int i = offset + ((fromIndex >= count) ? count - 1 : fromIndex);
-
         if (ch < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
             // handle most cases here (ch is a BMP code point or a
             // negative value (invalid code point))
-            for (; i >= min ; i--) {
-                if (v[i] == ch) {
+            final char[] value = this.value;
+            final int offset = this.offset;
+            int i = offset + Math.min(fromIndex, count - 1);
+            for (; i >= offset ; i--) {
+                if (value[i] == ch) {
                     return i - offset;
                 }
             }
             return -1;
+        } else {
+            return lastIndexOfSupplementary(ch, fromIndex);
         }
+    }
 
-        int max = offset + count;
-        if (ch <= Character.MAX_CODE_POINT) {
-            // handle supplementary characters here
-            char[] surrogates = Character.toChars(ch);
-            for (; i >= min; i--) {
-                if (v[i] == surrogates[0]) {
-                    if (i + 1 == max) {
-                        break;
-                    }
-                    if (v[i+1] == surrogates[1]) {
-                        return i - offset;
-                    }
+    /**
+     * Handles (rare) calls of lastIndexOf with a supplementary character.
+     */
+    private int lastIndexOfSupplementary(int ch, int fromIndex) {
+        if (Character.isValidCodePoint(ch)) {
+            final char[] value = this.value;
+            final int offset = this.offset;
+            char hi = Surrogate.high(ch);
+            char lo = Surrogate.low(ch);
+            int i = offset + Math.min(fromIndex, count - 2);
+            for (; i >= offset; i--) {
+                if (value[i] == hi && value[i+1] == lo) {
+                    return i - offset;
                 }
             }
         }
--- a/jdk/test/java/lang/String/Supplementary.java	Wed Jun 30 16:11:32 2010 -0700
+++ b/jdk/test/java/lang/String/Supplementary.java	Wed Jun 30 16:11:32 2010 -0700
@@ -62,7 +62,7 @@
          0     1     2345     678     9     012     345678     9 01     2     */
         "\uD800\uDC00!#$\uD800%&\uD800\uDC00;+\uDC00<>;=^\uDC00\\@\uD800\uDC00",
 
-        // includes an undefined supprementary characters in Unicode 4.0.0
+        // includes an undefined supplementary character in Unicode 4.0.0
       /*                                    1     11     1     1111     1
          0     1     2345     6     789     0     12     3     4567     8     */
         "\uDB40\uDE00abc\uDE01\uDB40de\uDB40\uDE02f\uDB40\uDE03ghi\uDB40\uDE02",
@@ -168,7 +168,7 @@
      * string in input[m].
      *
      * The meaning of each element in golden3[][n]
-     *   golden3[][0]: characater which is searched.
+     *   golden3[][0]: character which is searched.
      *   golden3[][2]: the golden data for indexOf(int ch)
      *   From golden3[][2] to golden3[][n-1]:
      *       the golden data for indexOf(int ch, int fromIndex)
@@ -201,17 +201,17 @@
             /*
              * Normal case
              */
-            testIndexOf(First, s, golden3[i][0], golden3[i][2]);
+            testIndexOf(s, golden3[i][0], golden3[i][2]);
 
             /*
              * Abnormal case - char which isn't included in the string.
              */
-            testIndexOf(First, s, 'Z', -1);
-            testIndexOf(First, s, 0xDB98, -1);
-            testIndexOf(First, s, 0xDE76, -1);
-            testIndexOf(First, s, 0x12345, -1);
-            testIndexOf(First, s, -1, -1);
-            testIndexOf(First, s, 0x110000, -1);
+            testIndexOf(s, 'Z', -1);
+            testIndexOf(s, 0xDB98, -1);
+            testIndexOf(s, 0xDE76, -1);
+            testIndexOf(s, 0x12345, -1);
+            testIndexOf(s, -1, -1);
+            testIndexOf(s, 0x110000, -1);
         }
     }
 
@@ -229,7 +229,7 @@
              */
             int fromIndex = 0;
             for (int j = 2; j < golden3[i].length; j++) {
-                fromIndex = testIndexOf(First, s, fromIndex, ch,
+                fromIndex = testIndexOf(s, fromIndex, ch,
                                         golden3[i][j]) + 1;
             }
 
@@ -237,19 +237,19 @@
              * Abnormal case1 - char is included in the string but fromIndex
              *                  is incorrect.
              */
-            testIndexOf(First, s, -1, ch, golden3[i][2]);
-            testIndexOf(First, s, s.length(), ch,
+            testIndexOf(s, -1, ch, golden3[i][2]);
+            testIndexOf(s, s.length(), ch,
                         golden3[i][golden3[i].length-1]);
 
             /*
              * Abnormal case2 - char which isn't included in the string.
              */
-            testIndexOf(First, s, 0, 'Z', -1);
-            testIndexOf(First, s, 0, 0xDB98, -1);
-            testIndexOf(First, s, 0, 0xDE76, -1);
-            testIndexOf(First, s, 0, 0x12345, -1);
-            testIndexOf(First, s, 0, -1, -1);
-            testIndexOf(First, s, 0, 0x110000, -1);
+            testIndexOf(s, 0, 'Z', -1);
+            testIndexOf(s, 0, 0xDB98, -1);
+            testIndexOf(s, 0, 0xDE76, -1);
+            testIndexOf(s, 0, 0x12345, -1);
+            testIndexOf(s, 0, -1, -1);
+            testIndexOf(s, 0, 0x110000, -1);
         }
     }
 
@@ -264,18 +264,18 @@
             /*
              * Normal case
              */
-            testIndexOf(Last, s, golden3[i][0],
+            testLastIndexOf(s, golden3[i][0],
                         golden3[i][golden3[i].length-2]);
 
             /*
              * Abnormal case - char which isn't included in the string.
              */
-            testIndexOf(Last, s, 'Z', -1);
-            testIndexOf(Last, s, 0xDB98, -1);
-            testIndexOf(Last, s, 0xDE76, -1);
-            testIndexOf(Last, s, 0x12345, -1);
-            testIndexOf(Last, s, -1, -1);
-            testIndexOf(Last, s, 0x110000, -1);
+            testLastIndexOf(s, 'Z', -1);
+            testLastIndexOf(s, 0xDB98, -1);
+            testLastIndexOf(s, 0xDE76, -1);
+            testLastIndexOf(s, 0x12345, -1);
+            testLastIndexOf(s, -1, -1);
+            testLastIndexOf(s, 0x110000, -1);
         }
     }
 
@@ -294,7 +294,7 @@
              */
             int fromIndex = len - 1;
             for (int j = golden3[i].length - 2; j > 0; j--) {
-                fromIndex = testIndexOf(Last, s, fromIndex, ch,
+                fromIndex = testLastIndexOf(s, fromIndex, ch,
                                         golden3[i][j]) - 1;
             }
 
@@ -302,18 +302,18 @@
              * Abnormal case1 - char is included in the string but fromIndex
              *                  is incorrect.
              */
-            testIndexOf(Last, s, -1, ch, golden3[i][1]);
-            testIndexOf(Last, s, len, ch, golden3[i][golden3[i].length-2]);
+            testLastIndexOf(s, -1, ch, golden3[i][1]);
+            testLastIndexOf(s, len, ch, golden3[i][golden3[i].length-2]);
 
             /*
              * Abnormal case2 - char which isn't included in the string.
              */
-            testIndexOf(Last, s, len, 'Z', -1);
-            testIndexOf(Last, s, len, 0xDB98, -1);
-            testIndexOf(Last, s, len, 0xDE76, -1);
-            testIndexOf(Last, s, len, 0x12345, -1);
-            testIndexOf(Last, s, len, -1, -1);
-            testIndexOf(Last, s, len, 0x110000, -1);
+            testLastIndexOf(s, len, 'Z', -1);
+            testLastIndexOf(s, len, 0xDB98, -1);
+            testLastIndexOf(s, len, 0xDE76, -1);
+            testLastIndexOf(s, len, 0x12345, -1);
+            testLastIndexOf(s, len, -1, -1);
+            testLastIndexOf(s, len, 0x110000, -1);
         }
     }
 
@@ -471,7 +471,7 @@
                       result, expected);
                 result = str.offsetByCodePoints(j, -nCodePoints);
                 check(result != 0,
-                      "offsetBycodePoints(input["+i+"], "+j+", "+(-nCodePoints)+")",
+                      "offsetByCodePoints(input["+i+"], "+j+", "+(-nCodePoints)+")",
                       result, 0);
             }
 
@@ -531,7 +531,7 @@
                       result, expected);
                 result = str.offsetByCodePoints(j, -nCodePoints);
                 check(result != 0,
-                      "offsetBycodePoints(input["+i+"], "+j+", "+(-nCodePoints)+")",
+                      "offsetByCodePoints(input["+i+"], "+j+", "+(-nCodePoints)+")",
                       result, 0);
             }
         }
@@ -539,7 +539,7 @@
 
 
     static final boolean At = true, Before = false;
-    static final boolean First = true, Last = false;
+    static final boolean FIRST = true, LAST = false;
 
     static void testCodePoint(boolean isAt, String s, int index, int expected) {
         int c = isAt ? s.codePointAt(index) : s.codePointBefore(index);
@@ -563,22 +563,72 @@
               + s + "> should throw StringIndexOutOfBoundsPointerException.");
     }
 
-    static void testIndexOf(boolean isFirst, String s, int c, int expected) {
-        int index = isFirst ? s.indexOf(c) : s.lastIndexOf(c);
+    static void testIndexOf(String s, int c, int expected) {
+        testIndexOf2(s, c, expected);
+        if (s.indexOf(c) != -1) {
+            testIndexOf2(s + (char) c, c, expected);
+            if (Character.isSupplementaryCodePoint(c)) {
+                char[] surrogates = Character.toChars(c);
+                testIndexOf2(s + new String(surrogates), c, expected);
+                testIndexOf2(s + surrogates[0], c, expected);
+                testIndexOf2(s + surrogates[1], c, expected);
+                testIndexOf2(new String(surrogates) + s, c, 0);
+                testIndexOf2(surrogates[0] + s, c, expected + 1);
+                testIndexOf2(surrogates[1] + s, c, expected + 1);
+            }
+        }
+    }
+
+    static void testIndexOf2(String s, int c, int expected) {
+        int index = s.indexOf(c);
 
         check(index != expected,
-              (isFirst ? "i" : "lastI") + "ndexOf(" + toHexString(c)
-              + ") for <" + s + ">", index, expected);
+              "indexOf(" + toHexString(c) + ") for <" + s + ">",
+              index, expected);
     }
 
-    static int testIndexOf(boolean isFirst, String s, int fromIndex, int c,
-                           int expected) {
-        int index = isFirst ? s.indexOf(c, fromIndex) :
-                              s.lastIndexOf(c, fromIndex);
+    static void testLastIndexOf(String s, int c, int expected) {
+        testLastIndexOf2(s, c, expected);
+        if (s.lastIndexOf(c) != -1) {
+            testLastIndexOf2((char) c + s, c, expected + 1);
+            if (Character.isSupplementaryCodePoint(c)) {
+                char[] surrogates = Character.toChars(c);
+                testLastIndexOf2(s + new String(surrogates), c, s.length());
+                testLastIndexOf2(s + surrogates[0], c, expected);
+                testLastIndexOf2(s + surrogates[1], c, expected);
+                testLastIndexOf2(new String(surrogates) + s, c, expected + 2);
+                testLastIndexOf2(surrogates[0] + s, c, expected + 1);
+                testLastIndexOf2(surrogates[1] + s, c, expected + 1);
+            }
+        }
+    }
+
+    static void testLastIndexOf2(String s, int c, int expected) {
+        int index = s.lastIndexOf(c);
 
         check(index != expected,
-              (isFirst ? "i" : "lastI") + "ndexOf(" + toHexString(c) + ", "
-              + fromIndex + ") for <" + s + ">", index, expected);
+              "lastIndexOf(" + toHexString(c) + ") for <" + s + ">",
+              index, expected);
+    }
+
+    static int testIndexOf(String s, int fromIndex, int c, int expected) {
+        int index = s.indexOf(c, fromIndex);
+
+        check(index != expected,
+              "indexOf(" + toHexString(c) + ", "
+              + fromIndex + ") for <" + s + ">",
+              index, expected);
+
+        return index;
+    }
+
+    static int testLastIndexOf(String s, int fromIndex, int c, int expected) {
+        int index = s.lastIndexOf(c, fromIndex);
+
+        check(index != expected,
+              "lastIndexOf(" + toHexString(c) + ", "
+              + fromIndex + ") for <" + s + ">",
+              index, expected);
 
         return index;
     }
--- a/jdk/test/java/lang/StringBuffer/Supplementary.java	Wed Jun 30 16:11:32 2010 -0700
+++ b/jdk/test/java/lang/StringBuffer/Supplementary.java	Wed Jun 30 16:11:32 2010 -0700
@@ -24,7 +24,7 @@
 /*
  *
  * @test
- * @bug 4533872 4915683 4985217 5017280
+ * @bug 4533872 4915683 4985217 5017280 6937112
  * @summary Unit tests for supplementary character support (JSR-204)
  */
 
@@ -57,7 +57,7 @@
          0     1     2345     678     9     012     345678     9 01     2     */
         "\uD800\uDC00!#$\uD800%&\uD800\uDC00;+\uDC00<>;=^\uDC00\\@\uD800\uDC00",
 
-        // includes an undefined supprementary characters in Unicode 4.0.0
+        // includes an undefined supplementary character in Unicode 4.0.0
       /*                                    1     11     1     1111     1
          0     1     2345     6     789     0     12     3     4567     8     */
         "\uDB40\uDE00abc\uDE01\uDB40de\uDB40\uDE02f\uDB40\uDE03ghi\uDB40\uDE02",
@@ -151,7 +151,7 @@
         "\uD800on\uDC00ml\uDC00\uDC00ki9\uD800\uDC00hgfe\uDBFF\uDFFFdcba\uDC00",
         "\uD800\uDC00@\\\uDC00^=;><\uDC00+;\uD800\uDC00&%\uD800$#!\uD800\uDC00",
 
-        // includes an undefined supprementary characters in Unicode 4.0.0
+        // includes an undefined supplementary character in Unicode 4.0.0
         "\uDB40\uDE02ihg\uDB40\uDE03f\uDB40\uDE02ed\uDB40\uDE01cba\uDB40\uDE00",
     };
 
--- a/jdk/test/java/lang/StringBuilder/Supplementary.java	Wed Jun 30 16:11:32 2010 -0700
+++ b/jdk/test/java/lang/StringBuilder/Supplementary.java	Wed Jun 30 16:11:32 2010 -0700
@@ -57,7 +57,7 @@
          0     1     2345     678     9     012     345678     9 01     2     */
         "\uD800\uDC00!#$\uD800%&\uD800\uDC00;+\uDC00<>;=^\uDC00\\@\uD800\uDC00",
 
-        // includes an undefined supprementary characters in Unicode 4.0.0
+        // includes an undefined supplementary character in Unicode 4.0.0
       /*                                    1     11     1     1111     1
          0     1     2345     6     789     0     12     3     4567     8     */
         "\uDB40\uDE00abc\uDE01\uDB40de\uDB40\uDE02f\uDB40\uDE03ghi\uDB40\uDE02",
@@ -151,7 +151,7 @@
         "\uD800on\uDC00ml\uDC00\uDC00ki9\uD800\uDC00hgfe\uDBFF\uDFFFdcba\uDC00",
         "\uD800\uDC00@\\\uDC00^=;><\uDC00+;\uD800\uDC00&%\uD800$#!\uD800\uDC00",
 
-        // includes an undefined supprementary characters in Unicode 4.0.0
+        // includes an undefined supplementary character in Unicode 4.0.0
         "\uDB40\uDE02ihg\uDB40\uDE03f\uDB40\uDE02ed\uDB40\uDE01cba\uDB40\uDE00",
     };