8027645: Pattern.split() with positive lookahead
authorsherman
Wed, 13 Nov 2013 11:26:01 -0800
changeset 21668 b62ce4a9635f
parent 21667 e4c61eb98292
child 21669 582457f85c7b
8027645: Pattern.split() with positive lookahead 6559590: Pattern.compile(".*").split("") returns incorrect result Summary: updated spec/impl for these two corner cases Reviewed-by: alanb, psandoz
jdk/src/share/classes/java/lang/String.java
jdk/src/share/classes/java/util/regex/Pattern.java
jdk/test/java/lang/String/Split.java
jdk/test/java/util/regex/RegExTest.java
--- a/jdk/src/share/classes/java/lang/String.java	Wed Nov 13 11:06:57 2013 -0800
+++ b/jdk/src/share/classes/java/lang/String.java	Wed Nov 13 11:26:01 2013 -0800
@@ -2235,7 +2235,13 @@
      * expression or is terminated by the end of the string.  The substrings in
      * the array are in the order in which they occur in this string.  If the
      * expression does not match any part of the input then the resulting array
-     * has just one element, namely this string.
+     * has just one element, namely this string. A zero-length input sequence
+     * always results zero-length resulting array.
+     *
+     * <p> When there is a positive-width match at the beginning of this
+     * string then an empty leading substring is included at the beginning
+     * of the resulting array. A zero-width match at the beginning however
+     * never produces such empty leading substring.
      *
      * <p> The {@code limit} parameter controls the number of times the
      * pattern is applied and therefore affects the length of the resulting
@@ -2325,6 +2331,8 @@
             (ch < Character.MIN_HIGH_SURROGATE ||
              ch > Character.MAX_LOW_SURROGATE))
         {
+            if (value.length == 0)
+                return new String[0];
             int off = 0;
             int next = 0;
             boolean limited = limit > 0;
--- a/jdk/src/share/classes/java/util/regex/Pattern.java	Wed Nov 13 11:06:57 2013 -0800
+++ b/jdk/src/share/classes/java/util/regex/Pattern.java	Wed Nov 13 11:26:01 2013 -0800
@@ -1142,9 +1142,15 @@
      * input sequence that is terminated by another subsequence that matches
      * this pattern or is terminated by the end of the input sequence.  The
      * substrings in the array are in the order in which they occur in the
-     * input.  If this pattern does not match any subsequence of the input then
+     * input. If this pattern does not match any subsequence of the input then
      * the resulting array has just one element, namely the input sequence in
-     * string form.
+     * string form. A zero-length input sequence always results zero-length
+     * resulting array.
+     *
+     * <p> When there is a positive-width match at the beginning of the input
+     * sequence then an empty leading substring is included at the beginning
+     * of the resulting array. A zero-width match at the beginning however
+     * never produces such empty leading substring.
      *
      * <p> The <tt>limit</tt> parameter controls the number of times the
      * pattern is applied and therefore affects the length of the resulting
@@ -1185,7 +1191,6 @@
      *     <td><tt>{ "b", "", ":and:f" }</tt></td></tr>
      * </table></blockquote>
      *
-     *
      * @param  input
      *         The character sequence to be split
      *
@@ -1196,6 +1201,8 @@
      *          around matches of this pattern
      */
     public String[] split(CharSequence input, int limit) {
+        if (input.length() == 0)
+            return new String[0];
         int index = 0;
         boolean matchLimited = limit > 0;
         ArrayList<String> matchList = new ArrayList<>();
@@ -1204,6 +1211,11 @@
         // Add segments before each match found
         while(m.find()) {
             if (!matchLimited || matchList.size() < limit - 1) {
+                if (index == 0 && index == m.start() && m.start() == m.end()) {
+                    // no empty leading substring included for zero-width match
+                    // at the beginning of the input char sequence.
+                    continue;
+                }
                 String match = input.subSequence(index, m.start()).toString();
                 matchList.add(match);
                 index = m.end();
@@ -5762,6 +5774,13 @@
      * the resulting stream has just one element, namely the input sequence in
      * string form.
      *
+     * <p> A zero-length input sequence always results an empty stream.
+     *
+     * <p> When there is a positive-width match at the beginning of the input
+     * sequence then an empty leading substring is included at the beginning
+     * of the stream. A zero-width match at the beginning however never produces
+     * such empty leading substring.
+     *
      * <p> If the input sequence is mutable, it must remain constant during the
      * execution of the terminal stream operation.  Otherwise, the result of the
      * terminal stream operation is undefined.
@@ -5817,7 +5836,8 @@
                     current = matcher.end();
                     if (!nextElement.isEmpty()) {
                         return true;
-                    } else {
+                    } else if (current > 0) { // no empty leading substring for zero-width
+                                              // match at the beginning of the input
                         emptyElementCount++;
                     }
                 }
--- a/jdk/test/java/lang/String/Split.java	Wed Nov 13 11:06:57 2013 -0800
+++ b/jdk/test/java/lang/String/Split.java	Wed Nov 13 11:26:01 2013 -0800
@@ -23,7 +23,7 @@
 
 /**
  * @test
- * @bug 6840246
+ * @bug 6840246 6559590
  * @summary test String.split()
  */
 import java.util.Arrays;
@@ -78,12 +78,11 @@
                 throw new RuntimeException("String.split failure 7");
         }
         // Check the case for limit == 0, source = "";
+        // split() now returns 0-length for empty source "" see #6559590
         source = "";
         String[] result = source.split("e", 0);
-        if (result.length != 1)
+        if (result.length != 0)
             throw new RuntimeException("String.split failure 8");
-        if (!result[0].equals(source))
-            throw new RuntimeException("String.split failure 9");
 
         // check fastpath of String.split()
         source = "0123456789abcdefgABCDEFG";
--- a/jdk/test/java/util/regex/RegExTest.java	Wed Nov 13 11:06:57 2013 -0800
+++ b/jdk/test/java/util/regex/RegExTest.java	Wed Nov 13 11:26:01 2013 -0800
@@ -33,7 +33,8 @@
  * 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940
  * 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133
  * 6350801 6676425 6878475 6919132 6931676 6948903 6990617 7014645 7039066
- * 7067045 7014640 7189363 8007395 8013252 8013254 8012646 8023647
+ * 7067045 7014640 7189363 8007395 8013252 8013254 8012646 8023647 6559590
+ * 8027645
  */
 
 import java.util.regex.*;
@@ -148,6 +149,7 @@
         groupCurlyNotFoundSuppTest();
         groupCurlyBackoffTest();
         patternAsPredicate();
+
         if (failure) {
             throw new
                 RuntimeException("RegExTest failed, 1st failure: " +
@@ -1776,13 +1778,68 @@
                 failCount++;
         }
         // Check the case for limit == 0, source = "";
+        // split() now returns 0-length for empty source "" see #6559590
         source = "";
         result = source.split("e", 0);
-        if (result.length != 1)
-            failCount++;
-        if (!result[0].equals(source))
-            failCount++;
-
+        if (result.length != 0)
+            failCount++;
+
+        // Check both split() and splitAsStraem(), especially for zero-lenth
+        // input and zero-lenth match cases
+        String[][] input = new String[][] {
+            { " ",           "Abc Efg Hij" },   // normal non-zero-match
+            { " ",           " Abc Efg Hij" },  // leading empty str for non-zero-match
+            { " ",           "Abc  Efg Hij" },  // non-zero-match in the middle
+            { "(?=\\p{Lu})", "AbcEfgHij" },     // no leading empty str for zero-match
+            { "(?=\\p{Lu})", "AbcEfg" },
+            { "(?=\\p{Lu})", "Abc" },
+            { " ",           "" },              // zero-length input
+            { ".*",          "" },
+
+            // some tests from PatternStreamTest.java
+            { "4",       "awgqwefg1fefw4vssv1vvv1" },
+            { "\u00a3a", "afbfq\u00a3abgwgb\u00a3awngnwggw\u00a3a\u00a3ahjrnhneerh" },
+            { "1",       "awgqwefg1fefw4vssv1vvv1" },
+            { "1",       "a\u4ebafg1fefw\u4eba4\u9f9cvssv\u9f9c1v\u672c\u672cvv" },
+            { "\u56da",  "1\u56da23\u56da456\u56da7890" },
+            { "\u56da",  "1\u56da23\u9f9c\u672c\u672c\u56da456\u56da\u9f9c\u672c7890" },
+            { "\u56da",  "" },
+            { "[ \t,:.]","This is,testing: with\tdifferent separators." }, //multiple septs
+            { "o",       "boo:and:foo" },
+            { "o",       "booooo:and:fooooo" },
+            { "o",       "fooooo:" },
+        };
+
+        String[][] expected = new String[][] {
+            { "Abc", "Efg", "Hij" },
+            { "", "Abc", "Efg", "Hij" },
+            { "Abc", "", "Efg", "Hij" },
+            { "Abc", "Efg", "Hij" },
+            { "Abc", "Efg" },
+            { "Abc" },
+            {},
+            {},
+
+            { "awgqwefg1fefw", "vssv1vvv1" },
+            { "afbfq", "bgwgb", "wngnwggw", "", "hjrnhneerh" },
+            { "awgqwefg", "fefw4vssv", "vvv" },
+            { "a\u4ebafg", "fefw\u4eba4\u9f9cvssv\u9f9c", "v\u672c\u672cvv" },
+            { "1", "23", "456", "7890" },
+            { "1", "23\u9f9c\u672c\u672c", "456", "\u9f9c\u672c7890" },
+            {},
+            { "This", "is", "testing", "", "with", "different", "separators" },
+            { "b", "", ":and:f" },
+            { "b", "", "", "", "", ":and:f" },
+            { "f", "", "", "", "", ":" },
+        };
+        for (int i = 0; i < input.length; i++) {
+            pattern = Pattern.compile(input[i][0]);
+            if (!Arrays.equals(pattern.split(input[i][1]), expected[i]))
+                failCount++;
+            if (!Arrays.equals(pattern.splitAsStream(input[i][1]).toArray(),
+                               expected[i]))
+                failCount++;
+        }
         report("Split");
     }