8039124: j.u.regex.Matcher.appendReplace/Tail() should support StringBuilder variant
authorsherman
Wed, 09 Apr 2014 09:36:19 -0700
changeset 23734 439905b27f94
parent 23733 b9b80421cfa7
child 23735 34d83df87817
child 23736 6f77d7e5aa63
8039124: j.u.regex.Matcher.appendReplace/Tail() should support StringBuilder variant Summary: to add the StringBuilder variant Reviewed-by: alanb, sherman Contributed-by: jeremymanson@google.com, peter.levart@gmail.com
jdk/src/share/classes/java/util/regex/Matcher.java
jdk/test/java/util/regex/RegExTest.java
--- a/jdk/src/share/classes/java/util/regex/Matcher.java	Wed Apr 09 12:49:51 2014 +0000
+++ b/jdk/src/share/classes/java/util/regex/Matcher.java	Wed Apr 09 09:36:19 2014 -0700
@@ -65,9 +65,10 @@
  * new strings whose contents can, if desired, be computed from the match
  * result.  The {@link #appendReplacement appendReplacement} and {@link
  * #appendTail appendTail} methods can be used in tandem in order to collect
- * the result into an existing string buffer, or the more convenient {@link
- * #replaceAll replaceAll} method can be used to create a string in which every
- * matching subsequence in the input sequence is replaced.
+ * the result into an existing string buffer or string builder. Alternatively,
+ * the more convenient {@link #replaceAll replaceAll} method can be used to
+ * create a string in which every matching subsequence in the input sequence
+ * is replaced.
  *
  * <p> The explicit state of a matcher includes the start and end indices of
  * the most recent successful match.  It also includes the start and end
@@ -792,15 +793,115 @@
      *          that does not exist in the pattern
      */
     public Matcher appendReplacement(StringBuffer sb, String replacement) {
-
         // If no match, return error
         if (first < 0)
             throw new IllegalStateException("No match available");
+        StringBuilder result = new StringBuilder();
+        appendExpandedReplacement(replacement, result);
+        // Append the intervening text
+        sb.append(text, lastAppendPosition, first);
+        // Append the match substitution
+        sb.append(result);
+        lastAppendPosition = last;
+        return this;
+    }
 
-        // Process substitution string to replace group references with groups
+    /**
+     * Implements a non-terminal append-and-replace step.
+     *
+     * <p> This method performs the following actions: </p>
+     *
+     * <ol>
+     *
+     *   <li><p> It reads characters from the input sequence, starting at the
+     *   append position, and appends them to the given string builder.  It
+     *   stops after reading the last character preceding the previous match,
+     *   that is, the character at index {@link
+     *   #start()}&nbsp;<tt>-</tt>&nbsp;<tt>1</tt>.  </p></li>
+     *
+     *   <li><p> It appends the given replacement string to the string builder.
+     *   </p></li>
+     *
+     *   <li><p> It sets the append position of this matcher to the index of
+     *   the last character matched, plus one, that is, to {@link #end()}.
+     *   </p></li>
+     *
+     * </ol>
+     *
+     * <p> The replacement string may contain references to subsequences
+     * captured during the previous match: Each occurrence of
+     * <tt>$</tt><i>g</i><tt></tt> will be replaced by the result of
+     * evaluating {@link #group(int) group}<tt>(</tt><i>g</i><tt>)</tt>.
+     * The first number after the <tt>$</tt> is always treated as part of
+     * the group reference. Subsequent numbers are incorporated into g if
+     * they would form a legal group reference. Only the numerals '0'
+     * through '9' are considered as potential components of the group
+     * reference. If the second group matched the string <tt>"foo"</tt>, for
+     * example, then passing the replacement string <tt>"$2bar"</tt> would
+     * cause <tt>"foobar"</tt> to be appended to the string builder. A dollar
+     * sign (<tt>$</tt>) may be included as a literal in the replacement
+     * string by preceding it with a backslash (<tt>\$</tt>).
+     *
+     * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
+     * the replacement string may cause the results to be different than if it
+     * were being treated as a literal replacement string. Dollar signs may be
+     * treated as references to captured subsequences as described above, and
+     * backslashes are used to escape literal characters in the replacement
+     * string.
+     *
+     * <p> This method is intended to be used in a loop together with the
+     * {@link #appendTail appendTail} and {@link #find find} methods.  The
+     * following code, for example, writes <tt>one dog two dogs in the
+     * yard</tt> to the standard-output stream: </p>
+     *
+     * <blockquote><pre>
+     * Pattern p = Pattern.compile("cat");
+     * Matcher m = p.matcher("one cat two cats in the yard");
+     * StringBuilder sb = new StringBuilder();
+     * while (m.find()) {
+     *     m.appendReplacement(sb, "dog");
+     * }
+     * m.appendTail(sb);
+     * System.out.println(sb.toString());</pre></blockquote>
+     *
+     * @param  sb
+     *         The target string builder
+     * @param  replacement
+     *         The replacement string
+     * @return  This matcher
+     *
+     * @throws  IllegalStateException
+     *          If no match has yet been attempted,
+     *          or if the previous match operation failed
+     * @throws  IllegalArgumentException
+     *          If the replacement string refers to a named-capturing
+     *          group that does not exist in the pattern
+     * @throws  IndexOutOfBoundsException
+     *          If the replacement string refers to a capturing group
+     *          that does not exist in the pattern
+     * @since 1.9
+     */
+    public Matcher appendReplacement(StringBuilder sb, String replacement) {
+        // If no match, return error
+        if (first < 0)
+            throw new IllegalStateException("No match available");
+        StringBuilder result = new StringBuilder();
+        appendExpandedReplacement(replacement, result);
+        // Append the intervening text
+        sb.append(text, lastAppendPosition, first);
+        // Append the match substitution
+        sb.append(result);
+        lastAppendPosition = last;
+        return this;
+    }
+
+    /**
+     * Processes replacement string to replace group references with
+     * groups.
+     */
+    private StringBuilder appendExpandedReplacement(
+        String replacement, StringBuilder result) {
         int cursor = 0;
-        StringBuilder result = new StringBuilder();
-
         while (cursor < replacement.length()) {
             char nextChar = replacement.charAt(cursor);
             if (nextChar == '\\') {
@@ -852,8 +953,8 @@
                     cursor++;
                 } else {
                     // The first number is always a group
-                    refNum = (int)nextChar - '0';
-                    if ((refNum < 0)||(refNum > 9))
+                    refNum = nextChar - '0';
+                    if ((refNum < 0) || (refNum > 9))
                         throw new IllegalArgumentException(
                             "Illegal group reference");
                     cursor++;
@@ -864,7 +965,7 @@
                             break;
                         }
                         int nextDigit = replacement.charAt(cursor) - '0';
-                        if ((nextDigit < 0)||(nextDigit > 9)) { // not a number
+                        if ((nextDigit < 0) || (nextDigit > 9)) { // not a number
                             break;
                         }
                         int newRefNum = (refNum * 10) + nextDigit;
@@ -884,13 +985,7 @@
                 cursor++;
             }
         }
-        // Append the intervening text
-        sb.append(text, lastAppendPosition, first);
-        // Append the match substitution
-        sb.append(result);
-
-        lastAppendPosition = last;
-        return this;
+        return result;
     }
 
     /**
@@ -913,6 +1008,27 @@
     }
 
     /**
+     * Implements a terminal append-and-replace step.
+     *
+     * <p> This method reads characters from the input sequence, starting at
+     * the append position, and appends them to the given string builder.  It is
+     * intended to be invoked after one or more invocations of the {@link
+     * #appendReplacement appendReplacement} method in order to copy the
+     * remainder of the input sequence.  </p>
+     *
+     * @param  sb
+     *         The target string builder
+     *
+     * @return  The target string builder
+     *
+     * @since 1.9
+     */
+    public StringBuilder appendTail(StringBuilder sb) {
+        sb.append(text, lastAppendPosition, getTextLength());
+        return sb;
+    }
+
+    /**
      * Replaces every subsequence of the input sequence that matches the
      * pattern with the given replacement string.
      *
@@ -950,7 +1066,7 @@
         reset();
         boolean result = find();
         if (result) {
-            StringBuffer sb = new StringBuffer();
+            StringBuilder sb = new StringBuilder();
             do {
                 appendReplacement(sb, replacement);
                 result = find();
@@ -1000,7 +1116,7 @@
         reset();
         if (!find())
             return text.toString();
-        StringBuffer sb = new StringBuffer();
+        StringBuilder sb = new StringBuilder();
         appendReplacement(sb, replacement);
         appendTail(sb);
         return sb.toString();
--- a/jdk/test/java/util/regex/RegExTest.java	Wed Apr 09 12:49:51 2014 +0000
+++ b/jdk/test/java/util/regex/RegExTest.java	Wed Apr 09 09:36:19 2014 -0700
@@ -32,7 +32,7 @@
  * 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133
  * 6350801 6676425 6878475 6919132 6931676 6948903 6990617 7014645 7039066
  * 7067045 7014640 7189363 8007395 8013252 8013254 8012646 8023647 6559590
- * 8027645 8035076
+ * 8027645 8035076 8039124
  */
 
 import java.util.regex.*;
@@ -75,7 +75,10 @@
         // Substitition tests on randomly generated sequences
         globalSubstitute();
         stringbufferSubstitute();
+        stringbuilderSubstitute();
+
         substitutionBasher();
+        substitutionBasher2();
 
         // Canonical Equivalence
         ceTest();
@@ -296,10 +299,12 @@
 
         final Matcher m = Pattern.compile("xyz").matcher("xyz");
         m.matches();
-        check(new Runnable() { public void run() { m.appendTail(null);}});
+        check(new Runnable() { public void run() { m.appendTail((StringBuffer)null);}});
+        check(new Runnable() { public void run() { m.appendTail((StringBuilder)null);}});
         check(new Runnable() { public void run() { m.replaceAll(null);}});
         check(new Runnable() { public void run() { m.replaceFirst(null);}});
-        check(new Runnable() { public void run() { m.appendReplacement(null, null);}});
+        check(new Runnable() { public void run() { m.appendReplacement((StringBuffer)null, null);}});
+        check(new Runnable() { public void run() { m.appendReplacement((StringBuilder)null, null);}});
         check(new Runnable() { public void run() { m.reset(null);}});
         check(new Runnable() { public void run() { Matcher.quoteReplacement(null);}});
         //check(new Runnable() { public void run() { m.usePattern(null);}});
@@ -2973,6 +2978,286 @@
         report("SB Substitution");
     }
 
+    /**
+     * Tests the usage of Matcher.appendReplacement() with literal
+     * and group substitutions.
+     */
+    private static void stringbuilderSubstitute() throws Exception {
+        // SB substitution with literal
+        String blah = "zzzblahzzz";
+        Pattern p = Pattern.compile("blah");
+        Matcher m = p.matcher(blah);
+        StringBuilder result = new StringBuilder();
+        try {
+            m.appendReplacement(result, "blech");
+            failCount++;
+        } catch (IllegalStateException e) {
+        }
+        m.find();
+        m.appendReplacement(result, "blech");
+        if (!result.toString().equals("zzzblech"))
+            failCount++;
+
+        m.appendTail(result);
+        if (!result.toString().equals("zzzblechzzz"))
+            failCount++;
+
+        // SB substitution with groups
+        blah = "zzzabcdzzz";
+        p = Pattern.compile("(ab)(cd)*");
+        m = p.matcher(blah);
+        result = new StringBuilder();
+        try {
+            m.appendReplacement(result, "$1");
+            failCount++;
+        } catch (IllegalStateException e) {
+        }
+        m.find();
+        m.appendReplacement(result, "$1");
+        if (!result.toString().equals("zzzab"))
+            failCount++;
+
+        m.appendTail(result);
+        if (!result.toString().equals("zzzabzzz"))
+            failCount++;
+
+        // SB substitution with 3 groups
+        blah = "zzzabcdcdefzzz";
+        p = Pattern.compile("(ab)(cd)*(ef)");
+        m = p.matcher(blah);
+        result = new StringBuilder();
+        try {
+            m.appendReplacement(result, "$1w$2w$3");
+            failCount++;
+        } catch (IllegalStateException e) {
+        }
+        m.find();
+        m.appendReplacement(result, "$1w$2w$3");
+        if (!result.toString().equals("zzzabwcdwef"))
+            failCount++;
+
+        m.appendTail(result);
+        if (!result.toString().equals("zzzabwcdwefzzz"))
+            failCount++;
+
+        // SB substitution with groups and three matches
+        // skipping middle match
+        blah = "zzzabcdzzzabcddzzzabcdzzz";
+        p = Pattern.compile("(ab)(cd*)");
+        m = p.matcher(blah);
+        result = new StringBuilder();
+        try {
+            m.appendReplacement(result, "$1");
+            failCount++;
+        } catch (IllegalStateException e) {
+        }
+        m.find();
+        m.appendReplacement(result, "$1");
+        if (!result.toString().equals("zzzab"))
+            failCount++;
+
+        m.find();
+        m.find();
+        m.appendReplacement(result, "$2");
+        if (!result.toString().equals("zzzabzzzabcddzzzcd"))
+            failCount++;
+
+        m.appendTail(result);
+        if (!result.toString().equals("zzzabzzzabcddzzzcdzzz"))
+            failCount++;
+
+        // Check to make sure escaped $ is ignored
+        blah = "zzzabcdcdefzzz";
+        p = Pattern.compile("(ab)(cd)*(ef)");
+        m = p.matcher(blah);
+        result = new StringBuilder();
+        m.find();
+        m.appendReplacement(result, "$1w\\$2w$3");
+        if (!result.toString().equals("zzzabw$2wef"))
+            failCount++;
+
+        m.appendTail(result);
+        if (!result.toString().equals("zzzabw$2wefzzz"))
+            failCount++;
+
+        // Check to make sure a reference to nonexistent group causes error
+        blah = "zzzabcdcdefzzz";
+        p = Pattern.compile("(ab)(cd)*(ef)");
+        m = p.matcher(blah);
+        result = new StringBuilder();
+        m.find();
+        try {
+            m.appendReplacement(result, "$1w$5w$3");
+            failCount++;
+        } catch (IndexOutOfBoundsException ioobe) {
+            // Correct result
+        }
+
+        // Check double digit group references
+        blah = "zzz123456789101112zzz";
+        p = Pattern.compile("(1)(2)(3)(4)(5)(6)(7)(8)(9)(10)(11)");
+        m = p.matcher(blah);
+        result = new StringBuilder();
+        m.find();
+        m.appendReplacement(result, "$1w$11w$3");
+        if (!result.toString().equals("zzz1w11w3"))
+            failCount++;
+
+        // Check to make sure it backs off $15 to $1 if only three groups
+        blah = "zzzabcdcdefzzz";
+        p = Pattern.compile("(ab)(cd)*(ef)");
+        m = p.matcher(blah);
+        result = new StringBuilder();
+        m.find();
+        m.appendReplacement(result, "$1w$15w$3");
+        if (!result.toString().equals("zzzabwab5wef"))
+            failCount++;
+
+
+        // Supplementary character test
+        // SB substitution with literal
+        blah = toSupplementaries("zzzblahzzz");
+        p = Pattern.compile(toSupplementaries("blah"));
+        m = p.matcher(blah);
+        result = new StringBuilder();
+        try {
+            m.appendReplacement(result, toSupplementaries("blech"));
+            failCount++;
+        } catch (IllegalStateException e) {
+        }
+        m.find();
+        m.appendReplacement(result, toSupplementaries("blech"));
+        if (!result.toString().equals(toSupplementaries("zzzblech")))
+            failCount++;
+        m.appendTail(result);
+        if (!result.toString().equals(toSupplementaries("zzzblechzzz")))
+            failCount++;
+
+        // SB substitution with groups
+        blah = toSupplementaries("zzzabcdzzz");
+        p = Pattern.compile(toSupplementaries("(ab)(cd)*"));
+        m = p.matcher(blah);
+        result = new StringBuilder();
+        try {
+            m.appendReplacement(result, "$1");
+            failCount++;
+        } catch (IllegalStateException e) {
+        }
+        m.find();
+        m.appendReplacement(result, "$1");
+        if (!result.toString().equals(toSupplementaries("zzzab")))
+            failCount++;
+
+        m.appendTail(result);
+        if (!result.toString().equals(toSupplementaries("zzzabzzz")))
+            failCount++;
+
+        // SB substitution with 3 groups
+        blah = toSupplementaries("zzzabcdcdefzzz");
+        p = Pattern.compile(toSupplementaries("(ab)(cd)*(ef)"));
+        m = p.matcher(blah);
+        result = new StringBuilder();
+        try {
+            m.appendReplacement(result, toSupplementaries("$1w$2w$3"));
+            failCount++;
+        } catch (IllegalStateException e) {
+        }
+        m.find();
+        m.appendReplacement(result, toSupplementaries("$1w$2w$3"));
+        if (!result.toString().equals(toSupplementaries("zzzabwcdwef")))
+            failCount++;
+
+        m.appendTail(result);
+        if (!result.toString().equals(toSupplementaries("zzzabwcdwefzzz")))
+            failCount++;
+
+        // SB substitution with groups and three matches
+        // skipping middle match
+        blah = toSupplementaries("zzzabcdzzzabcddzzzabcdzzz");
+        p = Pattern.compile(toSupplementaries("(ab)(cd*)"));
+        m = p.matcher(blah);
+        result = new StringBuilder();
+        try {
+            m.appendReplacement(result, "$1");
+            failCount++;
+        } catch (IllegalStateException e) {
+        }
+        m.find();
+        m.appendReplacement(result, "$1");
+        if (!result.toString().equals(toSupplementaries("zzzab")))
+            failCount++;
+
+        m.find();
+        m.find();
+        m.appendReplacement(result, "$2");
+        if (!result.toString().equals(toSupplementaries("zzzabzzzabcddzzzcd")))
+            failCount++;
+
+        m.appendTail(result);
+        if (!result.toString().equals(toSupplementaries("zzzabzzzabcddzzzcdzzz")))
+            failCount++;
+
+        // Check to make sure escaped $ is ignored
+        blah = toSupplementaries("zzzabcdcdefzzz");
+        p = Pattern.compile(toSupplementaries("(ab)(cd)*(ef)"));
+        m = p.matcher(blah);
+        result = new StringBuilder();
+        m.find();
+        m.appendReplacement(result, toSupplementaries("$1w\\$2w$3"));
+        if (!result.toString().equals(toSupplementaries("zzzabw$2wef")))
+            failCount++;
+
+        m.appendTail(result);
+        if (!result.toString().equals(toSupplementaries("zzzabw$2wefzzz")))
+            failCount++;
+
+        // Check to make sure a reference to nonexistent group causes error
+        blah = toSupplementaries("zzzabcdcdefzzz");
+        p = Pattern.compile(toSupplementaries("(ab)(cd)*(ef)"));
+        m = p.matcher(blah);
+        result = new StringBuilder();
+        m.find();
+        try {
+            m.appendReplacement(result, toSupplementaries("$1w$5w$3"));
+            failCount++;
+        } catch (IndexOutOfBoundsException ioobe) {
+            // Correct result
+        }
+        // Check double digit group references
+        blah = toSupplementaries("zzz123456789101112zzz");
+        p = Pattern.compile("(1)(2)(3)(4)(5)(6)(7)(8)(9)(10)(11)");
+        m = p.matcher(blah);
+        result = new StringBuilder();
+        m.find();
+        m.appendReplacement(result, toSupplementaries("$1w$11w$3"));
+        if (!result.toString().equals(toSupplementaries("zzz1w11w3")))
+            failCount++;
+
+        // Check to make sure it backs off $15 to $1 if only three groups
+        blah = toSupplementaries("zzzabcdcdefzzz");
+        p = Pattern.compile(toSupplementaries("(ab)(cd)*(ef)"));
+        m = p.matcher(blah);
+        result = new StringBuilder();
+        m.find();
+        m.appendReplacement(result, toSupplementaries("$1w$15w$3"));
+        if (!result.toString().equals(toSupplementaries("zzzabwab5wef")))
+            failCount++;
+        // Check nothing has been appended into the output buffer if
+        // the replacement string triggers IllegalArgumentException.
+        p = Pattern.compile("(abc)");
+        m = p.matcher("abcd");
+        result = new StringBuilder();
+        m.find();
+        try {
+            m.appendReplacement(result, ("xyz$g"));
+            failCount++;
+        } catch (IllegalArgumentException iae) {
+            if (result.length() != 0)
+                failCount++;
+        }
+        report("SB Substitution 2");
+    }
+
     /*
      * 5 groups of characters are created to make a substitution string.
      * A base string will be created including random lead chars, the
@@ -3059,6 +3344,93 @@
         report("Substitution Basher");
     }
 
+    /*
+     * 5 groups of characters are created to make a substitution string.
+     * A base string will be created including random lead chars, the
+     * substitution string, and random trailing chars.
+     * A pattern containing the 5 groups is searched for and replaced with:
+     * random group + random string + random group.
+     * The results are checked for correctness.
+     */
+    private static void substitutionBasher2() {
+        for (int runs = 0; runs<1000; runs++) {
+            // Create a base string to work in
+            int leadingChars = generator.nextInt(10);
+            StringBuilder baseBuffer = new StringBuilder(100);
+            String leadingString = getRandomAlphaString(leadingChars);
+            baseBuffer.append(leadingString);
+
+            // Create 5 groups of random number of random chars
+            // Create the string to substitute
+            // Create the pattern string to search for
+            StringBuilder bufferToSub = new StringBuilder(25);
+            StringBuilder bufferToPat = new StringBuilder(50);
+            String[] groups = new String[5];
+            for(int i=0; i<5; i++) {
+                int aGroupSize = generator.nextInt(5)+1;
+                groups[i] = getRandomAlphaString(aGroupSize);
+                bufferToSub.append(groups[i]);
+                bufferToPat.append('(');
+                bufferToPat.append(groups[i]);
+                bufferToPat.append(')');
+            }
+            String stringToSub = bufferToSub.toString();
+            String pattern = bufferToPat.toString();
+
+            // Place sub string into working string at random index
+            baseBuffer.append(stringToSub);
+
+            // Append random chars to end
+            int trailingChars = generator.nextInt(10);
+            String trailingString = getRandomAlphaString(trailingChars);
+            baseBuffer.append(trailingString);
+            String baseString = baseBuffer.toString();
+
+            // Create test pattern and matcher
+            Pattern p = Pattern.compile(pattern);
+            Matcher m = p.matcher(baseString);
+
+            // Reject candidate if pattern happens to start early
+            m.find();
+            if (m.start() < leadingChars)
+                continue;
+
+            // Reject candidate if more than one match
+            if (m.find())
+                continue;
+
+            // Construct a replacement string with :
+            // random group + random string + random group
+            StringBuilder bufferToRep = new StringBuilder();
+            int groupIndex1 = generator.nextInt(5);
+            bufferToRep.append("$" + (groupIndex1 + 1));
+            String randomMidString = getRandomAlphaString(5);
+            bufferToRep.append(randomMidString);
+            int groupIndex2 = generator.nextInt(5);
+            bufferToRep.append("$" + (groupIndex2 + 1));
+            String replacement = bufferToRep.toString();
+
+            // Do the replacement
+            String result = m.replaceAll(replacement);
+
+            // Construct expected result
+            StringBuilder bufferToRes = new StringBuilder();
+            bufferToRes.append(leadingString);
+            bufferToRes.append(groups[groupIndex1]);
+            bufferToRes.append(randomMidString);
+            bufferToRes.append(groups[groupIndex2]);
+            bufferToRes.append(trailingString);
+            String expectedResult = bufferToRes.toString();
+
+            // Check results
+            if (!result.equals(expectedResult)) {
+                failCount++;
+            }
+        }
+
+        report("Substitution Basher 2");
+    }
+
     /**
      * Checks the handling of some escape sequences that the Pattern
      * class should process instead of the java compiler. These are