8013252: Regex Matcher .start and .end should be accessible by group name
authorsherman
Mon, 06 May 2013 21:24:37 -0700
changeset 17434 4a04d7127e80
parent 17433 24c57ce3fec4
child 17435 ec797e955dca
8013252: Regex Matcher .start and .end should be accessible by group name 8013254: Constructor \w need update to add the support of \p{Join_Control} Summary: added the requested methods and updated the \w constructor Reviewed-by: mchung, alanb
jdk/src/share/classes/java/util/regex/Matcher.java
jdk/src/share/classes/java/util/regex/Pattern.java
jdk/src/share/classes/java/util/regex/UnicodeProp.java
jdk/test/java/util/regex/POSIX_Unicode.java
jdk/test/java/util/regex/RegExTest.java
--- a/jdk/src/share/classes/java/util/regex/Matcher.java	Mon May 06 20:54:54 2013 -0700
+++ b/jdk/src/share/classes/java/util/regex/Matcher.java	Mon May 06 21:24:37 2013 -0700
@@ -25,6 +25,7 @@
 
 package java.util.regex;
 
+import java.util.Objects;
 
 /**
  * An engine that performs match operations on a {@link java.lang.CharSequence
@@ -370,12 +371,37 @@
     public int start(int group) {
         if (first < 0)
             throw new IllegalStateException("No match available");
-        if (group > groupCount())
+        if (group < 0 || group > groupCount())
             throw new IndexOutOfBoundsException("No group " + group);
         return groups[group * 2];
     }
 
     /**
+     * Returns the start index of the subsequence captured by the given
+     * <a href="Pattern.html#groupname">named-capturing group</a> during the
+     * previous match operation.
+     *
+     * @param  name
+     *         The name of a named-capturing group in this matcher's pattern
+     *
+     * @return  The index of the first character captured by the group,
+     *          or {@code -1} if the match was successful but the group
+     *          itself did not match anything
+     *
+     * @throws  IllegalStateException
+     *          If no match has yet been attempted,
+     *          or if the previous match operation failed
+     *
+     * @throws  IllegalArgumentException
+     *          If there is no capturing group in the pattern
+     *          with the given name
+     * @since 1.8
+     */
+    public int start(String name) {
+        return groups[getMatchedGroupIndex(name) * 2];
+    }
+
+    /**
      * Returns the offset after the last character matched.  </p>
      *
      * @return  The offset after the last character matched
@@ -417,12 +443,37 @@
     public int end(int group) {
         if (first < 0)
             throw new IllegalStateException("No match available");
-        if (group > groupCount())
+        if (group < 0 || group > groupCount())
             throw new IndexOutOfBoundsException("No group " + group);
         return groups[group * 2 + 1];
     }
 
     /**
+     * Returns the offset after the last character of the subsequence
+     * captured by the given <a href="Pattern.html#groupname">named-capturing
+     * group</a> during the previous match operation.
+     *
+     * @param  name
+     *         The name of a named-capturing group in this matcher's pattern
+     *
+     * @return  The offset after the last character captured by the group,
+     *          or {@code -1} if the match was successful
+     *          but the group itself did not match anything
+     *
+     * @throws  IllegalStateException
+     *          If no match has yet been attempted,
+     *          or if the previous match operation failed
+     *
+     * @throws  IllegalArgumentException
+     *          If there is no capturing group in the pattern
+     *          with the given name
+     * @since 1.8
+     */
+    public int end(String name) {
+        return groups[getMatchedGroupIndex(name) * 2 + 1];
+    }
+
+    /**
      * Returns the input subsequence matched by the previous match.
      *
      * <p> For a matcher <i>m</i> with input sequence <i>s</i>,
@@ -518,13 +569,7 @@
      * @since 1.7
      */
     public String group(String name) {
-        if (name == null)
-            throw new NullPointerException("Null group name");
-        if (first < 0)
-            throw new IllegalStateException("No match found");
-        if (!parentPattern.namedGroups().containsKey(name))
-            throw new IllegalArgumentException("No group with name <" + name + ">");
-        int group = parentPattern.namedGroups().get(name);
+        int group = getMatchedGroupIndex(name);
         if ((groups[group*2] == -1) || (groups[group*2+1] == -1))
             return null;
         return getSubSequence(groups[group * 2], groups[group * 2 + 1]).toString();
@@ -1257,4 +1302,17 @@
         return text.charAt(i);
     }
 
+    /**
+     * Returns the group index of the matched capturing group.
+     *
+     * @return the index of the named-capturing group
+     */
+    int getMatchedGroupIndex(String name) {
+        Objects.requireNonNull(name, "Group name");
+        if (first < 0)
+            throw new IllegalStateException("No match found");
+        if (!parentPattern.namedGroups().containsKey(name))
+            throw new IllegalArgumentException("No group with name <" + name + ">");
+        return parentPattern.namedGroups().get(name);
+    }
 }
--- a/jdk/src/share/classes/java/util/regex/Pattern.java	Mon May 06 20:54:54 2013 -0700
+++ b/jdk/src/share/classes/java/util/regex/Pattern.java	Mon May 06 21:24:37 2013 -0700
@@ -612,6 +612,7 @@
  *   <li> White_Space
  *   <li> Digit
  *   <li> Hex_Digit
+ *   <li> Join_Control
  *   <li> Noncharacter_Code_Point
  *   <li> Assigned
  * </ul>
@@ -662,7 +663,7 @@
  * <tr><td><tt>\S</tt></td>
  *     <td>A non-whitespace character: <tt>[^\s]</tt></td></tr>
  * <tr><td><tt>\w</tt></td>
- *     <td>A word character: <tt>[\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}]</tt></td></tr>
+ *     <td>A word character: <tt>[\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}\p{IsJoin_Control}]</tt></td></tr>
  * <tr><td><tt>\W</tt></td>
  *     <td>A non-word character: <tt>[^\w]</tt></td></tr>
  * </table>
--- a/jdk/src/share/classes/java/util/regex/UnicodeProp.java	Mon May 06 20:54:54 2013 -0700
+++ b/jdk/src/share/classes/java/util/regex/UnicodeProp.java	Mon May 06 21:24:37 2013 -0700
@@ -181,6 +181,7 @@
         //  \p{gc=Mark}
         //  \p{digit}
         //  \p{gc=Connector_Punctuation}
+        //  \p{Join_Control}    200C..200D
 
         public boolean is(int ch) {
             return ALPHABETIC.is(ch) ||
@@ -189,7 +190,15 @@
                       (1 << Character.COMBINING_SPACING_MARK) |
                       (1 << Character.DECIMAL_DIGIT_NUMBER) |
                       (1 << Character.CONNECTOR_PUNCTUATION)) >> Character.getType(ch)) & 1)
-                   != 0;
+                   != 0 ||
+                   JOIN_CONTROL.is(ch);
+        }
+    },
+
+    JOIN_CONTROL {
+        //  200C..200D    PropList.txt:Join_Control
+        public boolean is(int ch) {
+           return (ch == 0x200C || ch == 0x200D);
         }
     };
 
@@ -212,6 +221,7 @@
         aliases.put("WHITESPACE", "WHITE_SPACE");
         aliases.put("HEXDIGIT","HEX_DIGIT");
         aliases.put("NONCHARACTERCODEPOINT", "NONCHARACTER_CODE_POINT");
+        aliases.put("JOINCONTROL", "JOIN_CONTROL");
     }
 
     public static UnicodeProp forName(String propName) {
--- a/jdk/test/java/util/regex/POSIX_Unicode.java	Mon May 06 20:54:54 2013 -0700
+++ b/jdk/test/java/util/regex/POSIX_Unicode.java	Mon May 06 21:24:37 2013 -0700
@@ -125,6 +125,10 @@
         return (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef);
     }
 
+    public static boolean isJoinControl(int ch) {
+        return (ch == 0x200C || ch == 0x200D);
+    }
+
     //  \p{alpha}
     //  \p{gc=Mark}
     //  \p{digit}
@@ -136,6 +140,7 @@
                   (1 << Character.COMBINING_SPACING_MARK) |
                   (1 << Character.CONNECTOR_PUNCTUATION)) >> Character.getType(ch)) & 1)
                != 0 ||
-               isDigit(ch);
+               isDigit(ch) ||
+               isJoinControl(ch);
     }
 }
--- a/jdk/test/java/util/regex/RegExTest.java	Mon May 06 20:54:54 2013 -0700
+++ b/jdk/test/java/util/regex/RegExTest.java	Mon May 06 21:24:37 2013 -0700
@@ -33,7 +33,7 @@
  * 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940
  * 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133
  * 6350801 6676425 6878475 6919132 6931676 6948903 6990617 7014645 7039066
- * 7067045 7014640 7189363 8007395
+ * 7067045 7014640 7189363 8007395 8013252 8013254
  */
 
 import java.util.regex.*;
@@ -3390,7 +3390,9 @@
     private static void check(Pattern p, String s, String g, String expected) {
         Matcher m = p.matcher(s);
         m.find();
-        if (!m.group(g).equals(expected))
+        if (!m.group(g).equals(expected) ||
+            s.charAt(m.start(g)) != expected.charAt(0) ||
+            s.charAt(m.end(g) - 1) != expected.charAt(expected.length() - 1))
             failCount++;
     }
 
@@ -3420,19 +3422,42 @@
         failCount++;
     }
 
-    private static void checkExpectedFail(Matcher m, String g) {
+    private static void checkExpectedIAE(Matcher m, String g) {
         m.find();
         try {
             m.group(g);
-        } catch (IllegalArgumentException iae) {
+        } catch (IllegalArgumentException x) {
             //iae.printStackTrace();
-            return;
-        } catch (NullPointerException npe) {
-            return;
+            try {
+                m.start(g);
+            } catch (IllegalArgumentException xx) {
+                try {
+                    m.start(g);
+                } catch (IllegalArgumentException xxx) {
+                    return;
+                }
+            }
         }
         failCount++;
     }
 
+    private static void checkExpectedNPE(Matcher m) {
+        m.find();
+        try {
+            m.group(null);
+        } catch (NullPointerException x) {
+            try {
+                m.start(null);
+            } catch (NullPointerException xx) {
+                try {
+                    m.end(null);
+                } catch (NullPointerException xxx) {
+                    return;
+                }
+            }
+        }
+        failCount++;
+    }
 
     private static void namedGroupCaptureTest() throws Exception {
         check(Pattern.compile("x+(?<gname>y+)z+"),
@@ -3559,10 +3584,9 @@
         checkExpectedFail("(?<6groupnamestartswithdigit>abc)(def)");
         checkExpectedFail("(?<gname>abc)(def)\\k<gnameX>");
         checkExpectedFail("(?<gname>abc)(?<gname>def)\\k<gnameX>");
-        checkExpectedFail(Pattern.compile("(?<gname>abc)(def)").matcher("abcdef"),
-                          "gnameX");
-        checkExpectedFail(Pattern.compile("(?<gname>abc)(def)").matcher("abcdef"),
-                          null);
+        checkExpectedIAE(Pattern.compile("(?<gname>abc)(def)").matcher("abcdef"),
+                         "gnameX");
+        checkExpectedNPE(Pattern.compile("(?<gname>abc)(def)").matcher("abcdef"));
         report("NamedGroupCapture");
     }
 
@@ -3759,6 +3783,7 @@
         Matcher spaceP  = Pattern.compile("\\p{IsWhiteSpace}").matcher("");
         Matcher definedP = Pattern.compile("\\p{IsAssigned}").matcher("");
         Matcher nonCCPP = Pattern.compile("\\p{IsNoncharacterCodePoint}").matcher("");
+        Matcher joinCrtl = Pattern.compile("\\p{IsJoinControl}").matcher("");
 
         // javaMethod
         Matcher lowerJ  = Pattern.compile("\\p{javaLowerCase}").matcher("");
@@ -3829,7 +3854,8 @@
                 Character.isIdeographic(cp) != ideogP.reset(str).matches() ||
                 Character.isIdeographic(cp) != ideogJ.reset(str).matches() ||
                 (Character.UNASSIGNED == type) == definedP.reset(str).matches() ||
-                POSIX_Unicode.isNoncharacterCodePoint(cp) != nonCCPP.reset(str).matches())
+                POSIX_Unicode.isNoncharacterCodePoint(cp) != nonCCPP.reset(str).matches() ||
+                POSIX_Unicode.isJoinControl(cp) != joinCrtl.reset(str).matches())
                 failCount++;
         }