7014640: To add a metachar \R for line ending and character classes for vertical/horizontal ws \v \V \h \H
Summary: added propsoed constructs
Reviewed-by: alanb
--- a/jdk/src/share/classes/java/util/regex/Pattern.java Tue May 08 02:59:10 2012 -0400
+++ b/jdk/src/share/classes/java/util/regex/Pattern.java Tue May 08 10:57:13 2012 -0700
@@ -152,15 +152,24 @@
* <td headers="matches">A digit: <tt>[0-9]</tt></td></tr>
* <tr><td valign="top" headers="construct predef"><tt>\D</tt></td>
* <td headers="matches">A non-digit: <tt>[^0-9]</tt></td></tr>
+ * <tr><td valign="top" headers="construct predef"><tt>\h</tt></td>
+ * <td headers="matches">A horizontal whitespace character:
+ * <tt>[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]</tt></td></tr>
+ * <tr><td valign="top" headers="construct predef"><tt>\H</tt></td>
+ * <td headers="matches">A non-horizontal whitespace character: <tt>[^\h]</tt></td></tr>
* <tr><td valign="top" headers="construct predef"><tt>\s</tt></td>
* <td headers="matches">A whitespace character: <tt>[ \t\n\x0B\f\r]</tt></td></tr>
* <tr><td valign="top" headers="construct predef"><tt>\S</tt></td>
* <td headers="matches">A non-whitespace character: <tt>[^\s]</tt></td></tr>
+ * <tr><td valign="top" headers="construct predef"><tt>\v</tt></td>
+ * <td headers="matches">A vertical whitespace character: <tt>[\n\x0B\f\r\x85\u2028\u2029]</tt>
+ * </td></tr>
+ * <tr><td valign="top" headers="construct predef"><tt>\V</tt></td>
+ * <td headers="matches">A non-vertical whitespace character: <tt>[^\v]</tt></td></tr>
* <tr><td valign="top" headers="construct predef"><tt>\w</tt></td>
* <td headers="matches">A word character: <tt>[a-zA-Z_0-9]</tt></td></tr>
* <tr><td valign="top" headers="construct predef"><tt>\W</tt></td>
* <td headers="matches">A non-word character: <tt>[^\w]</tt></td></tr>
- *
* <tr><th> </th></tr>
* <tr align="left"><th colspan="2" id="posix">POSIX character classes</b> (US-ASCII only)<b></th></tr>
*
@@ -244,6 +253,13 @@
* <td headers="matches">The end of the input</td></tr>
*
* <tr><th> </th></tr>
+ * <tr align="left"><th colspan="2" id="lineending">Linebreak matcher</th></tr>
+ * <tr><td valign="top" headers="construct lineending"><tt>\R</tt></td>
+ * <td headers="matches">Any Unicode linebreak sequence, is equivalent to
+ * <tt>\u000D\u000A|[\u000A\u000B\u000C\u000D\u0085\u2028\u2029]
+ * </tt></td></tr>
+ *
+ * <tr><th> </th></tr>
* <tr align="left"><th colspan="2" id="greedy">Greedy quantifiers</th></tr>
*
* <tr><td valign="top" headers="construct greedy"><i>X</i><tt>?</tt></td>
@@ -599,11 +615,9 @@
* <li> Noncharacter_Code_Point
* <li> Assigned
* </ul>
-
-
* <p>
- * <b>Predefined Character classes</b> and <b>POSIX character classes</b> are in
- * conformance with the recommendation of <i>Annex C: Compatibility Properties</i>
+ * The following <b>Predefined Character classes</b> and <b>POSIX character classes</b>
+ * are in conformance with the recommendation of <i>Annex C: Compatibility Properties</i>
* of <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Regular Expression
* </i></a>, when {@link #UNICODE_CHARACTER_CLASS} flag is specified.
* <p>
@@ -668,12 +682,6 @@
*
* <ul>
* <li><p> Predefined character classes (Unicode character)
- * <p><tt>\h </tt>A horizontal whitespace
- * <p><tt>\H </tt>A non horizontal whitespace
- * <p><tt>\v </tt>A vertical whitespace
- * <p><tt>\V </tt>A non vertical whitespace
- * <p><tt>\R </tt>Any Unicode linebreak sequence
- * <tt>\u005cu000D\u005cu000A|[\u005cu000A\u005cu000B\u005cu000C\u005cu000D\u005cu0085\u005cu2028\u005cu2029]</tt>
* <p><tt>\X </tt>Match Unicode
* <a href="http://www.unicode.org/reports/tr18/#Default_Grapheme_Clusters">
* <i>extended grapheme cluster</i></a>
@@ -2178,7 +2186,7 @@
}
unread();
prev = cursor;
- ch = escape(false, first == 0);
+ ch = escape(false, first == 0, false);
if (ch >= 0) {
append(ch, first);
first++;
@@ -2276,7 +2284,7 @@
* If the returned value is greater than zero, it is the value that
* matches the escape sequence.
*/
- private int escape(boolean inclass, boolean create) {
+ private int escape(boolean inclass, boolean create, boolean isrange) {
int ch = skip();
switch (ch) {
case '0':
@@ -2318,6 +2326,8 @@
if (create) root = new LastMatch();
return -1;
case 'H':
+ if (create) root = new HorizWS().complement();
+ return -1;
case 'I':
case 'J':
case 'K':
@@ -2327,8 +2337,11 @@
case 'O':
case 'P':
case 'Q':
+ break;
case 'R':
- break;
+ if (inclass) break;
+ if (create) root = new LineEnding();
+ return -1;
case 'S':
if (create) root = has(UNICODE_CHARACTER_CLASS)
? new Utype(UnicodeProp.WHITE_SPACE).complement()
@@ -2336,8 +2349,10 @@
return -1;
case 'T':
case 'U':
+ break;
case 'V':
- break;
+ if (create) root = new VertWS().complement();
+ return -1;
case 'W':
if (create) root = has(UNICODE_CHARACTER_CLASS)
? new Utype(UnicodeProp.WORD).complement()
@@ -2373,7 +2388,10 @@
case 'f':
return '\f';
case 'g':
+ break;
case 'h':
+ if (create) root = new HorizWS();
+ return -1;
case 'i':
case 'j':
break;
@@ -2413,7 +2431,18 @@
case 'u':
return u();
case 'v':
- return '\013';
+ // '\v' was implemented as VT/0x0B in releases < 1.8 (though
+ // undocumented). In JDK8 '\v' is specified as a predefined
+ // character class for all vertical whitespace characters.
+ // So [-1, root=VertWS node] pair is returned (instead of a
+ // single 0x0B). This breaks the range if '\v' is used as
+ // the start or end value, such as [\v-...] or [...-\v], in
+ // which a single definite value (0x0B) is expected. For
+ // compatiblity concern '\013'/0x0B is returned if isrange.
+ if (isrange)
+ return '\013';
+ if (create) root = new VertWS();
+ return -1;
case 'w':
if (create) root = has(UNICODE_CHARACTER_CLASS)
? new Utype(UnicodeProp.WORD)
@@ -2590,13 +2619,14 @@
oneLetter = false;
return family(oneLetter, comp);
} else { // ordinary escape
+ boolean isrange = temp[cursor+1] == '-';
unread();
- ch = escape(true, true);
+ ch = escape(true, true, isrange);
if (ch == -1)
return (CharProperty) root;
}
} else {
- ch = single();
+ next();
}
if (ch >= 0) {
if (peek() == '-') {
@@ -2606,9 +2636,15 @@
}
if (endRange != ']') {
next();
- int m = single();
- if (m < ch)
+ int m = peek();
+ if (m == '\\') {
+ m = escape(true, false, true);
+ } else {
+ next();
+ }
+ if (m < ch) {
throw error("Illegal character range");
+ }
if (has(CASE_INSENSITIVE))
return caseInsensitiveRangeFor(ch, m);
else
@@ -2620,17 +2656,6 @@
throw error("Unexpected character '"+((char)ch)+"'");
}
- private int single() {
- int ch = peek();
- switch (ch) {
- case '\\':
- return escape(true, false);
- default:
- next();
- return ch;
- }
- }
-
/**
* Parses a Unicode character family and returns its representative node.
*/
@@ -3695,6 +3720,35 @@
}
/**
+ * Node class that matches a Unicode line ending '\R'
+ */
+ static final class LineEnding extends Node {
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ // (u+000Du+000A|[u+000Au+000Bu+000Cu+000Du+0085u+2028u+2029])
+ if (i < matcher.to) {
+ int ch = seq.charAt(i);
+ if (ch == 0x0A || ch == 0x0B || ch == 0x0C ||
+ ch == 0x85 || ch == 0x2028 || ch == 0x2029)
+ return next.match(matcher, i + 1, seq);
+ if (ch == 0x0D) {
+ i++;
+ if (i < matcher.to && seq.charAt(i) == 0x0A)
+ i++;
+ return next.match(matcher, i, seq);
+ }
+ } else {
+ matcher.hitEnd = true;
+ }
+ return false;
+ }
+ boolean study(TreeInfo info) {
+ info.minLength++;
+ info.maxLength += 2;
+ return next.study(info);
+ }
+ }
+
+ /**
* Abstract node class to match one character satisfying some
* boolean property.
*/
@@ -3789,7 +3843,6 @@
}
}
-
/**
* Node class that matches a Unicode block.
*/
@@ -3838,7 +3891,6 @@
}
}
-
/**
* Node class that matches a POSIX type.
*/
@@ -3851,6 +3903,28 @@
}
/**
+ * Node class that matches a Perl vertical whitespace
+ */
+ static final class VertWS extends BmpCharProperty {
+ boolean isSatisfiedBy(int cp) {
+ return (cp >= 0x0A && cp <= 0x0D) ||
+ cp == 0x85 || cp == 0x2028 || cp == 0x2029;
+ }
+ }
+
+ /**
+ * Node class that matches a Perl horizontal whitespace
+ */
+ static final class HorizWS extends BmpCharProperty {
+ boolean isSatisfiedBy(int cp) {
+ return cp == 0x09 || cp == 0x20 || cp == 0xa0 ||
+ cp == 0x1680 || cp == 0x180e ||
+ cp >= 0x2000 && cp <= 0x200a ||
+ cp == 0x202f || cp == 0x205f || cp == 0x3000;
+ }
+ }
+
+ /**
* Base class for all Slice nodes
*/
static class SliceNode extends Node {
--- a/jdk/test/java/util/regex/RegExTest.java Tue May 08 02:59:10 2012 -0400
+++ b/jdk/test/java/util/regex/RegExTest.java Tue May 08 10:57:13 2012 -0700
@@ -33,7 +33,7 @@
* 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940
* 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133
* 6350801 6676425 6878475 6919132 6931676 6948903 6990617 7014645 7039066
- * 7067045
+ * 7067045 7014640
*/
import java.util.regex.*;
@@ -141,6 +141,8 @@
unicodePropertiesTest();
unicodeHexNotationTest();
unicodeClassesTest();
+ horizontalAndVerticalWSTest();
+ linebreakTest();
if (failure) {
throw new
RuntimeException("RegExTest failed, 1st failure: " +
@@ -857,13 +859,18 @@
// in replacement string
try {
"\uac00".replaceAll("\uac00", "$");
+ failCount++;
+ } catch (IllegalArgumentException iie) {
+ } catch (Exception e) {
+ failCount++;
+ }
+ try {
"\uac00".replaceAll("\uac00", "\\");
failCount++;
} catch (IllegalArgumentException iie) {
} catch (Exception e) {
failCount++;
}
-
report("Literal replacement");
}
@@ -3838,4 +3845,77 @@
failCount++;
report("unicodePredefinedClasses");
}
+
+ private static void horizontalAndVerticalWSTest() throws Exception {
+ String hws = new String (new char[] {
+ 0x09, 0x20, 0xa0, 0x1680, 0x180e,
+ 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005,
+ 0x2006, 0x2007, 0x2008, 0x2009, 0x200a,
+ 0x202f, 0x205f, 0x3000 });
+ String vws = new String (new char[] {
+ 0x0a, 0x0b, 0x0c, 0x0d, 0x85, 0x2028, 0x2029 });
+ if (!Pattern.compile("\\h+").matcher(hws).matches() ||
+ !Pattern.compile("[\\h]+").matcher(hws).matches())
+ failCount++;
+ if (Pattern.compile("\\H").matcher(hws).find() ||
+ Pattern.compile("[\\H]").matcher(hws).find())
+ failCount++;
+ if (!Pattern.compile("\\v+").matcher(vws).matches() ||
+ !Pattern.compile("[\\v]+").matcher(vws).matches())
+ failCount++;
+ if (Pattern.compile("\\V").matcher(vws).find() ||
+ Pattern.compile("[\\V]").matcher(vws).find())
+ failCount++;
+ String prefix = "abcd";
+ String suffix = "efgh";
+ String ng = "A";
+ for (int i = 0; i < hws.length(); i++) {
+ String c = String.valueOf(hws.charAt(i));
+ Matcher m = Pattern.compile("\\h").matcher(prefix + c + suffix);
+ if (!m.find() || !c.equals(m.group()))
+ failCount++;
+ m = Pattern.compile("[\\h]").matcher(prefix + c + suffix);
+ if (!m.find() || !c.equals(m.group()))
+ failCount++;
+
+ m = Pattern.compile("\\H").matcher(hws.substring(0, i) + ng + hws.substring(i));
+ if (!m.find() || !ng.equals(m.group()))
+ failCount++;
+ m = Pattern.compile("[\\H]").matcher(hws.substring(0, i) + ng + hws.substring(i));
+ if (!m.find() || !ng.equals(m.group()))
+ failCount++;
+ }
+ for (int i = 0; i < vws.length(); i++) {
+ String c = String.valueOf(vws.charAt(i));
+ Matcher m = Pattern.compile("\\v").matcher(prefix + c + suffix);
+ if (!m.find() || !c.equals(m.group()))
+ failCount++;
+ m = Pattern.compile("[\\v]").matcher(prefix + c + suffix);
+ if (!m.find() || !c.equals(m.group()))
+ failCount++;
+
+ m = Pattern.compile("\\V").matcher(vws.substring(0, i) + ng + vws.substring(i));
+ if (!m.find() || !ng.equals(m.group()))
+ failCount++;
+ m = Pattern.compile("[\\V]").matcher(vws.substring(0, i) + ng + vws.substring(i));
+ if (!m.find() || !ng.equals(m.group()))
+ failCount++;
+ }
+ // \v in range is interpreted as 0x0B. This is the undocumented behavior
+ if (!Pattern.compile("[\\v-\\v]").matcher(String.valueOf((char)0x0B)).matches())
+ failCount++;
+ report("horizontalAndVerticalWSTest");
+ }
+
+ private static void linebreakTest() throws Exception {
+ String linebreaks = new String (new char[] {
+ 0x0A, 0x0B, 0x0C, 0x0D, 0x85, 0x2028, 0x2029 });
+ String crnl = "\r\n";
+ if (!Pattern.compile("\\R+").matcher(linebreaks).matches() ||
+ !Pattern.compile("\\R").matcher(crnl).matches() ||
+ Pattern.compile("\\R\\R").matcher(crnl).matches())
+ failCount++;
+ report("linebreakTest");
+ }
+
}