7014645: Support perl style Unicode hex notation \x{...}
Summary: Added the construct \x{...} for Unicode hex notation support
Reviewed-by: alanb, okutsu
--- a/jdk/src/share/classes/java/util/regex/Pattern.java Thu Feb 03 19:09:05 2011 +0000
+++ b/jdk/src/share/classes/java/util/regex/Pattern.java Thu Feb 03 13:49:25 2011 -0800
@@ -101,6 +101,11 @@
* <td headers="matches">The character with hexadecimal value <tt>0x</tt><i>hh</i></td></tr>
* <tr><td valign="top" headers="construct characters"><tt>\u</tt><i>hhhh</i></td>
* <td headers="matches">The character with hexadecimal value <tt>0x</tt><i>hhhh</i></td></tr>
+ * <tr><td valign="top" headers="construct characters"><tt>\x</tt><i>{h...h}</i></td>
+ * <td headers="matches">The character with hexadecimal value <tt>0x</tt><i>h...h</i>
+ * ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT}
+ * <= <tt>0x</tt><i>h...h</i> <= 
+ * {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})</td></tr>
* <tr><td valign="top" headers="matches"><tt>\t</tt></td>
* <td headers="matches">The tab character (<tt>'\u0009'</tt>)</td></tr>
* <tr><td valign="top" headers="construct characters"><tt>\n</tt></td>
@@ -529,6 +534,13 @@
* while not equal, compile into the same pattern, which matches the character
* with hexadecimal value <tt>0x2014</tt>.
*
+ * <p> A Unicode character can also be represented in a regular-expression by
+ * using its hexadecimal code point value directly as described in construct
+ * <tt>\x{...}</tt>, for example a supplementary character U+2011F
+ * can be specified as <tt>\x{2011F}</tt>, instead of two consecutive
+ * Unicode escape sequences of the surrogate pair
+ * <tt>\uD840</tt><tt>\uDD1F</tt>.
+ *
* <a name="ubc">
* <p>Unicode scripts, blocks and categories are written with the <tt>\p</tt> and
* <tt>\P</tt> constructs as in Perl. <tt>\p{</tt><i>prop</i><tt>}</tt> matches if
@@ -2993,6 +3005,16 @@
if (ASCII.isHexDigit(m)) {
return ASCII.toDigit(n) * 16 + ASCII.toDigit(m);
}
+ } else if (n == '{' && ASCII.isHexDigit(peek())) {
+ int ch = 0;
+ while (ASCII.isHexDigit(n = read())) {
+ ch = (ch << 4) + ASCII.toDigit(n);
+ if (ch > Character.MAX_CODE_POINT)
+ throw error("Hexadecimal codepoint is too big");
+ }
+ if (n != '}')
+ throw error("Unclosed hexadecimal escape sequence");
+ return ch;
}
throw error("Illegal hexadecimal escape sequence");
}
--- a/jdk/test/java/util/regex/RegExTest.java Thu Feb 03 19:09:05 2011 +0000
+++ b/jdk/test/java/util/regex/RegExTest.java Thu Feb 03 13:49:25 2011 -0800
@@ -32,7 +32,7 @@
* 4872664 4803179 4892980 4900747 4945394 4938995 4979006 4994840 4997476
* 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940
* 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133
- * 6350801 6676425 6878475 6919132 6931676 6948903
+ * 6350801 6676425 6878475 6919132 6931676 6948903 7014645
*/
import java.util.regex.*;
@@ -136,6 +136,7 @@
namedGroupCaptureTest();
nonBmpClassComplementTest();
unicodePropertiesTest();
+ unicodeHexNotationTest();
if (failure)
throw new RuntimeException("Failure in the RE handling.");
else
@@ -161,18 +162,19 @@
private static void check(Matcher m, String result, boolean expected) {
m.find();
- if (m.group().equals(result))
- failCount += (expected) ? 0 : 1;
- else
- failCount += (expected) ? 1 : 0;
+ if (m.group().equals(result) != expected)
+ failCount++;
}
private static void check(Pattern p, String s, boolean expected) {
- Matcher matcher = p.matcher(s);
- if (matcher.find())
- failCount += (expected) ? 0 : 1;
- else
- failCount += (expected) ? 1 : 0;
+ if (p.matcher(s).find() != expected)
+ failCount++;
+ }
+
+ private static void check(String p, String s, boolean expected) {
+ Matcher matcher = Pattern.compile(p).matcher(s);
+ if (matcher.find() != expected)
+ failCount++;
}
private static void check(String p, char c, boolean expected) {
@@ -3614,4 +3616,45 @@
}
report("unicodeProperties");
}
+
+ private static void unicodeHexNotationTest() throws Exception {
+
+ // negative
+ checkExpectedFail("\\x{-23}");
+ checkExpectedFail("\\x{110000}");
+ checkExpectedFail("\\x{}");
+ checkExpectedFail("\\x{AB[ef]");
+
+ // codepoint
+ check("^\\x{1033c}$", "\uD800\uDF3C", true);
+ check("^\\xF0\\x90\\x8C\\xBC$", "\uD800\uDF3C", false);
+ check("^\\x{D800}\\x{DF3c}+$", "\uD800\uDF3C", false);
+ check("^\\xF0\\x90\\x8C\\xBC$", "\uD800\uDF3C", false);
+
+ // in class
+ check("^[\\x{D800}\\x{DF3c}]+$", "\uD800\uDF3C", false);
+ check("^[\\xF0\\x90\\x8C\\xBC]+$", "\uD800\uDF3C", false);
+ check("^[\\x{D800}\\x{DF3C}]+$", "\uD800\uDF3C", false);
+ check("^[\\x{DF3C}\\x{D800}]+$", "\uD800\uDF3C", false);
+ check("^[\\x{D800}\\x{DF3C}]+$", "\uDF3C\uD800", true);
+ check("^[\\x{DF3C}\\x{D800}]+$", "\uDF3C\uD800", true);
+
+ for (int cp = 0; cp <= 0x10FFFF; cp++) {
+ String s = "A" + new String(Character.toChars(cp)) + "B";
+ String hexUTF16 = (cp <= 0xFFFF)? String.format("\\u%04x", cp)
+ : String.format("\\u%04x\\u%04x",
+ (int) Character.toChars(cp)[0],
+ (int) Character.toChars(cp)[1]);
+ String hexCodePoint = "\\x{" + Integer.toHexString(cp) + "}";
+ if (!Pattern.matches("A" + hexUTF16 + "B", s))
+ failCount++;
+ if (!Pattern.matches("A[" + hexUTF16 + "]B", s))
+ failCount++;
+ if (!Pattern.matches("A" + hexCodePoint + "B", s))
+ failCount++;
+ if (!Pattern.matches("A[" + hexCodePoint + "]B", s))
+ failCount++;
+ }
+ report("unicodeHexNotation");
+ }
}