7014645: Support perl style Unicode hex notation \x{...}
authorsherman
Thu, 03 Feb 2011 13:49:25 -0800
changeset 8173 a3a39b98e05a
parent 8172 3f25c3770191
child 8174 89e3a22d4cd7
7014645: Support perl style Unicode hex notation \x{...} Summary: Added the construct \x{...} for Unicode hex notation support Reviewed-by: alanb, okutsu
jdk/src/share/classes/java/util/regex/Pattern.java
jdk/test/java/util/regex/RegExTest.java
--- a/jdk/src/share/classes/java/util/regex/Pattern.java	Thu Feb 03 19:09:05 2011 +0000
+++ b/jdk/src/share/classes/java/util/regex/Pattern.java	Thu Feb 03 13:49:25 2011 -0800
@@ -101,6 +101,11 @@
  *     <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;<tt>0x</tt><i>hh</i></td></tr>
  * <tr><td valign="top" headers="construct characters"><tt>&#92;u</tt><i>hhhh</i></td>
  *     <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;<tt>0x</tt><i>hhhh</i></td></tr>
+ * <tr><td valign="top" headers="construct characters"><tt>&#92;x</tt><i>{h...h}</i></td>
+ *     <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;<tt>0x</tt><i>h...h</i>
+ *         ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT}
+ *         &nbsp;&lt;=&nbsp;<tt>0x</tt><i>h...h</i>&nbsp;&lt;=&nbsp
+ *          {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})</td></tr>
  * <tr><td valign="top" headers="matches"><tt>\t</tt></td>
  *     <td headers="matches">The tab character (<tt>'&#92;u0009'</tt>)</td></tr>
  * <tr><td valign="top" headers="construct characters"><tt>\n</tt></td>
@@ -529,6 +534,13 @@
  * while not equal, compile into the same pattern, which matches the character
  * with hexadecimal value <tt>0x2014</tt>.
  *
+ * <p> A Unicode character can also be represented in a regular-expression by
+ * using its hexadecimal code point value directly as described in construct
+ * <tt>&#92;x{...}</tt>, for example a supplementary character U+2011F
+ * can be specified as <tt>&#92;x{2011F}</tt>, instead of two consecutive
+ * Unicode escape sequences of the surrogate pair
+ * <tt>&#92;uD840</tt><tt>&#92;uDD1F</tt>.
+ *
  * <a name="ubc">
  * <p>Unicode scripts, blocks and categories are written with the <tt>\p</tt> and
  * <tt>\P</tt> constructs as in Perl. <tt>\p{</tt><i>prop</i><tt>}</tt> matches if
@@ -2993,6 +3005,16 @@
             if (ASCII.isHexDigit(m)) {
                 return ASCII.toDigit(n) * 16 + ASCII.toDigit(m);
             }
+        } else if (n == '{' && ASCII.isHexDigit(peek())) {
+            int ch = 0;
+            while (ASCII.isHexDigit(n = read())) {
+                ch = (ch << 4) + ASCII.toDigit(n);
+                if (ch > Character.MAX_CODE_POINT)
+                    throw error("Hexadecimal codepoint is too big");
+            }
+            if (n != '}')
+                throw error("Unclosed hexadecimal escape sequence");
+            return ch;
         }
         throw error("Illegal hexadecimal escape sequence");
     }
--- a/jdk/test/java/util/regex/RegExTest.java	Thu Feb 03 19:09:05 2011 +0000
+++ b/jdk/test/java/util/regex/RegExTest.java	Thu Feb 03 13:49:25 2011 -0800
@@ -32,7 +32,7 @@
  * 4872664 4803179 4892980 4900747 4945394 4938995 4979006 4994840 4997476
  * 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940
  * 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133
- * 6350801 6676425 6878475 6919132 6931676 6948903
+ * 6350801 6676425 6878475 6919132 6931676 6948903 7014645
  */
 
 import java.util.regex.*;
@@ -136,6 +136,7 @@
         namedGroupCaptureTest();
         nonBmpClassComplementTest();
         unicodePropertiesTest();
+        unicodeHexNotationTest();
         if (failure)
             throw new RuntimeException("Failure in the RE handling.");
         else
@@ -161,18 +162,19 @@
 
     private static void check(Matcher m, String result, boolean expected) {
         m.find();
-        if (m.group().equals(result))
-            failCount += (expected) ? 0 : 1;
-        else
-            failCount += (expected) ? 1 : 0;
+        if (m.group().equals(result) != expected)
+            failCount++;
     }
 
     private static void check(Pattern p, String s, boolean expected) {
-        Matcher matcher = p.matcher(s);
-        if (matcher.find())
-            failCount += (expected) ? 0 : 1;
-        else
-            failCount += (expected) ? 1 : 0;
+        if (p.matcher(s).find() != expected)
+            failCount++;
+    }
+
+    private static void check(String p, String s, boolean expected) {
+        Matcher matcher = Pattern.compile(p).matcher(s);
+        if (matcher.find() != expected)
+            failCount++;
     }
 
     private static void check(String p, char c, boolean expected) {
@@ -3614,4 +3616,45 @@
         }
         report("unicodeProperties");
     }
+
+    private static void unicodeHexNotationTest() throws Exception {
+
+        // negative
+        checkExpectedFail("\\x{-23}");
+        checkExpectedFail("\\x{110000}");
+        checkExpectedFail("\\x{}");
+        checkExpectedFail("\\x{AB[ef]");
+
+        // codepoint
+        check("^\\x{1033c}$",              "\uD800\uDF3C", true);
+        check("^\\xF0\\x90\\x8C\\xBC$",    "\uD800\uDF3C", false);
+        check("^\\x{D800}\\x{DF3c}+$",     "\uD800\uDF3C", false);
+        check("^\\xF0\\x90\\x8C\\xBC$",    "\uD800\uDF3C", false);
+
+        // in class
+        check("^[\\x{D800}\\x{DF3c}]+$",   "\uD800\uDF3C", false);
+        check("^[\\xF0\\x90\\x8C\\xBC]+$", "\uD800\uDF3C", false);
+        check("^[\\x{D800}\\x{DF3C}]+$",   "\uD800\uDF3C", false);
+        check("^[\\x{DF3C}\\x{D800}]+$",   "\uD800\uDF3C", false);
+        check("^[\\x{D800}\\x{DF3C}]+$",   "\uDF3C\uD800", true);
+        check("^[\\x{DF3C}\\x{D800}]+$",   "\uDF3C\uD800", true);
+
+        for (int cp = 0; cp <= 0x10FFFF; cp++) {
+             String s = "A" + new String(Character.toChars(cp)) + "B";
+             String hexUTF16 = (cp <= 0xFFFF)? String.format("\\u%04x", cp)
+                                             : String.format("\\u%04x\\u%04x",
+                                               (int) Character.toChars(cp)[0],
+                                               (int) Character.toChars(cp)[1]);
+             String hexCodePoint = "\\x{" + Integer.toHexString(cp) + "}";
+             if (!Pattern.matches("A" + hexUTF16 + "B", s))
+                 failCount++;
+             if (!Pattern.matches("A[" + hexUTF16 + "]B", s))
+                 failCount++;
+             if (!Pattern.matches("A" + hexCodePoint + "B", s))
+                 failCount++;
+             if (!Pattern.matches("A[" + hexCodePoint + "]B", s))
+                 failCount++;
+         }
+         report("unicodeHexNotation");
+     }
 }