7039066: j.u.rgex does not match TR18 RL1.4 Simple Word Boundaries and RL1.2 Properties
Summary: updated the regex Unicode property support
Reviewed-by: alanb
--- a/jdk/src/share/classes/java/util/regex/Pattern.java Thu Apr 28 20:18:57 2011 -0700
+++ b/jdk/src/share/classes/java/util/regex/Pattern.java Thu Apr 28 20:48:36 2011 -0700
@@ -206,13 +206,15 @@
* <td>Equivalent to java.lang.Character.isMirrored()</td></tr>
*
* <tr><th> </th></tr>
- * <tr align="left"><th colspan="2" id="unicode">Classes for Unicode scripts, blocks and categories</th></tr>
+ * <tr align="left"><th colspan="2" id="unicode">Classes for Unicode scripts, blocks, categories and binary properties</th></tr>
* * <tr><td valign="top" headers="construct unicode"><tt>\p{IsLatin}</tt></td>
- * <td headers="matches">A Latin script character (simple <a href="#ubc">script</a>)</td></tr>
+ * <td headers="matches">A Latin script character (<a href="#usc">script</a>)</td></tr>
* <tr><td valign="top" headers="construct unicode"><tt>\p{InGreek}</tt></td>
- * <td headers="matches">A character in the Greek block (simple <a href="#ubc">block</a>)</td></tr>
+ * <td headers="matches">A character in the Greek block (<a href="#ubc">block</a>)</td></tr>
* <tr><td valign="top" headers="construct unicode"><tt>\p{Lu}</tt></td>
- * <td headers="matches">An uppercase letter (simple <a href="#ubc">category</a>)</td></tr>
+ * <td headers="matches">An uppercase letter (<a href="#ucc">category</a>)</td></tr>
+ * <tr><td valign="top" headers="construct unicode"><tt>\p{isAlphabetic}</tt></td>
+ * <td headers="matches">An alphabetic character (<a href="#ubpc">binary property</a>)</td></tr>
* <tr><td valign="top" headers="construct unicode"><tt>\p{Sc}</tt></td>
* <td headers="matches">A currency symbol</td></tr>
* <tr><td valign="top" headers="construct unicode"><tt>\P{InGreek}</tt></td>
@@ -328,10 +330,11 @@
* <td headers="matches"><i>X</i>, as a named-capturing group</td></tr>
* <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td>
* <td headers="matches"><i>X</i>, as a non-capturing group</td></tr>
- * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux) </tt></td>
+ * <tr><td valign="top" headers="construct special"><tt>(?idmsuxU-idmsuxU) </tt></td>
* <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a>
* <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a>
- * <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> on - off</td></tr>
+ * <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> <a href="#UNICODE_CHARACTER_CLASS">U</a>
+ * on - off</td></tr>
* <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux:</tt><i>X</i><tt>)</tt> </td>
* <td headers="matches"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the
* given flags <a href="#CASE_INSENSITIVE">i</a> <a href="#UNIX_LINES">d</a>
@@ -518,61 +521,140 @@
*
* <p> This class is in conformance with Level 1 of <a
* href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
- * Standard #18: Unicode Regular Expression Guidelines</i></a>, plus RL2.1
+ * Standard #18: Unicode Regular Expression</i></a>, plus RL2.1
* Canonical Equivalents.
- *
- * <p> Unicode escape sequences such as <tt>\u2014</tt> in Java source code
+ * <p>
+ * <b>Unicode escape sequences</b> such as <tt>\u2014</tt> in Java source code
* are processed as described in section 3.3 of
* <cite>The Java™ Language Specification</cite>.
- * Such escape sequences are also
- * implemented directly by the regular-expression parser so that Unicode
- * escapes can be used in expressions that are read from files or from the
- * keyboard. Thus the strings <tt>"\u2014"</tt> and <tt>"\\u2014"</tt>,
- * while not equal, compile into the same pattern, which matches the character
- * with hexadecimal value <tt>0x2014</tt>.
- *
- * <p> A Unicode character can also be represented in a regular-expression by
- * using its hexadecimal code point value directly as described in construct
+ * Such escape sequences are also implemented directly by the regular-expression
+ * parser so that Unicode escapes can be used in expressions that are read from
+ * files or from the keyboard. Thus the strings <tt>"\u2014"</tt> and
+ * <tt>"\\u2014"</tt>, while not equal, compile into the same pattern, which
+ * matches the character with hexadecimal value <tt>0x2014</tt>.
+ * <p>
+ * A Unicode character can also be represented in a regular-expression by
+ * using its <b>Hex notation</b>(hexadecimal code point value) directly as described in construct
* <tt>\x{...}</tt>, for example a supplementary character U+2011F
* can be specified as <tt>\x{2011F}</tt>, instead of two consecutive
* Unicode escape sequences of the surrogate pair
* <tt>\uD840</tt><tt>\uDD1F</tt>.
- *
- * <a name="ubc">
- * <p>Unicode scripts, blocks and categories are written with the <tt>\p</tt> and
- * <tt>\P</tt> constructs as in Perl. <tt>\p{</tt><i>prop</i><tt>}</tt> matches if
+ * <p>
+ * Unicode scripts, blocks, categories and binary properties are written with
+ * the <tt>\p</tt> and <tt>\P</tt> constructs as in Perl.
+ * <tt>\p{</tt><i>prop</i><tt>}</tt> matches if
* the input has the property <i>prop</i>, while <tt>\P{</tt><i>prop</i><tt>}</tt>
* does not match if the input has that property.
* <p>
- * Scripts are specified either with the prefix {@code Is}, as in
+ * Scripts, blocks, categories and binary properties can be used both inside
+ * and outside of a character class.
+ * <a name="usc">
+ * <p>
+ * <b>Scripts</b> are specified either with the prefix {@code Is}, as in
* {@code IsHiragana}, or by using the {@code script} keyword (or its short
* form {@code sc})as in {@code script=Hiragana} or {@code sc=Hiragana}.
* <p>
- * Blocks are specified with the prefix {@code In}, as in
+ * The script names supported by <code>Pattern</code> are the valid script names
+ * accepted and defined by
+ * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.
+ * <a name="ubc">
+ * <p>
+ * <b>Blocks</b> are specified with the prefix {@code In}, as in
* {@code InMongolian}, or by using the keyword {@code block} (or its short
* form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}.
* <p>
- * Categories may be specified with the optional prefix {@code Is}:
+ * The block names supported by <code>Pattern</code> are the valid block names
+ * accepted and defined by
+ * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.
+ * <p>
+ * <a name="ucc">
+ * <b>Categories</b> may be specified with the optional prefix {@code Is}:
* Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode
* letters. Same as scripts and blocks, categories can also be specified
* by using the keyword {@code general_category} (or its short form
* {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}.
* <p>
- * Scripts, blocks and categories can be used both inside and outside of a
- * character class.
- * <p> The supported categories are those of
+ * The supported categories are those of
* <a href="http://www.unicode.org/unicode/standard/standard.html">
* <i>The Unicode Standard</i></a> in the version specified by the
* {@link java.lang.Character Character} class. The category names are those
* defined in the Standard, both normative and informative.
- * The script names supported by <code>Pattern</code> are the valid script names
- * accepted and defined by
- * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.
- * The block names supported by <code>Pattern</code> are the valid block names
- * accepted and defined by
- * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.
+ * <p>
+ * <a name="ubpc">
+ * <b>Binary properties</b> are specified with the prefix {@code Is}, as in
+ * {@code IsAlphabetic}. The supported binary properties by <code>Pattern</code>
+ * are
+ * <ul>
+ * <li> Alphabetic
+ * <li> Ideographic
+ * <li> Letter
+ * <li> Lowercase
+ * <li> Uppercase
+ * <li> Titlecase
+ * <li> Punctuation
+ * <Li> Control
+ * <li> White_Space
+ * <li> Digit
+ * <li> Hex_Digit
+ * <li> Noncharacter_Code_Point
+ * <li> Assigned
+ * </ul>
+
+
+ * <p>
+ * <b>Predefined Character classes</b> and <b>POSIX character classes</b> are in
+ * conformance with the recommendation of <i>Annex C: Compatibility Properties</i>
+ * of <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Regular Expression
+ * </i></a>, when {@link #UNICODE_CHARACTER_CLASS} flag is specified.
* <p>
- * <a name="jcc"> <p>Categories that behave like the java.lang.Character
+ * <table border="0" cellpadding="1" cellspacing="0"
+ * summary="predefined and posix character classes in Unicode mode">
+ * <tr align="left">
+ * <th bgcolor="#CCCCFF" align="left" id="classes">Classes</th>
+ * <th bgcolor="#CCCCFF" align="left" id="matches">Matches</th>
+ *</tr>
+ * <tr><td><tt>\p{Lower}</tt></td>
+ * <td>A lowercase character:<tt>\p{IsLowercase}</tt></td></tr>
+ * <tr><td><tt>\p{Upper}</tt></td>
+ * <td>An uppercase character:<tt>\p{IsUppercase}</tt></td></tr>
+ * <tr><td><tt>\p{ASCII}</tt></td>
+ * <td>All ASCII:<tt>[\x00-\x7F]</tt></td></tr>
+ * <tr><td><tt>\p{Alpha}</tt></td>
+ * <td>An alphabetic character:<tt>\p{IsAlphabetic}</tt></td></tr>
+ * <tr><td><tt>\p{Digit}</tt></td>
+ * <td>A decimal digit character:<tt>p{IsDigit}</tt></td></tr>
+ * <tr><td><tt>\p{Alnum}</tt></td>
+ * <td>An alphanumeric character:<tt>[\p{IsAlphabetic}\p{IsDigit}]</tt></td></tr>
+ * <tr><td><tt>\p{Punct}</tt></td>
+ * <td>A punctuation character:<tt>p{IsPunctuation}</tt></td></tr>
+ * <tr><td><tt>\p{Graph}</tt></td>
+ * <td>A visible character: <tt>[^\p{IsWhite_Space}\p{gc=Cc}\p{gc=Cs}\p{gc=Cn}]</tt></td></tr>
+ * <tr><td><tt>\p{Print}</tt></td>
+ * <td>A printable character: <tt>[\p{Graph}\p{Blank}&&[^\p{Cntrl}]]</tt></td></tr>
+ * <tr><td><tt>\p{Blank}</tt></td>
+ * <td>A space or a tab: <tt>[\p{IsWhite_Space}&&[^\p{gc=Zl}\p{gc=Zp}\x0a\x0b\x0c\x0d\x85]]</tt></td></tr>
+ * <tr><td><tt>\p{Cntrl}</tt></td>
+ * <td>A control character: <tt>\p{gc=Cc}</tt></td></tr>
+ * <tr><td><tt>\p{XDigit}</tt></td>
+ * <td>A hexadecimal digit: <tt>[\p{gc=Nd}\p{IsHex_Digit}]</tt></td></tr>
+ * <tr><td><tt>\p{Space}</tt></td>
+ * <td>A whitespace character:<tt>\p{IsWhite_Space}</tt></td></tr>
+ * <tr><td><tt>\d</tt></td>
+ * <td>A digit: <tt>\p{IsDigit}</tt></td></tr>
+ * <tr><td><tt>\D</tt></td>
+ * <td>A non-digit: <tt>[^\d]</tt></td></tr>
+ * <tr><td><tt>\s</tt></td>
+ * <td>A whitespace character: <tt>\p{IsWhite_Space}</tt></td></tr>
+ * <tr><td><tt>\S</tt></td>
+ * <td>A non-whitespace character: <tt>[^\s]</tt></td></tr>
+ * <tr><td><tt>\w</tt></td>
+ * <td>A word character: <tt>[\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}]</tt></td></tr>
+ * <tr><td><tt>\W</tt></td>
+ * <td>A non-word character: <tt>[^\w]</tt></td></tr>
+ * </table>
+ * <p>
+ * <a name="jcc">
+ * Categories that behave like the java.lang.Character
* boolean is<i>methodname</i> methods (except for the deprecated ones) are
* available through the same <tt>\p{</tt><i>prop</i><tt>}</tt> syntax where
* the specified property has the name <tt>java<i>methodname</i></tt>.
@@ -796,6 +878,28 @@
*/
public static final int CANON_EQ = 0x80;
+ /**
+ * Enables the Unicode version of <i>Predefined character classes</i> and
+ * <i>POSIX character classes</i>.
+ *
+ * <p> When this flag is specified then the (US-ASCII only)
+ * <i>Predefined character classes</i> and <i>POSIX character classes</i>
+ * are in conformance with
+ * <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
+ * Standard #18: Unicode Regular Expression</i></a>
+ * <i>Annex C: Compatibility Properties</i>.
+ * <p>
+ * The UNICODE_CHARACTER_CLASS mode can also be enabled via the embedded
+ * flag expression <tt>(?U)</tt>.
+ * <p>
+ * The flag implies UNICODE_CASE, that is, it enables Unicode-aware case
+ * folding.
+ * <p>
+ * Specifying this flag may impose a performance penalty. </p>
+ * @since 1.7
+ */
+ public static final int UNICODE_CHARACTER_CLASS = 0x100;
+
/* Pattern has only two serialized components: The pattern string
* and the flags, which are all that is needed to recompile the pattern
* when it is deserialized.
@@ -918,7 +1022,8 @@
* Match flags, a bit mask that may include
* {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL},
* {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES},
- * {@link #LITERAL} and {@link #COMMENTS}
+ * {@link #LITERAL}, {@link #UNICODE_CHARACTER_CLASS}
+ * and {@link #COMMENTS}
*
* @throws IllegalArgumentException
* If bit values other than those corresponding to the defined
@@ -1209,6 +1314,10 @@
pattern = p;
flags = f;
+ // to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present
+ if ((flags & UNICODE_CHARACTER_CLASS) != 0)
+ flags |= UNICODE_CASE;
+
// Reset group index count
capturingGroupCount = 1;
localCount = 0;
@@ -2164,12 +2273,14 @@
return -1;
case 'B':
if (inclass) break;
- if (create) root = new Bound(Bound.NONE);
+ if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));
return -1;
case 'C':
break;
case 'D':
- if (create) root = new Ctype(ASCII.DIGIT).complement();
+ if (create) root = has(UNICODE_CHARACTER_CLASS)
+ ? new Utype(UnicodeProp.DIGIT).complement()
+ : new Ctype(ASCII.DIGIT).complement();
return -1;
case 'E':
case 'F':
@@ -2191,14 +2302,18 @@
case 'R':
break;
case 'S':
- if (create) root = new Ctype(ASCII.SPACE).complement();
+ if (create) root = has(UNICODE_CHARACTER_CLASS)
+ ? new Utype(UnicodeProp.WHITE_SPACE).complement()
+ : new Ctype(ASCII.SPACE).complement();
return -1;
case 'T':
case 'U':
case 'V':
break;
case 'W':
- if (create) root = new Ctype(ASCII.WORD).complement();
+ if (create) root = has(UNICODE_CHARACTER_CLASS)
+ ? new Utype(UnicodeProp.WORD).complement()
+ : new Ctype(ASCII.WORD).complement();
return -1;
case 'X':
case 'Y':
@@ -2216,12 +2331,14 @@
return '\007';
case 'b':
if (inclass) break;
- if (create) root = new Bound(Bound.BOTH);
+ if (create) root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));
return -1;
case 'c':
return c();
case 'd':
- if (create) root = new Ctype(ASCII.DIGIT);
+ if (create) root = has(UNICODE_CHARACTER_CLASS)
+ ? new Utype(UnicodeProp.DIGIT)
+ : new Ctype(ASCII.DIGIT);
return -1;
case 'e':
return '\033';
@@ -2259,7 +2376,9 @@
case 'r':
return '\r';
case 's':
- if (create) root = new Ctype(ASCII.SPACE);
+ if (create) root = has(UNICODE_CHARACTER_CLASS)
+ ? new Utype(UnicodeProp.WHITE_SPACE)
+ : new Ctype(ASCII.SPACE);
return -1;
case 't':
return '\t';
@@ -2268,7 +2387,9 @@
case 'v':
return '\013';
case 'w':
- if (create) root = new Ctype(ASCII.WORD);
+ if (create) root = has(UNICODE_CHARACTER_CLASS)
+ ? new Utype(UnicodeProp.WORD)
+ : new Ctype(ASCII.WORD);
return -1;
case 'x':
return x();
@@ -2490,7 +2611,7 @@
{
next();
String name;
- CharProperty node;
+ CharProperty node = null;
if (singleLetter) {
int c = temp[cursor];
@@ -2536,11 +2657,21 @@
} else if (name.startsWith("Is")) {
// \p{isGeneralCategory} and \p{isScriptName}
name = name.substring(2);
- node = CharPropertyNames.charPropertyFor(name);
+ UnicodeProp uprop = UnicodeProp.forName(name);
+ if (uprop != null)
+ node = new Utype(uprop);
+ if (node == null)
+ node = CharPropertyNames.charPropertyFor(name);
if (node == null)
node = unicodeScriptPropertyFor(name);
} else {
- node = charPropertyNodeFor(name);
+ if (has(UNICODE_CHARACTER_CLASS)) {
+ UnicodeProp uprop = UnicodeProp.forPOSIXName(name);
+ if (uprop != null)
+ node = new Utype(uprop);
+ }
+ if (node == null)
+ node = charPropertyNodeFor(name);
}
}
if (maybeComplement) {
@@ -2822,6 +2953,9 @@
case 'x':
flags |= COMMENTS;
break;
+ case 'U':
+ flags |= (UNICODE_CHARACTER_CLASS | UNICODE_CASE);
+ break;
case '-': // subFlag then fall through
ch = next();
subFlag();
@@ -2861,6 +2995,8 @@
case 'x':
flags &= ~COMMENTS;
break;
+ case 'U':
+ flags &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE);
default:
return;
}
@@ -3664,6 +3800,18 @@
}
/**
+ * Node class that matches a Unicode "type"
+ */
+ static final class Utype extends CharProperty {
+ final UnicodeProp uprop;
+ Utype(UnicodeProp uprop) { this.uprop = uprop; }
+ boolean isSatisfiedBy(int ch) {
+ return uprop.is(ch);
+ }
+ }
+
+
+ /**
* Node class that matches a POSIX type.
*/
static final class Ctype extends BmpCharProperty {
@@ -5025,9 +5173,17 @@
static int BOTH = 0x3;
static int NONE = 0x4;
int type;
- Bound(int n) {
+ boolean useUWORD;
+ Bound(int n, boolean useUWORD) {
type = n;
- }
+ this.useUWORD = useUWORD;
+ }
+
+ boolean isWord(int ch) {
+ return useUWORD ? UnicodeProp.WORD.is(ch)
+ : (ch == '_' || Character.isLetterOrDigit(ch));
+ }
+
int check(Matcher matcher, int i, CharSequence seq) {
int ch;
boolean left = false;
@@ -5039,14 +5195,14 @@
}
if (i > startIndex) {
ch = Character.codePointBefore(seq, i);
- left = (ch == '_' || Character.isLetterOrDigit(ch) ||
+ left = (isWord(ch) ||
((Character.getType(ch) == Character.NON_SPACING_MARK)
&& hasBaseCharacter(matcher, i-1, seq)));
}
boolean right = false;
if (i < endIndex) {
ch = Character.codePointAt(seq, i);
- right = (ch == '_' || Character.isLetterOrDigit(ch) ||
+ right = (isWord(ch) ||
((Character.getType(ch) == Character.NON_SPACING_MARK)
&& hasBaseCharacter(matcher, i, seq)));
} else {
@@ -5428,6 +5584,12 @@
defClone("javaUpperCase", new CloneableProperty() {
boolean isSatisfiedBy(int ch) {
return Character.isUpperCase(ch);}});
+ defClone("javaAlphabetic", new CloneableProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return Character.isAlphabetic(ch);}});
+ defClone("javaIdeographic", new CloneableProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return Character.isIdeographic(ch);}});
defClone("javaTitleCase", new CloneableProperty() {
boolean isSatisfiedBy(int ch) {
return Character.isTitleCase(ch);}});
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/src/share/classes/java/util/regex/UnicodeProp.java Thu Apr 28 20:48:36 2011 -0700
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package java.util.regex;
+
+import java.util.HashMap;
+import java.util.Locale;
+
+enum UnicodeProp {
+
+ ALPHABETIC {
+ public boolean is(int ch) {
+ return Character.isAlphabetic(ch);
+ }
+ },
+
+ LETTER {
+ public boolean is(int ch) {
+ return Character.isLetter(ch);
+ }
+ },
+
+ IDEOGRAPHIC {
+ public boolean is(int ch) {
+ return Character.isIdeographic(ch);
+ }
+ },
+
+ LOWERCASE {
+ public boolean is(int ch) {
+ return Character.isLowerCase(ch);
+ }
+ },
+
+ UPPERCASE {
+ public boolean is(int ch) {
+ return Character.isUpperCase(ch);
+ }
+ },
+
+ TITLECASE {
+ public boolean is(int ch) {
+ return Character.isTitleCase(ch);
+ }
+ },
+
+ WHITE_SPACE {
+ // \p{Whitespace}
+ public boolean is(int ch) {
+ return ((((1 << Character.SPACE_SEPARATOR) |
+ (1 << Character.LINE_SEPARATOR) |
+ (1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1)
+ != 0 || (ch >= 0x9 && ch <= 0xd) || (ch == 0x85);
+ }
+ },
+
+ CONTROL {
+ // \p{gc=Control}
+ public boolean is(int ch) {
+ return Character.getType(ch) == Character.CONTROL;
+ }
+ },
+
+ PUNCTUATION {
+ // \p{gc=Punctuation}
+ public boolean is(int ch) {
+ return ((((1 << Character.CONNECTOR_PUNCTUATION) |
+ (1 << Character.DASH_PUNCTUATION) |
+ (1 << Character.START_PUNCTUATION) |
+ (1 << Character.END_PUNCTUATION) |
+ (1 << Character.OTHER_PUNCTUATION) |
+ (1 << Character.INITIAL_QUOTE_PUNCTUATION) |
+ (1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1)
+ != 0;
+ }
+ },
+
+ HEX_DIGIT {
+ // \p{gc=Decimal_Number}
+ // \p{Hex_Digit} -> PropList.txt: Hex_Digit
+ public boolean is(int ch) {
+ return DIGIT.is(ch) ||
+ (ch >= 0x0030 && ch <= 0x0039) ||
+ (ch >= 0x0041 && ch <= 0x0046) ||
+ (ch >= 0x0061 && ch <= 0x0066) ||
+ (ch >= 0xFF10 && ch <= 0xFF19) ||
+ (ch >= 0xFF21 && ch <= 0xFF26) ||
+ (ch >= 0xFF41 && ch <= 0xFF46);
+ }
+ },
+
+ ASSIGNED {
+ public boolean is(int ch) {
+ return Character.getType(ch) != Character.UNASSIGNED;
+ }
+ },
+
+ NONCHARACTER_CODE_POINT {
+ // PropList.txt:Noncharacter_Code_Point
+ public boolean is(int ch) {
+ return (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef);
+ }
+ },
+
+ DIGIT {
+ // \p{gc=Decimal_Number}
+ public boolean is(int ch) {
+ return Character.isDigit(ch);
+ }
+ },
+
+ ALNUM {
+ // \p{alpha}
+ // \p{digit}
+ public boolean is(int ch) {
+ return ALPHABETIC.is(ch) || DIGIT.is(ch);
+ }
+ },
+
+ BLANK {
+ // \p{Whitespace} --
+ // [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL} -> 0xa, 0xb, 0xc, 0xd, 0x85
+ // \p{gc=Line_Separator}
+ // \p{gc=Paragraph_Separator}]
+ public boolean is(int ch) {
+ return Character.getType(ch) == Character.SPACE_SEPARATOR ||
+ ch == 0x9; // \N{HT}
+ }
+ },
+
+ GRAPH {
+ // [^
+ // \p{space}
+ // \p{gc=Control}
+ // \p{gc=Surrogate}
+ // \p{gc=Unassigned}]
+ public boolean is(int ch) {
+ return ((((1 << Character.SPACE_SEPARATOR) |
+ (1 << Character.LINE_SEPARATOR) |
+ (1 << Character.PARAGRAPH_SEPARATOR) |
+ (1 << Character.CONTROL) |
+ (1 << Character.SURROGATE) |
+ (1 << Character.UNASSIGNED)) >> Character.getType(ch)) & 1)
+ == 0;
+ }
+ },
+
+ PRINT {
+ // \p{graph}
+ // \p{blank}
+ // -- \p{cntrl}
+ public boolean is(int ch) {
+ return (GRAPH.is(ch) || BLANK.is(ch)) && !CONTROL.is(ch);
+ }
+ },
+
+ WORD {
+ // \p{alpha}
+ // \p{gc=Mark}
+ // \p{digit}
+ // \p{gc=Connector_Punctuation}
+
+ public boolean is(int ch) {
+ return ALPHABETIC.is(ch) ||
+ ((((1 << Character.NON_SPACING_MARK) |
+ (1 << Character.ENCLOSING_MARK) |
+ (1 << Character.COMBINING_SPACING_MARK) |
+ (1 << Character.DECIMAL_DIGIT_NUMBER) |
+ (1 << Character.CONNECTOR_PUNCTUATION)) >> Character.getType(ch)) & 1)
+ != 0;
+ }
+ };
+
+ private final static HashMap<String, String> posix = new HashMap<>();
+ private final static HashMap<String, String> aliases = new HashMap<>();
+ static {
+ posix.put("ALPHA", "ALPHABETIC");
+ posix.put("LOWER", "LOWERCASE");
+ posix.put("UPPER", "UPPERCASE");
+ posix.put("SPACE", "WHITE_SPACE");
+ posix.put("PUNCT", "PUNCTUATION");
+ posix.put("XDIGIT","HEX_DIGIT");
+ posix.put("ALNUM", "ALNUM");
+ posix.put("CNTRL", "CONTROL");
+ posix.put("DIGIT", "DIGIT");
+ posix.put("BLANK", "BLANK");
+ posix.put("GRAPH", "GRAPH");
+ posix.put("PRINT", "PRINT");
+
+ aliases.put("WHITESPACE", "WHITE_SPACE");
+ aliases.put("HEXDIGIT","HEX_DIGIT");
+ aliases.put("NONCHARACTERCODEPOINT", "NONCHARACTER_CODE_POINT");
+ }
+
+ public static UnicodeProp forName(String propName) {
+ propName = propName.toUpperCase(Locale.ENGLISH);
+ String alias = aliases.get(propName);
+ if (alias != null)
+ propName = alias;
+ try {
+ return valueOf (propName);
+ } catch (IllegalArgumentException x) {}
+ return null;
+ }
+
+ public static UnicodeProp forPOSIXName(String propName) {
+ propName = posix.get(propName.toUpperCase(Locale.ENGLISH));
+ if (propName == null)
+ return null;
+ return valueOf (propName);
+ }
+
+ public abstract boolean is(int ch);
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/test/java/util/regex/POSIX_ASCII.java Thu Apr 28 20:48:36 2011 -0700
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+
+final class POSIX_ASCII {
+
+ static final int UPPER = 0x00000100;
+
+ static final int LOWER = 0x00000200;
+
+ static final int DIGIT = 0x00000400;
+
+ static final int SPACE = 0x00000800;
+
+ static final int PUNCT = 0x00001000;
+
+ static final int CNTRL = 0x00002000;
+
+ static final int BLANK = 0x00004000;
+
+ static final int HEX = 0x00008000;
+
+ static final int UNDER = 0x00010000;
+
+ static final int ASCII = 0x0000FF00;
+
+ static final int ALPHA = (UPPER|LOWER);
+
+ static final int ALNUM = (UPPER|LOWER|DIGIT);
+
+ static final int GRAPH = (PUNCT|UPPER|LOWER|DIGIT);
+
+ static final int WORD = (UPPER|LOWER|UNDER|DIGIT);
+
+ static final int XDIGIT = (HEX);
+
+ private static final int[] ctype = new int[] {
+ CNTRL, /* 00 (NUL) */
+ CNTRL, /* 01 (SOH) */
+ CNTRL, /* 02 (STX) */
+ CNTRL, /* 03 (ETX) */
+ CNTRL, /* 04 (EOT) */
+ CNTRL, /* 05 (ENQ) */
+ CNTRL, /* 06 (ACK) */
+ CNTRL, /* 07 (BEL) */
+ CNTRL, /* 08 (BS) */
+ SPACE+CNTRL+BLANK, /* 09 (HT) */
+ SPACE+CNTRL, /* 0A (LF) */
+ SPACE+CNTRL, /* 0B (VT) */
+ SPACE+CNTRL, /* 0C (FF) */
+ SPACE+CNTRL, /* 0D (CR) */
+ CNTRL, /* 0E (SI) */
+ CNTRL, /* 0F (SO) */
+ CNTRL, /* 10 (DLE) */
+ CNTRL, /* 11 (DC1) */
+ CNTRL, /* 12 (DC2) */
+ CNTRL, /* 13 (DC3) */
+ CNTRL, /* 14 (DC4) */
+ CNTRL, /* 15 (NAK) */
+ CNTRL, /* 16 (SYN) */
+ CNTRL, /* 17 (ETB) */
+ CNTRL, /* 18 (CAN) */
+ CNTRL, /* 19 (EM) */
+ CNTRL, /* 1A (SUB) */
+ CNTRL, /* 1B (ESC) */
+ CNTRL, /* 1C (FS) */
+ CNTRL, /* 1D (GS) */
+ CNTRL, /* 1E (RS) */
+ CNTRL, /* 1F (US) */
+ SPACE+BLANK, /* 20 SPACE */
+ PUNCT, /* 21 ! */
+ PUNCT, /* 22 " */
+ PUNCT, /* 23 # */
+ PUNCT, /* 24 $ */
+ PUNCT, /* 25 % */
+ PUNCT, /* 26 & */
+ PUNCT, /* 27 ' */
+ PUNCT, /* 28 ( */
+ PUNCT, /* 29 ) */
+ PUNCT, /* 2A * */
+ PUNCT, /* 2B + */
+ PUNCT, /* 2C , */
+ PUNCT, /* 2D - */
+ PUNCT, /* 2E . */
+ PUNCT, /* 2F / */
+ DIGIT+HEX+0, /* 30 0 */
+ DIGIT+HEX+1, /* 31 1 */
+ DIGIT+HEX+2, /* 32 2 */
+ DIGIT+HEX+3, /* 33 3 */
+ DIGIT+HEX+4, /* 34 4 */
+ DIGIT+HEX+5, /* 35 5 */
+ DIGIT+HEX+6, /* 36 6 */
+ DIGIT+HEX+7, /* 37 7 */
+ DIGIT+HEX+8, /* 38 8 */
+ DIGIT+HEX+9, /* 39 9 */
+ PUNCT, /* 3A : */
+ PUNCT, /* 3B ; */
+ PUNCT, /* 3C < */
+ PUNCT, /* 3D = */
+ PUNCT, /* 3E > */
+ PUNCT, /* 3F ? */
+ PUNCT, /* 40 @ */
+ UPPER+HEX+10, /* 41 A */
+ UPPER+HEX+11, /* 42 B */
+ UPPER+HEX+12, /* 43 C */
+ UPPER+HEX+13, /* 44 D */
+ UPPER+HEX+14, /* 45 E */
+ UPPER+HEX+15, /* 46 F */
+ UPPER+16, /* 47 G */
+ UPPER+17, /* 48 H */
+ UPPER+18, /* 49 I */
+ UPPER+19, /* 4A J */
+ UPPER+20, /* 4B K */
+ UPPER+21, /* 4C L */
+ UPPER+22, /* 4D M */
+ UPPER+23, /* 4E N */
+ UPPER+24, /* 4F O */
+ UPPER+25, /* 50 P */
+ UPPER+26, /* 51 Q */
+ UPPER+27, /* 52 R */
+ UPPER+28, /* 53 S */
+ UPPER+29, /* 54 T */
+ UPPER+30, /* 55 U */
+ UPPER+31, /* 56 V */
+ UPPER+32, /* 57 W */
+ UPPER+33, /* 58 X */
+ UPPER+34, /* 59 Y */
+ UPPER+35, /* 5A Z */
+ PUNCT, /* 5B [ */
+ PUNCT, /* 5C \ */
+ PUNCT, /* 5D ] */
+ PUNCT, /* 5E ^ */
+ PUNCT|UNDER, /* 5F _ */
+ PUNCT, /* 60 ` */
+ LOWER+HEX+10, /* 61 a */
+ LOWER+HEX+11, /* 62 b */
+ LOWER+HEX+12, /* 63 c */
+ LOWER+HEX+13, /* 64 d */
+ LOWER+HEX+14, /* 65 e */
+ LOWER+HEX+15, /* 66 f */
+ LOWER+16, /* 67 g */
+ LOWER+17, /* 68 h */
+ LOWER+18, /* 69 i */
+ LOWER+19, /* 6A j */
+ LOWER+20, /* 6B k */
+ LOWER+21, /* 6C l */
+ LOWER+22, /* 6D m */
+ LOWER+23, /* 6E n */
+ LOWER+24, /* 6F o */
+ LOWER+25, /* 70 p */
+ LOWER+26, /* 71 q */
+ LOWER+27, /* 72 r */
+ LOWER+28, /* 73 s */
+ LOWER+29, /* 74 t */
+ LOWER+30, /* 75 u */
+ LOWER+31, /* 76 v */
+ LOWER+32, /* 77 w */
+ LOWER+33, /* 78 x */
+ LOWER+34, /* 79 y */
+ LOWER+35, /* 7A z */
+ PUNCT, /* 7B { */
+ PUNCT, /* 7C | */
+ PUNCT, /* 7D } */
+ PUNCT, /* 7E ~ */
+ CNTRL, /* 7F (DEL) */
+ };
+
+ static int getType(int ch) {
+ return ((ch & 0xFFFFFF80) == 0 ? ctype[ch] : 0);
+ }
+
+ static boolean isType(int ch, int type) {
+ return (getType(ch) & type) != 0;
+ }
+
+ static boolean isAscii(int ch) {
+ return ((ch & 0xFFFFFF80) == 0);
+ }
+
+ static boolean isAlpha(int ch) {
+ return isType(ch, ALPHA);
+ }
+
+ static boolean isDigit(int ch) {
+ return ((ch-'0')|('9'-ch)) >= 0;
+ }
+
+ static boolean isAlnum(int ch) {
+ return isType(ch, ALNUM);
+ }
+
+ static boolean isGraph(int ch) {
+ return isType(ch, GRAPH);
+ }
+
+ static boolean isPrint(int ch) {
+ return ((ch-0x20)|(0x7E-ch)) >= 0;
+ }
+
+ static boolean isPunct(int ch) {
+ return isType(ch, PUNCT);
+ }
+
+ static boolean isSpace(int ch) {
+ return isType(ch, SPACE);
+ }
+
+ static boolean isHexDigit(int ch) {
+ return isType(ch, HEX);
+ }
+
+ static boolean isCntrl(int ch) {
+ return isType(ch, CNTRL);
+ }
+
+ static boolean isLower(int ch) {
+ return ((ch-'a')|('z'-ch)) >= 0;
+ }
+
+ static boolean isUpper(int ch) {
+ return ((ch-'A')|('Z'-ch)) >= 0;
+ }
+
+ static boolean isWord(int ch) {
+ return isType(ch, WORD);
+ }
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/test/java/util/regex/POSIX_Unicode.java Thu Apr 28 20:48:36 2011 -0700
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+import java.util.HashMap;
+import java.util.Locale;
+
+final public class POSIX_Unicode {
+
+ public static boolean isAlpha(int ch) {
+ return Character.isAlphabetic(ch);
+ }
+
+ public static boolean isLower(int ch) {
+ return Character.isLowerCase(ch);
+ }
+
+ public static boolean isUpper(int ch) {
+ return Character.isUpperCase(ch);
+ }
+
+ // \p{Whitespace}
+ public static boolean isSpace(int ch) {
+ return ((((1 << Character.SPACE_SEPARATOR) |
+ (1 << Character.LINE_SEPARATOR) |
+ (1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1)
+ != 0 ||
+ (ch >= 0x9 && ch <= 0xd) ||
+ (ch == 0x85);
+ }
+
+ // \p{gc=Control}
+ public static boolean isCntrl(int ch) {
+ return Character.getType(ch) == Character.CONTROL;
+ }
+
+ // \p{gc=Punctuation}
+ public static boolean isPunct(int ch) {
+ return ((((1 << Character.CONNECTOR_PUNCTUATION) |
+ (1 << Character.DASH_PUNCTUATION) |
+ (1 << Character.START_PUNCTUATION) |
+ (1 << Character.END_PUNCTUATION) |
+ (1 << Character.OTHER_PUNCTUATION) |
+ (1 << Character.INITIAL_QUOTE_PUNCTUATION) |
+ (1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1)
+ != 0;
+ }
+
+ // \p{gc=Decimal_Number}
+ // \p{Hex_Digit} -> PropList.txt: Hex_Digit
+ public static boolean isHexDigit(int ch) {
+ return Character.isDigit(ch) ||
+ (ch >= 0x0030 && ch <= 0x0039) ||
+ (ch >= 0x0041 && ch <= 0x0046) ||
+ (ch >= 0x0061 && ch <= 0x0066) ||
+ (ch >= 0xFF10 && ch <= 0xFF19) ||
+ (ch >= 0xFF21 && ch <= 0xFF26) ||
+ (ch >= 0xFF41 && ch <= 0xFF46);
+ }
+
+ // \p{gc=Decimal_Number}
+ public static boolean isDigit(int ch) {
+ return Character.isDigit(ch);
+ };
+
+ // \p{alpha}
+ // \p{digit}
+ public static boolean isAlnum(int ch) {
+ return Character.isAlphabetic(ch) || Character.isDigit(ch);
+ }
+
+ // \p{Whitespace} --
+ // [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL} -> 0xa, 0xb, 0xc, 0xd, 0x85
+ // \p{gc=Line_Separator}
+ // \p{gc=Paragraph_Separator}]
+ public static boolean isBlank(int ch) {
+ int type = Character.getType(ch);
+ return isSpace(ch) &&
+ ch != 0xa & ch != 0xb && ch !=0xc && ch != 0xd && ch != 0x85 &&
+ type != Character.LINE_SEPARATOR &&
+ type != Character.PARAGRAPH_SEPARATOR;
+ }
+
+ // [^
+ // \p{space}
+ // \p{gc=Control}
+ // \p{gc=Surrogate}
+ // \p{gc=Unassigned}]
+ public static boolean isGraph(int ch) {
+ int type = Character.getType(ch);
+ return !(isSpace(ch) ||
+ Character.CONTROL == type ||
+ Character.SURROGATE == type ||
+ Character.UNASSIGNED == type);
+ }
+
+ // \p{graph}
+ // \p{blank}
+ // -- \p{cntrl}
+ public static boolean isPrint(int ch) {
+ return (isGraph(ch) || isBlank(ch)) && !isCntrl(ch);
+ }
+
+ // PropList.txt:Noncharacter_Code_Point
+ public static boolean isNoncharacterCodePoint(int ch) {
+ return (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef);
+ }
+
+ // \p{alpha}
+ // \p{gc=Mark}
+ // \p{digit}
+ // \p{gc=Connector_Punctuation}
+ public static boolean isWord(int ch) {
+ return isAlpha(ch) ||
+ ((((1 << Character.NON_SPACING_MARK) |
+ (1 << Character.ENCLOSING_MARK) |
+ (1 << Character.COMBINING_SPACING_MARK) |
+ (1 << Character.CONNECTOR_PUNCTUATION)) >> Character.getType(ch)) & 1)
+ != 0 ||
+ isDigit(ch);
+ }
+}
--- a/jdk/test/java/util/regex/RegExTest.java Thu Apr 28 20:18:57 2011 -0700
+++ b/jdk/test/java/util/regex/RegExTest.java Thu Apr 28 20:48:36 2011 -0700
@@ -32,7 +32,7 @@
* 4872664 4803179 4892980 4900747 4945394 4938995 4979006 4994840 4997476
* 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940
* 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133
- * 6350801 6676425 6878475 6919132 6931676 6948903 7014645
+ * 6350801 6676425 6878475 6919132 6931676 6948903 7014645 7039066
*/
import java.util.regex.*;
@@ -137,6 +137,7 @@
nonBmpClassComplementTest();
unicodePropertiesTest();
unicodeHexNotationTest();
+ unicodeClassesTest();
if (failure)
throw new RuntimeException("Failure in the RE handling.");
else
@@ -3656,5 +3657,146 @@
failCount++;
}
report("unicodeHexNotation");
- }
+ }
+
+ private static void unicodeClassesTest() throws Exception {
+
+ Matcher lower = Pattern.compile("\\p{Lower}").matcher("");
+ Matcher upper = Pattern.compile("\\p{Upper}").matcher("");
+ Matcher ASCII = Pattern.compile("\\p{ASCII}").matcher("");
+ Matcher alpha = Pattern.compile("\\p{Alpha}").matcher("");
+ Matcher digit = Pattern.compile("\\p{Digit}").matcher("");
+ Matcher alnum = Pattern.compile("\\p{Alnum}").matcher("");
+ Matcher punct = Pattern.compile("\\p{Punct}").matcher("");
+ Matcher graph = Pattern.compile("\\p{Graph}").matcher("");
+ Matcher print = Pattern.compile("\\p{Print}").matcher("");
+ Matcher blank = Pattern.compile("\\p{Blank}").matcher("");
+ Matcher cntrl = Pattern.compile("\\p{Cntrl}").matcher("");
+ Matcher xdigit = Pattern.compile("\\p{XDigit}").matcher("");
+ Matcher space = Pattern.compile("\\p{Space}").matcher("");
+ Matcher bound = Pattern.compile("\\b").matcher("");
+ Matcher word = Pattern.compile("\\w++").matcher("");
+ // UNICODE_CHARACTER_CLASS
+ Matcher lowerU = Pattern.compile("\\p{Lower}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+ Matcher upperU = Pattern.compile("\\p{Upper}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+ Matcher ASCIIU = Pattern.compile("\\p{ASCII}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+ Matcher alphaU = Pattern.compile("\\p{Alpha}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+ Matcher digitU = Pattern.compile("\\p{Digit}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+ Matcher alnumU = Pattern.compile("\\p{Alnum}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+ Matcher punctU = Pattern.compile("\\p{Punct}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+ Matcher graphU = Pattern.compile("\\p{Graph}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+ Matcher printU = Pattern.compile("\\p{Print}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+ Matcher blankU = Pattern.compile("\\p{Blank}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+ Matcher cntrlU = Pattern.compile("\\p{Cntrl}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+ Matcher xdigitU = Pattern.compile("\\p{XDigit}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+ Matcher spaceU = Pattern.compile("\\p{Space}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+ Matcher boundU = Pattern.compile("\\b", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+ Matcher wordU = Pattern.compile("\\w", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+ // embedded flag (?U)
+ Matcher lowerEU = Pattern.compile("(?U)\\p{Lower}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+ Matcher graphEU = Pattern.compile("(?U)\\p{Graph}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+ Matcher wordEU = Pattern.compile("(?U)\\w", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+
+ Matcher bwb = Pattern.compile("\\b\\w\\b").matcher("");
+ Matcher bwbU = Pattern.compile("\\b\\w++\\b", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+ Matcher bwbEU = Pattern.compile("(?U)\\b\\w++\\b", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
+ // properties
+ Matcher lowerP = Pattern.compile("\\p{IsLowerCase}").matcher("");
+ Matcher upperP = Pattern.compile("\\p{IsUpperCase}").matcher("");
+ Matcher titleP = Pattern.compile("\\p{IsTitleCase}").matcher("");
+ Matcher letterP = Pattern.compile("\\p{IsLetter}").matcher("");
+ Matcher alphaP = Pattern.compile("\\p{IsAlphabetic}").matcher("");
+ Matcher ideogP = Pattern.compile("\\p{IsIdeographic}").matcher("");
+ Matcher cntrlP = Pattern.compile("\\p{IsControl}").matcher("");
+ Matcher spaceP = Pattern.compile("\\p{IsWhiteSpace}").matcher("");
+ Matcher definedP = Pattern.compile("\\p{IsAssigned}").matcher("");
+ Matcher nonCCPP = Pattern.compile("\\p{IsNoncharacterCodePoint}").matcher("");
+
+ // javaMethod
+ Matcher lowerJ = Pattern.compile("\\p{javaLowerCase}").matcher("");
+ Matcher upperJ = Pattern.compile("\\p{javaUpperCase}").matcher("");
+ Matcher alphaJ = Pattern.compile("\\p{javaAlphabetic}").matcher("");
+ Matcher ideogJ = Pattern.compile("\\p{javaIdeographic}").matcher("");
+
+ for (int cp = 1; cp < 0x30000; cp++) {
+ String str = new String(Character.toChars(cp));
+ int type = Character.getType(cp);
+ if (// lower
+ POSIX_ASCII.isLower(cp) != lower.reset(str).matches() ||
+ Character.isLowerCase(cp) != lowerU.reset(str).matches() ||
+ Character.isLowerCase(cp) != lowerP.reset(str).matches() ||
+ Character.isLowerCase(cp) != lowerEU.reset(str).matches()||
+ Character.isLowerCase(cp) != lowerJ.reset(str).matches()||
+ // upper
+ POSIX_ASCII.isUpper(cp) != upper.reset(str).matches() ||
+ POSIX_Unicode.isUpper(cp) != upperU.reset(str).matches() ||
+ Character.isUpperCase(cp) != upperP.reset(str).matches() ||
+ Character.isUpperCase(cp) != upperJ.reset(str).matches() ||
+ // alpha
+ POSIX_ASCII.isAlpha(cp) != alpha.reset(str).matches() ||
+ POSIX_Unicode.isAlpha(cp) != alphaU.reset(str).matches() ||
+ Character.isAlphabetic(cp)!= alphaP.reset(str).matches() ||
+ Character.isAlphabetic(cp)!= alphaJ.reset(str).matches() ||
+ // digit
+ POSIX_ASCII.isDigit(cp) != digit.reset(str).matches() ||
+ Character.isDigit(cp) != digitU.reset(str).matches() ||
+ // alnum
+ POSIX_ASCII.isAlnum(cp) != alnum.reset(str).matches() ||
+ POSIX_Unicode.isAlnum(cp) != alnumU.reset(str).matches() ||
+ // punct
+ POSIX_ASCII.isPunct(cp) != punct.reset(str).matches() ||
+ POSIX_Unicode.isPunct(cp) != punctU.reset(str).matches() ||
+ // graph
+ POSIX_ASCII.isGraph(cp) != graph.reset(str).matches() ||
+ POSIX_Unicode.isGraph(cp) != graphU.reset(str).matches() ||
+ POSIX_Unicode.isGraph(cp) != graphEU.reset(str).matches()||
+ // blank
+ POSIX_ASCII.isType(cp, POSIX_ASCII.BLANK)
+ != blank.reset(str).matches() ||
+ POSIX_Unicode.isBlank(cp) != blankU.reset(str).matches() ||
+ // print
+ POSIX_ASCII.isPrint(cp) != print.reset(str).matches() ||
+ POSIX_Unicode.isPrint(cp) != printU.reset(str).matches() ||
+ // cntrl
+ POSIX_ASCII.isCntrl(cp) != cntrl.reset(str).matches() ||
+ POSIX_Unicode.isCntrl(cp) != cntrlU.reset(str).matches() ||
+ (Character.CONTROL == type) != cntrlP.reset(str).matches() ||
+ // hexdigit
+ POSIX_ASCII.isHexDigit(cp) != xdigit.reset(str).matches() ||
+ POSIX_Unicode.isHexDigit(cp) != xdigitU.reset(str).matches() ||
+ // space
+ POSIX_ASCII.isSpace(cp) != space.reset(str).matches() ||
+ POSIX_Unicode.isSpace(cp) != spaceU.reset(str).matches() ||
+ POSIX_Unicode.isSpace(cp) != spaceP.reset(str).matches() ||
+ // word
+ POSIX_ASCII.isWord(cp) != word.reset(str).matches() ||
+ POSIX_Unicode.isWord(cp) != wordU.reset(str).matches() ||
+ POSIX_Unicode.isWord(cp) != wordEU.reset(str).matches()||
+ // bwordb
+ POSIX_ASCII.isWord(cp) != bwb.reset(str).matches() ||
+ POSIX_Unicode.isWord(cp) != bwbU.reset(str).matches() ||
+ // properties
+ Character.isTitleCase(cp) != titleP.reset(str).matches() ||
+ Character.isLetter(cp) != letterP.reset(str).matches()||
+ Character.isIdeographic(cp) != ideogP.reset(str).matches() ||
+ Character.isIdeographic(cp) != ideogJ.reset(str).matches() ||
+ (Character.UNASSIGNED == type) == definedP.reset(str).matches() ||
+ POSIX_Unicode.isNoncharacterCodePoint(cp) != nonCCPP.reset(str).matches())
+ failCount++;
+ }
+
+ // bounds/word align
+ twoFindIndexes(" \u0180sherman\u0400 ", bound, 1, 10);
+ if (!bwbU.reset("\u0180sherman\u0400").matches())
+ failCount++;
+ twoFindIndexes(" \u0180sh\u0345erman\u0400 ", bound, 1, 11);
+ if (!bwbU.reset("\u0180sh\u0345erman\u0400").matches())
+ failCount++;
+ twoFindIndexes(" \u0724\u0739\u0724 ", bound, 1, 4);
+ if (!bwbU.reset("\u0724\u0739\u0724").matches())
+ failCount++;
+ if (!bwbEU.reset("\u0724\u0739\u0724").matches())
+ failCount++;
+ report("unicodePredefinedClasses");
+ }
}