jdk/src/java.base/share/classes/java/util/regex/Pattern.java
changeset 35783 2690535d72cc
parent 34774 03b4e6dc367b
child 37880 60ec48925dc6
equal deleted inserted replaced
35782:cce69c0777dc 35783:2690535d72cc
   107  * <tr><td valign="top" headers="construct characters"><code>&#92;x</code><i>{h...h}</i></td>
   107  * <tr><td valign="top" headers="construct characters"><code>&#92;x</code><i>{h...h}</i></td>
   108  *     <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;{@code 0x}<i>h...h</i>
   108  *     <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;{@code 0x}<i>h...h</i>
   109  *         ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT}
   109  *         ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT}
   110  *         &nbsp;&lt;=&nbsp;{@code 0x}<i>h...h</i>&nbsp;&lt;=&nbsp;
   110  *         &nbsp;&lt;=&nbsp;{@code 0x}<i>h...h</i>&nbsp;&lt;=&nbsp;
   111  *          {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})</td></tr>
   111  *          {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})</td></tr>
       
   112  * <tr><td valign="top" headers="construct characters"><code>&#92;N{</code><i>name</i><code>}</code></td>
       
   113  *     <td headers="matches">The character with Unicode character name <i>'name'</i></td></tr>
   112  * <tr><td valign="top" headers="matches">{@code \t}</td>
   114  * <tr><td valign="top" headers="matches">{@code \t}</td>
   113  *     <td headers="matches">The tab character (<code>'&#92;u0009'</code>)</td></tr>
   115  *     <td headers="matches">The tab character (<code>'&#92;u0009'</code>)</td></tr>
   114  * <tr><td valign="top" headers="construct characters">{@code \n}</td>
   116  * <tr><td valign="top" headers="construct characters">{@code \n}</td>
   115  *     <td headers="matches">The newline (line feed) character (<code>'&#92;u000A'</code>)</td></tr>
   117  *     <td headers="matches">The newline (line feed) character (<code>'&#92;u000A'</code>)</td></tr>
   116  * <tr><td valign="top" headers="construct characters">{@code \r}</td>
   118  * <tr><td valign="top" headers="construct characters">{@code \r}</td>
   241  *     <td headers="matches">The beginning of a line</td></tr>
   243  *     <td headers="matches">The beginning of a line</td></tr>
   242  * <tr><td valign="top" headers="construct bounds">{@code $}</td>
   244  * <tr><td valign="top" headers="construct bounds">{@code $}</td>
   243  *     <td headers="matches">The end of a line</td></tr>
   245  *     <td headers="matches">The end of a line</td></tr>
   244  * <tr><td valign="top" headers="construct bounds">{@code \b}</td>
   246  * <tr><td valign="top" headers="construct bounds">{@code \b}</td>
   245  *     <td headers="matches">A word boundary</td></tr>
   247  *     <td headers="matches">A word boundary</td></tr>
       
   248  * <tr><td valign="top" headers="construct bounds">{@code \b{g}}</td>
       
   249  *     <td headers="matches">A Unicode extended grapheme cluster boundary</td></tr>
   246  * <tr><td valign="top" headers="construct bounds">{@code \B}</td>
   250  * <tr><td valign="top" headers="construct bounds">{@code \B}</td>
   247  *     <td headers="matches">A non-word boundary</td></tr>
   251  *     <td headers="matches">A non-word boundary</td></tr>
   248  * <tr><td valign="top" headers="construct bounds">{@code \A}</td>
   252  * <tr><td valign="top" headers="construct bounds">{@code \A}</td>
   249  *     <td headers="matches">The beginning of the input</td></tr>
   253  *     <td headers="matches">The beginning of the input</td></tr>
   250  * <tr><td valign="top" headers="construct bounds">{@code \G}</td>
   254  * <tr><td valign="top" headers="construct bounds">{@code \G}</td>
   259  * <tr align="left"><th colspan="2" id="lineending">Linebreak matcher</th></tr>
   263  * <tr align="left"><th colspan="2" id="lineending">Linebreak matcher</th></tr>
   260  * <tr><td valign="top" headers="construct lineending">{@code \R}</td>
   264  * <tr><td valign="top" headers="construct lineending">{@code \R}</td>
   261  *     <td headers="matches">Any Unicode linebreak sequence, is equivalent to
   265  *     <td headers="matches">Any Unicode linebreak sequence, is equivalent to
   262  *     <code>&#92;u000D&#92;u000A|[&#92;u000A&#92;u000B&#92;u000C&#92;u000D&#92;u0085&#92;u2028&#92;u2029]
   266  *     <code>&#92;u000D&#92;u000A|[&#92;u000A&#92;u000B&#92;u000C&#92;u000D&#92;u0085&#92;u2028&#92;u2029]
   263  *     </code></td></tr>
   267  *     </code></td></tr>
       
   268  *
       
   269  * <tr><th>&nbsp;</th></tr>
       
   270  * <tr align="left"><th colspan="2" id="grapheme">Unicode Extended Grapheme matcher</th></tr>
       
   271  * <tr><td valign="top" headers="construct grapheme">{@code \X}</td>
       
   272  *     <td headers="matches">Any Unicode extended grapheme cluster</td></tr>
   264  *
   273  *
   265  * <tr><th>&nbsp;</th></tr>
   274  * <tr><th>&nbsp;</th></tr>
   266  * <tr align="left"><th colspan="2" id="greedy">Greedy quantifiers</th></tr>
   275  * <tr align="left"><th colspan="2" id="greedy">Greedy quantifiers</th></tr>
   267  *
   276  *
   268  * <tr><td valign="top" headers="construct greedy"><i>X</i>{@code ?}</td>
   277  * <tr><td valign="top" headers="construct greedy"><i>X</i>{@code ?}</td>
   544  * parser so that Unicode escapes can be used in expressions that are read from
   553  * parser so that Unicode escapes can be used in expressions that are read from
   545  * files or from the keyboard.  Thus the strings <code>"&#92;u2014"</code> and
   554  * files or from the keyboard.  Thus the strings <code>"&#92;u2014"</code> and
   546  * {@code "\\u2014"}, while not equal, compile into the same pattern, which
   555  * {@code "\\u2014"}, while not equal, compile into the same pattern, which
   547  * matches the character with hexadecimal value {@code 0x2014}.
   556  * matches the character with hexadecimal value {@code 0x2014}.
   548  * <p>
   557  * <p>
   549  * A Unicode character can also be represented in a regular-expression by
   558  * A Unicode character can also be represented by using its <b>Hex notation</b>
   550  * using its <b>Hex notation</b>(hexadecimal code point value) directly as described in construct
   559  * (hexadecimal code point value) directly as described in construct
   551  * <code>&#92;x{...}</code>, for example a supplementary character U+2011F
   560  * <code>&#92;x{...}</code>, for example a supplementary character U+2011F can be
   552  * can be specified as <code>&#92;x{2011F}</code>, instead of two consecutive
   561  * specified as <code>&#92;x{2011F}</code>, instead of two consecutive Unicode escape
   553  * Unicode escape sequences of the surrogate pair
   562  * sequences of the surrogate pair <code>&#92;uD840</code><code>&#92;uDD1F</code>.
   554  * <code>&#92;uD840</code><code>&#92;uDD1F</code>.
   563  * <p>
       
   564  * <b>Unicode character names</b> are supported by the named character construct
       
   565  * <code>\N{</code>...<code>}</code>, for example, <code>\N{WHITE SMILING FACE}</code>
       
   566  * specifies character <code>&#92;u263A</code>. The character names supported
       
   567  * by this class are the valid Unicode character names matched by
       
   568  * {@link java.lang.Character#codePointOf(String) Character.codePointOf(name)}.
       
   569  * <p>
       
   570  * <a href="http://www.unicode.org/reports/tr18/#Default_Grapheme_Clusters">
       
   571  * <b>Unicode extended grapheme clusters</b></a> are supported by the grapheme
       
   572  * cluster matcher {@code \X} and the corresponding boundary matcher {@code \b{g}}.
   555  * <p>
   573  * <p>
   556  * Unicode scripts, blocks, categories and binary properties are written with
   574  * Unicode scripts, blocks, categories and binary properties are written with
   557  * the {@code \p} and {@code \P} constructs as in Perl.
   575  * the {@code \p} and {@code \P} constructs as in Perl.
   558  * <code>\p{</code><i>prop</i><code>}</code> matches if
   576  * <code>\p{</code><i>prop</i><code>}</code> matches if
   559  * the input has the property <i>prop</i>, while <code>\P{</code><i>prop</i><code>}</code>
   577  * the input has the property <i>prop</i>, while <code>\P{</code><i>prop</i><code>}</code>
   677  * with ordered alternation as occurs in Perl 5.
   695  * with ordered alternation as occurs in Perl 5.
   678  *
   696  *
   679  * <p> Perl constructs not supported by this class: </p>
   697  * <p> Perl constructs not supported by this class: </p>
   680  *
   698  *
   681  * <ul>
   699  * <ul>
   682  *    <li><p> Predefined character classes (Unicode character)
       
   683  *    <p><code>\X&nbsp;&nbsp;&nbsp;&nbsp;</code>Match Unicode
       
   684  *    <a href="http://www.unicode.org/reports/tr18/#Default_Grapheme_Clusters">
       
   685  *    <i>extended grapheme cluster</i></a>
       
   686  *    </p></li>
       
   687  *
       
   688  *    <li><p> The backreference constructs, <code>\g{</code><i>n</i><code>}</code> for
   700  *    <li><p> The backreference constructs, <code>\g{</code><i>n</i><code>}</code> for
   689  *    the <i>n</i><sup>th</sup><a href="#cg">capturing group</a> and
   701  *    the <i>n</i><sup>th</sup><a href="#cg">capturing group</a> and
   690  *    <code>\g{</code><i>name</i><code>}</code> for
   702  *    <code>\g{</code><i>name</i><code>}</code> for
   691  *    <a href="#groupname">named-capturing group</a>.
   703  *    <a href="#groupname">named-capturing group</a>.
   692  *    </p></li>
       
   693  *
       
   694  *    <li><p> The named character construct, <code>\N{</code><i>name</i><code>}</code>
       
   695  *    for a Unicode character by its name.
       
   696  *    </p></li>
   704  *    </p></li>
   697  *
   705  *
   698  *    <li><p> The conditional constructs
   706  *    <li><p> The conditional constructs
   699  *    {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code )} and
   707  *    {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code )} and
   700  *    {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code |}<i>Y</i>{@code )},
   708  *    {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code |}<i>Y</i>{@code )},
  2355         case 'I':
  2363         case 'I':
  2356         case 'J':
  2364         case 'J':
  2357         case 'K':
  2365         case 'K':
  2358         case 'L':
  2366         case 'L':
  2359         case 'M':
  2367         case 'M':
       
  2368             break;
  2360         case 'N':
  2369         case 'N':
       
  2370             return N();
  2361         case 'O':
  2371         case 'O':
  2362         case 'P':
  2372         case 'P':
  2363         case 'Q':
  2373         case 'Q':
  2364             break;
  2374             break;
  2365         case 'R':
  2375         case 'R':
  2381             if (create) root = has(UNICODE_CHARACTER_CLASS)
  2391             if (create) root = has(UNICODE_CHARACTER_CLASS)
  2382                                ? new Utype(UnicodeProp.WORD).complement()
  2392                                ? new Utype(UnicodeProp.WORD).complement()
  2383                                : new Ctype(ASCII.WORD).complement();
  2393                                : new Ctype(ASCII.WORD).complement();
  2384             return -1;
  2394             return -1;
  2385         case 'X':
  2395         case 'X':
       
  2396             if (inclass) break;
       
  2397             if (create) {
       
  2398                 root = new XGrapheme();
       
  2399             }
       
  2400             return -1;
  2386         case 'Y':
  2401         case 'Y':
  2387             break;
  2402             break;
  2388         case 'Z':
  2403         case 'Z':
  2389             if (inclass) break;
  2404             if (inclass) break;
  2390             if (create) {
  2405             if (create) {
  2396             return -1;
  2411             return -1;
  2397         case 'a':
  2412         case 'a':
  2398             return '\007';
  2413             return '\007';
  2399         case 'b':
  2414         case 'b':
  2400             if (inclass) break;
  2415             if (inclass) break;
  2401             if (create) root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));
  2416             if (create) {
       
  2417                 if (peek() == '{') {
       
  2418                     if (skip() == 'g') {
       
  2419                         if (read() == '}') {
       
  2420                             root = new GraphemeBound();
       
  2421                             return -1;
       
  2422                         }
       
  2423                         break;  // error missing trailing }
       
  2424                     }
       
  2425                     unread(); unread();
       
  2426                 }
       
  2427                 root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));
       
  2428             }
  2402             return -1;
  2429             return -1;
  2403         case 'c':
  2430         case 'c':
  2404             return c();
  2431             return c();
  2405         case 'd':
  2432         case 'd':
  2406             if (create) root = has(UNICODE_CHARACTER_CLASS)
  2433             if (create) root = has(UNICODE_CHARACTER_CLASS)
  3273             setcursor(cur);
  3300             setcursor(cur);
  3274         }
  3301         }
  3275         return n;
  3302         return n;
  3276     }
  3303     }
  3277 
  3304 
       
  3305     private int N() {
       
  3306         if (read() == '{') {
       
  3307             int i = cursor;
       
  3308             while (cursor < patternLength && read() != '}') {}
       
  3309             if (cursor > patternLength)
       
  3310                 throw error("Unclosed character name escape sequence");
       
  3311             String name = new String(temp, i, cursor - i - 1);
       
  3312             try {
       
  3313                 return Character.codePointOf(name);
       
  3314             } catch (IllegalArgumentException x) {
       
  3315                 throw error("Unknown character name [" + name + "]");
       
  3316             }
       
  3317         }
       
  3318         throw error("Illegal character name escape sequence");
       
  3319     }
       
  3320 
  3278     //
  3321     //
  3279     // Utility methods for code point support
  3322     // Utility methods for code point support
  3280     //
  3323     //
  3281 
       
  3282     private static final int countChars(CharSequence seq, int index,
  3324     private static final int countChars(CharSequence seq, int index,
  3283                                         int lengthInCodePoints) {
  3325                                         int lengthInCodePoints) {
  3284         // optimization
  3326         // optimization
  3285         if (lengthInCodePoints == 1 && !Character.isHighSurrogate(seq.charAt(index))) {
  3327         if (lengthInCodePoints == 1 && !Character.isHighSurrogate(seq.charAt(index))) {
  3286             assert (index >= 0 && index < seq.length());
  3328             assert (index >= 0 && index < seq.length());
  3952         boolean isSatisfiedBy(int cp) {
  3994         boolean isSatisfiedBy(int cp) {
  3953             return cp == 0x09 || cp == 0x20 || cp == 0xa0 ||
  3995             return cp == 0x09 || cp == 0x20 || cp == 0xa0 ||
  3954                    cp == 0x1680 || cp == 0x180e ||
  3996                    cp == 0x1680 || cp == 0x180e ||
  3955                    cp >= 0x2000 && cp <= 0x200a ||
  3997                    cp >= 0x2000 && cp <= 0x200a ||
  3956                    cp == 0x202f || cp == 0x205f || cp == 0x3000;
  3998                    cp == 0x202f || cp == 0x205f || cp == 0x3000;
       
  3999         }
       
  4000     }
       
  4001 
       
  4002     /**
       
  4003      * Node class that matches an unicode extended grapheme cluster
       
  4004      */
       
  4005     static class XGrapheme extends Node {
       
  4006         boolean match(Matcher matcher, int i, CharSequence seq) {
       
  4007             if (i < matcher.to) {
       
  4008                 int ch0 = Character.codePointAt(seq, i);
       
  4009                     i += Character.charCount(ch0);
       
  4010                 while (i < matcher.to) {
       
  4011                     int ch1 = Character.codePointAt(seq, i);
       
  4012                     if (Grapheme.isBoundary(ch0, ch1))
       
  4013                         break;
       
  4014                     ch0 = ch1;
       
  4015                     i += Character.charCount(ch1);
       
  4016                 }
       
  4017                 return next.match(matcher, i, seq);
       
  4018             }
       
  4019             matcher.hitEnd = true;
       
  4020             return false;
       
  4021         }
       
  4022 
       
  4023         boolean study(TreeInfo info) {
       
  4024             info.minLength++;
       
  4025             info.deterministic = false;
       
  4026             return next.study(info);
       
  4027         }
       
  4028     }
       
  4029 
       
  4030     /**
       
  4031      * Node class that handles grapheme boundaries
       
  4032      */
       
  4033     static class GraphemeBound extends Node {
       
  4034         boolean match(Matcher matcher, int i, CharSequence seq) {
       
  4035             int startIndex = matcher.from;
       
  4036             int endIndex = matcher.to;
       
  4037             if (matcher.transparentBounds) {
       
  4038                 startIndex = 0;
       
  4039                 endIndex = matcher.getTextLength();
       
  4040             }
       
  4041             if (i == startIndex) {
       
  4042                 return next.match(matcher, i, seq);
       
  4043             }
       
  4044             if (i < endIndex) {
       
  4045                 if (Character.isSurrogatePair(seq.charAt(i-1), seq.charAt(i)) ||
       
  4046                     !Grapheme.isBoundary(Character.codePointBefore(seq, i),
       
  4047                                          Character.codePointAt(seq, i))) {
       
  4048                     return false;
       
  4049                 }
       
  4050             } else {
       
  4051                 matcher.hitEnd = true;
       
  4052                 matcher.requireEnd = true;
       
  4053             }
       
  4054             return next.match(matcher, i, seq);
  3957         }
  4055         }
  3958     }
  4056     }
  3959 
  4057 
  3960     /**
  4058     /**
  3961      * Base class for all Slice nodes
  4059      * Base class for all Slice nodes