107 * <tr><td valign="top" headers="construct characters"><code>\x</code><i>{h...h}</i></td> |
107 * <tr><td valign="top" headers="construct characters"><code>\x</code><i>{h...h}</i></td> |
108 * <td headers="matches">The character with hexadecimal value {@code 0x}<i>h...h</i> |
108 * <td headers="matches">The character with hexadecimal value {@code 0x}<i>h...h</i> |
109 * ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT} |
109 * ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT} |
110 * <= {@code 0x}<i>h...h</i> <= |
110 * <= {@code 0x}<i>h...h</i> <= |
111 * {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})</td></tr> |
111 * {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})</td></tr> |
|
112 * <tr><td valign="top" headers="construct characters"><code>\N{</code><i>name</i><code>}</code></td> |
|
113 * <td headers="matches">The character with Unicode character name <i>'name'</i></td></tr> |
112 * <tr><td valign="top" headers="matches">{@code \t}</td> |
114 * <tr><td valign="top" headers="matches">{@code \t}</td> |
113 * <td headers="matches">The tab character (<code>'\u0009'</code>)</td></tr> |
115 * <td headers="matches">The tab character (<code>'\u0009'</code>)</td></tr> |
114 * <tr><td valign="top" headers="construct characters">{@code \n}</td> |
116 * <tr><td valign="top" headers="construct characters">{@code \n}</td> |
115 * <td headers="matches">The newline (line feed) character (<code>'\u000A'</code>)</td></tr> |
117 * <td headers="matches">The newline (line feed) character (<code>'\u000A'</code>)</td></tr> |
116 * <tr><td valign="top" headers="construct characters">{@code \r}</td> |
118 * <tr><td valign="top" headers="construct characters">{@code \r}</td> |
241 * <td headers="matches">The beginning of a line</td></tr> |
243 * <td headers="matches">The beginning of a line</td></tr> |
242 * <tr><td valign="top" headers="construct bounds">{@code $}</td> |
244 * <tr><td valign="top" headers="construct bounds">{@code $}</td> |
243 * <td headers="matches">The end of a line</td></tr> |
245 * <td headers="matches">The end of a line</td></tr> |
244 * <tr><td valign="top" headers="construct bounds">{@code \b}</td> |
246 * <tr><td valign="top" headers="construct bounds">{@code \b}</td> |
245 * <td headers="matches">A word boundary</td></tr> |
247 * <td headers="matches">A word boundary</td></tr> |
|
248 * <tr><td valign="top" headers="construct bounds">{@code \b{g}}</td> |
|
249 * <td headers="matches">A Unicode extended grapheme cluster boundary</td></tr> |
246 * <tr><td valign="top" headers="construct bounds">{@code \B}</td> |
250 * <tr><td valign="top" headers="construct bounds">{@code \B}</td> |
247 * <td headers="matches">A non-word boundary</td></tr> |
251 * <td headers="matches">A non-word boundary</td></tr> |
248 * <tr><td valign="top" headers="construct bounds">{@code \A}</td> |
252 * <tr><td valign="top" headers="construct bounds">{@code \A}</td> |
249 * <td headers="matches">The beginning of the input</td></tr> |
253 * <td headers="matches">The beginning of the input</td></tr> |
250 * <tr><td valign="top" headers="construct bounds">{@code \G}</td> |
254 * <tr><td valign="top" headers="construct bounds">{@code \G}</td> |
259 * <tr align="left"><th colspan="2" id="lineending">Linebreak matcher</th></tr> |
263 * <tr align="left"><th colspan="2" id="lineending">Linebreak matcher</th></tr> |
260 * <tr><td valign="top" headers="construct lineending">{@code \R}</td> |
264 * <tr><td valign="top" headers="construct lineending">{@code \R}</td> |
261 * <td headers="matches">Any Unicode linebreak sequence, is equivalent to |
265 * <td headers="matches">Any Unicode linebreak sequence, is equivalent to |
262 * <code>\u000D\u000A|[\u000A\u000B\u000C\u000D\u0085\u2028\u2029] |
266 * <code>\u000D\u000A|[\u000A\u000B\u000C\u000D\u0085\u2028\u2029] |
263 * </code></td></tr> |
267 * </code></td></tr> |
|
268 * |
|
269 * <tr><th> </th></tr> |
|
270 * <tr align="left"><th colspan="2" id="grapheme">Unicode Extended Grapheme matcher</th></tr> |
|
271 * <tr><td valign="top" headers="construct grapheme">{@code \X}</td> |
|
272 * <td headers="matches">Any Unicode extended grapheme cluster</td></tr> |
264 * |
273 * |
265 * <tr><th> </th></tr> |
274 * <tr><th> </th></tr> |
266 * <tr align="left"><th colspan="2" id="greedy">Greedy quantifiers</th></tr> |
275 * <tr align="left"><th colspan="2" id="greedy">Greedy quantifiers</th></tr> |
267 * |
276 * |
268 * <tr><td valign="top" headers="construct greedy"><i>X</i>{@code ?}</td> |
277 * <tr><td valign="top" headers="construct greedy"><i>X</i>{@code ?}</td> |
544 * parser so that Unicode escapes can be used in expressions that are read from |
553 * parser so that Unicode escapes can be used in expressions that are read from |
545 * files or from the keyboard. Thus the strings <code>"\u2014"</code> and |
554 * files or from the keyboard. Thus the strings <code>"\u2014"</code> and |
546 * {@code "\\u2014"}, while not equal, compile into the same pattern, which |
555 * {@code "\\u2014"}, while not equal, compile into the same pattern, which |
547 * matches the character with hexadecimal value {@code 0x2014}. |
556 * matches the character with hexadecimal value {@code 0x2014}. |
548 * <p> |
557 * <p> |
549 * A Unicode character can also be represented in a regular-expression by |
558 * A Unicode character can also be represented by using its <b>Hex notation</b> |
550 * using its <b>Hex notation</b>(hexadecimal code point value) directly as described in construct |
559 * (hexadecimal code point value) directly as described in construct |
551 * <code>\x{...}</code>, for example a supplementary character U+2011F |
560 * <code>\x{...}</code>, for example a supplementary character U+2011F can be |
552 * can be specified as <code>\x{2011F}</code>, instead of two consecutive |
561 * specified as <code>\x{2011F}</code>, instead of two consecutive Unicode escape |
553 * Unicode escape sequences of the surrogate pair |
562 * sequences of the surrogate pair <code>\uD840</code><code>\uDD1F</code>. |
554 * <code>\uD840</code><code>\uDD1F</code>. |
563 * <p> |
|
564 * <b>Unicode character names</b> are supported by the named character construct |
|
565 * <code>\N{</code>...<code>}</code>, for example, <code>\N{WHITE SMILING FACE}</code> |
|
566 * specifies character <code>\u263A</code>. The character names supported |
|
567 * by this class are the valid Unicode character names matched by |
|
568 * {@link java.lang.Character#codePointOf(String) Character.codePointOf(name)}. |
|
569 * <p> |
|
570 * <a href="http://www.unicode.org/reports/tr18/#Default_Grapheme_Clusters"> |
|
571 * <b>Unicode extended grapheme clusters</b></a> are supported by the grapheme |
|
572 * cluster matcher {@code \X} and the corresponding boundary matcher {@code \b{g}}. |
555 * <p> |
573 * <p> |
556 * Unicode scripts, blocks, categories and binary properties are written with |
574 * Unicode scripts, blocks, categories and binary properties are written with |
557 * the {@code \p} and {@code \P} constructs as in Perl. |
575 * the {@code \p} and {@code \P} constructs as in Perl. |
558 * <code>\p{</code><i>prop</i><code>}</code> matches if |
576 * <code>\p{</code><i>prop</i><code>}</code> matches if |
559 * the input has the property <i>prop</i>, while <code>\P{</code><i>prop</i><code>}</code> |
577 * the input has the property <i>prop</i>, while <code>\P{</code><i>prop</i><code>}</code> |
677 * with ordered alternation as occurs in Perl 5. |
695 * with ordered alternation as occurs in Perl 5. |
678 * |
696 * |
679 * <p> Perl constructs not supported by this class: </p> |
697 * <p> Perl constructs not supported by this class: </p> |
680 * |
698 * |
681 * <ul> |
699 * <ul> |
682 * <li><p> Predefined character classes (Unicode character) |
|
683 * <p><code>\X </code>Match Unicode |
|
684 * <a href="http://www.unicode.org/reports/tr18/#Default_Grapheme_Clusters"> |
|
685 * <i>extended grapheme cluster</i></a> |
|
686 * </p></li> |
|
687 * |
|
688 * <li><p> The backreference constructs, <code>\g{</code><i>n</i><code>}</code> for |
700 * <li><p> The backreference constructs, <code>\g{</code><i>n</i><code>}</code> for |
689 * the <i>n</i><sup>th</sup><a href="#cg">capturing group</a> and |
701 * the <i>n</i><sup>th</sup><a href="#cg">capturing group</a> and |
690 * <code>\g{</code><i>name</i><code>}</code> for |
702 * <code>\g{</code><i>name</i><code>}</code> for |
691 * <a href="#groupname">named-capturing group</a>. |
703 * <a href="#groupname">named-capturing group</a>. |
692 * </p></li> |
|
693 * |
|
694 * <li><p> The named character construct, <code>\N{</code><i>name</i><code>}</code> |
|
695 * for a Unicode character by its name. |
|
696 * </p></li> |
704 * </p></li> |
697 * |
705 * |
698 * <li><p> The conditional constructs |
706 * <li><p> The conditional constructs |
699 * {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code )} and |
707 * {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code )} and |
700 * {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code |}<i>Y</i>{@code )}, |
708 * {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code |}<i>Y</i>{@code )}, |
3273 setcursor(cur); |
3300 setcursor(cur); |
3274 } |
3301 } |
3275 return n; |
3302 return n; |
3276 } |
3303 } |
3277 |
3304 |
|
3305 private int N() { |
|
3306 if (read() == '{') { |
|
3307 int i = cursor; |
|
3308 while (cursor < patternLength && read() != '}') {} |
|
3309 if (cursor > patternLength) |
|
3310 throw error("Unclosed character name escape sequence"); |
|
3311 String name = new String(temp, i, cursor - i - 1); |
|
3312 try { |
|
3313 return Character.codePointOf(name); |
|
3314 } catch (IllegalArgumentException x) { |
|
3315 throw error("Unknown character name [" + name + "]"); |
|
3316 } |
|
3317 } |
|
3318 throw error("Illegal character name escape sequence"); |
|
3319 } |
|
3320 |
3278 // |
3321 // |
3279 // Utility methods for code point support |
3322 // Utility methods for code point support |
3280 // |
3323 // |
3281 |
|
3282 private static final int countChars(CharSequence seq, int index, |
3324 private static final int countChars(CharSequence seq, int index, |
3283 int lengthInCodePoints) { |
3325 int lengthInCodePoints) { |
3284 // optimization |
3326 // optimization |
3285 if (lengthInCodePoints == 1 && !Character.isHighSurrogate(seq.charAt(index))) { |
3327 if (lengthInCodePoints == 1 && !Character.isHighSurrogate(seq.charAt(index))) { |
3286 assert (index >= 0 && index < seq.length()); |
3328 assert (index >= 0 && index < seq.length()); |
3952 boolean isSatisfiedBy(int cp) { |
3994 boolean isSatisfiedBy(int cp) { |
3953 return cp == 0x09 || cp == 0x20 || cp == 0xa0 || |
3995 return cp == 0x09 || cp == 0x20 || cp == 0xa0 || |
3954 cp == 0x1680 || cp == 0x180e || |
3996 cp == 0x1680 || cp == 0x180e || |
3955 cp >= 0x2000 && cp <= 0x200a || |
3997 cp >= 0x2000 && cp <= 0x200a || |
3956 cp == 0x202f || cp == 0x205f || cp == 0x3000; |
3998 cp == 0x202f || cp == 0x205f || cp == 0x3000; |
|
3999 } |
|
4000 } |
|
4001 |
|
4002 /** |
|
4003 * Node class that matches an unicode extended grapheme cluster |
|
4004 */ |
|
4005 static class XGrapheme extends Node { |
|
4006 boolean match(Matcher matcher, int i, CharSequence seq) { |
|
4007 if (i < matcher.to) { |
|
4008 int ch0 = Character.codePointAt(seq, i); |
|
4009 i += Character.charCount(ch0); |
|
4010 while (i < matcher.to) { |
|
4011 int ch1 = Character.codePointAt(seq, i); |
|
4012 if (Grapheme.isBoundary(ch0, ch1)) |
|
4013 break; |
|
4014 ch0 = ch1; |
|
4015 i += Character.charCount(ch1); |
|
4016 } |
|
4017 return next.match(matcher, i, seq); |
|
4018 } |
|
4019 matcher.hitEnd = true; |
|
4020 return false; |
|
4021 } |
|
4022 |
|
4023 boolean study(TreeInfo info) { |
|
4024 info.minLength++; |
|
4025 info.deterministic = false; |
|
4026 return next.study(info); |
|
4027 } |
|
4028 } |
|
4029 |
|
4030 /** |
|
4031 * Node class that handles grapheme boundaries |
|
4032 */ |
|
4033 static class GraphemeBound extends Node { |
|
4034 boolean match(Matcher matcher, int i, CharSequence seq) { |
|
4035 int startIndex = matcher.from; |
|
4036 int endIndex = matcher.to; |
|
4037 if (matcher.transparentBounds) { |
|
4038 startIndex = 0; |
|
4039 endIndex = matcher.getTextLength(); |
|
4040 } |
|
4041 if (i == startIndex) { |
|
4042 return next.match(matcher, i, seq); |
|
4043 } |
|
4044 if (i < endIndex) { |
|
4045 if (Character.isSurrogatePair(seq.charAt(i-1), seq.charAt(i)) || |
|
4046 !Grapheme.isBoundary(Character.codePointBefore(seq, i), |
|
4047 Character.codePointAt(seq, i))) { |
|
4048 return false; |
|
4049 } |
|
4050 } else { |
|
4051 matcher.hitEnd = true; |
|
4052 matcher.requireEnd = true; |
|
4053 } |
|
4054 return next.match(matcher, i, seq); |
3957 } |
4055 } |
3958 } |
4056 } |
3959 |
4057 |
3960 /** |
4058 /** |
3961 * Base class for all Slice nodes |
4059 * Base class for all Slice nodes |