jaxp/src/java.xml/share/classes/com/sun/org/apache/regexp/internal/RE.java
changeset 37947 f69560487686
parent 37946 e420b9f05aaf
parent 37936 428ebc487445
child 37948 caf97b37ebec
equal deleted inserted replaced
37946:e420b9f05aaf 37947:f69560487686
     1 /*
       
     2  * reserved comment block
       
     3  * DO NOT REMOVE OR ALTER!
       
     4  */
       
     5 /*
       
     6  * Copyright 1999-2004 The Apache Software Foundation.
       
     7  *
       
     8  * Licensed under the Apache License, Version 2.0 (the "License");
       
     9  * you may not use this file except in compliance with the License.
       
    10  * You may obtain a copy of the License at
       
    11  *
       
    12  *     http://www.apache.org/licenses/LICENSE-2.0
       
    13  *
       
    14  * Unless required by applicable law or agreed to in writing, software
       
    15  * distributed under the License is distributed on an "AS IS" BASIS,
       
    16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
       
    17  * See the License for the specific language governing permissions and
       
    18  * limitations under the License.
       
    19  */
       
    20 
       
    21 package com.sun.org.apache.regexp.internal;
       
    22 
       
    23 import java.io.Serializable;
       
    24 import java.util.Vector;
       
    25 
       
    26 /**
       
    27  * RE is an efficient, lightweight regular expression evaluator/matcher
       
    28  * class. Regular expressions are pattern descriptions which enable
       
    29  * sophisticated matching of strings.  In addition to being able to
       
    30  * match a string against a pattern, you can also extract parts of the
       
    31  * match.  This is especially useful in text parsing! Details on the
       
    32  * syntax of regular expression patterns are given below.
       
    33  *
       
    34  * <p>
       
    35  * To compile a regular expression (RE), you can simply construct an RE
       
    36  * matcher object from the string specification of the pattern, like this:
       
    37  *
       
    38  * <pre>
       
    39  *  RE r = new RE("a*b");
       
    40  * </pre>
       
    41  *
       
    42  * <p>
       
    43  * Once you have done this, you can call either of the RE.match methods to
       
    44  * perform matching on a String.  For example:
       
    45  *
       
    46  * <pre>
       
    47  *  boolean matched = r.match("aaaab");
       
    48  * </pre>
       
    49  *
       
    50  * will cause the boolean matched to be set to true because the
       
    51  * pattern "a*b" matches the string "aaaab".
       
    52  *
       
    53  * <p>
       
    54  * If you were interested in the <i>number</i> of a's which matched the
       
    55  * first part of our example expression, you could change the expression to
       
    56  * "(a*)b".  Then when you compiled the expression and matched it against
       
    57  * something like "xaaaab", you would get results like this:
       
    58  *
       
    59  * <pre>
       
    60  *  RE r = new RE("(a*)b");                  // Compile expression
       
    61  *  boolean matched = r.match("xaaaab");     // Match against "xaaaab"
       
    62  *
       
    63  *  String wholeExpr = r.getParen(0);        // wholeExpr will be 'aaaab'
       
    64  *  String insideParens = r.getParen(1);     // insideParens will be 'aaaa'
       
    65  *
       
    66  *  int startWholeExpr = r.getParenStart(0); // startWholeExpr will be index 1
       
    67  *  int endWholeExpr = r.getParenEnd(0);     // endWholeExpr will be index 6
       
    68  *  int lenWholeExpr = r.getParenLength(0);  // lenWholeExpr will be 5
       
    69  *
       
    70  *  int startInside = r.getParenStart(1);    // startInside will be index 1
       
    71  *  int endInside = r.getParenEnd(1);        // endInside will be index 5
       
    72  *  int lenInside = r.getParenLength(1);     // lenInside will be 4
       
    73  * </pre>
       
    74  *
       
    75  * You can also refer to the contents of a parenthesized expression
       
    76  * within a regular expression itself.  This is called a
       
    77  * 'backreference'.  The first backreference in a regular expression is
       
    78  * denoted by \1, the second by \2 and so on.  So the expression:
       
    79  *
       
    80  * <pre>
       
    81  *  ([0-9]+)=\1
       
    82  * </pre>
       
    83  *
       
    84  * will match any string of the form n=n (like 0=0 or 2=2).
       
    85  *
       
    86  * <p>
       
    87  * The full regular expression syntax accepted by RE is described here:
       
    88  *
       
    89  * <pre>
       
    90  *
       
    91  *  <b><font face=times roman>Characters</font></b>
       
    92  *
       
    93  *    <i>unicodeChar</i>   Matches any identical unicode character
       
    94  *    \                    Used to quote a meta-character (like '*')
       
    95  *    \\                   Matches a single '\' character
       
    96  *    \0nnn                Matches a given octal character
       
    97  *    \xhh                 Matches a given 8-bit hexadecimal character
       
    98  *    \\uhhhh              Matches a given 16-bit hexadecimal character
       
    99  *    \t                   Matches an ASCII tab character
       
   100  *    \n                   Matches an ASCII newline character
       
   101  *    \r                   Matches an ASCII return character
       
   102  *    \f                   Matches an ASCII form feed character
       
   103  *
       
   104  *
       
   105  *  <b><font face=times roman>Character Classes</font></b>
       
   106  *
       
   107  *    [abc]                Simple character class
       
   108  *    [a-zA-Z]             Character class with ranges
       
   109  *    [^abc]               Negated character class
       
   110  * </pre>
       
   111  *
       
   112  * <b>NOTE:</b> Incomplete ranges will be interpreted as &quot;starts
       
   113  * from zero&quot; or &quot;ends with last character&quot;.
       
   114  * <br>
       
   115  * I.e. [-a] is the same as [\\u0000-a], and [a-] is the same as [a-\\uFFFF],
       
   116  * [-] means &quot;all characters&quot;.
       
   117  *
       
   118  * <pre>
       
   119  *
       
   120  *  <b><font face=times roman>Standard POSIX Character Classes</font></b>
       
   121  *
       
   122  *    [:alnum:]            Alphanumeric characters.
       
   123  *    [:alpha:]            Alphabetic characters.
       
   124  *    [:blank:]            Space and tab characters.
       
   125  *    [:cntrl:]            Control characters.
       
   126  *    [:digit:]            Numeric characters.
       
   127  *    [:graph:]            Characters that are printable and are also visible.
       
   128  *                         (A space is printable, but not visible, while an
       
   129  *                         `a' is both.)
       
   130  *    [:lower:]            Lower-case alphabetic characters.
       
   131  *    [:print:]            Printable characters (characters that are not
       
   132  *                         control characters.)
       
   133  *    [:punct:]            Punctuation characters (characters that are not letter,
       
   134  *                         digits, control characters, or space characters).
       
   135  *    [:space:]            Space characters (such as space, tab, and formfeed,
       
   136  *                         to name a few).
       
   137  *    [:upper:]            Upper-case alphabetic characters.
       
   138  *    [:xdigit:]           Characters that are hexadecimal digits.
       
   139  *
       
   140  *
       
   141  *  <b><font face=times roman>Non-standard POSIX-style Character Classes</font></b>
       
   142  *
       
   143  *    [:javastart:]        Start of a Java identifier
       
   144  *    [:javapart:]         Part of a Java identifier
       
   145  *
       
   146  *
       
   147  *  <b><font face=times roman>Predefined Classes</font></b>
       
   148  *
       
   149  *    .         Matches any character other than newline
       
   150  *    \w        Matches a "word" character (alphanumeric plus "_")
       
   151  *    \W        Matches a non-word character
       
   152  *    \s        Matches a whitespace character
       
   153  *    \S        Matches a non-whitespace character
       
   154  *    \d        Matches a digit character
       
   155  *    \D        Matches a non-digit character
       
   156  *
       
   157  *
       
   158  *  <b><font face=times roman>Boundary Matchers</font></b>
       
   159  *
       
   160  *    ^         Matches only at the beginning of a line
       
   161  *    $         Matches only at the end of a line
       
   162  *    \b        Matches only at a word boundary
       
   163  *    \B        Matches only at a non-word boundary
       
   164  *
       
   165  *
       
   166  *  <b><font face=times roman>Greedy Closures</font></b>
       
   167  *
       
   168  *    A*        Matches A 0 or more times (greedy)
       
   169  *    A+        Matches A 1 or more times (greedy)
       
   170  *    A?        Matches A 1 or 0 times (greedy)
       
   171  *    A{n}      Matches A exactly n times (greedy)
       
   172  *    A{n,}     Matches A at least n times (greedy)
       
   173  *    A{n,m}    Matches A at least n but not more than m times (greedy)
       
   174  *
       
   175  *
       
   176  *  <b><font face=times roman>Reluctant Closures</font></b>
       
   177  *
       
   178  *    A*?       Matches A 0 or more times (reluctant)
       
   179  *    A+?       Matches A 1 or more times (reluctant)
       
   180  *    A??       Matches A 0 or 1 times (reluctant)
       
   181  *
       
   182  *
       
   183  *  <b><font face=times roman>Logical Operators</font></b>
       
   184  *
       
   185  *    AB        Matches A followed by B
       
   186  *    A|B       Matches either A or B
       
   187  *    (A)       Used for subexpression grouping
       
   188  *   (?:A)      Used for subexpression clustering (just like grouping but
       
   189  *              no backrefs)
       
   190  *
       
   191  *
       
   192  *  <b><font face=times roman>Backreferences</font></b>
       
   193  *
       
   194  *    \1    Backreference to 1st parenthesized subexpression
       
   195  *    \2    Backreference to 2nd parenthesized subexpression
       
   196  *    \3    Backreference to 3rd parenthesized subexpression
       
   197  *    \4    Backreference to 4th parenthesized subexpression
       
   198  *    \5    Backreference to 5th parenthesized subexpression
       
   199  *    \6    Backreference to 6th parenthesized subexpression
       
   200  *    \7    Backreference to 7th parenthesized subexpression
       
   201  *    \8    Backreference to 8th parenthesized subexpression
       
   202  *    \9    Backreference to 9th parenthesized subexpression
       
   203  * </pre>
       
   204  *
       
   205  * <p>
       
   206  * All closure operators (+, *, ?, {m,n}) are greedy by default, meaning
       
   207  * that they match as many elements of the string as possible without
       
   208  * causing the overall match to fail.  If you want a closure to be
       
   209  * reluctant (non-greedy), you can simply follow it with a '?'.  A
       
   210  * reluctant closure will match as few elements of the string as
       
   211  * possible when finding matches.  {m,n} closures don't currently
       
   212  * support reluctancy.
       
   213  *
       
   214  * <p>
       
   215  * <b><font face="times roman">Line terminators</font></b>
       
   216  * <br>
       
   217  * A line terminator is a one- or two-character sequence that marks
       
   218  * the end of a line of the input character sequence. The following
       
   219  * are recognized as line terminators:
       
   220  * <ul>
       
   221  * <li>A newline (line feed) character ('\n'),</li>
       
   222  * <li>A carriage-return character followed immediately by a newline character ("\r\n"),</li>
       
   223  * <li>A standalone carriage-return character ('\r'),</li>
       
   224  * <li>A next-line character ('\u0085'),</li>
       
   225  * <li>A line-separator character ('\u2028'), or</li>
       
   226  * <li>A paragraph-separator character ('\u2029).</li>
       
   227  * </ul>
       
   228  *
       
   229  * <p>
       
   230  * RE runs programs compiled by the RECompiler class.  But the RE
       
   231  * matcher class does not include the actual regular expression compiler
       
   232  * for reasons of efficiency.  In fact, if you want to pre-compile one
       
   233  * or more regular expressions, the 'recompile' class can be invoked
       
   234  * from the command line to produce compiled output like this:
       
   235  *
       
   236  * <pre>
       
   237  *    // Pre-compiled regular expression "a*b"
       
   238  *    char[] re1Instructions =
       
   239  *    {
       
   240  *        0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041,
       
   241  *        0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047,
       
   242  *        0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000,
       
   243  *        0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000,
       
   244  *        0x0000,
       
   245  *    };
       
   246  *
       
   247  *
       
   248  *    REProgram re1 = new REProgram(re1Instructions);
       
   249  * </pre>
       
   250  *
       
   251  * You can then construct a regular expression matcher (RE) object from
       
   252  * the pre-compiled expression re1 and thus avoid the overhead of
       
   253  * compiling the expression at runtime. If you require more dynamic
       
   254  * regular expressions, you can construct a single RECompiler object and
       
   255  * re-use it to compile each expression. Similarly, you can change the
       
   256  * program run by a given matcher object at any time. However, RE and
       
   257  * RECompiler are not threadsafe (for efficiency reasons, and because
       
   258  * requiring thread safety in this class is deemed to be a rare
       
   259  * requirement), so you will need to construct a separate compiler or
       
   260  * matcher object for each thread (unless you do thread synchronization
       
   261  * yourself). Once expression compiled into the REProgram object, REProgram
       
   262  * can be safely shared across multiple threads and RE objects.
       
   263  *
       
   264  * <br><p><br>
       
   265  *
       
   266  * <font color="red">
       
   267  * <i>ISSUES:</i>
       
   268  *
       
   269  * <ul>
       
   270  *  <li>com.weusours.util.re is not currently compatible with all
       
   271  *      standard POSIX regcomp flags</li>
       
   272  *  <li>com.weusours.util.re does not support POSIX equivalence classes
       
   273  *      ([=foo=] syntax) (I18N/locale issue)</li>
       
   274  *  <li>com.weusours.util.re does not support nested POSIX character
       
   275  *      classes (definitely should, but not completely trivial)</li>
       
   276  *  <li>com.weusours.util.re Does not support POSIX character collation
       
   277  *      concepts ([.foo.] syntax) (I18N/locale issue)</li>
       
   278  *  <li>Should there be different matching styles (simple, POSIX, Perl etc?)</li>
       
   279  *  <li>Should RE support character iterators (for backwards RE matching!)?</li>
       
   280  *  <li>Should RE support reluctant {m,n} closures (does anyone care)?</li>
       
   281  *  <li>Not *all* possibilities are considered for greediness when backreferences
       
   282  *      are involved (as POSIX suggests should be the case).  The POSIX RE
       
   283  *      "(ac*)c*d[ac]*\1", when matched against "acdacaa" should yield a match
       
   284  *      of acdacaa where \1 is "a".  This is not the case in this RE package,
       
   285  *      and actually Perl doesn't go to this extent either!  Until someone
       
   286  *      actually complains about this, I'm not sure it's worth "fixing".
       
   287  *      If it ever is fixed, test #137 in RETest.txt should be updated.</li>
       
   288  * </ul>
       
   289  *
       
   290  * </font>
       
   291  *
       
   292  * @see recompile
       
   293  * @see RECompiler
       
   294  *
       
   295  * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
       
   296  * @author <a href="mailto:ts@sch-fer.de">Tobias Sch&auml;fer</a>
       
   297  */
       
   298 public class RE implements Serializable
       
   299 {
       
   300     /**
       
   301      * Specifies normal, case-sensitive matching behaviour.
       
   302      */
       
   303     public static final int MATCH_NORMAL          = 0x0000;
       
   304 
       
   305     /**
       
   306      * Flag to indicate that matching should be case-independent (folded)
       
   307      */
       
   308     public static final int MATCH_CASEINDEPENDENT = 0x0001;
       
   309 
       
   310     /**
       
   311      * Newlines should match as BOL/EOL (^ and $)
       
   312      */
       
   313     public static final int MATCH_MULTILINE       = 0x0002;
       
   314 
       
   315     /**
       
   316      * Consider all input a single body of text - newlines are matched by .
       
   317      */
       
   318     public static final int MATCH_SINGLELINE      = 0x0004;
       
   319 
       
   320     /************************************************
       
   321      *                                              *
       
   322      * The format of a node in a program is:        *
       
   323      *                                              *
       
   324      * [ OPCODE ] [ OPDATA ] [ OPNEXT ] [ OPERAND ] *
       
   325      *                                              *
       
   326      * char OPCODE - instruction                    *
       
   327      * char OPDATA - modifying data                 *
       
   328      * char OPNEXT - next node (relative offset)    *
       
   329      *                                              *
       
   330      ************************************************/
       
   331 
       
   332                  //   Opcode              Char       Opdata/Operand  Meaning
       
   333                  //   ----------          ---------- --------------- --------------------------------------------------
       
   334     static final char OP_END              = 'E';  //                 end of program
       
   335     static final char OP_BOL              = '^';  //                 match only if at beginning of line
       
   336     static final char OP_EOL              = '$';  //                 match only if at end of line
       
   337     static final char OP_ANY              = '.';  //                 match any single character except newline
       
   338     static final char OP_ANYOF            = '[';  // count/ranges    match any char in the list of ranges
       
   339     static final char OP_BRANCH           = '|';  // node            match this alternative or the next one
       
   340     static final char OP_ATOM             = 'A';  // length/string   length of string followed by string itself
       
   341     static final char OP_STAR             = '*';  // node            kleene closure
       
   342     static final char OP_PLUS             = '+';  // node            positive closure
       
   343     static final char OP_MAYBE            = '?';  // node            optional closure
       
   344     static final char OP_ESCAPE           = '\\'; // escape          special escape code char class (escape is E_* code)
       
   345     static final char OP_OPEN             = '(';  // number          nth opening paren
       
   346     static final char OP_OPEN_CLUSTER     = '<';  //                 opening cluster
       
   347     static final char OP_CLOSE            = ')';  // number          nth closing paren
       
   348     static final char OP_CLOSE_CLUSTER    = '>';  //                 closing cluster
       
   349     static final char OP_BACKREF          = '#';  // number          reference nth already matched parenthesized string
       
   350     static final char OP_GOTO             = 'G';  //                 nothing but a (back-)pointer
       
   351     static final char OP_NOTHING          = 'N';  //                 match null string such as in '(a|)'
       
   352     static final char OP_RELUCTANTSTAR    = '8';  // none/expr       reluctant '*' (mnemonic for char is unshifted '*')
       
   353     static final char OP_RELUCTANTPLUS    = '=';  // none/expr       reluctant '+' (mnemonic for char is unshifted '+')
       
   354     static final char OP_RELUCTANTMAYBE   = '/';  // none/expr       reluctant '?' (mnemonic for char is unshifted '?')
       
   355     static final char OP_POSIXCLASS       = 'P';  // classid         one of the posix character classes
       
   356 
       
   357     // Escape codes
       
   358     static final char E_ALNUM             = 'w';  // Alphanumeric
       
   359     static final char E_NALNUM            = 'W';  // Non-alphanumeric
       
   360     static final char E_BOUND             = 'b';  // Word boundary
       
   361     static final char E_NBOUND            = 'B';  // Non-word boundary
       
   362     static final char E_SPACE             = 's';  // Whitespace
       
   363     static final char E_NSPACE            = 'S';  // Non-whitespace
       
   364     static final char E_DIGIT             = 'd';  // Digit
       
   365     static final char E_NDIGIT            = 'D';  // Non-digit
       
   366 
       
   367     // Posix character classes
       
   368     static final char POSIX_CLASS_ALNUM   = 'w';  // Alphanumerics
       
   369     static final char POSIX_CLASS_ALPHA   = 'a';  // Alphabetics
       
   370     static final char POSIX_CLASS_BLANK   = 'b';  // Blanks
       
   371     static final char POSIX_CLASS_CNTRL   = 'c';  // Control characters
       
   372     static final char POSIX_CLASS_DIGIT   = 'd';  // Digits
       
   373     static final char POSIX_CLASS_GRAPH   = 'g';  // Graphic characters
       
   374     static final char POSIX_CLASS_LOWER   = 'l';  // Lowercase characters
       
   375     static final char POSIX_CLASS_PRINT   = 'p';  // Printable characters
       
   376     static final char POSIX_CLASS_PUNCT   = '!';  // Punctuation
       
   377     static final char POSIX_CLASS_SPACE   = 's';  // Spaces
       
   378     static final char POSIX_CLASS_UPPER   = 'u';  // Uppercase characters
       
   379     static final char POSIX_CLASS_XDIGIT  = 'x';  // Hexadecimal digits
       
   380     static final char POSIX_CLASS_JSTART  = 'j';  // Java identifier start
       
   381     static final char POSIX_CLASS_JPART   = 'k';  // Java identifier part
       
   382 
       
   383     // Limits
       
   384     static final int maxNode  = 65536;            // Maximum number of nodes in a program
       
   385     static final int MAX_PAREN = 16;              // Number of paren pairs (only 9 can be backrefs)
       
   386 
       
   387     // Node layout constants
       
   388     static final int offsetOpcode = 0;            // Opcode offset (first character)
       
   389     static final int offsetOpdata = 1;            // Opdata offset (second char)
       
   390     static final int offsetNext   = 2;            // Next index offset (third char)
       
   391     static final int nodeSize     = 3;            // Node size (in chars)
       
   392 
       
   393     // State of current program
       
   394     REProgram program;                            // Compiled regular expression 'program'
       
   395     transient CharacterIterator search;           // The string being matched against
       
   396     int matchFlags;                               // Match behaviour flags
       
   397     int maxParen = MAX_PAREN;
       
   398 
       
   399     // Parenthesized subexpressions
       
   400     transient int parenCount;                     // Number of subexpressions matched (num open parens + 1)
       
   401     transient int start0;                         // Cache of start[0]
       
   402     transient int end0;                           // Cache of start[0]
       
   403     transient int start1;                         // Cache of start[1]
       
   404     transient int end1;                           // Cache of start[1]
       
   405     transient int start2;                         // Cache of start[2]
       
   406     transient int end2;                           // Cache of start[2]
       
   407     transient int[] startn;                       // Lazy-alloced array of sub-expression starts
       
   408     transient int[] endn;                         // Lazy-alloced array of sub-expression ends
       
   409 
       
   410     // Backreferences
       
   411     transient int[] startBackref;                 // Lazy-alloced array of backref starts
       
   412     transient int[] endBackref;                   // Lazy-alloced array of backref ends
       
   413 
       
   414     /**
       
   415      * Constructs a regular expression matcher from a String by compiling it
       
   416      * using a new instance of RECompiler.  If you will be compiling many
       
   417      * expressions, you may prefer to use a single RECompiler object instead.
       
   418      *
       
   419      * @param pattern The regular expression pattern to compile.
       
   420      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
       
   421      * @see RECompiler
       
   422      * @see recompile
       
   423      */
       
   424     public RE(String pattern) throws RESyntaxException
       
   425     {
       
   426         this(pattern, MATCH_NORMAL);
       
   427     }
       
   428 
       
   429     /**
       
   430      * Constructs a regular expression matcher from a String by compiling it
       
   431      * using a new instance of RECompiler.  If you will be compiling many
       
   432      * expressions, you may prefer to use a single RECompiler object instead.
       
   433      *
       
   434      * @param pattern The regular expression pattern to compile.
       
   435      * @param matchFlags The matching style
       
   436      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
       
   437      * @see RECompiler
       
   438      * @see recompile
       
   439      */
       
   440     public RE(String pattern, int matchFlags) throws RESyntaxException
       
   441     {
       
   442         this(new RECompiler().compile(pattern));
       
   443         setMatchFlags(matchFlags);
       
   444     }
       
   445 
       
   446     /**
       
   447      * Construct a matcher for a pre-compiled regular expression from program
       
   448      * (bytecode) data.  Permits special flags to be passed in to modify matching
       
   449      * behaviour.
       
   450      *
       
   451      * @param program Compiled regular expression program (see RECompiler and/or recompile)
       
   452      * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
       
   453      *
       
   454      * <pre>
       
   455      *   MATCH_NORMAL              // Normal (case-sensitive) matching
       
   456      *   MATCH_CASEINDEPENDENT     // Case folded comparisons
       
   457      *   MATCH_MULTILINE           // Newline matches as BOL/EOL
       
   458      * </pre>
       
   459      *
       
   460      * @see RECompiler
       
   461      * @see REProgram
       
   462      * @see recompile
       
   463      */
       
   464     public RE(REProgram program, int matchFlags)
       
   465     {
       
   466         setProgram(program);
       
   467         setMatchFlags(matchFlags);
       
   468     }
       
   469 
       
   470     /**
       
   471      * Construct a matcher for a pre-compiled regular expression from program
       
   472      * (bytecode) data.
       
   473      *
       
   474      * @param program Compiled regular expression program
       
   475      * @see RECompiler
       
   476      * @see recompile
       
   477      */
       
   478     public RE(REProgram program)
       
   479     {
       
   480         this(program, MATCH_NORMAL);
       
   481     }
       
   482 
       
   483     /**
       
   484      * Constructs a regular expression matcher with no initial program.
       
   485      * This is likely to be an uncommon practice, but is still supported.
       
   486      */
       
   487     public RE()
       
   488     {
       
   489         this((REProgram)null, MATCH_NORMAL);
       
   490     }
       
   491 
       
   492     /**
       
   493      * Converts a 'simplified' regular expression to a full regular expression
       
   494      *
       
   495      * @param pattern The pattern to convert
       
   496      * @return The full regular expression
       
   497      */
       
   498     public static String simplePatternToFullRegularExpression(String pattern)
       
   499     {
       
   500         StringBuffer buf = new StringBuffer();
       
   501         for (int i = 0; i < pattern.length(); i++)
       
   502         {
       
   503             char c = pattern.charAt(i);
       
   504             switch (c)
       
   505             {
       
   506                 case '*':
       
   507                     buf.append(".*");
       
   508                     break;
       
   509 
       
   510                 case '.':
       
   511                 case '[':
       
   512                 case ']':
       
   513                 case '\\':
       
   514                 case '+':
       
   515                 case '?':
       
   516                 case '{':
       
   517                 case '}':
       
   518                 case '$':
       
   519                 case '^':
       
   520                 case '|':
       
   521                 case '(':
       
   522                 case ')':
       
   523                     buf.append('\\');
       
   524                 default:
       
   525                     buf.append(c);
       
   526                     break;
       
   527             }
       
   528         }
       
   529         return buf.toString();
       
   530     }
       
   531 
       
   532     /**
       
   533      * Sets match behaviour flags which alter the way RE does matching.
       
   534      * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
       
   535      *
       
   536      * <pre>
       
   537      *   MATCH_NORMAL              // Normal (case-sensitive) matching
       
   538      *   MATCH_CASEINDEPENDENT     // Case folded comparisons
       
   539      *   MATCH_MULTILINE           // Newline matches as BOL/EOL
       
   540      * </pre>
       
   541      */
       
   542     public void setMatchFlags(int matchFlags)
       
   543     {
       
   544         this.matchFlags = matchFlags;
       
   545     }
       
   546 
       
   547     /**
       
   548      * Returns the current match behaviour flags.
       
   549      * @return Current match behaviour flags (RE.MATCH_*).
       
   550      *
       
   551      * <pre>
       
   552      *   MATCH_NORMAL              // Normal (case-sensitive) matching
       
   553      *   MATCH_CASEINDEPENDENT     // Case folded comparisons
       
   554      *   MATCH_MULTILINE           // Newline matches as BOL/EOL
       
   555      * </pre>
       
   556      *
       
   557      * @see #setMatchFlags
       
   558      */
       
   559     public int getMatchFlags()
       
   560     {
       
   561         return matchFlags;
       
   562     }
       
   563 
       
   564     /**
       
   565      * Sets the current regular expression program used by this matcher object.
       
   566      *
       
   567      * @param program Regular expression program compiled by RECompiler.
       
   568      * @see RECompiler
       
   569      * @see REProgram
       
   570      * @see recompile
       
   571      */
       
   572     public void setProgram(REProgram program)
       
   573     {
       
   574         this.program = program;
       
   575         if (program != null && program.maxParens != -1) {
       
   576             this.maxParen = program.maxParens;
       
   577         } else {
       
   578             this.maxParen = MAX_PAREN;
       
   579         }
       
   580     }
       
   581 
       
   582     /**
       
   583      * Returns the current regular expression program in use by this matcher object.
       
   584      *
       
   585      * @return Regular expression program
       
   586      * @see #setProgram
       
   587      */
       
   588     public REProgram getProgram()
       
   589     {
       
   590         return program;
       
   591     }
       
   592 
       
   593     /**
       
   594      * Returns the number of parenthesized subexpressions available after a successful match.
       
   595      *
       
   596      * @return Number of available parenthesized subexpressions
       
   597      */
       
   598     public int getParenCount()
       
   599     {
       
   600         return parenCount;
       
   601     }
       
   602 
       
   603     /**
       
   604      * Gets the contents of a parenthesized subexpression after a successful match.
       
   605      *
       
   606      * @param which Nesting level of subexpression
       
   607      * @return String
       
   608      */
       
   609     public String getParen(int which)
       
   610     {
       
   611         int start;
       
   612         if (which < parenCount && (start = getParenStart(which)) >= 0)
       
   613         {
       
   614             return search.substring(start, getParenEnd(which));
       
   615         }
       
   616         return null;
       
   617     }
       
   618 
       
   619     /**
       
   620      * Returns the start index of a given paren level.
       
   621      *
       
   622      * @param which Nesting level of subexpression
       
   623      * @return String index
       
   624      */
       
   625     public final int getParenStart(int which)
       
   626     {
       
   627         if (which < parenCount)
       
   628         {
       
   629             switch (which)
       
   630             {
       
   631                 case 0:
       
   632                     return start0;
       
   633 
       
   634                 case 1:
       
   635                     return start1;
       
   636 
       
   637                 case 2:
       
   638                     return start2;
       
   639 
       
   640                 default:
       
   641                     if (startn == null)
       
   642                     {
       
   643                         allocParens();
       
   644                     }
       
   645                     return startn[which];
       
   646             }
       
   647         }
       
   648         return -1;
       
   649     }
       
   650 
       
   651     /**
       
   652      * Returns the end index of a given paren level.
       
   653      *
       
   654      * @param which Nesting level of subexpression
       
   655      * @return String index
       
   656      */
       
   657     public final int getParenEnd(int which)
       
   658     {
       
   659         if (which < parenCount)
       
   660         {
       
   661             switch (which)
       
   662             {
       
   663                 case 0:
       
   664                     return end0;
       
   665 
       
   666                 case 1:
       
   667                     return end1;
       
   668 
       
   669                 case 2:
       
   670                     return end2;
       
   671 
       
   672                 default:
       
   673                     if (endn == null)
       
   674                     {
       
   675                         allocParens();
       
   676                     }
       
   677                     return endn[which];
       
   678             }
       
   679         }
       
   680         return -1;
       
   681     }
       
   682 
       
   683     /**
       
   684      * Returns the length of a given paren level.
       
   685      *
       
   686      * @param which Nesting level of subexpression
       
   687      * @return Number of characters in the parenthesized subexpression
       
   688      */
       
   689     public final int getParenLength(int which)
       
   690     {
       
   691         if (which < parenCount)
       
   692         {
       
   693             return getParenEnd(which) - getParenStart(which);
       
   694         }
       
   695         return -1;
       
   696     }
       
   697 
       
   698     /**
       
   699      * Sets the start of a paren level
       
   700      *
       
   701      * @param which Which paren level
       
   702      * @param i Index in input array
       
   703      */
       
   704     protected final void setParenStart(int which, int i)
       
   705     {
       
   706         if (which < parenCount)
       
   707         {
       
   708             switch (which)
       
   709             {
       
   710                 case 0:
       
   711                     start0 = i;
       
   712                     break;
       
   713 
       
   714                 case 1:
       
   715                     start1 = i;
       
   716                     break;
       
   717 
       
   718                 case 2:
       
   719                     start2 = i;
       
   720                     break;
       
   721 
       
   722                 default:
       
   723                     if (startn == null)
       
   724                     {
       
   725                         allocParens();
       
   726                     }
       
   727                     startn[which] = i;
       
   728                     break;
       
   729             }
       
   730         }
       
   731     }
       
   732 
       
   733     /**
       
   734      * Sets the end of a paren level
       
   735      *
       
   736      * @param which Which paren level
       
   737      * @param i Index in input array
       
   738      */
       
   739     protected final void setParenEnd(int which, int i)
       
   740     {
       
   741         if (which < parenCount)
       
   742         {
       
   743             switch (which)
       
   744             {
       
   745                 case 0:
       
   746                     end0 = i;
       
   747                     break;
       
   748 
       
   749                 case 1:
       
   750                     end1 = i;
       
   751                     break;
       
   752 
       
   753                 case 2:
       
   754                     end2 = i;
       
   755                     break;
       
   756 
       
   757                 default:
       
   758                     if (endn == null)
       
   759                     {
       
   760                         allocParens();
       
   761                     }
       
   762                     endn[which] = i;
       
   763                     break;
       
   764             }
       
   765         }
       
   766     }
       
   767 
       
   768     /**
       
   769      * Throws an Error representing an internal error condition probably resulting
       
   770      * from a bug in the regular expression compiler (or possibly data corruption).
       
   771      * In practice, this should be very rare.
       
   772      *
       
   773      * @param s Error description
       
   774      */
       
   775     protected void internalError(String s) throws Error
       
   776     {
       
   777         throw new Error("RE internal error: " + s);
       
   778     }
       
   779 
       
   780     /**
       
   781      * Performs lazy allocation of subexpression arrays
       
   782      */
       
   783     private final void allocParens()
       
   784     {
       
   785         // Allocate arrays for subexpressions
       
   786         startn = new int[maxParen];
       
   787         endn = new int[maxParen];
       
   788 
       
   789         // Set sub-expression pointers to invalid values
       
   790         for (int i = 0; i < maxParen; i++)
       
   791         {
       
   792             startn[i] = -1;
       
   793             endn[i] = -1;
       
   794         }
       
   795     }
       
   796 
       
   797     /**
       
   798      * Try to match a string against a subset of nodes in the program
       
   799      *
       
   800      * @param firstNode Node to start at in program
       
   801      * @param lastNode  Last valid node (used for matching a subexpression without
       
   802      *                  matching the rest of the program as well).
       
   803      * @param idxStart  Starting position in character array
       
   804      * @return Final input array index if match succeeded.  -1 if not.
       
   805      */
       
   806     protected int matchNodes(int firstNode, int lastNode, int idxStart)
       
   807     {
       
   808         // Our current place in the string
       
   809         int idx = idxStart;
       
   810 
       
   811         // Loop while node is valid
       
   812         int next, opcode, opdata;
       
   813         int idxNew;
       
   814         char[] instruction = program.instruction;
       
   815         for (int node = firstNode; node < lastNode; )
       
   816         {
       
   817             opcode = instruction[node + offsetOpcode];
       
   818             next   = node + (short)instruction[node + offsetNext];
       
   819             opdata = instruction[node + offsetOpdata];
       
   820 
       
   821             switch (opcode)
       
   822             {
       
   823                 case OP_RELUCTANTMAYBE:
       
   824                     {
       
   825                         int once = 0;
       
   826                         do
       
   827                         {
       
   828                             // Try to match the rest without using the reluctant subexpr
       
   829                             if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
       
   830                             {
       
   831                                 return idxNew;
       
   832                             }
       
   833                         }
       
   834                         while ((once++ == 0) && (idx = matchNodes(node + nodeSize, next, idx)) != -1);
       
   835                         return -1;
       
   836                     }
       
   837 
       
   838                 case OP_RELUCTANTPLUS:
       
   839                     while ((idx = matchNodes(node + nodeSize, next, idx)) != -1)
       
   840                     {
       
   841                         // Try to match the rest without using the reluctant subexpr
       
   842                         if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
       
   843                         {
       
   844                             return idxNew;
       
   845                         }
       
   846                     }
       
   847                     return -1;
       
   848 
       
   849                 case OP_RELUCTANTSTAR:
       
   850                     do
       
   851                     {
       
   852                         // Try to match the rest without using the reluctant subexpr
       
   853                         if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
       
   854                         {
       
   855                             return idxNew;
       
   856                         }
       
   857                     }
       
   858                     while ((idx = matchNodes(node + nodeSize, next, idx)) != -1);
       
   859                     return -1;
       
   860 
       
   861                 case OP_OPEN:
       
   862 
       
   863                     // Match subexpression
       
   864                     if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
       
   865                     {
       
   866                         startBackref[opdata] = idx;
       
   867                     }
       
   868                     if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
       
   869                     {
       
   870                         // Increase valid paren count
       
   871                         if ((opdata + 1) > parenCount)
       
   872                         {
       
   873                             parenCount = opdata + 1;
       
   874                         }
       
   875 
       
   876                         // Don't set paren if already set later on
       
   877                         if (getParenStart(opdata) == -1)
       
   878                         {
       
   879                             setParenStart(opdata, idx);
       
   880                         }
       
   881                     }
       
   882                     return idxNew;
       
   883 
       
   884                 case OP_CLOSE:
       
   885 
       
   886                     // Done matching subexpression
       
   887                     if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
       
   888                     {
       
   889                         endBackref[opdata] = idx;
       
   890                     }
       
   891                     if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
       
   892                     {
       
   893                         // Increase valid paren count
       
   894                         if ((opdata + 1) > parenCount)
       
   895                         {
       
   896                             parenCount = opdata + 1;
       
   897                         }
       
   898 
       
   899                         // Don't set paren if already set later on
       
   900                         if (getParenEnd(opdata) == -1)
       
   901                         {
       
   902                             setParenEnd(opdata, idx);
       
   903                         }
       
   904                     }
       
   905                     return idxNew;
       
   906 
       
   907                 case OP_OPEN_CLUSTER:
       
   908                 case OP_CLOSE_CLUSTER:
       
   909                     // starting or ending the matching of a subexpression which has no backref.
       
   910                     return matchNodes( next, maxNode, idx );
       
   911 
       
   912                 case OP_BACKREF:
       
   913                     {
       
   914                         // Get the start and end of the backref
       
   915                         int s = startBackref[opdata];
       
   916                         int e = endBackref[opdata];
       
   917 
       
   918                         // We don't know the backref yet
       
   919                         if (s == -1 || e == -1)
       
   920                         {
       
   921                             return -1;
       
   922                         }
       
   923 
       
   924                         // The backref is empty size
       
   925                         if (s == e)
       
   926                         {
       
   927                             break;
       
   928                         }
       
   929 
       
   930                         // Get the length of the backref
       
   931                         int l = e - s;
       
   932 
       
   933                         // If there's not enough input left, give up.
       
   934                         if (search.isEnd(idx + l - 1))
       
   935                         {
       
   936                             return -1;
       
   937                         }
       
   938 
       
   939                         // Case fold the backref?
       
   940                         final boolean caseFold =
       
   941                             ((matchFlags & MATCH_CASEINDEPENDENT) != 0);
       
   942                         // Compare backref to input
       
   943                         for (int i = 0; i < l; i++)
       
   944                         {
       
   945                             if (compareChars(search.charAt(idx++), search.charAt(s + i), caseFold) != 0)
       
   946                             {
       
   947                                 return -1;
       
   948                             }
       
   949                         }
       
   950                     }
       
   951                     break;
       
   952 
       
   953                 case OP_BOL:
       
   954 
       
   955                     // Fail if we're not at the start of the string
       
   956                     if (idx != 0)
       
   957                     {
       
   958                         // If we're multiline matching, we could still be at the start of a line
       
   959                         if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
       
   960                         {
       
   961                             // If not at start of line, give up
       
   962                             if (idx <= 0 || !isNewline(idx - 1)) {
       
   963                                 return -1;
       
   964                             } else {
       
   965                                 break;
       
   966                             }
       
   967                         }
       
   968                         return -1;
       
   969                     }
       
   970                     break;
       
   971 
       
   972                 case OP_EOL:
       
   973 
       
   974                     // If we're not at the end of string
       
   975                     if (!search.isEnd(0) && !search.isEnd(idx))
       
   976                     {
       
   977                         // If we're multi-line matching
       
   978                         if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
       
   979                         {
       
   980                             // Give up if we're not at the end of a line
       
   981                             if (!isNewline(idx)) {
       
   982                                 return -1;
       
   983                             } else {
       
   984                                 break;
       
   985                             }
       
   986                         }
       
   987                         return -1;
       
   988                     }
       
   989                     break;
       
   990 
       
   991                 case OP_ESCAPE:
       
   992 
       
   993                     // Which escape?
       
   994                     switch (opdata)
       
   995                     {
       
   996                         // Word boundary match
       
   997                         case E_NBOUND:
       
   998                         case E_BOUND:
       
   999                             {
       
  1000                                 char cLast = ((idx == 0) ? '\n' : search.charAt(idx - 1));
       
  1001                                 char cNext = ((search.isEnd(idx)) ? '\n' : search.charAt(idx));
       
  1002                                 if ((Character.isLetterOrDigit(cLast) == Character.isLetterOrDigit(cNext)) == (opdata == E_BOUND))
       
  1003                                 {
       
  1004                                     return -1;
       
  1005                                 }
       
  1006                             }
       
  1007                             break;
       
  1008 
       
  1009                         // Alpha-numeric, digit, space, javaLetter, javaLetterOrDigit
       
  1010                         case E_ALNUM:
       
  1011                         case E_NALNUM:
       
  1012                         case E_DIGIT:
       
  1013                         case E_NDIGIT:
       
  1014                         case E_SPACE:
       
  1015                         case E_NSPACE:
       
  1016 
       
  1017                             // Give up if out of input
       
  1018                             if (search.isEnd(idx))
       
  1019                             {
       
  1020                                 return -1;
       
  1021                             }
       
  1022 
       
  1023                             char c = search.charAt(idx);
       
  1024 
       
  1025                             // Switch on escape
       
  1026                             switch (opdata)
       
  1027                             {
       
  1028                                 case E_ALNUM:
       
  1029                                 case E_NALNUM:
       
  1030                                     if (!((Character.isLetterOrDigit(c) || c == '_') == (opdata == E_ALNUM)))
       
  1031                                     {
       
  1032                                         return -1;
       
  1033                                     }
       
  1034                                     break;
       
  1035 
       
  1036                                 case E_DIGIT:
       
  1037                                 case E_NDIGIT:
       
  1038                                     if (!(Character.isDigit(c) == (opdata == E_DIGIT)))
       
  1039                                     {
       
  1040                                         return -1;
       
  1041                                     }
       
  1042                                     break;
       
  1043 
       
  1044                                 case E_SPACE:
       
  1045                                 case E_NSPACE:
       
  1046                                     if (!(Character.isWhitespace(c) == (opdata == E_SPACE)))
       
  1047                                     {
       
  1048                                         return -1;
       
  1049                                     }
       
  1050                                     break;
       
  1051                             }
       
  1052                             idx++;
       
  1053                             break;
       
  1054 
       
  1055                         default:
       
  1056                             internalError("Unrecognized escape '" + opdata + "'");
       
  1057                     }
       
  1058                     break;
       
  1059 
       
  1060                 case OP_ANY:
       
  1061 
       
  1062                     if ((matchFlags & MATCH_SINGLELINE) == MATCH_SINGLELINE) {
       
  1063                         // Match anything
       
  1064                         if (search.isEnd(idx))
       
  1065                         {
       
  1066                             return -1;
       
  1067                         }
       
  1068                     }
       
  1069                     else
       
  1070                     {
       
  1071                         // Match anything but a newline
       
  1072                         if (search.isEnd(idx) || isNewline(idx))
       
  1073                         {
       
  1074                             return -1;
       
  1075                         }
       
  1076                     }
       
  1077                     idx++;
       
  1078                     break;
       
  1079 
       
  1080                 case OP_ATOM:
       
  1081                     {
       
  1082                         // Match an atom value
       
  1083                         if (search.isEnd(idx))
       
  1084                         {
       
  1085                             return -1;
       
  1086                         }
       
  1087 
       
  1088                         // Get length of atom and starting index
       
  1089                         int lenAtom = opdata;
       
  1090                         int startAtom = node + nodeSize;
       
  1091 
       
  1092                         // Give up if not enough input remains to have a match
       
  1093                         if (search.isEnd(lenAtom + idx - 1))
       
  1094                         {
       
  1095                             return -1;
       
  1096                         }
       
  1097 
       
  1098                         // Match atom differently depending on casefolding flag
       
  1099                         final boolean caseFold =
       
  1100                             ((matchFlags & MATCH_CASEINDEPENDENT) != 0);
       
  1101 
       
  1102                         for (int i = 0; i < lenAtom; i++)
       
  1103                         {
       
  1104                             if (compareChars(search.charAt(idx++), instruction[startAtom + i], caseFold) != 0)
       
  1105                             {
       
  1106                                 return -1;
       
  1107                             }
       
  1108                         }
       
  1109                     }
       
  1110                     break;
       
  1111 
       
  1112                 case OP_POSIXCLASS:
       
  1113                     {
       
  1114                         // Out of input?
       
  1115                         if (search.isEnd(idx))
       
  1116                         {
       
  1117                             return -1;
       
  1118                         }
       
  1119 
       
  1120                         switch (opdata)
       
  1121                         {
       
  1122                             case POSIX_CLASS_ALNUM:
       
  1123                                 if (!Character.isLetterOrDigit(search.charAt(idx)))
       
  1124                                 {
       
  1125                                     return -1;
       
  1126                                 }
       
  1127                                 break;
       
  1128 
       
  1129                             case POSIX_CLASS_ALPHA:
       
  1130                                 if (!Character.isLetter(search.charAt(idx)))
       
  1131                                 {
       
  1132                                     return -1;
       
  1133                                 }
       
  1134                                 break;
       
  1135 
       
  1136                             case POSIX_CLASS_DIGIT:
       
  1137                                 if (!Character.isDigit(search.charAt(idx)))
       
  1138                                 {
       
  1139                                     return -1;
       
  1140                                 }
       
  1141                                 break;
       
  1142 
       
  1143                             case POSIX_CLASS_BLANK: // JWL - bugbug: is this right??
       
  1144                                 if (!Character.isSpaceChar(search.charAt(idx)))
       
  1145                                 {
       
  1146                                     return -1;
       
  1147                                 }
       
  1148                                 break;
       
  1149 
       
  1150                             case POSIX_CLASS_SPACE:
       
  1151                                 if (!Character.isWhitespace(search.charAt(idx)))
       
  1152                                 {
       
  1153                                     return -1;
       
  1154                                 }
       
  1155                                 break;
       
  1156 
       
  1157                             case POSIX_CLASS_CNTRL:
       
  1158                                 if (Character.getType(search.charAt(idx)) != Character.CONTROL)
       
  1159                                 {
       
  1160                                     return -1;
       
  1161                                 }
       
  1162                                 break;
       
  1163 
       
  1164                             case POSIX_CLASS_GRAPH: // JWL - bugbug???
       
  1165                                 switch (Character.getType(search.charAt(idx)))
       
  1166                                 {
       
  1167                                     case Character.MATH_SYMBOL:
       
  1168                                     case Character.CURRENCY_SYMBOL:
       
  1169                                     case Character.MODIFIER_SYMBOL:
       
  1170                                     case Character.OTHER_SYMBOL:
       
  1171                                         break;
       
  1172 
       
  1173                                     default:
       
  1174                                         return -1;
       
  1175                                 }
       
  1176                                 break;
       
  1177 
       
  1178                             case POSIX_CLASS_LOWER:
       
  1179                                 if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER)
       
  1180                                 {
       
  1181                                     return -1;
       
  1182                                 }
       
  1183                                 break;
       
  1184 
       
  1185                             case POSIX_CLASS_UPPER:
       
  1186                                 if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER)
       
  1187                                 {
       
  1188                                     return -1;
       
  1189                                 }
       
  1190                                 break;
       
  1191 
       
  1192                             case POSIX_CLASS_PRINT:
       
  1193                                 if (Character.getType(search.charAt(idx)) == Character.CONTROL)
       
  1194                                 {
       
  1195                                     return -1;
       
  1196                                 }
       
  1197                                 break;
       
  1198 
       
  1199                             case POSIX_CLASS_PUNCT:
       
  1200                             {
       
  1201                                 int type = Character.getType(search.charAt(idx));
       
  1202                                 switch(type)
       
  1203                                 {
       
  1204                                     case Character.DASH_PUNCTUATION:
       
  1205                                     case Character.START_PUNCTUATION:
       
  1206                                     case Character.END_PUNCTUATION:
       
  1207                                     case Character.CONNECTOR_PUNCTUATION:
       
  1208                                     case Character.OTHER_PUNCTUATION:
       
  1209                                         break;
       
  1210 
       
  1211                                     default:
       
  1212                                         return -1;
       
  1213                                 }
       
  1214                             }
       
  1215                             break;
       
  1216 
       
  1217                             case POSIX_CLASS_XDIGIT: // JWL - bugbug??
       
  1218                             {
       
  1219                                 boolean isXDigit = ((search.charAt(idx) >= '0' && search.charAt(idx) <= '9') ||
       
  1220                                                     (search.charAt(idx) >= 'a' && search.charAt(idx) <= 'f') ||
       
  1221                                                     (search.charAt(idx) >= 'A' && search.charAt(idx) <= 'F'));
       
  1222                                 if (!isXDigit)
       
  1223                                 {
       
  1224                                     return -1;
       
  1225                                 }
       
  1226                             }
       
  1227                             break;
       
  1228 
       
  1229                             case POSIX_CLASS_JSTART:
       
  1230                                 if (!Character.isJavaIdentifierStart(search.charAt(idx)))
       
  1231                                 {
       
  1232                                     return -1;
       
  1233                                 }
       
  1234                                 break;
       
  1235 
       
  1236                             case POSIX_CLASS_JPART:
       
  1237                                 if (!Character.isJavaIdentifierPart(search.charAt(idx)))
       
  1238                                 {
       
  1239                                     return -1;
       
  1240                                 }
       
  1241                                 break;
       
  1242 
       
  1243                             default:
       
  1244                                 internalError("Bad posix class");
       
  1245                                 break;
       
  1246                         }
       
  1247 
       
  1248                         // Matched.
       
  1249                         idx++;
       
  1250                     }
       
  1251                     break;
       
  1252 
       
  1253                 case OP_ANYOF:
       
  1254                     {
       
  1255                         // Out of input?
       
  1256                         if (search.isEnd(idx))
       
  1257                         {
       
  1258                             return -1;
       
  1259                         }
       
  1260 
       
  1261                         // Get character to match against character class and maybe casefold
       
  1262                         char c = search.charAt(idx);
       
  1263                         boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
       
  1264                         // Loop through character class checking our match character
       
  1265                         int idxRange = node + nodeSize;
       
  1266                         int idxEnd = idxRange + (opdata * 2);
       
  1267                         boolean match = false;
       
  1268                         for (int i = idxRange; !match && i < idxEnd; )
       
  1269                         {
       
  1270                             // Get start, end and match characters
       
  1271                             char s = instruction[i++];
       
  1272                             char e = instruction[i++];
       
  1273 
       
  1274                             match = ((compareChars(c, s, caseFold) >= 0)
       
  1275                                      && (compareChars(c, e, caseFold) <= 0));
       
  1276                         }
       
  1277 
       
  1278                         // Fail if we didn't match the character class
       
  1279                         if (!match)
       
  1280                         {
       
  1281                             return -1;
       
  1282                         }
       
  1283                         idx++;
       
  1284                     }
       
  1285                     break;
       
  1286 
       
  1287                 case OP_BRANCH:
       
  1288                 {
       
  1289                     // Check for choices
       
  1290                     if (instruction[next + offsetOpcode] != OP_BRANCH)
       
  1291                     {
       
  1292                         // If there aren't any other choices, just evaluate this branch.
       
  1293                         node += nodeSize;
       
  1294                         continue;
       
  1295                     }
       
  1296 
       
  1297                     // Try all available branches
       
  1298                     short nextBranch;
       
  1299                     do
       
  1300                     {
       
  1301                         // Try matching the branch against the string
       
  1302                         if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1)
       
  1303                         {
       
  1304                             return idxNew;
       
  1305                         }
       
  1306 
       
  1307                         // Go to next branch (if any)
       
  1308                         nextBranch = (short)instruction[node + offsetNext];
       
  1309                         node += nextBranch;
       
  1310                     }
       
  1311                     while (nextBranch != 0 && (instruction[node + offsetOpcode] == OP_BRANCH));
       
  1312 
       
  1313                     // Failed to match any branch!
       
  1314                     return -1;
       
  1315                 }
       
  1316 
       
  1317                 case OP_NOTHING:
       
  1318                 case OP_GOTO:
       
  1319 
       
  1320                     // Just advance to the next node without doing anything
       
  1321                     break;
       
  1322 
       
  1323                 case OP_END:
       
  1324 
       
  1325                     // Match has succeeded!
       
  1326                     setParenEnd(0, idx);
       
  1327                     return idx;
       
  1328 
       
  1329                 default:
       
  1330 
       
  1331                     // Corrupt program
       
  1332                     internalError("Invalid opcode '" + opcode + "'");
       
  1333             }
       
  1334 
       
  1335             // Advance to the next node in the program
       
  1336             node = next;
       
  1337         }
       
  1338 
       
  1339         // We "should" never end up here
       
  1340         internalError("Corrupt program");
       
  1341         return -1;
       
  1342     }
       
  1343 
       
  1344     /**
       
  1345      * Match the current regular expression program against the current
       
  1346      * input string, starting at index i of the input string.  This method
       
  1347      * is only meant for internal use.
       
  1348      *
       
  1349      * @param i The input string index to start matching at
       
  1350      * @return True if the input matched the expression
       
  1351      */
       
  1352     protected boolean matchAt(int i)
       
  1353     {
       
  1354         // Initialize start pointer, paren cache and paren count
       
  1355         start0 = -1;
       
  1356         end0   = -1;
       
  1357         start1 = -1;
       
  1358         end1   = -1;
       
  1359         start2 = -1;
       
  1360         end2   = -1;
       
  1361         startn = null;
       
  1362         endn   = null;
       
  1363         parenCount = 1;
       
  1364         setParenStart(0, i);
       
  1365 
       
  1366         // Allocate backref arrays (unless optimizations indicate otherwise)
       
  1367         if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
       
  1368         {
       
  1369             startBackref = new int[maxParen];
       
  1370             endBackref = new int[maxParen];
       
  1371         }
       
  1372 
       
  1373         // Match against string
       
  1374         int idx;
       
  1375         if ((idx = matchNodes(0, maxNode, i)) != -1)
       
  1376         {
       
  1377             setParenEnd(0, idx);
       
  1378             return true;
       
  1379         }
       
  1380 
       
  1381         // Didn't match
       
  1382         parenCount = 0;
       
  1383         return false;
       
  1384     }
       
  1385 
       
  1386     /**
       
  1387      * Matches the current regular expression program against a character array,
       
  1388      * starting at a given index.
       
  1389      *
       
  1390      * @param search String to match against
       
  1391      * @param i Index to start searching at
       
  1392      * @return True if string matched
       
  1393      */
       
  1394     public boolean match(String search, int i)
       
  1395     {
       
  1396         return match(new StringCharacterIterator(search), i);
       
  1397     }
       
  1398 
       
  1399     /**
       
  1400      * Matches the current regular expression program against a character array,
       
  1401      * starting at a given index.
       
  1402      *
       
  1403      * @param search String to match against
       
  1404      * @param i Index to start searching at
       
  1405      * @return True if string matched
       
  1406      */
       
  1407     public boolean match(CharacterIterator search, int i)
       
  1408     {
       
  1409         // There is no compiled program to search with!
       
  1410         if (program == null)
       
  1411         {
       
  1412             // This should be uncommon enough to be an error case rather
       
  1413             // than an exception (which would have to be handled everywhere)
       
  1414             internalError("No RE program to run!");
       
  1415         }
       
  1416 
       
  1417         // Save string to search
       
  1418         this.search = search;
       
  1419 
       
  1420         // Can we optimize the search by looking for a prefix string?
       
  1421         if (program.prefix == null)
       
  1422         {
       
  1423             // Unprefixed matching must try for a match at each character
       
  1424             for ( ;! search.isEnd(i - 1); i++)
       
  1425             {
       
  1426                 // Try a match at index i
       
  1427                 if (matchAt(i))
       
  1428                 {
       
  1429                     return true;
       
  1430                 }
       
  1431             }
       
  1432             return false;
       
  1433         }
       
  1434         else
       
  1435         {
       
  1436             // Prefix-anchored matching is possible
       
  1437             boolean caseIndependent = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
       
  1438             char[] prefix = program.prefix;
       
  1439             for ( ; !search.isEnd(i + prefix.length - 1); i++)
       
  1440             {
       
  1441                 int j = i;
       
  1442                 int k = 0;
       
  1443 
       
  1444                 boolean match;
       
  1445                 do {
       
  1446                     // If there's a mismatch of any character in the prefix, give up
       
  1447                     match = (compareChars(search.charAt(j++), prefix[k++], caseIndependent) == 0);
       
  1448                 } while (match && k < prefix.length);
       
  1449 
       
  1450                 // See if the whole prefix string matched
       
  1451                 if (k == prefix.length)
       
  1452                 {
       
  1453                     // We matched the full prefix at firstChar, so try it
       
  1454                     if (matchAt(i))
       
  1455                     {
       
  1456                         return true;
       
  1457                     }
       
  1458                 }
       
  1459             }
       
  1460             return false;
       
  1461         }
       
  1462     }
       
  1463 
       
  1464     /**
       
  1465      * Matches the current regular expression program against a String.
       
  1466      *
       
  1467      * @param search String to match against
       
  1468      * @return True if string matched
       
  1469      */
       
  1470     public boolean match(String search)
       
  1471     {
       
  1472         return match(search, 0);
       
  1473     }
       
  1474 
       
  1475     /**
       
  1476      * Splits a string into an array of strings on regular expression boundaries.
       
  1477      * This function works the same way as the Perl function of the same name.
       
  1478      * Given a regular expression of "[ab]+" and a string to split of
       
  1479      * "xyzzyababbayyzabbbab123", the result would be the array of Strings
       
  1480      * "[xyzzy, yyz, 123]".
       
  1481      *
       
  1482      * <p>Please note that the first string in the resulting array may be an empty
       
  1483      * string. This happens when the very first character of input string is
       
  1484      * matched by the pattern.
       
  1485      *
       
  1486      * @param s String to split on this regular exression
       
  1487      * @return Array of strings
       
  1488      */
       
  1489     public String[] split(String s)
       
  1490     {
       
  1491         // Create new vector
       
  1492         Vector v = new Vector();
       
  1493 
       
  1494         // Start at position 0 and search the whole string
       
  1495         int pos = 0;
       
  1496         int len = s.length();
       
  1497 
       
  1498         // Try a match at each position
       
  1499         while (pos < len && match(s, pos))
       
  1500         {
       
  1501             // Get start of match
       
  1502             int start = getParenStart(0);
       
  1503 
       
  1504             // Get end of match
       
  1505             int newpos = getParenEnd(0);
       
  1506 
       
  1507             // Check if no progress was made
       
  1508             if (newpos == pos)
       
  1509             {
       
  1510                 v.addElement(s.substring(pos, start + 1));
       
  1511                 newpos++;
       
  1512             }
       
  1513             else
       
  1514             {
       
  1515                 v.addElement(s.substring(pos, start));
       
  1516             }
       
  1517 
       
  1518             // Move to new position
       
  1519             pos = newpos;
       
  1520         }
       
  1521 
       
  1522         // Push remainder if it's not empty
       
  1523         String remainder = s.substring(pos);
       
  1524         if (remainder.length() != 0)
       
  1525         {
       
  1526             v.addElement(remainder);
       
  1527         }
       
  1528 
       
  1529         // Return vector as an array of strings
       
  1530         String[] ret = new String[v.size()];
       
  1531         v.copyInto(ret);
       
  1532         return ret;
       
  1533     }
       
  1534 
       
  1535     /**
       
  1536      * Flag bit that indicates that subst should replace all occurrences of this
       
  1537      * regular expression.
       
  1538      */
       
  1539     public static final int REPLACE_ALL            = 0x0000;
       
  1540 
       
  1541     /**
       
  1542      * Flag bit that indicates that subst should only replace the first occurrence
       
  1543      * of this regular expression.
       
  1544      */
       
  1545     public static final int REPLACE_FIRSTONLY      = 0x0001;
       
  1546 
       
  1547     /**
       
  1548      * Flag bit that indicates that subst should replace backreferences
       
  1549      */
       
  1550     public static final int REPLACE_BACKREFERENCES = 0x0002;
       
  1551 
       
  1552     /**
       
  1553      * Substitutes a string for this regular expression in another string.
       
  1554      * This method works like the Perl function of the same name.
       
  1555      * Given a regular expression of "a*b", a String to substituteIn of
       
  1556      * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
       
  1557      * resulting String returned by subst would be "-foo-garply-wacky-".
       
  1558      *
       
  1559      * @param substituteIn String to substitute within
       
  1560      * @param substitution String to substitute for all matches of this regular expression.
       
  1561      * @return The string substituteIn with zero or more occurrences of the current
       
  1562      * regular expression replaced with the substitution String (if this regular
       
  1563      * expression object doesn't match at any position, the original String is returned
       
  1564      * unchanged).
       
  1565      */
       
  1566     public String subst(String substituteIn, String substitution)
       
  1567     {
       
  1568         return subst(substituteIn, substitution, REPLACE_ALL);
       
  1569     }
       
  1570 
       
  1571     /**
       
  1572      * Substitutes a string for this regular expression in another string.
       
  1573      * This method works like the Perl function of the same name.
       
  1574      * Given a regular expression of "a*b", a String to substituteIn of
       
  1575      * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
       
  1576      * resulting String returned by subst would be "-foo-garply-wacky-".
       
  1577      * <p>
       
  1578      * It is also possible to reference the contents of a parenthesized expression
       
  1579      * with $0, $1, ... $9. A regular expression of "http://[\\.\\w\\-\\?/~_@&=%]+",
       
  1580      * a String to substituteIn of "visit us: http://www.apache.org!" and the
       
  1581      * substitution String "&lt;a href=\"$0\"&gt;$0&lt;/a&gt;", the resulting String
       
  1582      * returned by subst would be
       
  1583      * "visit us: &lt;a href=\"http://www.apache.org\"&gt;http://www.apache.org&lt;/a&gt;!".
       
  1584      * <p>
       
  1585      * <i>Note:</i> $0 represents the whole match.
       
  1586      *
       
  1587      * @param substituteIn String to substitute within
       
  1588      * @param substitution String to substitute for matches of this regular expression
       
  1589      * @param flags One or more bitwise flags from REPLACE_*.  If the REPLACE_FIRSTONLY
       
  1590      * flag bit is set, only the first occurrence of this regular expression is replaced.
       
  1591      * If the bit is not set (REPLACE_ALL), all occurrences of this pattern will be
       
  1592      * replaced. If the flag REPLACE_BACKREFERENCES is set, all backreferences will
       
  1593      * be processed.
       
  1594      * @return The string substituteIn with zero or more occurrences of the current
       
  1595      * regular expression replaced with the substitution String (if this regular
       
  1596      * expression object doesn't match at any position, the original String is returned
       
  1597      * unchanged).
       
  1598      */
       
  1599     public String subst(String substituteIn, String substitution, int flags)
       
  1600     {
       
  1601         // String to return
       
  1602         StringBuffer ret = new StringBuffer();
       
  1603 
       
  1604         // Start at position 0 and search the whole string
       
  1605         int pos = 0;
       
  1606         int len = substituteIn.length();
       
  1607 
       
  1608         // Try a match at each position
       
  1609         while (pos < len && match(substituteIn, pos))
       
  1610         {
       
  1611             // Append string before match
       
  1612             ret.append(substituteIn.substring(pos, getParenStart(0)));
       
  1613 
       
  1614             if ((flags & REPLACE_BACKREFERENCES) != 0)
       
  1615             {
       
  1616                 // Process backreferences
       
  1617                 int lCurrentPosition = 0;
       
  1618                 int lLastPosition = -2;
       
  1619                 int lLength = substitution.length();
       
  1620                 boolean bAddedPrefix = false;
       
  1621 
       
  1622                 while ((lCurrentPosition = substitution.indexOf("$", lCurrentPosition)) >= 0)
       
  1623                 {
       
  1624                     if ((lCurrentPosition == 0 || substitution.charAt(lCurrentPosition - 1) != '\\')
       
  1625                         && lCurrentPosition+1 < lLength)
       
  1626                     {
       
  1627                         char c = substitution.charAt(lCurrentPosition + 1);
       
  1628                         if (c >= '0' && c <= '9')
       
  1629                         {
       
  1630                             if (bAddedPrefix == false)
       
  1631                             {
       
  1632                                 // Append everything between the beginning of the
       
  1633                                 // substitution string and the current $ sign
       
  1634                                 ret.append(substitution.substring(0, lCurrentPosition));
       
  1635                                 bAddedPrefix = true;
       
  1636                             }
       
  1637                             else
       
  1638                             {
       
  1639                                 // Append everything between the last and the current $ sign
       
  1640                                 ret.append(substitution.substring(lLastPosition + 2, lCurrentPosition));
       
  1641                             }
       
  1642 
       
  1643                             // Append the parenthesized expression
       
  1644                             // Note: if a parenthesized expression of the requested
       
  1645                             // index is not available "null" is added to the string
       
  1646                             ret.append(getParen(c - '0'));
       
  1647                             lLastPosition = lCurrentPosition;
       
  1648                         }
       
  1649                     }
       
  1650 
       
  1651                     // Move forward, skipping past match
       
  1652                     lCurrentPosition++;
       
  1653                 }
       
  1654 
       
  1655                 // Append everything after the last $ sign
       
  1656                 ret.append(substitution.substring(lLastPosition + 2, lLength));
       
  1657             }
       
  1658             else
       
  1659             {
       
  1660                 // Append substitution without processing backreferences
       
  1661                 ret.append(substitution);
       
  1662             }
       
  1663 
       
  1664             // Move forward, skipping past match
       
  1665             int newpos = getParenEnd(0);
       
  1666 
       
  1667             // We always want to make progress!
       
  1668             if (newpos == pos)
       
  1669             {
       
  1670                 newpos++;
       
  1671             }
       
  1672 
       
  1673             // Try new position
       
  1674             pos = newpos;
       
  1675 
       
  1676             // Break out if we're only supposed to replace one occurrence
       
  1677             if ((flags & REPLACE_FIRSTONLY) != 0)
       
  1678             {
       
  1679                 break;
       
  1680             }
       
  1681         }
       
  1682 
       
  1683         // If there's remaining input, append it
       
  1684         if (pos < len)
       
  1685         {
       
  1686             ret.append(substituteIn.substring(pos));
       
  1687         }
       
  1688 
       
  1689         // Return string buffer as string
       
  1690         return ret.toString();
       
  1691     }
       
  1692 
       
  1693     /**
       
  1694      * Returns an array of Strings, whose toString representation matches a regular
       
  1695      * expression. This method works like the Perl function of the same name.  Given
       
  1696      * a regular expression of "a*b" and an array of String objects of [foo, aab, zzz,
       
  1697      * aaaab], the array of Strings returned by grep would be [aab, aaaab].
       
  1698      *
       
  1699      * @param search Array of Objects to search
       
  1700      * @return Array of Strings whose toString() value matches this regular expression.
       
  1701      */
       
  1702     public String[] grep(Object[] search)
       
  1703     {
       
  1704         // Create new vector to hold return items
       
  1705         Vector v = new Vector();
       
  1706 
       
  1707         // Traverse array of objects
       
  1708         for (int i = 0; i < search.length; i++)
       
  1709         {
       
  1710             // Get next object as a string
       
  1711             String s = search[i].toString();
       
  1712 
       
  1713             // If it matches this regexp, add it to the list
       
  1714             if (match(s))
       
  1715             {
       
  1716                 v.addElement(s);
       
  1717             }
       
  1718         }
       
  1719 
       
  1720         // Return vector as an array of strings
       
  1721         String[] ret = new String[v.size()];
       
  1722         v.copyInto(ret);
       
  1723         return ret;
       
  1724     }
       
  1725 
       
  1726     /**
       
  1727      * @return true if character at i-th position in the <code>search</code> string is a newline
       
  1728      */
       
  1729     private boolean isNewline(int i)
       
  1730     {
       
  1731         char nextChar = search.charAt(i);
       
  1732 
       
  1733         if (nextChar == '\n' || nextChar == '\r' || nextChar == '\u0085'
       
  1734             || nextChar == '\u2028' || nextChar == '\u2029')
       
  1735         {
       
  1736             return true;
       
  1737         }
       
  1738 
       
  1739         return false;
       
  1740     }
       
  1741 
       
  1742     /**
       
  1743      * Compares two characters.
       
  1744      *
       
  1745      * @param c1 first character to compare.
       
  1746      * @param c2 second character to compare.
       
  1747      * @param caseIndependent whether comparision is case insensitive or not.
       
  1748      * @return negative, 0, or positive integer as the first character
       
  1749      *         less than, equal to, or greater then the second.
       
  1750      */
       
  1751     private int compareChars(char c1, char c2, boolean caseIndependent)
       
  1752     {
       
  1753         if (caseIndependent)
       
  1754         {
       
  1755             c1 = Character.toLowerCase(c1);
       
  1756             c2 = Character.toLowerCase(c2);
       
  1757         }
       
  1758         return ((int)c1 - (int)c2);
       
  1759     }
       
  1760 }