6
|
1 |
/*
|
|
2 |
* reserved comment block
|
|
3 |
* DO NOT REMOVE OR ALTER!
|
|
4 |
*/
|
|
5 |
/*
|
|
6 |
* Copyright 1999-2004 The Apache Software Foundation.
|
|
7 |
*
|
|
8 |
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
9 |
* you may not use this file except in compliance with the License.
|
|
10 |
* You may obtain a copy of the License at
|
|
11 |
*
|
|
12 |
* http://www.apache.org/licenses/LICENSE-2.0
|
|
13 |
*
|
|
14 |
* Unless required by applicable law or agreed to in writing, software
|
|
15 |
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
16 |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
17 |
* See the License for the specific language governing permissions and
|
|
18 |
* limitations under the License.
|
|
19 |
*/
|
|
20 |
|
|
21 |
package com.sun.org.apache.regexp.internal;
|
|
22 |
|
|
23 |
import java.io.Serializable;
|
|
24 |
import java.util.Vector;
|
|
25 |
|
|
26 |
/**
|
|
27 |
* RE is an efficient, lightweight regular expression evaluator/matcher
|
|
28 |
* class. Regular expressions are pattern descriptions which enable
|
|
29 |
* sophisticated matching of strings. In addition to being able to
|
|
30 |
* match a string against a pattern, you can also extract parts of the
|
|
31 |
* match. This is especially useful in text parsing! Details on the
|
|
32 |
* syntax of regular expression patterns are given below.
|
|
33 |
*
|
|
34 |
* <p>
|
|
35 |
* To compile a regular expression (RE), you can simply construct an RE
|
|
36 |
* matcher object from the string specification of the pattern, like this:
|
|
37 |
*
|
|
38 |
* <pre>
|
|
39 |
* RE r = new RE("a*b");
|
|
40 |
* </pre>
|
|
41 |
*
|
|
42 |
* <p>
|
|
43 |
* Once you have done this, you can call either of the RE.match methods to
|
|
44 |
* perform matching on a String. For example:
|
|
45 |
*
|
|
46 |
* <pre>
|
|
47 |
* boolean matched = r.match("aaaab");
|
|
48 |
* </pre>
|
|
49 |
*
|
|
50 |
* will cause the boolean matched to be set to true because the
|
|
51 |
* pattern "a*b" matches the string "aaaab".
|
|
52 |
*
|
|
53 |
* <p>
|
|
54 |
* If you were interested in the <i>number</i> of a's which matched the
|
|
55 |
* first part of our example expression, you could change the expression to
|
|
56 |
* "(a*)b". Then when you compiled the expression and matched it against
|
|
57 |
* something like "xaaaab", you would get results like this:
|
|
58 |
*
|
|
59 |
* <pre>
|
|
60 |
* RE r = new RE("(a*)b"); // Compile expression
|
|
61 |
* boolean matched = r.match("xaaaab"); // Match against "xaaaab"
|
|
62 |
*
|
|
63 |
* String wholeExpr = r.getParen(0); // wholeExpr will be 'aaaab'
|
|
64 |
* String insideParens = r.getParen(1); // insideParens will be 'aaaa'
|
|
65 |
*
|
|
66 |
* int startWholeExpr = r.getParenStart(0); // startWholeExpr will be index 1
|
|
67 |
* int endWholeExpr = r.getParenEnd(0); // endWholeExpr will be index 6
|
|
68 |
* int lenWholeExpr = r.getParenLength(0); // lenWholeExpr will be 5
|
|
69 |
*
|
|
70 |
* int startInside = r.getParenStart(1); // startInside will be index 1
|
|
71 |
* int endInside = r.getParenEnd(1); // endInside will be index 5
|
|
72 |
* int lenInside = r.getParenLength(1); // lenInside will be 4
|
|
73 |
* </pre>
|
|
74 |
*
|
|
75 |
* You can also refer to the contents of a parenthesized expression
|
|
76 |
* within a regular expression itself. This is called a
|
|
77 |
* 'backreference'. The first backreference in a regular expression is
|
|
78 |
* denoted by \1, the second by \2 and so on. So the expression:
|
|
79 |
*
|
|
80 |
* <pre>
|
|
81 |
* ([0-9]+)=\1
|
|
82 |
* </pre>
|
|
83 |
*
|
|
84 |
* will match any string of the form n=n (like 0=0 or 2=2).
|
|
85 |
*
|
|
86 |
* <p>
|
|
87 |
* The full regular expression syntax accepted by RE is described here:
|
|
88 |
*
|
|
89 |
* <pre>
|
|
90 |
*
|
|
91 |
* <b><font face=times roman>Characters</font></b>
|
|
92 |
*
|
|
93 |
* <i>unicodeChar</i> Matches any identical unicode character
|
|
94 |
* \ Used to quote a meta-character (like '*')
|
|
95 |
* \\ Matches a single '\' character
|
|
96 |
* \0nnn Matches a given octal character
|
|
97 |
* \xhh Matches a given 8-bit hexadecimal character
|
|
98 |
* \\uhhhh Matches a given 16-bit hexadecimal character
|
|
99 |
* \t Matches an ASCII tab character
|
|
100 |
* \n Matches an ASCII newline character
|
|
101 |
* \r Matches an ASCII return character
|
|
102 |
* \f Matches an ASCII form feed character
|
|
103 |
*
|
|
104 |
*
|
|
105 |
* <b><font face=times roman>Character Classes</font></b>
|
|
106 |
*
|
|
107 |
* [abc] Simple character class
|
|
108 |
* [a-zA-Z] Character class with ranges
|
|
109 |
* [^abc] Negated character class
|
|
110 |
* </pre>
|
|
111 |
*
|
|
112 |
* <b>NOTE:</b> Incomplete ranges will be interpreted as "starts
|
|
113 |
* from zero" or "ends with last character".
|
|
114 |
* <br>
|
|
115 |
* I.e. [-a] is the same as [\\u0000-a], and [a-] is the same as [a-\\uFFFF],
|
|
116 |
* [-] means "all characters".
|
|
117 |
*
|
|
118 |
* <pre>
|
|
119 |
*
|
|
120 |
* <b><font face=times roman>Standard POSIX Character Classes</font></b>
|
|
121 |
*
|
|
122 |
* [:alnum:] Alphanumeric characters.
|
|
123 |
* [:alpha:] Alphabetic characters.
|
|
124 |
* [:blank:] Space and tab characters.
|
|
125 |
* [:cntrl:] Control characters.
|
|
126 |
* [:digit:] Numeric characters.
|
|
127 |
* [:graph:] Characters that are printable and are also visible.
|
|
128 |
* (A space is printable, but not visible, while an
|
|
129 |
* `a' is both.)
|
|
130 |
* [:lower:] Lower-case alphabetic characters.
|
|
131 |
* [:print:] Printable characters (characters that are not
|
|
132 |
* control characters.)
|
|
133 |
* [:punct:] Punctuation characters (characters that are not letter,
|
|
134 |
* digits, control characters, or space characters).
|
|
135 |
* [:space:] Space characters (such as space, tab, and formfeed,
|
|
136 |
* to name a few).
|
|
137 |
* [:upper:] Upper-case alphabetic characters.
|
|
138 |
* [:xdigit:] Characters that are hexadecimal digits.
|
|
139 |
*
|
|
140 |
*
|
|
141 |
* <b><font face=times roman>Non-standard POSIX-style Character Classes</font></b>
|
|
142 |
*
|
|
143 |
* [:javastart:] Start of a Java identifier
|
|
144 |
* [:javapart:] Part of a Java identifier
|
|
145 |
*
|
|
146 |
*
|
|
147 |
* <b><font face=times roman>Predefined Classes</font></b>
|
|
148 |
*
|
|
149 |
* . Matches any character other than newline
|
|
150 |
* \w Matches a "word" character (alphanumeric plus "_")
|
|
151 |
* \W Matches a non-word character
|
|
152 |
* \s Matches a whitespace character
|
|
153 |
* \S Matches a non-whitespace character
|
|
154 |
* \d Matches a digit character
|
|
155 |
* \D Matches a non-digit character
|
|
156 |
*
|
|
157 |
*
|
|
158 |
* <b><font face=times roman>Boundary Matchers</font></b>
|
|
159 |
*
|
|
160 |
* ^ Matches only at the beginning of a line
|
|
161 |
* $ Matches only at the end of a line
|
|
162 |
* \b Matches only at a word boundary
|
|
163 |
* \B Matches only at a non-word boundary
|
|
164 |
*
|
|
165 |
*
|
|
166 |
* <b><font face=times roman>Greedy Closures</font></b>
|
|
167 |
*
|
|
168 |
* A* Matches A 0 or more times (greedy)
|
|
169 |
* A+ Matches A 1 or more times (greedy)
|
|
170 |
* A? Matches A 1 or 0 times (greedy)
|
|
171 |
* A{n} Matches A exactly n times (greedy)
|
|
172 |
* A{n,} Matches A at least n times (greedy)
|
|
173 |
* A{n,m} Matches A at least n but not more than m times (greedy)
|
|
174 |
*
|
|
175 |
*
|
|
176 |
* <b><font face=times roman>Reluctant Closures</font></b>
|
|
177 |
*
|
|
178 |
* A*? Matches A 0 or more times (reluctant)
|
|
179 |
* A+? Matches A 1 or more times (reluctant)
|
|
180 |
* A?? Matches A 0 or 1 times (reluctant)
|
|
181 |
*
|
|
182 |
*
|
|
183 |
* <b><font face=times roman>Logical Operators</font></b>
|
|
184 |
*
|
|
185 |
* AB Matches A followed by B
|
|
186 |
* A|B Matches either A or B
|
|
187 |
* (A) Used for subexpression grouping
|
|
188 |
* (?:A) Used for subexpression clustering (just like grouping but
|
|
189 |
* no backrefs)
|
|
190 |
*
|
|
191 |
*
|
|
192 |
* <b><font face=times roman>Backreferences</font></b>
|
|
193 |
*
|
|
194 |
* \1 Backreference to 1st parenthesized subexpression
|
|
195 |
* \2 Backreference to 2nd parenthesized subexpression
|
|
196 |
* \3 Backreference to 3rd parenthesized subexpression
|
|
197 |
* \4 Backreference to 4th parenthesized subexpression
|
|
198 |
* \5 Backreference to 5th parenthesized subexpression
|
|
199 |
* \6 Backreference to 6th parenthesized subexpression
|
|
200 |
* \7 Backreference to 7th parenthesized subexpression
|
|
201 |
* \8 Backreference to 8th parenthesized subexpression
|
|
202 |
* \9 Backreference to 9th parenthesized subexpression
|
|
203 |
* </pre>
|
|
204 |
*
|
|
205 |
* <p>
|
|
206 |
* All closure operators (+, *, ?, {m,n}) are greedy by default, meaning
|
|
207 |
* that they match as many elements of the string as possible without
|
|
208 |
* causing the overall match to fail. If you want a closure to be
|
|
209 |
* reluctant (non-greedy), you can simply follow it with a '?'. A
|
|
210 |
* reluctant closure will match as few elements of the string as
|
|
211 |
* possible when finding matches. {m,n} closures don't currently
|
|
212 |
* support reluctancy.
|
|
213 |
*
|
|
214 |
* <p>
|
|
215 |
* <b><font face="times roman">Line terminators</font></b>
|
|
216 |
* <br>
|
|
217 |
* A line terminator is a one- or two-character sequence that marks
|
|
218 |
* the end of a line of the input character sequence. The following
|
|
219 |
* are recognized as line terminators:
|
|
220 |
* <ul>
|
|
221 |
* <li>A newline (line feed) character ('\n'),</li>
|
|
222 |
* <li>A carriage-return character followed immediately by a newline character ("\r\n"),</li>
|
|
223 |
* <li>A standalone carriage-return character ('\r'),</li>
|
|
224 |
* <li>A next-line character ('\u0085'),</li>
|
|
225 |
* <li>A line-separator character ('\u2028'), or</li>
|
|
226 |
* <li>A paragraph-separator character ('\u2029).</li>
|
|
227 |
* </ul>
|
|
228 |
*
|
|
229 |
* <p>
|
|
230 |
* RE runs programs compiled by the RECompiler class. But the RE
|
|
231 |
* matcher class does not include the actual regular expression compiler
|
|
232 |
* for reasons of efficiency. In fact, if you want to pre-compile one
|
|
233 |
* or more regular expressions, the 'recompile' class can be invoked
|
|
234 |
* from the command line to produce compiled output like this:
|
|
235 |
*
|
|
236 |
* <pre>
|
|
237 |
* // Pre-compiled regular expression "a*b"
|
|
238 |
* char[] re1Instructions =
|
|
239 |
* {
|
|
240 |
* 0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041,
|
|
241 |
* 0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047,
|
|
242 |
* 0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000,
|
|
243 |
* 0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000,
|
|
244 |
* 0x0000,
|
|
245 |
* };
|
|
246 |
*
|
|
247 |
*
|
|
248 |
* REProgram re1 = new REProgram(re1Instructions);
|
|
249 |
* </pre>
|
|
250 |
*
|
|
251 |
* You can then construct a regular expression matcher (RE) object from
|
|
252 |
* the pre-compiled expression re1 and thus avoid the overhead of
|
|
253 |
* compiling the expression at runtime. If you require more dynamic
|
|
254 |
* regular expressions, you can construct a single RECompiler object and
|
|
255 |
* re-use it to compile each expression. Similarly, you can change the
|
|
256 |
* program run by a given matcher object at any time. However, RE and
|
|
257 |
* RECompiler are not threadsafe (for efficiency reasons, and because
|
|
258 |
* requiring thread safety in this class is deemed to be a rare
|
|
259 |
* requirement), so you will need to construct a separate compiler or
|
|
260 |
* matcher object for each thread (unless you do thread synchronization
|
|
261 |
* yourself). Once expression compiled into the REProgram object, REProgram
|
|
262 |
* can be safely shared across multiple threads and RE objects.
|
|
263 |
*
|
|
264 |
* <br><p><br>
|
|
265 |
*
|
|
266 |
* <font color="red">
|
|
267 |
* <i>ISSUES:</i>
|
|
268 |
*
|
|
269 |
* <ul>
|
|
270 |
* <li>com.weusours.util.re is not currently compatible with all
|
|
271 |
* standard POSIX regcomp flags</li>
|
|
272 |
* <li>com.weusours.util.re does not support POSIX equivalence classes
|
|
273 |
* ([=foo=] syntax) (I18N/locale issue)</li>
|
|
274 |
* <li>com.weusours.util.re does not support nested POSIX character
|
|
275 |
* classes (definitely should, but not completely trivial)</li>
|
|
276 |
* <li>com.weusours.util.re Does not support POSIX character collation
|
|
277 |
* concepts ([.foo.] syntax) (I18N/locale issue)</li>
|
|
278 |
* <li>Should there be different matching styles (simple, POSIX, Perl etc?)</li>
|
|
279 |
* <li>Should RE support character iterators (for backwards RE matching!)?</li>
|
|
280 |
* <li>Should RE support reluctant {m,n} closures (does anyone care)?</li>
|
|
281 |
* <li>Not *all* possibilities are considered for greediness when backreferences
|
|
282 |
* are involved (as POSIX suggests should be the case). The POSIX RE
|
|
283 |
* "(ac*)c*d[ac]*\1", when matched against "acdacaa" should yield a match
|
|
284 |
* of acdacaa where \1 is "a". This is not the case in this RE package,
|
|
285 |
* and actually Perl doesn't go to this extent either! Until someone
|
|
286 |
* actually complains about this, I'm not sure it's worth "fixing".
|
|
287 |
* If it ever is fixed, test #137 in RETest.txt should be updated.</li>
|
|
288 |
* </ul>
|
|
289 |
*
|
|
290 |
* </font>
|
|
291 |
*
|
|
292 |
* @see recompile
|
|
293 |
* @see RECompiler
|
|
294 |
*
|
|
295 |
* @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
|
|
296 |
* @author <a href="mailto:ts@sch-fer.de">Tobias Schäfer</a>
|
|
297 |
*/
|
|
298 |
public class RE implements Serializable
|
|
299 |
{
|
|
300 |
/**
|
|
301 |
* Specifies normal, case-sensitive matching behaviour.
|
|
302 |
*/
|
|
303 |
public static final int MATCH_NORMAL = 0x0000;
|
|
304 |
|
|
305 |
/**
|
|
306 |
* Flag to indicate that matching should be case-independent (folded)
|
|
307 |
*/
|
|
308 |
public static final int MATCH_CASEINDEPENDENT = 0x0001;
|
|
309 |
|
|
310 |
/**
|
|
311 |
* Newlines should match as BOL/EOL (^ and $)
|
|
312 |
*/
|
|
313 |
public static final int MATCH_MULTILINE = 0x0002;
|
|
314 |
|
|
315 |
/**
|
|
316 |
* Consider all input a single body of text - newlines are matched by .
|
|
317 |
*/
|
|
318 |
public static final int MATCH_SINGLELINE = 0x0004;
|
|
319 |
|
|
320 |
/************************************************
|
|
321 |
* *
|
|
322 |
* The format of a node in a program is: *
|
|
323 |
* *
|
|
324 |
* [ OPCODE ] [ OPDATA ] [ OPNEXT ] [ OPERAND ] *
|
|
325 |
* *
|
|
326 |
* char OPCODE - instruction *
|
|
327 |
* char OPDATA - modifying data *
|
|
328 |
* char OPNEXT - next node (relative offset) *
|
|
329 |
* *
|
|
330 |
************************************************/
|
|
331 |
|
|
332 |
// Opcode Char Opdata/Operand Meaning
|
|
333 |
// ---------- ---------- --------------- --------------------------------------------------
|
|
334 |
static final char OP_END = 'E'; // end of program
|
|
335 |
static final char OP_BOL = '^'; // match only if at beginning of line
|
|
336 |
static final char OP_EOL = '$'; // match only if at end of line
|
|
337 |
static final char OP_ANY = '.'; // match any single character except newline
|
|
338 |
static final char OP_ANYOF = '['; // count/ranges match any char in the list of ranges
|
|
339 |
static final char OP_BRANCH = '|'; // node match this alternative or the next one
|
|
340 |
static final char OP_ATOM = 'A'; // length/string length of string followed by string itself
|
|
341 |
static final char OP_STAR = '*'; // node kleene closure
|
|
342 |
static final char OP_PLUS = '+'; // node positive closure
|
|
343 |
static final char OP_MAYBE = '?'; // node optional closure
|
|
344 |
static final char OP_ESCAPE = '\\'; // escape special escape code char class (escape is E_* code)
|
|
345 |
static final char OP_OPEN = '('; // number nth opening paren
|
|
346 |
static final char OP_OPEN_CLUSTER = '<'; // opening cluster
|
|
347 |
static final char OP_CLOSE = ')'; // number nth closing paren
|
|
348 |
static final char OP_CLOSE_CLUSTER = '>'; // closing cluster
|
|
349 |
static final char OP_BACKREF = '#'; // number reference nth already matched parenthesized string
|
|
350 |
static final char OP_GOTO = 'G'; // nothing but a (back-)pointer
|
|
351 |
static final char OP_NOTHING = 'N'; // match null string such as in '(a|)'
|
|
352 |
static final char OP_RELUCTANTSTAR = '8'; // none/expr reluctant '*' (mnemonic for char is unshifted '*')
|
|
353 |
static final char OP_RELUCTANTPLUS = '='; // none/expr reluctant '+' (mnemonic for char is unshifted '+')
|
|
354 |
static final char OP_RELUCTANTMAYBE = '/'; // none/expr reluctant '?' (mnemonic for char is unshifted '?')
|
|
355 |
static final char OP_POSIXCLASS = 'P'; // classid one of the posix character classes
|
|
356 |
|
|
357 |
// Escape codes
|
|
358 |
static final char E_ALNUM = 'w'; // Alphanumeric
|
|
359 |
static final char E_NALNUM = 'W'; // Non-alphanumeric
|
|
360 |
static final char E_BOUND = 'b'; // Word boundary
|
|
361 |
static final char E_NBOUND = 'B'; // Non-word boundary
|
|
362 |
static final char E_SPACE = 's'; // Whitespace
|
|
363 |
static final char E_NSPACE = 'S'; // Non-whitespace
|
|
364 |
static final char E_DIGIT = 'd'; // Digit
|
|
365 |
static final char E_NDIGIT = 'D'; // Non-digit
|
|
366 |
|
|
367 |
// Posix character classes
|
|
368 |
static final char POSIX_CLASS_ALNUM = 'w'; // Alphanumerics
|
|
369 |
static final char POSIX_CLASS_ALPHA = 'a'; // Alphabetics
|
|
370 |
static final char POSIX_CLASS_BLANK = 'b'; // Blanks
|
|
371 |
static final char POSIX_CLASS_CNTRL = 'c'; // Control characters
|
|
372 |
static final char POSIX_CLASS_DIGIT = 'd'; // Digits
|
|
373 |
static final char POSIX_CLASS_GRAPH = 'g'; // Graphic characters
|
|
374 |
static final char POSIX_CLASS_LOWER = 'l'; // Lowercase characters
|
|
375 |
static final char POSIX_CLASS_PRINT = 'p'; // Printable characters
|
|
376 |
static final char POSIX_CLASS_PUNCT = '!'; // Punctuation
|
|
377 |
static final char POSIX_CLASS_SPACE = 's'; // Spaces
|
|
378 |
static final char POSIX_CLASS_UPPER = 'u'; // Uppercase characters
|
|
379 |
static final char POSIX_CLASS_XDIGIT = 'x'; // Hexadecimal digits
|
|
380 |
static final char POSIX_CLASS_JSTART = 'j'; // Java identifier start
|
|
381 |
static final char POSIX_CLASS_JPART = 'k'; // Java identifier part
|
|
382 |
|
|
383 |
// Limits
|
|
384 |
static final int maxNode = 65536; // Maximum number of nodes in a program
|
|
385 |
static final int MAX_PAREN = 16; // Number of paren pairs (only 9 can be backrefs)
|
|
386 |
|
|
387 |
// Node layout constants
|
|
388 |
static final int offsetOpcode = 0; // Opcode offset (first character)
|
|
389 |
static final int offsetOpdata = 1; // Opdata offset (second char)
|
|
390 |
static final int offsetNext = 2; // Next index offset (third char)
|
|
391 |
static final int nodeSize = 3; // Node size (in chars)
|
|
392 |
|
|
393 |
// State of current program
|
|
394 |
REProgram program; // Compiled regular expression 'program'
|
|
395 |
transient CharacterIterator search; // The string being matched against
|
|
396 |
int matchFlags; // Match behaviour flags
|
|
397 |
int maxParen = MAX_PAREN;
|
|
398 |
|
|
399 |
// Parenthesized subexpressions
|
|
400 |
transient int parenCount; // Number of subexpressions matched (num open parens + 1)
|
|
401 |
transient int start0; // Cache of start[0]
|
|
402 |
transient int end0; // Cache of start[0]
|
|
403 |
transient int start1; // Cache of start[1]
|
|
404 |
transient int end1; // Cache of start[1]
|
|
405 |
transient int start2; // Cache of start[2]
|
|
406 |
transient int end2; // Cache of start[2]
|
|
407 |
transient int[] startn; // Lazy-alloced array of sub-expression starts
|
|
408 |
transient int[] endn; // Lazy-alloced array of sub-expression ends
|
|
409 |
|
|
410 |
// Backreferences
|
|
411 |
transient int[] startBackref; // Lazy-alloced array of backref starts
|
|
412 |
transient int[] endBackref; // Lazy-alloced array of backref ends
|
|
413 |
|
|
414 |
/**
|
|
415 |
* Constructs a regular expression matcher from a String by compiling it
|
|
416 |
* using a new instance of RECompiler. If you will be compiling many
|
|
417 |
* expressions, you may prefer to use a single RECompiler object instead.
|
|
418 |
*
|
|
419 |
* @param pattern The regular expression pattern to compile.
|
|
420 |
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
|
|
421 |
* @see RECompiler
|
|
422 |
* @see recompile
|
|
423 |
*/
|
|
424 |
public RE(String pattern) throws RESyntaxException
|
|
425 |
{
|
|
426 |
this(pattern, MATCH_NORMAL);
|
|
427 |
}
|
|
428 |
|
|
429 |
/**
|
|
430 |
* Constructs a regular expression matcher from a String by compiling it
|
|
431 |
* using a new instance of RECompiler. If you will be compiling many
|
|
432 |
* expressions, you may prefer to use a single RECompiler object instead.
|
|
433 |
*
|
|
434 |
* @param pattern The regular expression pattern to compile.
|
|
435 |
* @param matchFlags The matching style
|
|
436 |
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
|
|
437 |
* @see RECompiler
|
|
438 |
* @see recompile
|
|
439 |
*/
|
|
440 |
public RE(String pattern, int matchFlags) throws RESyntaxException
|
|
441 |
{
|
|
442 |
this(new RECompiler().compile(pattern));
|
|
443 |
setMatchFlags(matchFlags);
|
|
444 |
}
|
|
445 |
|
|
446 |
/**
|
|
447 |
* Construct a matcher for a pre-compiled regular expression from program
|
|
448 |
* (bytecode) data. Permits special flags to be passed in to modify matching
|
|
449 |
* behaviour.
|
|
450 |
*
|
|
451 |
* @param program Compiled regular expression program (see RECompiler and/or recompile)
|
|
452 |
* @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
|
|
453 |
*
|
|
454 |
* <pre>
|
|
455 |
* MATCH_NORMAL // Normal (case-sensitive) matching
|
|
456 |
* MATCH_CASEINDEPENDENT // Case folded comparisons
|
|
457 |
* MATCH_MULTILINE // Newline matches as BOL/EOL
|
|
458 |
* </pre>
|
|
459 |
*
|
|
460 |
* @see RECompiler
|
|
461 |
* @see REProgram
|
|
462 |
* @see recompile
|
|
463 |
*/
|
|
464 |
public RE(REProgram program, int matchFlags)
|
|
465 |
{
|
|
466 |
setProgram(program);
|
|
467 |
setMatchFlags(matchFlags);
|
|
468 |
}
|
|
469 |
|
|
470 |
/**
|
|
471 |
* Construct a matcher for a pre-compiled regular expression from program
|
|
472 |
* (bytecode) data.
|
|
473 |
*
|
|
474 |
* @param program Compiled regular expression program
|
|
475 |
* @see RECompiler
|
|
476 |
* @see recompile
|
|
477 |
*/
|
|
478 |
public RE(REProgram program)
|
|
479 |
{
|
|
480 |
this(program, MATCH_NORMAL);
|
|
481 |
}
|
|
482 |
|
|
483 |
/**
|
|
484 |
* Constructs a regular expression matcher with no initial program.
|
|
485 |
* This is likely to be an uncommon practice, but is still supported.
|
|
486 |
*/
|
|
487 |
public RE()
|
|
488 |
{
|
|
489 |
this((REProgram)null, MATCH_NORMAL);
|
|
490 |
}
|
|
491 |
|
|
492 |
/**
|
|
493 |
* Converts a 'simplified' regular expression to a full regular expression
|
|
494 |
*
|
|
495 |
* @param pattern The pattern to convert
|
|
496 |
* @return The full regular expression
|
|
497 |
*/
|
|
498 |
public static String simplePatternToFullRegularExpression(String pattern)
|
|
499 |
{
|
|
500 |
StringBuffer buf = new StringBuffer();
|
|
501 |
for (int i = 0; i < pattern.length(); i++)
|
|
502 |
{
|
|
503 |
char c = pattern.charAt(i);
|
|
504 |
switch (c)
|
|
505 |
{
|
|
506 |
case '*':
|
|
507 |
buf.append(".*");
|
|
508 |
break;
|
|
509 |
|
|
510 |
case '.':
|
|
511 |
case '[':
|
|
512 |
case ']':
|
|
513 |
case '\\':
|
|
514 |
case '+':
|
|
515 |
case '?':
|
|
516 |
case '{':
|
|
517 |
case '}':
|
|
518 |
case '$':
|
|
519 |
case '^':
|
|
520 |
case '|':
|
|
521 |
case '(':
|
|
522 |
case ')':
|
|
523 |
buf.append('\\');
|
|
524 |
default:
|
|
525 |
buf.append(c);
|
|
526 |
break;
|
|
527 |
}
|
|
528 |
}
|
|
529 |
return buf.toString();
|
|
530 |
}
|
|
531 |
|
|
532 |
/**
|
|
533 |
* Sets match behaviour flags which alter the way RE does matching.
|
|
534 |
* @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
|
|
535 |
*
|
|
536 |
* <pre>
|
|
537 |
* MATCH_NORMAL // Normal (case-sensitive) matching
|
|
538 |
* MATCH_CASEINDEPENDENT // Case folded comparisons
|
|
539 |
* MATCH_MULTILINE // Newline matches as BOL/EOL
|
|
540 |
* </pre>
|
|
541 |
*/
|
|
542 |
public void setMatchFlags(int matchFlags)
|
|
543 |
{
|
|
544 |
this.matchFlags = matchFlags;
|
|
545 |
}
|
|
546 |
|
|
547 |
/**
|
|
548 |
* Returns the current match behaviour flags.
|
|
549 |
* @return Current match behaviour flags (RE.MATCH_*).
|
|
550 |
*
|
|
551 |
* <pre>
|
|
552 |
* MATCH_NORMAL // Normal (case-sensitive) matching
|
|
553 |
* MATCH_CASEINDEPENDENT // Case folded comparisons
|
|
554 |
* MATCH_MULTILINE // Newline matches as BOL/EOL
|
|
555 |
* </pre>
|
|
556 |
*
|
|
557 |
* @see #setMatchFlags
|
|
558 |
*/
|
|
559 |
public int getMatchFlags()
|
|
560 |
{
|
|
561 |
return matchFlags;
|
|
562 |
}
|
|
563 |
|
|
564 |
/**
|
|
565 |
* Sets the current regular expression program used by this matcher object.
|
|
566 |
*
|
|
567 |
* @param program Regular expression program compiled by RECompiler.
|
|
568 |
* @see RECompiler
|
|
569 |
* @see REProgram
|
|
570 |
* @see recompile
|
|
571 |
*/
|
|
572 |
public void setProgram(REProgram program)
|
|
573 |
{
|
|
574 |
this.program = program;
|
|
575 |
if (program != null && program.maxParens != -1) {
|
|
576 |
this.maxParen = program.maxParens;
|
|
577 |
} else {
|
|
578 |
this.maxParen = MAX_PAREN;
|
|
579 |
}
|
|
580 |
}
|
|
581 |
|
|
582 |
/**
|
|
583 |
* Returns the current regular expression program in use by this matcher object.
|
|
584 |
*
|
|
585 |
* @return Regular expression program
|
|
586 |
* @see #setProgram
|
|
587 |
*/
|
|
588 |
public REProgram getProgram()
|
|
589 |
{
|
|
590 |
return program;
|
|
591 |
}
|
|
592 |
|
|
593 |
/**
|
|
594 |
* Returns the number of parenthesized subexpressions available after a successful match.
|
|
595 |
*
|
|
596 |
* @return Number of available parenthesized subexpressions
|
|
597 |
*/
|
|
598 |
public int getParenCount()
|
|
599 |
{
|
|
600 |
return parenCount;
|
|
601 |
}
|
|
602 |
|
|
603 |
/**
|
|
604 |
* Gets the contents of a parenthesized subexpression after a successful match.
|
|
605 |
*
|
|
606 |
* @param which Nesting level of subexpression
|
|
607 |
* @return String
|
|
608 |
*/
|
|
609 |
public String getParen(int which)
|
|
610 |
{
|
|
611 |
int start;
|
|
612 |
if (which < parenCount && (start = getParenStart(which)) >= 0)
|
|
613 |
{
|
|
614 |
return search.substring(start, getParenEnd(which));
|
|
615 |
}
|
|
616 |
return null;
|
|
617 |
}
|
|
618 |
|
|
619 |
/**
|
|
620 |
* Returns the start index of a given paren level.
|
|
621 |
*
|
|
622 |
* @param which Nesting level of subexpression
|
|
623 |
* @return String index
|
|
624 |
*/
|
|
625 |
public final int getParenStart(int which)
|
|
626 |
{
|
|
627 |
if (which < parenCount)
|
|
628 |
{
|
|
629 |
switch (which)
|
|
630 |
{
|
|
631 |
case 0:
|
|
632 |
return start0;
|
|
633 |
|
|
634 |
case 1:
|
|
635 |
return start1;
|
|
636 |
|
|
637 |
case 2:
|
|
638 |
return start2;
|
|
639 |
|
|
640 |
default:
|
|
641 |
if (startn == null)
|
|
642 |
{
|
|
643 |
allocParens();
|
|
644 |
}
|
|
645 |
return startn[which];
|
|
646 |
}
|
|
647 |
}
|
|
648 |
return -1;
|
|
649 |
}
|
|
650 |
|
|
651 |
/**
|
|
652 |
* Returns the end index of a given paren level.
|
|
653 |
*
|
|
654 |
* @param which Nesting level of subexpression
|
|
655 |
* @return String index
|
|
656 |
*/
|
|
657 |
public final int getParenEnd(int which)
|
|
658 |
{
|
|
659 |
if (which < parenCount)
|
|
660 |
{
|
|
661 |
switch (which)
|
|
662 |
{
|
|
663 |
case 0:
|
|
664 |
return end0;
|
|
665 |
|
|
666 |
case 1:
|
|
667 |
return end1;
|
|
668 |
|
|
669 |
case 2:
|
|
670 |
return end2;
|
|
671 |
|
|
672 |
default:
|
|
673 |
if (endn == null)
|
|
674 |
{
|
|
675 |
allocParens();
|
|
676 |
}
|
|
677 |
return endn[which];
|
|
678 |
}
|
|
679 |
}
|
|
680 |
return -1;
|
|
681 |
}
|
|
682 |
|
|
683 |
/**
|
|
684 |
* Returns the length of a given paren level.
|
|
685 |
*
|
|
686 |
* @param which Nesting level of subexpression
|
|
687 |
* @return Number of characters in the parenthesized subexpression
|
|
688 |
*/
|
|
689 |
public final int getParenLength(int which)
|
|
690 |
{
|
|
691 |
if (which < parenCount)
|
|
692 |
{
|
|
693 |
return getParenEnd(which) - getParenStart(which);
|
|
694 |
}
|
|
695 |
return -1;
|
|
696 |
}
|
|
697 |
|
|
698 |
/**
|
|
699 |
* Sets the start of a paren level
|
|
700 |
*
|
|
701 |
* @param which Which paren level
|
|
702 |
* @param i Index in input array
|
|
703 |
*/
|
|
704 |
protected final void setParenStart(int which, int i)
|
|
705 |
{
|
|
706 |
if (which < parenCount)
|
|
707 |
{
|
|
708 |
switch (which)
|
|
709 |
{
|
|
710 |
case 0:
|
|
711 |
start0 = i;
|
|
712 |
break;
|
|
713 |
|
|
714 |
case 1:
|
|
715 |
start1 = i;
|
|
716 |
break;
|
|
717 |
|
|
718 |
case 2:
|
|
719 |
start2 = i;
|
|
720 |
break;
|
|
721 |
|
|
722 |
default:
|
|
723 |
if (startn == null)
|
|
724 |
{
|
|
725 |
allocParens();
|
|
726 |
}
|
|
727 |
startn[which] = i;
|
|
728 |
break;
|
|
729 |
}
|
|
730 |
}
|
|
731 |
}
|
|
732 |
|
|
733 |
/**
|
|
734 |
* Sets the end of a paren level
|
|
735 |
*
|
|
736 |
* @param which Which paren level
|
|
737 |
* @param i Index in input array
|
|
738 |
*/
|
|
739 |
protected final void setParenEnd(int which, int i)
|
|
740 |
{
|
|
741 |
if (which < parenCount)
|
|
742 |
{
|
|
743 |
switch (which)
|
|
744 |
{
|
|
745 |
case 0:
|
|
746 |
end0 = i;
|
|
747 |
break;
|
|
748 |
|
|
749 |
case 1:
|
|
750 |
end1 = i;
|
|
751 |
break;
|
|
752 |
|
|
753 |
case 2:
|
|
754 |
end2 = i;
|
|
755 |
break;
|
|
756 |
|
|
757 |
default:
|
|
758 |
if (endn == null)
|
|
759 |
{
|
|
760 |
allocParens();
|
|
761 |
}
|
|
762 |
endn[which] = i;
|
|
763 |
break;
|
|
764 |
}
|
|
765 |
}
|
|
766 |
}
|
|
767 |
|
|
768 |
/**
|
|
769 |
* Throws an Error representing an internal error condition probably resulting
|
|
770 |
* from a bug in the regular expression compiler (or possibly data corruption).
|
|
771 |
* In practice, this should be very rare.
|
|
772 |
*
|
|
773 |
* @param s Error description
|
|
774 |
*/
|
|
775 |
protected void internalError(String s) throws Error
|
|
776 |
{
|
|
777 |
throw new Error("RE internal error: " + s);
|
|
778 |
}
|
|
779 |
|
|
780 |
/**
|
|
781 |
* Performs lazy allocation of subexpression arrays
|
|
782 |
*/
|
|
783 |
private final void allocParens()
|
|
784 |
{
|
|
785 |
// Allocate arrays for subexpressions
|
|
786 |
startn = new int[maxParen];
|
|
787 |
endn = new int[maxParen];
|
|
788 |
|
|
789 |
// Set sub-expression pointers to invalid values
|
|
790 |
for (int i = 0; i < maxParen; i++)
|
|
791 |
{
|
|
792 |
startn[i] = -1;
|
|
793 |
endn[i] = -1;
|
|
794 |
}
|
|
795 |
}
|
|
796 |
|
|
797 |
/**
|
|
798 |
* Try to match a string against a subset of nodes in the program
|
|
799 |
*
|
|
800 |
* @param firstNode Node to start at in program
|
|
801 |
* @param lastNode Last valid node (used for matching a subexpression without
|
|
802 |
* matching the rest of the program as well).
|
|
803 |
* @param idxStart Starting position in character array
|
|
804 |
* @return Final input array index if match succeeded. -1 if not.
|
|
805 |
*/
|
|
806 |
protected int matchNodes(int firstNode, int lastNode, int idxStart)
|
|
807 |
{
|
|
808 |
// Our current place in the string
|
|
809 |
int idx = idxStart;
|
|
810 |
|
|
811 |
// Loop while node is valid
|
|
812 |
int next, opcode, opdata;
|
|
813 |
int idxNew;
|
|
814 |
char[] instruction = program.instruction;
|
|
815 |
for (int node = firstNode; node < lastNode; )
|
|
816 |
{
|
|
817 |
opcode = instruction[node + offsetOpcode];
|
|
818 |
next = node + (short)instruction[node + offsetNext];
|
|
819 |
opdata = instruction[node + offsetOpdata];
|
|
820 |
|
|
821 |
switch (opcode)
|
|
822 |
{
|
|
823 |
case OP_RELUCTANTMAYBE:
|
|
824 |
{
|
|
825 |
int once = 0;
|
|
826 |
do
|
|
827 |
{
|
|
828 |
// Try to match the rest without using the reluctant subexpr
|
|
829 |
if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
|
|
830 |
{
|
|
831 |
return idxNew;
|
|
832 |
}
|
|
833 |
}
|
|
834 |
while ((once++ == 0) && (idx = matchNodes(node + nodeSize, next, idx)) != -1);
|
|
835 |
return -1;
|
|
836 |
}
|
|
837 |
|
|
838 |
case OP_RELUCTANTPLUS:
|
|
839 |
while ((idx = matchNodes(node + nodeSize, next, idx)) != -1)
|
|
840 |
{
|
|
841 |
// Try to match the rest without using the reluctant subexpr
|
|
842 |
if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
|
|
843 |
{
|
|
844 |
return idxNew;
|
|
845 |
}
|
|
846 |
}
|
|
847 |
return -1;
|
|
848 |
|
|
849 |
case OP_RELUCTANTSTAR:
|
|
850 |
do
|
|
851 |
{
|
|
852 |
// Try to match the rest without using the reluctant subexpr
|
|
853 |
if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
|
|
854 |
{
|
|
855 |
return idxNew;
|
|
856 |
}
|
|
857 |
}
|
|
858 |
while ((idx = matchNodes(node + nodeSize, next, idx)) != -1);
|
|
859 |
return -1;
|
|
860 |
|
|
861 |
case OP_OPEN:
|
|
862 |
|
|
863 |
// Match subexpression
|
|
864 |
if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
|
|
865 |
{
|
|
866 |
startBackref[opdata] = idx;
|
|
867 |
}
|
|
868 |
if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
|
|
869 |
{
|
|
870 |
// Increase valid paren count
|
|
871 |
if ((opdata + 1) > parenCount)
|
|
872 |
{
|
|
873 |
parenCount = opdata + 1;
|
|
874 |
}
|
|
875 |
|
|
876 |
// Don't set paren if already set later on
|
|
877 |
if (getParenStart(opdata) == -1)
|
|
878 |
{
|
|
879 |
setParenStart(opdata, idx);
|
|
880 |
}
|
|
881 |
}
|
|
882 |
return idxNew;
|
|
883 |
|
|
884 |
case OP_CLOSE:
|
|
885 |
|
|
886 |
// Done matching subexpression
|
|
887 |
if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
|
|
888 |
{
|
|
889 |
endBackref[opdata] = idx;
|
|
890 |
}
|
|
891 |
if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
|
|
892 |
{
|
|
893 |
// Increase valid paren count
|
|
894 |
if ((opdata + 1) > parenCount)
|
|
895 |
{
|
|
896 |
parenCount = opdata + 1;
|
|
897 |
}
|
|
898 |
|
|
899 |
// Don't set paren if already set later on
|
|
900 |
if (getParenEnd(opdata) == -1)
|
|
901 |
{
|
|
902 |
setParenEnd(opdata, idx);
|
|
903 |
}
|
|
904 |
}
|
|
905 |
return idxNew;
|
|
906 |
|
|
907 |
case OP_OPEN_CLUSTER:
|
|
908 |
case OP_CLOSE_CLUSTER:
|
|
909 |
// starting or ending the matching of a subexpression which has no backref.
|
|
910 |
return matchNodes( next, maxNode, idx );
|
|
911 |
|
|
912 |
case OP_BACKREF:
|
|
913 |
{
|
|
914 |
// Get the start and end of the backref
|
|
915 |
int s = startBackref[opdata];
|
|
916 |
int e = endBackref[opdata];
|
|
917 |
|
|
918 |
// We don't know the backref yet
|
|
919 |
if (s == -1 || e == -1)
|
|
920 |
{
|
|
921 |
return -1;
|
|
922 |
}
|
|
923 |
|
|
924 |
// The backref is empty size
|
|
925 |
if (s == e)
|
|
926 |
{
|
|
927 |
break;
|
|
928 |
}
|
|
929 |
|
|
930 |
// Get the length of the backref
|
|
931 |
int l = e - s;
|
|
932 |
|
|
933 |
// If there's not enough input left, give up.
|
|
934 |
if (search.isEnd(idx + l - 1))
|
|
935 |
{
|
|
936 |
return -1;
|
|
937 |
}
|
|
938 |
|
|
939 |
// Case fold the backref?
|
|
940 |
final boolean caseFold =
|
|
941 |
((matchFlags & MATCH_CASEINDEPENDENT) != 0);
|
|
942 |
// Compare backref to input
|
|
943 |
for (int i = 0; i < l; i++)
|
|
944 |
{
|
|
945 |
if (compareChars(search.charAt(idx++), search.charAt(s + i), caseFold) != 0)
|
|
946 |
{
|
|
947 |
return -1;
|
|
948 |
}
|
|
949 |
}
|
|
950 |
}
|
|
951 |
break;
|
|
952 |
|
|
953 |
case OP_BOL:
|
|
954 |
|
|
955 |
// Fail if we're not at the start of the string
|
|
956 |
if (idx != 0)
|
|
957 |
{
|
|
958 |
// If we're multiline matching, we could still be at the start of a line
|
|
959 |
if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
|
|
960 |
{
|
|
961 |
// If not at start of line, give up
|
|
962 |
if (idx <= 0 || !isNewline(idx - 1)) {
|
|
963 |
return -1;
|
|
964 |
} else {
|
|
965 |
break;
|
|
966 |
}
|
|
967 |
}
|
|
968 |
return -1;
|
|
969 |
}
|
|
970 |
break;
|
|
971 |
|
|
972 |
case OP_EOL:
|
|
973 |
|
|
974 |
// If we're not at the end of string
|
|
975 |
if (!search.isEnd(0) && !search.isEnd(idx))
|
|
976 |
{
|
|
977 |
// If we're multi-line matching
|
|
978 |
if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
|
|
979 |
{
|
|
980 |
// Give up if we're not at the end of a line
|
|
981 |
if (!isNewline(idx)) {
|
|
982 |
return -1;
|
|
983 |
} else {
|
|
984 |
break;
|
|
985 |
}
|
|
986 |
}
|
|
987 |
return -1;
|
|
988 |
}
|
|
989 |
break;
|
|
990 |
|
|
991 |
case OP_ESCAPE:
|
|
992 |
|
|
993 |
// Which escape?
|
|
994 |
switch (opdata)
|
|
995 |
{
|
|
996 |
// Word boundary match
|
|
997 |
case E_NBOUND:
|
|
998 |
case E_BOUND:
|
|
999 |
{
|
|
1000 |
char cLast = ((idx == 0) ? '\n' : search.charAt(idx - 1));
|
|
1001 |
char cNext = ((search.isEnd(idx)) ? '\n' : search.charAt(idx));
|
|
1002 |
if ((Character.isLetterOrDigit(cLast) == Character.isLetterOrDigit(cNext)) == (opdata == E_BOUND))
|
|
1003 |
{
|
|
1004 |
return -1;
|
|
1005 |
}
|
|
1006 |
}
|
|
1007 |
break;
|
|
1008 |
|
|
1009 |
// Alpha-numeric, digit, space, javaLetter, javaLetterOrDigit
|
|
1010 |
case E_ALNUM:
|
|
1011 |
case E_NALNUM:
|
|
1012 |
case E_DIGIT:
|
|
1013 |
case E_NDIGIT:
|
|
1014 |
case E_SPACE:
|
|
1015 |
case E_NSPACE:
|
|
1016 |
|
|
1017 |
// Give up if out of input
|
|
1018 |
if (search.isEnd(idx))
|
|
1019 |
{
|
|
1020 |
return -1;
|
|
1021 |
}
|
|
1022 |
|
|
1023 |
char c = search.charAt(idx);
|
|
1024 |
|
|
1025 |
// Switch on escape
|
|
1026 |
switch (opdata)
|
|
1027 |
{
|
|
1028 |
case E_ALNUM:
|
|
1029 |
case E_NALNUM:
|
|
1030 |
if (!((Character.isLetterOrDigit(c) || c == '_') == (opdata == E_ALNUM)))
|
|
1031 |
{
|
|
1032 |
return -1;
|
|
1033 |
}
|
|
1034 |
break;
|
|
1035 |
|
|
1036 |
case E_DIGIT:
|
|
1037 |
case E_NDIGIT:
|
|
1038 |
if (!(Character.isDigit(c) == (opdata == E_DIGIT)))
|
|
1039 |
{
|
|
1040 |
return -1;
|
|
1041 |
}
|
|
1042 |
break;
|
|
1043 |
|
|
1044 |
case E_SPACE:
|
|
1045 |
case E_NSPACE:
|
|
1046 |
if (!(Character.isWhitespace(c) == (opdata == E_SPACE)))
|
|
1047 |
{
|
|
1048 |
return -1;
|
|
1049 |
}
|
|
1050 |
break;
|
|
1051 |
}
|
|
1052 |
idx++;
|
|
1053 |
break;
|
|
1054 |
|
|
1055 |
default:
|
|
1056 |
internalError("Unrecognized escape '" + opdata + "'");
|
|
1057 |
}
|
|
1058 |
break;
|
|
1059 |
|
|
1060 |
case OP_ANY:
|
|
1061 |
|
|
1062 |
if ((matchFlags & MATCH_SINGLELINE) == MATCH_SINGLELINE) {
|
|
1063 |
// Match anything
|
|
1064 |
if (search.isEnd(idx))
|
|
1065 |
{
|
|
1066 |
return -1;
|
|
1067 |
}
|
|
1068 |
}
|
|
1069 |
else
|
|
1070 |
{
|
|
1071 |
// Match anything but a newline
|
|
1072 |
if (search.isEnd(idx) || isNewline(idx))
|
|
1073 |
{
|
|
1074 |
return -1;
|
|
1075 |
}
|
|
1076 |
}
|
|
1077 |
idx++;
|
|
1078 |
break;
|
|
1079 |
|
|
1080 |
case OP_ATOM:
|
|
1081 |
{
|
|
1082 |
// Match an atom value
|
|
1083 |
if (search.isEnd(idx))
|
|
1084 |
{
|
|
1085 |
return -1;
|
|
1086 |
}
|
|
1087 |
|
|
1088 |
// Get length of atom and starting index
|
|
1089 |
int lenAtom = opdata;
|
|
1090 |
int startAtom = node + nodeSize;
|
|
1091 |
|
|
1092 |
// Give up if not enough input remains to have a match
|
|
1093 |
if (search.isEnd(lenAtom + idx - 1))
|
|
1094 |
{
|
|
1095 |
return -1;
|
|
1096 |
}
|
|
1097 |
|
|
1098 |
// Match atom differently depending on casefolding flag
|
|
1099 |
final boolean caseFold =
|
|
1100 |
((matchFlags & MATCH_CASEINDEPENDENT) != 0);
|
|
1101 |
|
|
1102 |
for (int i = 0; i < lenAtom; i++)
|
|
1103 |
{
|
|
1104 |
if (compareChars(search.charAt(idx++), instruction[startAtom + i], caseFold) != 0)
|
|
1105 |
{
|
|
1106 |
return -1;
|
|
1107 |
}
|
|
1108 |
}
|
|
1109 |
}
|
|
1110 |
break;
|
|
1111 |
|
|
1112 |
case OP_POSIXCLASS:
|
|
1113 |
{
|
|
1114 |
// Out of input?
|
|
1115 |
if (search.isEnd(idx))
|
|
1116 |
{
|
|
1117 |
return -1;
|
|
1118 |
}
|
|
1119 |
|
|
1120 |
switch (opdata)
|
|
1121 |
{
|
|
1122 |
case POSIX_CLASS_ALNUM:
|
|
1123 |
if (!Character.isLetterOrDigit(search.charAt(idx)))
|
|
1124 |
{
|
|
1125 |
return -1;
|
|
1126 |
}
|
|
1127 |
break;
|
|
1128 |
|
|
1129 |
case POSIX_CLASS_ALPHA:
|
|
1130 |
if (!Character.isLetter(search.charAt(idx)))
|
|
1131 |
{
|
|
1132 |
return -1;
|
|
1133 |
}
|
|
1134 |
break;
|
|
1135 |
|
|
1136 |
case POSIX_CLASS_DIGIT:
|
|
1137 |
if (!Character.isDigit(search.charAt(idx)))
|
|
1138 |
{
|
|
1139 |
return -1;
|
|
1140 |
}
|
|
1141 |
break;
|
|
1142 |
|
|
1143 |
case POSIX_CLASS_BLANK: // JWL - bugbug: is this right??
|
|
1144 |
if (!Character.isSpaceChar(search.charAt(idx)))
|
|
1145 |
{
|
|
1146 |
return -1;
|
|
1147 |
}
|
|
1148 |
break;
|
|
1149 |
|
|
1150 |
case POSIX_CLASS_SPACE:
|
|
1151 |
if (!Character.isWhitespace(search.charAt(idx)))
|
|
1152 |
{
|
|
1153 |
return -1;
|
|
1154 |
}
|
|
1155 |
break;
|
|
1156 |
|
|
1157 |
case POSIX_CLASS_CNTRL:
|
|
1158 |
if (Character.getType(search.charAt(idx)) != Character.CONTROL)
|
|
1159 |
{
|
|
1160 |
return -1;
|
|
1161 |
}
|
|
1162 |
break;
|
|
1163 |
|
|
1164 |
case POSIX_CLASS_GRAPH: // JWL - bugbug???
|
|
1165 |
switch (Character.getType(search.charAt(idx)))
|
|
1166 |
{
|
|
1167 |
case Character.MATH_SYMBOL:
|
|
1168 |
case Character.CURRENCY_SYMBOL:
|
|
1169 |
case Character.MODIFIER_SYMBOL:
|
|
1170 |
case Character.OTHER_SYMBOL:
|
|
1171 |
break;
|
|
1172 |
|
|
1173 |
default:
|
|
1174 |
return -1;
|
|
1175 |
}
|
|
1176 |
break;
|
|
1177 |
|
|
1178 |
case POSIX_CLASS_LOWER:
|
|
1179 |
if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER)
|
|
1180 |
{
|
|
1181 |
return -1;
|
|
1182 |
}
|
|
1183 |
break;
|
|
1184 |
|
|
1185 |
case POSIX_CLASS_UPPER:
|
|
1186 |
if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER)
|
|
1187 |
{
|
|
1188 |
return -1;
|
|
1189 |
}
|
|
1190 |
break;
|
|
1191 |
|
|
1192 |
case POSIX_CLASS_PRINT:
|
|
1193 |
if (Character.getType(search.charAt(idx)) == Character.CONTROL)
|
|
1194 |
{
|
|
1195 |
return -1;
|
|
1196 |
}
|
|
1197 |
break;
|
|
1198 |
|
|
1199 |
case POSIX_CLASS_PUNCT:
|
|
1200 |
{
|
|
1201 |
int type = Character.getType(search.charAt(idx));
|
|
1202 |
switch(type)
|
|
1203 |
{
|
|
1204 |
case Character.DASH_PUNCTUATION:
|
|
1205 |
case Character.START_PUNCTUATION:
|
|
1206 |
case Character.END_PUNCTUATION:
|
|
1207 |
case Character.CONNECTOR_PUNCTUATION:
|
|
1208 |
case Character.OTHER_PUNCTUATION:
|
|
1209 |
break;
|
|
1210 |
|
|
1211 |
default:
|
|
1212 |
return -1;
|
|
1213 |
}
|
|
1214 |
}
|
|
1215 |
break;
|
|
1216 |
|
|
1217 |
case POSIX_CLASS_XDIGIT: // JWL - bugbug??
|
|
1218 |
{
|
|
1219 |
boolean isXDigit = ((search.charAt(idx) >= '0' && search.charAt(idx) <= '9') ||
|
|
1220 |
(search.charAt(idx) >= 'a' && search.charAt(idx) <= 'f') ||
|
|
1221 |
(search.charAt(idx) >= 'A' && search.charAt(idx) <= 'F'));
|
|
1222 |
if (!isXDigit)
|
|
1223 |
{
|
|
1224 |
return -1;
|
|
1225 |
}
|
|
1226 |
}
|
|
1227 |
break;
|
|
1228 |
|
|
1229 |
case POSIX_CLASS_JSTART:
|
|
1230 |
if (!Character.isJavaIdentifierStart(search.charAt(idx)))
|
|
1231 |
{
|
|
1232 |
return -1;
|
|
1233 |
}
|
|
1234 |
break;
|
|
1235 |
|
|
1236 |
case POSIX_CLASS_JPART:
|
|
1237 |
if (!Character.isJavaIdentifierPart(search.charAt(idx)))
|
|
1238 |
{
|
|
1239 |
return -1;
|
|
1240 |
}
|
|
1241 |
break;
|
|
1242 |
|
|
1243 |
default:
|
|
1244 |
internalError("Bad posix class");
|
|
1245 |
break;
|
|
1246 |
}
|
|
1247 |
|
|
1248 |
// Matched.
|
|
1249 |
idx++;
|
|
1250 |
}
|
|
1251 |
break;
|
|
1252 |
|
|
1253 |
case OP_ANYOF:
|
|
1254 |
{
|
|
1255 |
// Out of input?
|
|
1256 |
if (search.isEnd(idx))
|
|
1257 |
{
|
|
1258 |
return -1;
|
|
1259 |
}
|
|
1260 |
|
|
1261 |
// Get character to match against character class and maybe casefold
|
|
1262 |
char c = search.charAt(idx);
|
|
1263 |
boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
|
|
1264 |
// Loop through character class checking our match character
|
|
1265 |
int idxRange = node + nodeSize;
|
|
1266 |
int idxEnd = idxRange + (opdata * 2);
|
|
1267 |
boolean match = false;
|
|
1268 |
for (int i = idxRange; !match && i < idxEnd; )
|
|
1269 |
{
|
|
1270 |
// Get start, end and match characters
|
|
1271 |
char s = instruction[i++];
|
|
1272 |
char e = instruction[i++];
|
|
1273 |
|
|
1274 |
match = ((compareChars(c, s, caseFold) >= 0)
|
|
1275 |
&& (compareChars(c, e, caseFold) <= 0));
|
|
1276 |
}
|
|
1277 |
|
|
1278 |
// Fail if we didn't match the character class
|
|
1279 |
if (!match)
|
|
1280 |
{
|
|
1281 |
return -1;
|
|
1282 |
}
|
|
1283 |
idx++;
|
|
1284 |
}
|
|
1285 |
break;
|
|
1286 |
|
|
1287 |
case OP_BRANCH:
|
|
1288 |
{
|
|
1289 |
// Check for choices
|
|
1290 |
if (instruction[next + offsetOpcode] != OP_BRANCH)
|
|
1291 |
{
|
|
1292 |
// If there aren't any other choices, just evaluate this branch.
|
|
1293 |
node += nodeSize;
|
|
1294 |
continue;
|
|
1295 |
}
|
|
1296 |
|
|
1297 |
// Try all available branches
|
|
1298 |
short nextBranch;
|
|
1299 |
do
|
|
1300 |
{
|
|
1301 |
// Try matching the branch against the string
|
|
1302 |
if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1)
|
|
1303 |
{
|
|
1304 |
return idxNew;
|
|
1305 |
}
|
|
1306 |
|
|
1307 |
// Go to next branch (if any)
|
|
1308 |
nextBranch = (short)instruction[node + offsetNext];
|
|
1309 |
node += nextBranch;
|
|
1310 |
}
|
|
1311 |
while (nextBranch != 0 && (instruction[node + offsetOpcode] == OP_BRANCH));
|
|
1312 |
|
|
1313 |
// Failed to match any branch!
|
|
1314 |
return -1;
|
|
1315 |
}
|
|
1316 |
|
|
1317 |
case OP_NOTHING:
|
|
1318 |
case OP_GOTO:
|
|
1319 |
|
|
1320 |
// Just advance to the next node without doing anything
|
|
1321 |
break;
|
|
1322 |
|
|
1323 |
case OP_END:
|
|
1324 |
|
|
1325 |
// Match has succeeded!
|
|
1326 |
setParenEnd(0, idx);
|
|
1327 |
return idx;
|
|
1328 |
|
|
1329 |
default:
|
|
1330 |
|
|
1331 |
// Corrupt program
|
|
1332 |
internalError("Invalid opcode '" + opcode + "'");
|
|
1333 |
}
|
|
1334 |
|
|
1335 |
// Advance to the next node in the program
|
|
1336 |
node = next;
|
|
1337 |
}
|
|
1338 |
|
|
1339 |
// We "should" never end up here
|
|
1340 |
internalError("Corrupt program");
|
|
1341 |
return -1;
|
|
1342 |
}
|
|
1343 |
|
|
1344 |
/**
|
|
1345 |
* Match the current regular expression program against the current
|
|
1346 |
* input string, starting at index i of the input string. This method
|
|
1347 |
* is only meant for internal use.
|
|
1348 |
*
|
|
1349 |
* @param i The input string index to start matching at
|
|
1350 |
* @return True if the input matched the expression
|
|
1351 |
*/
|
|
1352 |
protected boolean matchAt(int i)
|
|
1353 |
{
|
|
1354 |
// Initialize start pointer, paren cache and paren count
|
|
1355 |
start0 = -1;
|
|
1356 |
end0 = -1;
|
|
1357 |
start1 = -1;
|
|
1358 |
end1 = -1;
|
|
1359 |
start2 = -1;
|
|
1360 |
end2 = -1;
|
|
1361 |
startn = null;
|
|
1362 |
endn = null;
|
|
1363 |
parenCount = 1;
|
|
1364 |
setParenStart(0, i);
|
|
1365 |
|
|
1366 |
// Allocate backref arrays (unless optimizations indicate otherwise)
|
|
1367 |
if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
|
|
1368 |
{
|
|
1369 |
startBackref = new int[maxParen];
|
|
1370 |
endBackref = new int[maxParen];
|
|
1371 |
}
|
|
1372 |
|
|
1373 |
// Match against string
|
|
1374 |
int idx;
|
|
1375 |
if ((idx = matchNodes(0, maxNode, i)) != -1)
|
|
1376 |
{
|
|
1377 |
setParenEnd(0, idx);
|
|
1378 |
return true;
|
|
1379 |
}
|
|
1380 |
|
|
1381 |
// Didn't match
|
|
1382 |
parenCount = 0;
|
|
1383 |
return false;
|
|
1384 |
}
|
|
1385 |
|
|
1386 |
/**
|
|
1387 |
* Matches the current regular expression program against a character array,
|
|
1388 |
* starting at a given index.
|
|
1389 |
*
|
|
1390 |
* @param search String to match against
|
|
1391 |
* @param i Index to start searching at
|
|
1392 |
* @return True if string matched
|
|
1393 |
*/
|
|
1394 |
public boolean match(String search, int i)
|
|
1395 |
{
|
|
1396 |
return match(new StringCharacterIterator(search), i);
|
|
1397 |
}
|
|
1398 |
|
|
1399 |
/**
|
|
1400 |
* Matches the current regular expression program against a character array,
|
|
1401 |
* starting at a given index.
|
|
1402 |
*
|
|
1403 |
* @param search String to match against
|
|
1404 |
* @param i Index to start searching at
|
|
1405 |
* @return True if string matched
|
|
1406 |
*/
|
|
1407 |
public boolean match(CharacterIterator search, int i)
|
|
1408 |
{
|
|
1409 |
// There is no compiled program to search with!
|
|
1410 |
if (program == null)
|
|
1411 |
{
|
|
1412 |
// This should be uncommon enough to be an error case rather
|
|
1413 |
// than an exception (which would have to be handled everywhere)
|
|
1414 |
internalError("No RE program to run!");
|
|
1415 |
}
|
|
1416 |
|
|
1417 |
// Save string to search
|
|
1418 |
this.search = search;
|
|
1419 |
|
|
1420 |
// Can we optimize the search by looking for a prefix string?
|
|
1421 |
if (program.prefix == null)
|
|
1422 |
{
|
|
1423 |
// Unprefixed matching must try for a match at each character
|
|
1424 |
for ( ;! search.isEnd(i - 1); i++)
|
|
1425 |
{
|
|
1426 |
// Try a match at index i
|
|
1427 |
if (matchAt(i))
|
|
1428 |
{
|
|
1429 |
return true;
|
|
1430 |
}
|
|
1431 |
}
|
|
1432 |
return false;
|
|
1433 |
}
|
|
1434 |
else
|
|
1435 |
{
|
|
1436 |
// Prefix-anchored matching is possible
|
|
1437 |
boolean caseIndependent = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
|
|
1438 |
char[] prefix = program.prefix;
|
|
1439 |
for ( ; !search.isEnd(i + prefix.length - 1); i++)
|
|
1440 |
{
|
|
1441 |
int j = i;
|
|
1442 |
int k = 0;
|
|
1443 |
|
|
1444 |
boolean match;
|
|
1445 |
do {
|
|
1446 |
// If there's a mismatch of any character in the prefix, give up
|
|
1447 |
match = (compareChars(search.charAt(j++), prefix[k++], caseIndependent) == 0);
|
|
1448 |
} while (match && k < prefix.length);
|
|
1449 |
|
|
1450 |
// See if the whole prefix string matched
|
|
1451 |
if (k == prefix.length)
|
|
1452 |
{
|
|
1453 |
// We matched the full prefix at firstChar, so try it
|
|
1454 |
if (matchAt(i))
|
|
1455 |
{
|
|
1456 |
return true;
|
|
1457 |
}
|
|
1458 |
}
|
|
1459 |
}
|
|
1460 |
return false;
|
|
1461 |
}
|
|
1462 |
}
|
|
1463 |
|
|
1464 |
/**
|
|
1465 |
* Matches the current regular expression program against a String.
|
|
1466 |
*
|
|
1467 |
* @param search String to match against
|
|
1468 |
* @return True if string matched
|
|
1469 |
*/
|
|
1470 |
public boolean match(String search)
|
|
1471 |
{
|
|
1472 |
return match(search, 0);
|
|
1473 |
}
|
|
1474 |
|
|
1475 |
/**
|
|
1476 |
* Splits a string into an array of strings on regular expression boundaries.
|
|
1477 |
* This function works the same way as the Perl function of the same name.
|
|
1478 |
* Given a regular expression of "[ab]+" and a string to split of
|
|
1479 |
* "xyzzyababbayyzabbbab123", the result would be the array of Strings
|
|
1480 |
* "[xyzzy, yyz, 123]".
|
|
1481 |
*
|
|
1482 |
* <p>Please note that the first string in the resulting array may be an empty
|
|
1483 |
* string. This happens when the very first character of input string is
|
|
1484 |
* matched by the pattern.
|
|
1485 |
*
|
|
1486 |
* @param s String to split on this regular exression
|
|
1487 |
* @return Array of strings
|
|
1488 |
*/
|
|
1489 |
public String[] split(String s)
|
|
1490 |
{
|
|
1491 |
// Create new vector
|
|
1492 |
Vector v = new Vector();
|
|
1493 |
|
|
1494 |
// Start at position 0 and search the whole string
|
|
1495 |
int pos = 0;
|
|
1496 |
int len = s.length();
|
|
1497 |
|
|
1498 |
// Try a match at each position
|
|
1499 |
while (pos < len && match(s, pos))
|
|
1500 |
{
|
|
1501 |
// Get start of match
|
|
1502 |
int start = getParenStart(0);
|
|
1503 |
|
|
1504 |
// Get end of match
|
|
1505 |
int newpos = getParenEnd(0);
|
|
1506 |
|
|
1507 |
// Check if no progress was made
|
|
1508 |
if (newpos == pos)
|
|
1509 |
{
|
|
1510 |
v.addElement(s.substring(pos, start + 1));
|
|
1511 |
newpos++;
|
|
1512 |
}
|
|
1513 |
else
|
|
1514 |
{
|
|
1515 |
v.addElement(s.substring(pos, start));
|
|
1516 |
}
|
|
1517 |
|
|
1518 |
// Move to new position
|
|
1519 |
pos = newpos;
|
|
1520 |
}
|
|
1521 |
|
|
1522 |
// Push remainder if it's not empty
|
|
1523 |
String remainder = s.substring(pos);
|
|
1524 |
if (remainder.length() != 0)
|
|
1525 |
{
|
|
1526 |
v.addElement(remainder);
|
|
1527 |
}
|
|
1528 |
|
|
1529 |
// Return vector as an array of strings
|
|
1530 |
String[] ret = new String[v.size()];
|
|
1531 |
v.copyInto(ret);
|
|
1532 |
return ret;
|
|
1533 |
}
|
|
1534 |
|
|
1535 |
/**
|
|
1536 |
* Flag bit that indicates that subst should replace all occurrences of this
|
|
1537 |
* regular expression.
|
|
1538 |
*/
|
|
1539 |
public static final int REPLACE_ALL = 0x0000;
|
|
1540 |
|
|
1541 |
/**
|
|
1542 |
* Flag bit that indicates that subst should only replace the first occurrence
|
|
1543 |
* of this regular expression.
|
|
1544 |
*/
|
|
1545 |
public static final int REPLACE_FIRSTONLY = 0x0001;
|
|
1546 |
|
|
1547 |
/**
|
|
1548 |
* Flag bit that indicates that subst should replace backreferences
|
|
1549 |
*/
|
|
1550 |
public static final int REPLACE_BACKREFERENCES = 0x0002;
|
|
1551 |
|
|
1552 |
/**
|
|
1553 |
* Substitutes a string for this regular expression in another string.
|
|
1554 |
* This method works like the Perl function of the same name.
|
|
1555 |
* Given a regular expression of "a*b", a String to substituteIn of
|
|
1556 |
* "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
|
|
1557 |
* resulting String returned by subst would be "-foo-garply-wacky-".
|
|
1558 |
*
|
|
1559 |
* @param substituteIn String to substitute within
|
|
1560 |
* @param substitution String to substitute for all matches of this regular expression.
|
|
1561 |
* @return The string substituteIn with zero or more occurrences of the current
|
|
1562 |
* regular expression replaced with the substitution String (if this regular
|
|
1563 |
* expression object doesn't match at any position, the original String is returned
|
|
1564 |
* unchanged).
|
|
1565 |
*/
|
|
1566 |
public String subst(String substituteIn, String substitution)
|
|
1567 |
{
|
|
1568 |
return subst(substituteIn, substitution, REPLACE_ALL);
|
|
1569 |
}
|
|
1570 |
|
|
1571 |
/**
|
|
1572 |
* Substitutes a string for this regular expression in another string.
|
|
1573 |
* This method works like the Perl function of the same name.
|
|
1574 |
* Given a regular expression of "a*b", a String to substituteIn of
|
|
1575 |
* "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
|
|
1576 |
* resulting String returned by subst would be "-foo-garply-wacky-".
|
|
1577 |
* <p>
|
|
1578 |
* It is also possible to reference the contents of a parenthesized expression
|
|
1579 |
* with $0, $1, ... $9. A regular expression of "http://[\\.\\w\\-\\?/~_@&=%]+",
|
|
1580 |
* a String to substituteIn of "visit us: http://www.apache.org!" and the
|
|
1581 |
* substitution String "<a href=\"$0\">$0</a>", the resulting String
|
|
1582 |
* returned by subst would be
|
|
1583 |
* "visit us: <a href=\"http://www.apache.org\">http://www.apache.org</a>!".
|
|
1584 |
* <p>
|
|
1585 |
* <i>Note:</i> $0 represents the whole match.
|
|
1586 |
*
|
|
1587 |
* @param substituteIn String to substitute within
|
|
1588 |
* @param substitution String to substitute for matches of this regular expression
|
|
1589 |
* @param flags One or more bitwise flags from REPLACE_*. If the REPLACE_FIRSTONLY
|
|
1590 |
* flag bit is set, only the first occurrence of this regular expression is replaced.
|
|
1591 |
* If the bit is not set (REPLACE_ALL), all occurrences of this pattern will be
|
|
1592 |
* replaced. If the flag REPLACE_BACKREFERENCES is set, all backreferences will
|
|
1593 |
* be processed.
|
|
1594 |
* @return The string substituteIn with zero or more occurrences of the current
|
|
1595 |
* regular expression replaced with the substitution String (if this regular
|
|
1596 |
* expression object doesn't match at any position, the original String is returned
|
|
1597 |
* unchanged).
|
|
1598 |
*/
|
|
1599 |
public String subst(String substituteIn, String substitution, int flags)
|
|
1600 |
{
|
|
1601 |
// String to return
|
|
1602 |
StringBuffer ret = new StringBuffer();
|
|
1603 |
|
|
1604 |
// Start at position 0 and search the whole string
|
|
1605 |
int pos = 0;
|
|
1606 |
int len = substituteIn.length();
|
|
1607 |
|
|
1608 |
// Try a match at each position
|
|
1609 |
while (pos < len && match(substituteIn, pos))
|
|
1610 |
{
|
|
1611 |
// Append string before match
|
|
1612 |
ret.append(substituteIn.substring(pos, getParenStart(0)));
|
|
1613 |
|
|
1614 |
if ((flags & REPLACE_BACKREFERENCES) != 0)
|
|
1615 |
{
|
|
1616 |
// Process backreferences
|
|
1617 |
int lCurrentPosition = 0;
|
|
1618 |
int lLastPosition = -2;
|
|
1619 |
int lLength = substitution.length();
|
|
1620 |
boolean bAddedPrefix = false;
|
|
1621 |
|
|
1622 |
while ((lCurrentPosition = substitution.indexOf("$", lCurrentPosition)) >= 0)
|
|
1623 |
{
|
|
1624 |
if ((lCurrentPosition == 0 || substitution.charAt(lCurrentPosition - 1) != '\\')
|
|
1625 |
&& lCurrentPosition+1 < lLength)
|
|
1626 |
{
|
|
1627 |
char c = substitution.charAt(lCurrentPosition + 1);
|
|
1628 |
if (c >= '0' && c <= '9')
|
|
1629 |
{
|
|
1630 |
if (bAddedPrefix == false)
|
|
1631 |
{
|
|
1632 |
// Append everything between the beginning of the
|
|
1633 |
// substitution string and the current $ sign
|
|
1634 |
ret.append(substitution.substring(0, lCurrentPosition));
|
|
1635 |
bAddedPrefix = true;
|
|
1636 |
}
|
|
1637 |
else
|
|
1638 |
{
|
|
1639 |
// Append everything between the last and the current $ sign
|
|
1640 |
ret.append(substitution.substring(lLastPosition + 2, lCurrentPosition));
|
|
1641 |
}
|
|
1642 |
|
|
1643 |
// Append the parenthesized expression
|
|
1644 |
// Note: if a parenthesized expression of the requested
|
|
1645 |
// index is not available "null" is added to the string
|
|
1646 |
ret.append(getParen(c - '0'));
|
|
1647 |
lLastPosition = lCurrentPosition;
|
|
1648 |
}
|
|
1649 |
}
|
|
1650 |
|
|
1651 |
// Move forward, skipping past match
|
|
1652 |
lCurrentPosition++;
|
|
1653 |
}
|
|
1654 |
|
|
1655 |
// Append everything after the last $ sign
|
|
1656 |
ret.append(substitution.substring(lLastPosition + 2, lLength));
|
|
1657 |
}
|
|
1658 |
else
|
|
1659 |
{
|
|
1660 |
// Append substitution without processing backreferences
|
|
1661 |
ret.append(substitution);
|
|
1662 |
}
|
|
1663 |
|
|
1664 |
// Move forward, skipping past match
|
|
1665 |
int newpos = getParenEnd(0);
|
|
1666 |
|
|
1667 |
// We always want to make progress!
|
|
1668 |
if (newpos == pos)
|
|
1669 |
{
|
|
1670 |
newpos++;
|
|
1671 |
}
|
|
1672 |
|
|
1673 |
// Try new position
|
|
1674 |
pos = newpos;
|
|
1675 |
|
|
1676 |
// Break out if we're only supposed to replace one occurrence
|
|
1677 |
if ((flags & REPLACE_FIRSTONLY) != 0)
|
|
1678 |
{
|
|
1679 |
break;
|
|
1680 |
}
|
|
1681 |
}
|
|
1682 |
|
|
1683 |
// If there's remaining input, append it
|
|
1684 |
if (pos < len)
|
|
1685 |
{
|
|
1686 |
ret.append(substituteIn.substring(pos));
|
|
1687 |
}
|
|
1688 |
|
|
1689 |
// Return string buffer as string
|
|
1690 |
return ret.toString();
|
|
1691 |
}
|
|
1692 |
|
|
1693 |
/**
|
|
1694 |
* Returns an array of Strings, whose toString representation matches a regular
|
|
1695 |
* expression. This method works like the Perl function of the same name. Given
|
|
1696 |
* a regular expression of "a*b" and an array of String objects of [foo, aab, zzz,
|
|
1697 |
* aaaab], the array of Strings returned by grep would be [aab, aaaab].
|
|
1698 |
*
|
|
1699 |
* @param search Array of Objects to search
|
|
1700 |
* @return Array of Strings whose toString() value matches this regular expression.
|
|
1701 |
*/
|
|
1702 |
public String[] grep(Object[] search)
|
|
1703 |
{
|
|
1704 |
// Create new vector to hold return items
|
|
1705 |
Vector v = new Vector();
|
|
1706 |
|
|
1707 |
// Traverse array of objects
|
|
1708 |
for (int i = 0; i < search.length; i++)
|
|
1709 |
{
|
|
1710 |
// Get next object as a string
|
|
1711 |
String s = search[i].toString();
|
|
1712 |
|
|
1713 |
// If it matches this regexp, add it to the list
|
|
1714 |
if (match(s))
|
|
1715 |
{
|
|
1716 |
v.addElement(s);
|
|
1717 |
}
|
|
1718 |
}
|
|
1719 |
|
|
1720 |
// Return vector as an array of strings
|
|
1721 |
String[] ret = new String[v.size()];
|
|
1722 |
v.copyInto(ret);
|
|
1723 |
return ret;
|
|
1724 |
}
|
|
1725 |
|
|
1726 |
/**
|
|
1727 |
* @return true if character at i-th position in the <code>search</code> string is a newline
|
|
1728 |
*/
|
|
1729 |
private boolean isNewline(int i)
|
|
1730 |
{
|
|
1731 |
char nextChar = search.charAt(i);
|
|
1732 |
|
|
1733 |
if (nextChar == '\n' || nextChar == '\r' || nextChar == '\u0085'
|
|
1734 |
|| nextChar == '\u2028' || nextChar == '\u2029')
|
|
1735 |
{
|
|
1736 |
return true;
|
|
1737 |
}
|
|
1738 |
|
|
1739 |
return false;
|
|
1740 |
}
|
|
1741 |
|
|
1742 |
/**
|
|
1743 |
* Compares two characters.
|
|
1744 |
*
|
|
1745 |
* @param c1 first character to compare.
|
|
1746 |
* @param c2 second character to compare.
|
|
1747 |
* @param caseIndependent whether comparision is case insensitive or not.
|
|
1748 |
* @return negative, 0, or positive integer as the first character
|
|
1749 |
* less than, equal to, or greater then the second.
|
|
1750 |
*/
|
|
1751 |
private int compareChars(char c1, char c2, boolean caseIndependent)
|
|
1752 |
{
|
|
1753 |
if (caseIndependent)
|
|
1754 |
{
|
|
1755 |
c1 = Character.toLowerCase(c1);
|
|
1756 |
c2 = Character.toLowerCase(c2);
|
|
1757 |
}
|
|
1758 |
return ((int)c1 - (int)c2);
|
|
1759 |
}
|
|
1760 |
}
|