6
|
1 |
/*
|
|
2 |
* reserved comment block
|
|
3 |
* DO NOT REMOVE OR ALTER!
|
|
4 |
*/
|
|
5 |
/*
|
|
6 |
* Copyright 1999-2004 The Apache Software Foundation.
|
|
7 |
*
|
|
8 |
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
9 |
* you may not use this file except in compliance with the License.
|
|
10 |
* You may obtain a copy of the License at
|
|
11 |
*
|
|
12 |
* http://www.apache.org/licenses/LICENSE-2.0
|
|
13 |
*
|
|
14 |
* Unless required by applicable law or agreed to in writing, software
|
|
15 |
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
16 |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
17 |
* See the License for the specific language governing permissions and
|
|
18 |
* limitations under the License.
|
|
19 |
*/
|
|
20 |
|
|
21 |
package com.sun.org.apache.regexp.internal;
|
|
22 |
|
|
23 |
import com.sun.org.apache.regexp.internal.RE;
|
|
24 |
import java.util.Hashtable;
|
|
25 |
|
|
26 |
/**
|
|
27 |
* A regular expression compiler class. This class compiles a pattern string into a
|
|
28 |
* regular expression program interpretable by the RE evaluator class. The 'recompile'
|
|
29 |
* command line tool uses this compiler to pre-compile regular expressions for use
|
|
30 |
* with RE. For a description of the syntax accepted by RECompiler and what you can
|
|
31 |
* do with regular expressions, see the documentation for the RE matcher class.
|
|
32 |
*
|
|
33 |
* @see RE
|
|
34 |
* @see recompile
|
|
35 |
*
|
|
36 |
* @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
|
|
37 |
* @author <a href="mailto:gholam@xtra.co.nz">Michael McCallum</a>
|
|
38 |
*/
|
|
39 |
public class RECompiler
|
|
40 |
{
|
|
41 |
// The compiled program
|
|
42 |
char[] instruction; // The compiled RE 'program' instruction buffer
|
|
43 |
int lenInstruction; // The amount of the program buffer currently in use
|
|
44 |
|
|
45 |
// Input state for compiling regular expression
|
|
46 |
String pattern; // Input string
|
|
47 |
int len; // Length of the pattern string
|
|
48 |
int idx; // Current input index into ac
|
|
49 |
int parens; // Total number of paren pairs
|
|
50 |
|
|
51 |
// Node flags
|
|
52 |
static final int NODE_NORMAL = 0; // No flags (nothing special)
|
|
53 |
static final int NODE_NULLABLE = 1; // True if node is potentially null
|
|
54 |
static final int NODE_TOPLEVEL = 2; // True if top level expr
|
|
55 |
|
|
56 |
// Special types of 'escapes'
|
|
57 |
static final int ESC_MASK = 0xffff0; // Escape complexity mask
|
|
58 |
static final int ESC_BACKREF = 0xfffff; // Escape is really a backreference
|
|
59 |
static final int ESC_COMPLEX = 0xffffe; // Escape isn't really a true character
|
|
60 |
static final int ESC_CLASS = 0xffffd; // Escape represents a whole class of characters
|
|
61 |
|
|
62 |
// {m,n} stacks
|
|
63 |
int maxBrackets = 10; // Maximum number of bracket pairs
|
|
64 |
static final int bracketUnbounded = -1; // Unbounded value
|
|
65 |
int brackets = 0; // Number of bracket sets
|
|
66 |
int[] bracketStart = null; // Starting point
|
|
67 |
int[] bracketEnd = null; // Ending point
|
|
68 |
int[] bracketMin = null; // Minimum number of matches
|
|
69 |
int[] bracketOpt = null; // Additional optional matches
|
|
70 |
|
|
71 |
// Lookup table for POSIX character class names
|
|
72 |
static Hashtable hashPOSIX = new Hashtable();
|
|
73 |
static
|
|
74 |
{
|
|
75 |
hashPOSIX.put("alnum", new Character(RE.POSIX_CLASS_ALNUM));
|
|
76 |
hashPOSIX.put("alpha", new Character(RE.POSIX_CLASS_ALPHA));
|
|
77 |
hashPOSIX.put("blank", new Character(RE.POSIX_CLASS_BLANK));
|
|
78 |
hashPOSIX.put("cntrl", new Character(RE.POSIX_CLASS_CNTRL));
|
|
79 |
hashPOSIX.put("digit", new Character(RE.POSIX_CLASS_DIGIT));
|
|
80 |
hashPOSIX.put("graph", new Character(RE.POSIX_CLASS_GRAPH));
|
|
81 |
hashPOSIX.put("lower", new Character(RE.POSIX_CLASS_LOWER));
|
|
82 |
hashPOSIX.put("print", new Character(RE.POSIX_CLASS_PRINT));
|
|
83 |
hashPOSIX.put("punct", new Character(RE.POSIX_CLASS_PUNCT));
|
|
84 |
hashPOSIX.put("space", new Character(RE.POSIX_CLASS_SPACE));
|
|
85 |
hashPOSIX.put("upper", new Character(RE.POSIX_CLASS_UPPER));
|
|
86 |
hashPOSIX.put("xdigit", new Character(RE.POSIX_CLASS_XDIGIT));
|
|
87 |
hashPOSIX.put("javastart", new Character(RE.POSIX_CLASS_JSTART));
|
|
88 |
hashPOSIX.put("javapart", new Character(RE.POSIX_CLASS_JPART));
|
|
89 |
}
|
|
90 |
|
|
91 |
/**
|
|
92 |
* Constructor. Creates (initially empty) storage for a regular expression program.
|
|
93 |
*/
|
|
94 |
public RECompiler()
|
|
95 |
{
|
|
96 |
// Start off with a generous, yet reasonable, initial size
|
|
97 |
instruction = new char[128];
|
|
98 |
lenInstruction = 0;
|
|
99 |
}
|
|
100 |
|
|
101 |
/**
|
|
102 |
* Ensures that n more characters can fit in the program buffer.
|
|
103 |
* If n more can't fit, then the size is doubled until it can.
|
|
104 |
* @param n Number of additional characters to ensure will fit.
|
|
105 |
*/
|
|
106 |
void ensure(int n)
|
|
107 |
{
|
|
108 |
// Get current program length
|
|
109 |
int curlen = instruction.length;
|
|
110 |
|
|
111 |
// If the current length + n more is too much
|
|
112 |
if (lenInstruction + n >= curlen)
|
|
113 |
{
|
|
114 |
// Double the size of the program array until n more will fit
|
|
115 |
while (lenInstruction + n >= curlen)
|
|
116 |
{
|
|
117 |
curlen *= 2;
|
|
118 |
}
|
|
119 |
|
|
120 |
// Allocate new program array and move data into it
|
|
121 |
char[] newInstruction = new char[curlen];
|
|
122 |
System.arraycopy(instruction, 0, newInstruction, 0, lenInstruction);
|
|
123 |
instruction = newInstruction;
|
|
124 |
}
|
|
125 |
}
|
|
126 |
|
|
127 |
/**
|
|
128 |
* Emit a single character into the program stream.
|
|
129 |
* @param c Character to add
|
|
130 |
*/
|
|
131 |
void emit(char c)
|
|
132 |
{
|
|
133 |
// Make room for character
|
|
134 |
ensure(1);
|
|
135 |
|
|
136 |
// Add character
|
|
137 |
instruction[lenInstruction++] = c;
|
|
138 |
}
|
|
139 |
|
|
140 |
/**
|
|
141 |
* Inserts a node with a given opcode and opdata at insertAt. The node relative next
|
|
142 |
* pointer is initialized to 0.
|
|
143 |
* @param opcode Opcode for new node
|
|
144 |
* @param opdata Opdata for new node (only the low 16 bits are currently used)
|
|
145 |
* @param insertAt Index at which to insert the new node in the program
|
|
146 |
*/
|
|
147 |
void nodeInsert(char opcode, int opdata, int insertAt)
|
|
148 |
{
|
|
149 |
// Make room for a new node
|
|
150 |
ensure(RE.nodeSize);
|
|
151 |
|
|
152 |
// Move everything from insertAt to the end down nodeSize elements
|
|
153 |
System.arraycopy(instruction, insertAt, instruction, insertAt + RE.nodeSize, lenInstruction - insertAt);
|
|
154 |
instruction[insertAt + RE.offsetOpcode] = opcode;
|
|
155 |
instruction[insertAt + RE.offsetOpdata] = (char)opdata;
|
|
156 |
instruction[insertAt + RE.offsetNext] = 0;
|
|
157 |
lenInstruction += RE.nodeSize;
|
|
158 |
}
|
|
159 |
|
|
160 |
/**
|
|
161 |
* Appends a node to the end of a node chain
|
|
162 |
* @param node Start of node chain to traverse
|
|
163 |
* @param pointTo Node to have the tail of the chain point to
|
|
164 |
*/
|
|
165 |
void setNextOfEnd(int node, int pointTo)
|
|
166 |
{
|
|
167 |
// Traverse the chain until the next offset is 0
|
|
168 |
int next = instruction[node + RE.offsetNext];
|
|
169 |
// while the 'node' is not the last in the chain
|
|
170 |
// and the 'node' is not the last in the program.
|
|
171 |
while ( next != 0 && node < lenInstruction )
|
|
172 |
{
|
|
173 |
// if the node we are supposed to point to is in the chain then
|
|
174 |
// point to the end of the program instead.
|
|
175 |
// Michael McCallum <gholam@xtra.co.nz>
|
|
176 |
// FIXME: // This is a _hack_ to stop infinite programs.
|
|
177 |
// I believe that the implementation of the reluctant matches is wrong but
|
|
178 |
// have not worked out a better way yet.
|
|
179 |
if ( node == pointTo ) {
|
|
180 |
pointTo = lenInstruction;
|
|
181 |
}
|
|
182 |
node += next;
|
|
183 |
next = instruction[node + RE.offsetNext];
|
|
184 |
}
|
|
185 |
// if we have reached the end of the program then dont set the pointTo.
|
|
186 |
// im not sure if this will break any thing but passes all the tests.
|
|
187 |
if ( node < lenInstruction ) {
|
|
188 |
// Point the last node in the chain to pointTo.
|
|
189 |
instruction[node + RE.offsetNext] = (char)(short)(pointTo - node);
|
|
190 |
}
|
|
191 |
}
|
|
192 |
|
|
193 |
/**
|
|
194 |
* Adds a new node
|
|
195 |
* @param opcode Opcode for node
|
|
196 |
* @param opdata Opdata for node (only the low 16 bits are currently used)
|
|
197 |
* @return Index of new node in program
|
|
198 |
*/
|
|
199 |
int node(char opcode, int opdata)
|
|
200 |
{
|
|
201 |
// Make room for a new node
|
|
202 |
ensure(RE.nodeSize);
|
|
203 |
|
|
204 |
// Add new node at end
|
|
205 |
instruction[lenInstruction + RE.offsetOpcode] = opcode;
|
|
206 |
instruction[lenInstruction + RE.offsetOpdata] = (char)opdata;
|
|
207 |
instruction[lenInstruction + RE.offsetNext] = 0;
|
|
208 |
lenInstruction += RE.nodeSize;
|
|
209 |
|
|
210 |
// Return index of new node
|
|
211 |
return lenInstruction - RE.nodeSize;
|
|
212 |
}
|
|
213 |
|
|
214 |
|
|
215 |
/**
|
|
216 |
* Throws a new internal error exception
|
|
217 |
* @exception Error Thrown in the event of an internal error.
|
|
218 |
*/
|
|
219 |
void internalError() throws Error
|
|
220 |
{
|
|
221 |
throw new Error("Internal error!");
|
|
222 |
}
|
|
223 |
|
|
224 |
/**
|
|
225 |
* Throws a new syntax error exception
|
|
226 |
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
|
|
227 |
*/
|
|
228 |
void syntaxError(String s) throws RESyntaxException
|
|
229 |
{
|
|
230 |
throw new RESyntaxException(s);
|
|
231 |
}
|
|
232 |
|
|
233 |
/**
|
|
234 |
* Allocate storage for brackets only as needed
|
|
235 |
*/
|
|
236 |
void allocBrackets()
|
|
237 |
{
|
|
238 |
// Allocate bracket stacks if not already done
|
|
239 |
if (bracketStart == null)
|
|
240 |
{
|
|
241 |
// Allocate storage
|
|
242 |
bracketStart = new int[maxBrackets];
|
|
243 |
bracketEnd = new int[maxBrackets];
|
|
244 |
bracketMin = new int[maxBrackets];
|
|
245 |
bracketOpt = new int[maxBrackets];
|
|
246 |
|
|
247 |
// Initialize to invalid values
|
|
248 |
for (int i = 0; i < maxBrackets; i++)
|
|
249 |
{
|
|
250 |
bracketStart[i] = bracketEnd[i] = bracketMin[i] = bracketOpt[i] = -1;
|
|
251 |
}
|
|
252 |
}
|
|
253 |
}
|
|
254 |
|
|
255 |
/** Enlarge storage for brackets only as needed. */
|
|
256 |
synchronized void reallocBrackets() {
|
|
257 |
// trick the tricky
|
|
258 |
if (bracketStart == null) {
|
|
259 |
allocBrackets();
|
|
260 |
}
|
|
261 |
|
|
262 |
int new_size = maxBrackets * 2;
|
|
263 |
int[] new_bS = new int[new_size];
|
|
264 |
int[] new_bE = new int[new_size];
|
|
265 |
int[] new_bM = new int[new_size];
|
|
266 |
int[] new_bO = new int[new_size];
|
|
267 |
// Initialize to invalid values
|
|
268 |
for (int i=brackets; i<new_size; i++) {
|
|
269 |
new_bS[i] = new_bE[i] = new_bM[i] = new_bO[i] = -1;
|
|
270 |
}
|
|
271 |
System.arraycopy(bracketStart,0, new_bS,0, brackets);
|
|
272 |
System.arraycopy(bracketEnd,0, new_bE,0, brackets);
|
|
273 |
System.arraycopy(bracketMin,0, new_bM,0, brackets);
|
|
274 |
System.arraycopy(bracketOpt,0, new_bO,0, brackets);
|
|
275 |
bracketStart = new_bS;
|
|
276 |
bracketEnd = new_bE;
|
|
277 |
bracketMin = new_bM;
|
|
278 |
bracketOpt = new_bO;
|
|
279 |
maxBrackets = new_size;
|
|
280 |
}
|
|
281 |
|
|
282 |
/**
|
|
283 |
* Match bracket {m,n} expression put results in bracket member variables
|
|
284 |
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
|
|
285 |
*/
|
|
286 |
void bracket() throws RESyntaxException
|
|
287 |
{
|
|
288 |
// Current character must be a '{'
|
|
289 |
if (idx >= len || pattern.charAt(idx++) != '{')
|
|
290 |
{
|
|
291 |
internalError();
|
|
292 |
}
|
|
293 |
|
|
294 |
// Next char must be a digit
|
|
295 |
if (idx >= len || !Character.isDigit(pattern.charAt(idx)))
|
|
296 |
{
|
|
297 |
syntaxError("Expected digit");
|
|
298 |
}
|
|
299 |
|
|
300 |
// Get min ('m' of {m,n}) number
|
|
301 |
StringBuffer number = new StringBuffer();
|
|
302 |
while (idx < len && Character.isDigit(pattern.charAt(idx)))
|
|
303 |
{
|
|
304 |
number.append(pattern.charAt(idx++));
|
|
305 |
}
|
|
306 |
try
|
|
307 |
{
|
|
308 |
bracketMin[brackets] = Integer.parseInt(number.toString());
|
|
309 |
}
|
|
310 |
catch (NumberFormatException e)
|
|
311 |
{
|
|
312 |
syntaxError("Expected valid number");
|
|
313 |
}
|
|
314 |
|
|
315 |
// If out of input, fail
|
|
316 |
if (idx >= len)
|
|
317 |
{
|
|
318 |
syntaxError("Expected comma or right bracket");
|
|
319 |
}
|
|
320 |
|
|
321 |
// If end of expr, optional limit is 0
|
|
322 |
if (pattern.charAt(idx) == '}')
|
|
323 |
{
|
|
324 |
idx++;
|
|
325 |
bracketOpt[brackets] = 0;
|
|
326 |
return;
|
|
327 |
}
|
|
328 |
|
|
329 |
// Must have at least {m,} and maybe {m,n}.
|
|
330 |
if (idx >= len || pattern.charAt(idx++) != ',')
|
|
331 |
{
|
|
332 |
syntaxError("Expected comma");
|
|
333 |
}
|
|
334 |
|
|
335 |
// If out of input, fail
|
|
336 |
if (idx >= len)
|
|
337 |
{
|
|
338 |
syntaxError("Expected comma or right bracket");
|
|
339 |
}
|
|
340 |
|
|
341 |
// If {m,} max is unlimited
|
|
342 |
if (pattern.charAt(idx) == '}')
|
|
343 |
{
|
|
344 |
idx++;
|
|
345 |
bracketOpt[brackets] = bracketUnbounded;
|
|
346 |
return;
|
|
347 |
}
|
|
348 |
|
|
349 |
// Next char must be a digit
|
|
350 |
if (idx >= len || !Character.isDigit(pattern.charAt(idx)))
|
|
351 |
{
|
|
352 |
syntaxError("Expected digit");
|
|
353 |
}
|
|
354 |
|
|
355 |
// Get max number
|
|
356 |
number.setLength(0);
|
|
357 |
while (idx < len && Character.isDigit(pattern.charAt(idx)))
|
|
358 |
{
|
|
359 |
number.append(pattern.charAt(idx++));
|
|
360 |
}
|
|
361 |
try
|
|
362 |
{
|
|
363 |
bracketOpt[brackets] = Integer.parseInt(number.toString()) - bracketMin[brackets];
|
|
364 |
}
|
|
365 |
catch (NumberFormatException e)
|
|
366 |
{
|
|
367 |
syntaxError("Expected valid number");
|
|
368 |
}
|
|
369 |
|
|
370 |
// Optional repetitions must be >= 0
|
|
371 |
if (bracketOpt[brackets] < 0)
|
|
372 |
{
|
|
373 |
syntaxError("Bad range");
|
|
374 |
}
|
|
375 |
|
|
376 |
// Must have close brace
|
|
377 |
if (idx >= len || pattern.charAt(idx++) != '}')
|
|
378 |
{
|
|
379 |
syntaxError("Missing close brace");
|
|
380 |
}
|
|
381 |
}
|
|
382 |
|
|
383 |
/**
|
|
384 |
* Match an escape sequence. Handles quoted chars and octal escapes as well
|
|
385 |
* as normal escape characters. Always advances the input stream by the
|
|
386 |
* right amount. This code "understands" the subtle difference between an
|
|
387 |
* octal escape and a backref. You can access the type of ESC_CLASS or
|
|
388 |
* ESC_COMPLEX or ESC_BACKREF by looking at pattern[idx - 1].
|
|
389 |
* @return ESC_* code or character if simple escape
|
|
390 |
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
|
|
391 |
*/
|
|
392 |
int escape() throws RESyntaxException
|
|
393 |
{
|
|
394 |
// "Shouldn't" happen
|
|
395 |
if (pattern.charAt(idx) != '\\')
|
|
396 |
{
|
|
397 |
internalError();
|
|
398 |
}
|
|
399 |
|
|
400 |
// Escape shouldn't occur as last character in string!
|
|
401 |
if (idx + 1 == len)
|
|
402 |
{
|
|
403 |
syntaxError("Escape terminates string");
|
|
404 |
}
|
|
405 |
|
|
406 |
// Switch on character after backslash
|
|
407 |
idx += 2;
|
|
408 |
char escapeChar = pattern.charAt(idx - 1);
|
|
409 |
switch (escapeChar)
|
|
410 |
{
|
|
411 |
case RE.E_BOUND:
|
|
412 |
case RE.E_NBOUND:
|
|
413 |
return ESC_COMPLEX;
|
|
414 |
|
|
415 |
case RE.E_ALNUM:
|
|
416 |
case RE.E_NALNUM:
|
|
417 |
case RE.E_SPACE:
|
|
418 |
case RE.E_NSPACE:
|
|
419 |
case RE.E_DIGIT:
|
|
420 |
case RE.E_NDIGIT:
|
|
421 |
return ESC_CLASS;
|
|
422 |
|
|
423 |
case 'u':
|
|
424 |
case 'x':
|
|
425 |
{
|
|
426 |
// Exact required hex digits for escape type
|
|
427 |
int hexDigits = (escapeChar == 'u' ? 4 : 2);
|
|
428 |
|
|
429 |
// Parse up to hexDigits characters from input
|
|
430 |
int val = 0;
|
|
431 |
for ( ; idx < len && hexDigits-- > 0; idx++)
|
|
432 |
{
|
|
433 |
// Get char
|
|
434 |
char c = pattern.charAt(idx);
|
|
435 |
|
|
436 |
// If it's a hexadecimal digit (0-9)
|
|
437 |
if (c >= '0' && c <= '9')
|
|
438 |
{
|
|
439 |
// Compute new value
|
|
440 |
val = (val << 4) + c - '0';
|
|
441 |
}
|
|
442 |
else
|
|
443 |
{
|
|
444 |
// If it's a hexadecimal letter (a-f)
|
|
445 |
c = Character.toLowerCase(c);
|
|
446 |
if (c >= 'a' && c <= 'f')
|
|
447 |
{
|
|
448 |
// Compute new value
|
|
449 |
val = (val << 4) + (c - 'a') + 10;
|
|
450 |
}
|
|
451 |
else
|
|
452 |
{
|
|
453 |
// If it's not a valid digit or hex letter, the escape must be invalid
|
|
454 |
// because hexDigits of input have not been absorbed yet.
|
|
455 |
syntaxError("Expected " + hexDigits + " hexadecimal digits after \\" + escapeChar);
|
|
456 |
}
|
|
457 |
}
|
|
458 |
}
|
|
459 |
return val;
|
|
460 |
}
|
|
461 |
|
|
462 |
case 't':
|
|
463 |
return '\t';
|
|
464 |
|
|
465 |
case 'n':
|
|
466 |
return '\n';
|
|
467 |
|
|
468 |
case 'r':
|
|
469 |
return '\r';
|
|
470 |
|
|
471 |
case 'f':
|
|
472 |
return '\f';
|
|
473 |
|
|
474 |
case '0':
|
|
475 |
case '1':
|
|
476 |
case '2':
|
|
477 |
case '3':
|
|
478 |
case '4':
|
|
479 |
case '5':
|
|
480 |
case '6':
|
|
481 |
case '7':
|
|
482 |
case '8':
|
|
483 |
case '9':
|
|
484 |
|
|
485 |
// An octal escape starts with a 0 or has two digits in a row
|
|
486 |
if ((idx < len && Character.isDigit(pattern.charAt(idx))) || escapeChar == '0')
|
|
487 |
{
|
|
488 |
// Handle \nnn octal escapes
|
|
489 |
int val = escapeChar - '0';
|
|
490 |
if (idx < len && Character.isDigit(pattern.charAt(idx)))
|
|
491 |
{
|
|
492 |
val = ((val << 3) + (pattern.charAt(idx++) - '0'));
|
|
493 |
if (idx < len && Character.isDigit(pattern.charAt(idx)))
|
|
494 |
{
|
|
495 |
val = ((val << 3) + (pattern.charAt(idx++) - '0'));
|
|
496 |
}
|
|
497 |
}
|
|
498 |
return val;
|
|
499 |
}
|
|
500 |
|
|
501 |
// It's actually a backreference (\[1-9]), not an escape
|
|
502 |
return ESC_BACKREF;
|
|
503 |
|
|
504 |
default:
|
|
505 |
|
|
506 |
// Simple quoting of a character
|
|
507 |
return escapeChar;
|
|
508 |
}
|
|
509 |
}
|
|
510 |
|
|
511 |
/**
|
|
512 |
* Compile a character class
|
|
513 |
* @return Index of class node
|
|
514 |
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
|
|
515 |
*/
|
|
516 |
int characterClass() throws RESyntaxException
|
|
517 |
{
|
|
518 |
// Check for bad calling or empty class
|
|
519 |
if (pattern.charAt(idx) != '[')
|
|
520 |
{
|
|
521 |
internalError();
|
|
522 |
}
|
|
523 |
|
|
524 |
// Check for unterminated or empty class
|
|
525 |
if ((idx + 1) >= len || pattern.charAt(++idx) == ']')
|
|
526 |
{
|
|
527 |
syntaxError("Empty or unterminated class");
|
|
528 |
}
|
|
529 |
|
|
530 |
// Check for POSIX character class
|
|
531 |
if (idx < len && pattern.charAt(idx) == ':')
|
|
532 |
{
|
|
533 |
// Skip colon
|
|
534 |
idx++;
|
|
535 |
|
|
536 |
// POSIX character classes are denoted with lowercase ASCII strings
|
|
537 |
int idxStart = idx;
|
|
538 |
while (idx < len && pattern.charAt(idx) >= 'a' && pattern.charAt(idx) <= 'z')
|
|
539 |
{
|
|
540 |
idx++;
|
|
541 |
}
|
|
542 |
|
|
543 |
// Should be a ":]" to terminate the POSIX character class
|
|
544 |
if ((idx + 1) < len && pattern.charAt(idx) == ':' && pattern.charAt(idx + 1) == ']')
|
|
545 |
{
|
|
546 |
// Get character class
|
|
547 |
String charClass = pattern.substring(idxStart, idx);
|
|
548 |
|
|
549 |
// Select the POSIX class id
|
|
550 |
Character i = (Character)hashPOSIX.get(charClass);
|
|
551 |
if (i != null)
|
|
552 |
{
|
|
553 |
// Move past colon and right bracket
|
|
554 |
idx += 2;
|
|
555 |
|
|
556 |
// Return new POSIX character class node
|
|
557 |
return node(RE.OP_POSIXCLASS, i.charValue());
|
|
558 |
}
|
|
559 |
syntaxError("Invalid POSIX character class '" + charClass + "'");
|
|
560 |
}
|
|
561 |
syntaxError("Invalid POSIX character class syntax");
|
|
562 |
}
|
|
563 |
|
|
564 |
// Try to build a class. Create OP_ANYOF node
|
|
565 |
int ret = node(RE.OP_ANYOF, 0);
|
|
566 |
|
|
567 |
// Parse class declaration
|
|
568 |
char CHAR_INVALID = Character.MAX_VALUE;
|
|
569 |
char last = CHAR_INVALID;
|
|
570 |
char simpleChar = 0;
|
|
571 |
boolean include = true;
|
|
572 |
boolean definingRange = false;
|
|
573 |
int idxFirst = idx;
|
|
574 |
char rangeStart = Character.MIN_VALUE;
|
|
575 |
char rangeEnd;
|
|
576 |
RERange range = new RERange();
|
|
577 |
while (idx < len && pattern.charAt(idx) != ']')
|
|
578 |
{
|
|
579 |
|
|
580 |
switchOnCharacter:
|
|
581 |
|
|
582 |
// Switch on character
|
|
583 |
switch (pattern.charAt(idx))
|
|
584 |
{
|
|
585 |
case '^':
|
|
586 |
include = !include;
|
|
587 |
if (idx == idxFirst)
|
|
588 |
{
|
|
589 |
range.include(Character.MIN_VALUE, Character.MAX_VALUE, true);
|
|
590 |
}
|
|
591 |
idx++;
|
|
592 |
continue;
|
|
593 |
|
|
594 |
case '\\':
|
|
595 |
{
|
|
596 |
// Escape always advances the stream
|
|
597 |
int c;
|
|
598 |
switch (c = escape ())
|
|
599 |
{
|
|
600 |
case ESC_COMPLEX:
|
|
601 |
case ESC_BACKREF:
|
|
602 |
|
|
603 |
// Word boundaries and backrefs not allowed in a character class!
|
|
604 |
syntaxError("Bad character class");
|
|
605 |
|
|
606 |
case ESC_CLASS:
|
|
607 |
|
|
608 |
// Classes can't be an endpoint of a range
|
|
609 |
if (definingRange)
|
|
610 |
{
|
|
611 |
syntaxError("Bad character class");
|
|
612 |
}
|
|
613 |
|
|
614 |
// Handle specific type of class (some are ok)
|
|
615 |
switch (pattern.charAt(idx - 1))
|
|
616 |
{
|
|
617 |
case RE.E_NSPACE:
|
|
618 |
case RE.E_NDIGIT:
|
|
619 |
case RE.E_NALNUM:
|
|
620 |
syntaxError("Bad character class");
|
|
621 |
|
|
622 |
case RE.E_SPACE:
|
|
623 |
range.include('\t', include);
|
|
624 |
range.include('\r', include);
|
|
625 |
range.include('\f', include);
|
|
626 |
range.include('\n', include);
|
|
627 |
range.include('\b', include);
|
|
628 |
range.include(' ', include);
|
|
629 |
break;
|
|
630 |
|
|
631 |
case RE.E_ALNUM:
|
|
632 |
range.include('a', 'z', include);
|
|
633 |
range.include('A', 'Z', include);
|
|
634 |
range.include('_', include);
|
|
635 |
|
|
636 |
// Fall through!
|
|
637 |
|
|
638 |
case RE.E_DIGIT:
|
|
639 |
range.include('0', '9', include);
|
|
640 |
break;
|
|
641 |
}
|
|
642 |
|
|
643 |
// Make last char invalid (can't be a range start)
|
|
644 |
last = CHAR_INVALID;
|
|
645 |
break;
|
|
646 |
|
|
647 |
default:
|
|
648 |
|
|
649 |
// Escape is simple so treat as a simple char
|
|
650 |
simpleChar = (char) c;
|
|
651 |
break switchOnCharacter;
|
|
652 |
}
|
|
653 |
}
|
|
654 |
continue;
|
|
655 |
|
|
656 |
case '-':
|
|
657 |
|
|
658 |
// Start a range if one isn't already started
|
|
659 |
if (definingRange)
|
|
660 |
{
|
|
661 |
syntaxError("Bad class range");
|
|
662 |
}
|
|
663 |
definingRange = true;
|
|
664 |
|
|
665 |
// If no last character, start of range is 0
|
|
666 |
rangeStart = (last == CHAR_INVALID ? 0 : last);
|
|
667 |
|
|
668 |
// Premature end of range. define up to Character.MAX_VALUE
|
|
669 |
if ((idx + 1) < len && pattern.charAt(++idx) == ']')
|
|
670 |
{
|
|
671 |
simpleChar = Character.MAX_VALUE;
|
|
672 |
break;
|
|
673 |
}
|
|
674 |
continue;
|
|
675 |
|
|
676 |
default:
|
|
677 |
simpleChar = pattern.charAt(idx++);
|
|
678 |
break;
|
|
679 |
}
|
|
680 |
|
|
681 |
// Handle simple character simpleChar
|
|
682 |
if (definingRange)
|
|
683 |
{
|
|
684 |
// if we are defining a range make it now
|
|
685 |
rangeEnd = simpleChar;
|
|
686 |
|
|
687 |
// Actually create a range if the range is ok
|
|
688 |
if (rangeStart >= rangeEnd)
|
|
689 |
{
|
|
690 |
syntaxError("Bad character class");
|
|
691 |
}
|
|
692 |
range.include(rangeStart, rangeEnd, include);
|
|
693 |
|
|
694 |
// We are done defining the range
|
|
695 |
last = CHAR_INVALID;
|
|
696 |
definingRange = false;
|
|
697 |
}
|
|
698 |
else
|
|
699 |
{
|
|
700 |
// If simple character and not start of range, include it
|
|
701 |
if (idx >= len || pattern.charAt(idx) != '-')
|
|
702 |
{
|
|
703 |
range.include(simpleChar, include);
|
|
704 |
}
|
|
705 |
last = simpleChar;
|
|
706 |
}
|
|
707 |
}
|
|
708 |
|
|
709 |
// Shouldn't be out of input
|
|
710 |
if (idx == len)
|
|
711 |
{
|
|
712 |
syntaxError("Unterminated character class");
|
|
713 |
}
|
|
714 |
|
|
715 |
// Absorb the ']' end of class marker
|
|
716 |
idx++;
|
|
717 |
|
|
718 |
// Emit character class definition
|
|
719 |
instruction[ret + RE.offsetOpdata] = (char)range.num;
|
|
720 |
for (int i = 0; i < range.num; i++)
|
|
721 |
{
|
|
722 |
emit((char)range.minRange[i]);
|
|
723 |
emit((char)range.maxRange[i]);
|
|
724 |
}
|
|
725 |
return ret;
|
|
726 |
}
|
|
727 |
|
|
728 |
/**
|
|
729 |
* Absorb an atomic character string. This method is a little tricky because
|
|
730 |
* it can un-include the last character of string if a closure operator follows.
|
|
731 |
* This is correct because *+? have higher precedence than concatentation (thus
|
|
732 |
* ABC* means AB(C*) and NOT (ABC)*).
|
|
733 |
* @return Index of new atom node
|
|
734 |
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
|
|
735 |
*/
|
|
736 |
int atom() throws RESyntaxException
|
|
737 |
{
|
|
738 |
// Create a string node
|
|
739 |
int ret = node(RE.OP_ATOM, 0);
|
|
740 |
|
|
741 |
// Length of atom
|
|
742 |
int lenAtom = 0;
|
|
743 |
|
|
744 |
// Loop while we've got input
|
|
745 |
|
|
746 |
atomLoop:
|
|
747 |
|
|
748 |
while (idx < len)
|
|
749 |
{
|
|
750 |
// Is there a next char?
|
|
751 |
if ((idx + 1) < len)
|
|
752 |
{
|
|
753 |
char c = pattern.charAt(idx + 1);
|
|
754 |
|
|
755 |
// If the next 'char' is an escape, look past the whole escape
|
|
756 |
if (pattern.charAt(idx) == '\\')
|
|
757 |
{
|
|
758 |
int idxEscape = idx;
|
|
759 |
escape();
|
|
760 |
if (idx < len)
|
|
761 |
{
|
|
762 |
c = pattern.charAt(idx);
|
|
763 |
}
|
|
764 |
idx = idxEscape;
|
|
765 |
}
|
|
766 |
|
|
767 |
// Switch on next char
|
|
768 |
switch (c)
|
|
769 |
{
|
|
770 |
case '{':
|
|
771 |
case '?':
|
|
772 |
case '*':
|
|
773 |
case '+':
|
|
774 |
|
|
775 |
// If the next character is a closure operator and our atom is non-empty, the
|
|
776 |
// current character should bind to the closure operator rather than the atom
|
|
777 |
if (lenAtom != 0)
|
|
778 |
{
|
|
779 |
break atomLoop;
|
|
780 |
}
|
|
781 |
}
|
|
782 |
}
|
|
783 |
|
|
784 |
// Switch on current char
|
|
785 |
switch (pattern.charAt(idx))
|
|
786 |
{
|
|
787 |
case ']':
|
|
788 |
case '^':
|
|
789 |
case '$':
|
|
790 |
case '.':
|
|
791 |
case '[':
|
|
792 |
case '(':
|
|
793 |
case ')':
|
|
794 |
case '|':
|
|
795 |
break atomLoop;
|
|
796 |
|
|
797 |
case '{':
|
|
798 |
case '?':
|
|
799 |
case '*':
|
|
800 |
case '+':
|
|
801 |
|
|
802 |
// We should have an atom by now
|
|
803 |
if (lenAtom == 0)
|
|
804 |
{
|
|
805 |
// No atom before closure
|
|
806 |
syntaxError("Missing operand to closure");
|
|
807 |
}
|
|
808 |
break atomLoop;
|
|
809 |
|
|
810 |
case '\\':
|
|
811 |
|
|
812 |
{
|
|
813 |
// Get the escaped character (advances input automatically)
|
|
814 |
int idxBeforeEscape = idx;
|
|
815 |
int c = escape();
|
|
816 |
|
|
817 |
// Check if it's a simple escape (as opposed to, say, a backreference)
|
|
818 |
if ((c & ESC_MASK) == ESC_MASK)
|
|
819 |
{
|
|
820 |
// Not a simple escape, so backup to where we were before the escape.
|
|
821 |
idx = idxBeforeEscape;
|
|
822 |
break atomLoop;
|
|
823 |
}
|
|
824 |
|
|
825 |
// Add escaped char to atom
|
|
826 |
emit((char) c);
|
|
827 |
lenAtom++;
|
|
828 |
}
|
|
829 |
break;
|
|
830 |
|
|
831 |
default:
|
|
832 |
|
|
833 |
// Add normal character to atom
|
|
834 |
emit(pattern.charAt(idx++));
|
|
835 |
lenAtom++;
|
|
836 |
break;
|
|
837 |
}
|
|
838 |
}
|
|
839 |
|
|
840 |
// This "shouldn't" happen
|
|
841 |
if (lenAtom == 0)
|
|
842 |
{
|
|
843 |
internalError();
|
|
844 |
}
|
|
845 |
|
|
846 |
// Emit the atom length into the program
|
|
847 |
instruction[ret + RE.offsetOpdata] = (char)lenAtom;
|
|
848 |
return ret;
|
|
849 |
}
|
|
850 |
|
|
851 |
/**
|
|
852 |
* Match a terminal node.
|
|
853 |
* @param flags Flags
|
|
854 |
* @return Index of terminal node (closeable)
|
|
855 |
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
|
|
856 |
*/
|
|
857 |
int terminal(int[] flags) throws RESyntaxException
|
|
858 |
{
|
|
859 |
switch (pattern.charAt(idx))
|
|
860 |
{
|
|
861 |
case RE.OP_EOL:
|
|
862 |
case RE.OP_BOL:
|
|
863 |
case RE.OP_ANY:
|
|
864 |
return node(pattern.charAt(idx++), 0);
|
|
865 |
|
|
866 |
case '[':
|
|
867 |
return characterClass();
|
|
868 |
|
|
869 |
case '(':
|
|
870 |
return expr(flags);
|
|
871 |
|
|
872 |
case ')':
|
|
873 |
syntaxError("Unexpected close paren");
|
|
874 |
|
|
875 |
case '|':
|
|
876 |
internalError();
|
|
877 |
|
|
878 |
case ']':
|
|
879 |
syntaxError("Mismatched class");
|
|
880 |
|
|
881 |
case 0:
|
|
882 |
syntaxError("Unexpected end of input");
|
|
883 |
|
|
884 |
case '?':
|
|
885 |
case '+':
|
|
886 |
case '{':
|
|
887 |
case '*':
|
|
888 |
syntaxError("Missing operand to closure");
|
|
889 |
|
|
890 |
case '\\':
|
|
891 |
{
|
|
892 |
// Don't forget, escape() advances the input stream!
|
|
893 |
int idxBeforeEscape = idx;
|
|
894 |
|
|
895 |
// Switch on escaped character
|
|
896 |
switch (escape())
|
|
897 |
{
|
|
898 |
case ESC_CLASS:
|
|
899 |
case ESC_COMPLEX:
|
|
900 |
flags[0] &= ~NODE_NULLABLE;
|
|
901 |
return node(RE.OP_ESCAPE, pattern.charAt(idx - 1));
|
|
902 |
|
|
903 |
case ESC_BACKREF:
|
|
904 |
{
|
|
905 |
char backreference = (char)(pattern.charAt(idx - 1) - '0');
|
|
906 |
if (parens <= backreference)
|
|
907 |
{
|
|
908 |
syntaxError("Bad backreference");
|
|
909 |
}
|
|
910 |
flags[0] |= NODE_NULLABLE;
|
|
911 |
return node(RE.OP_BACKREF, backreference);
|
|
912 |
}
|
|
913 |
|
|
914 |
default:
|
|
915 |
|
|
916 |
// We had a simple escape and we want to have it end up in
|
|
917 |
// an atom, so we back up and fall though to the default handling
|
|
918 |
idx = idxBeforeEscape;
|
|
919 |
flags[0] &= ~NODE_NULLABLE;
|
|
920 |
break;
|
|
921 |
}
|
|
922 |
}
|
|
923 |
}
|
|
924 |
|
|
925 |
// Everything above either fails or returns.
|
|
926 |
// If it wasn't one of the above, it must be the start of an atom.
|
|
927 |
flags[0] &= ~NODE_NULLABLE;
|
|
928 |
return atom();
|
|
929 |
}
|
|
930 |
|
|
931 |
/**
|
|
932 |
* Compile a possibly closured terminal
|
|
933 |
* @param flags Flags passed by reference
|
|
934 |
* @return Index of closured node
|
|
935 |
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
|
|
936 |
*/
|
|
937 |
int closure(int[] flags) throws RESyntaxException
|
|
938 |
{
|
|
939 |
// Before terminal
|
|
940 |
int idxBeforeTerminal = idx;
|
|
941 |
|
|
942 |
// Values to pass by reference to terminal()
|
|
943 |
int[] terminalFlags = { NODE_NORMAL };
|
|
944 |
|
|
945 |
// Get terminal symbol
|
|
946 |
int ret = terminal(terminalFlags);
|
|
947 |
|
|
948 |
// Or in flags from terminal symbol
|
|
949 |
flags[0] |= terminalFlags[0];
|
|
950 |
|
|
951 |
// Advance input, set NODE_NULLABLE flag and do sanity checks
|
|
952 |
if (idx >= len)
|
|
953 |
{
|
|
954 |
return ret;
|
|
955 |
}
|
|
956 |
boolean greedy = true;
|
|
957 |
char closureType = pattern.charAt(idx);
|
|
958 |
switch (closureType)
|
|
959 |
{
|
|
960 |
case '?':
|
|
961 |
case '*':
|
|
962 |
|
|
963 |
// The current node can be null
|
|
964 |
flags[0] |= NODE_NULLABLE;
|
|
965 |
|
|
966 |
case '+':
|
|
967 |
|
|
968 |
// Eat closure character
|
|
969 |
idx++;
|
|
970 |
|
|
971 |
case '{':
|
|
972 |
|
|
973 |
// Don't allow blantant stupidity
|
|
974 |
int opcode = instruction[ret + RE.offsetOpcode];
|
|
975 |
if (opcode == RE.OP_BOL || opcode == RE.OP_EOL)
|
|
976 |
{
|
|
977 |
syntaxError("Bad closure operand");
|
|
978 |
}
|
|
979 |
if ((terminalFlags[0] & NODE_NULLABLE) != 0)
|
|
980 |
{
|
|
981 |
syntaxError("Closure operand can't be nullable");
|
|
982 |
}
|
|
983 |
break;
|
|
984 |
}
|
|
985 |
|
|
986 |
// If the next character is a '?', make the closure non-greedy (reluctant)
|
|
987 |
if (idx < len && pattern.charAt(idx) == '?')
|
|
988 |
{
|
|
989 |
idx++;
|
|
990 |
greedy = false;
|
|
991 |
}
|
|
992 |
|
|
993 |
if (greedy)
|
|
994 |
{
|
|
995 |
// Actually do the closure now
|
|
996 |
switch (closureType)
|
|
997 |
{
|
|
998 |
case '{':
|
|
999 |
{
|
|
1000 |
// We look for our bracket in the list
|
|
1001 |
boolean found = false;
|
|
1002 |
int i;
|
|
1003 |
allocBrackets();
|
|
1004 |
for (i = 0; i < brackets; i++)
|
|
1005 |
{
|
|
1006 |
if (bracketStart[i] == idx)
|
|
1007 |
{
|
|
1008 |
found = true;
|
|
1009 |
break;
|
|
1010 |
}
|
|
1011 |
}
|
|
1012 |
|
|
1013 |
// If its not in the list we parse the {m,n}
|
|
1014 |
if (!found)
|
|
1015 |
{
|
|
1016 |
if (brackets >= maxBrackets)
|
|
1017 |
{
|
|
1018 |
reallocBrackets();
|
|
1019 |
}
|
|
1020 |
bracketStart[brackets] = idx;
|
|
1021 |
bracket();
|
|
1022 |
bracketEnd[brackets] = idx;
|
|
1023 |
i = brackets++;
|
|
1024 |
}
|
|
1025 |
|
|
1026 |
// Process min first
|
|
1027 |
if (bracketMin[i]-- > 0)
|
|
1028 |
{
|
|
1029 |
if (bracketMin[i] > 0 || bracketOpt[i] != 0) {
|
|
1030 |
// Rewind stream and run it through again - more matchers coming
|
|
1031 |
for (int j = 0; j < brackets; j++) {
|
|
1032 |
if (j != i && bracketStart[j] < idx
|
|
1033 |
&& bracketStart[j] >= idxBeforeTerminal)
|
|
1034 |
{
|
|
1035 |
brackets--;
|
|
1036 |
bracketStart[j] = bracketStart[brackets];
|
|
1037 |
bracketEnd[j] = bracketEnd[brackets];
|
|
1038 |
bracketMin[j] = bracketMin[brackets];
|
|
1039 |
bracketOpt[j] = bracketOpt[brackets];
|
|
1040 |
}
|
|
1041 |
}
|
|
1042 |
|
|
1043 |
idx = idxBeforeTerminal;
|
|
1044 |
} else {
|
|
1045 |
// Bug #1030: No optinal matches - no need to rewind
|
|
1046 |
idx = bracketEnd[i];
|
|
1047 |
}
|
|
1048 |
break;
|
|
1049 |
}
|
|
1050 |
|
|
1051 |
// Do the right thing for maximum ({m,})
|
|
1052 |
if (bracketOpt[i] == bracketUnbounded)
|
|
1053 |
{
|
|
1054 |
// Drop through now and closure expression.
|
|
1055 |
// We are done with the {m,} expr, so skip rest
|
|
1056 |
closureType = '*';
|
|
1057 |
bracketOpt[i] = 0;
|
|
1058 |
idx = bracketEnd[i];
|
|
1059 |
}
|
|
1060 |
else
|
|
1061 |
if (bracketOpt[i]-- > 0)
|
|
1062 |
{
|
|
1063 |
if (bracketOpt[i] > 0)
|
|
1064 |
{
|
|
1065 |
// More optional matchers - 'play it again sam!'
|
|
1066 |
idx = idxBeforeTerminal;
|
|
1067 |
} else {
|
|
1068 |
// Bug #1030: We are done - this one is last and optional
|
|
1069 |
idx = bracketEnd[i];
|
|
1070 |
}
|
|
1071 |
// Drop through to optionally close
|
|
1072 |
closureType = '?';
|
|
1073 |
}
|
|
1074 |
else
|
|
1075 |
{
|
|
1076 |
// Rollback terminal - neither min nor opt matchers present
|
|
1077 |
lenInstruction = ret;
|
|
1078 |
node(RE.OP_NOTHING, 0);
|
|
1079 |
|
|
1080 |
// We are done. skip the rest of {m,n} expr
|
|
1081 |
idx = bracketEnd[i];
|
|
1082 |
break;
|
|
1083 |
}
|
|
1084 |
}
|
|
1085 |
|
|
1086 |
// Fall through!
|
|
1087 |
|
|
1088 |
case '?':
|
|
1089 |
case '*':
|
|
1090 |
|
|
1091 |
if (!greedy)
|
|
1092 |
{
|
|
1093 |
break;
|
|
1094 |
}
|
|
1095 |
|
|
1096 |
if (closureType == '?')
|
|
1097 |
{
|
|
1098 |
// X? is compiled as (X|)
|
|
1099 |
nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X
|
|
1100 |
setNextOfEnd(ret, node (RE.OP_BRANCH, 0)); // inserted branch to option
|
|
1101 |
int nothing = node (RE.OP_NOTHING, 0); // which is OP_NOTHING
|
|
1102 |
setNextOfEnd(ret, nothing); // point (second) branch to OP_NOTHING
|
|
1103 |
setNextOfEnd(ret + RE.nodeSize, nothing); // point the end of X to OP_NOTHING node
|
|
1104 |
}
|
|
1105 |
|
|
1106 |
if (closureType == '*')
|
|
1107 |
{
|
|
1108 |
// X* is compiled as (X{gotoX}|)
|
|
1109 |
nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X
|
|
1110 |
setNextOfEnd(ret + RE.nodeSize, node(RE.OP_BRANCH, 0)); // end of X points to an option
|
|
1111 |
setNextOfEnd(ret + RE.nodeSize, node(RE.OP_GOTO, 0)); // to goto
|
|
1112 |
setNextOfEnd(ret + RE.nodeSize, ret); // the start again
|
|
1113 |
setNextOfEnd(ret, node(RE.OP_BRANCH, 0)); // the other option is
|
|
1114 |
setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // OP_NOTHING
|
|
1115 |
}
|
|
1116 |
break;
|
|
1117 |
|
|
1118 |
case '+':
|
|
1119 |
{
|
|
1120 |
// X+ is compiled as X({gotoX}|)
|
|
1121 |
int branch;
|
|
1122 |
branch = node(RE.OP_BRANCH, 0); // a new branch
|
|
1123 |
setNextOfEnd(ret, branch); // is added to the end of X
|
|
1124 |
setNextOfEnd(node(RE.OP_GOTO, 0), ret); // one option is to go back to the start
|
|
1125 |
setNextOfEnd(branch, node(RE.OP_BRANCH, 0)); // the other option
|
|
1126 |
setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // is OP_NOTHING
|
|
1127 |
}
|
|
1128 |
break;
|
|
1129 |
}
|
|
1130 |
}
|
|
1131 |
else
|
|
1132 |
{
|
|
1133 |
// Add end after closured subexpr
|
|
1134 |
setNextOfEnd(ret, node(RE.OP_END, 0));
|
|
1135 |
|
|
1136 |
// Actually do the closure now
|
|
1137 |
switch (closureType)
|
|
1138 |
{
|
|
1139 |
case '?':
|
|
1140 |
nodeInsert(RE.OP_RELUCTANTMAYBE, 0, ret);
|
|
1141 |
break;
|
|
1142 |
|
|
1143 |
case '*':
|
|
1144 |
nodeInsert(RE.OP_RELUCTANTSTAR, 0, ret);
|
|
1145 |
break;
|
|
1146 |
|
|
1147 |
case '+':
|
|
1148 |
nodeInsert(RE.OP_RELUCTANTPLUS, 0, ret);
|
|
1149 |
break;
|
|
1150 |
}
|
|
1151 |
|
|
1152 |
// Point to the expr after the closure
|
|
1153 |
setNextOfEnd(ret, lenInstruction);
|
|
1154 |
}
|
|
1155 |
return ret;
|
|
1156 |
}
|
|
1157 |
|
|
1158 |
/**
|
|
1159 |
* Compile one branch of an or operator (implements concatenation)
|
|
1160 |
* @param flags Flags passed by reference
|
|
1161 |
* @return Pointer to branch node
|
|
1162 |
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
|
|
1163 |
*/
|
|
1164 |
int branch(int[] flags) throws RESyntaxException
|
|
1165 |
{
|
|
1166 |
// Get each possibly closured piece and concat
|
|
1167 |
int node;
|
|
1168 |
int ret = node(RE.OP_BRANCH, 0);
|
|
1169 |
int chain = -1;
|
|
1170 |
int[] closureFlags = new int[1];
|
|
1171 |
boolean nullable = true;
|
|
1172 |
while (idx < len && pattern.charAt(idx) != '|' && pattern.charAt(idx) != ')')
|
|
1173 |
{
|
|
1174 |
// Get new node
|
|
1175 |
closureFlags[0] = NODE_NORMAL;
|
|
1176 |
node = closure(closureFlags);
|
|
1177 |
if (closureFlags[0] == NODE_NORMAL)
|
|
1178 |
{
|
|
1179 |
nullable = false;
|
|
1180 |
}
|
|
1181 |
|
|
1182 |
// If there's a chain, append to the end
|
|
1183 |
if (chain != -1)
|
|
1184 |
{
|
|
1185 |
setNextOfEnd(chain, node);
|
|
1186 |
}
|
|
1187 |
|
|
1188 |
// Chain starts at current
|
|
1189 |
chain = node;
|
|
1190 |
}
|
|
1191 |
|
|
1192 |
// If we don't run loop, make a nothing node
|
|
1193 |
if (chain == -1)
|
|
1194 |
{
|
|
1195 |
node(RE.OP_NOTHING, 0);
|
|
1196 |
}
|
|
1197 |
|
|
1198 |
// Set nullable flag for this branch
|
|
1199 |
if (nullable)
|
|
1200 |
{
|
|
1201 |
flags[0] |= NODE_NULLABLE;
|
|
1202 |
}
|
|
1203 |
return ret;
|
|
1204 |
}
|
|
1205 |
|
|
1206 |
/**
|
|
1207 |
* Compile an expression with possible parens around it. Paren matching
|
|
1208 |
* is done at this level so we can tie the branch tails together.
|
|
1209 |
* @param flags Flag value passed by reference
|
|
1210 |
* @return Node index of expression in instruction array
|
|
1211 |
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
|
|
1212 |
*/
|
|
1213 |
int expr(int[] flags) throws RESyntaxException
|
|
1214 |
{
|
|
1215 |
// Create open paren node unless we were called from the top level (which has no parens)
|
|
1216 |
int paren = -1;
|
|
1217 |
int ret = -1;
|
|
1218 |
int closeParens = parens;
|
|
1219 |
if ((flags[0] & NODE_TOPLEVEL) == 0 && pattern.charAt(idx) == '(')
|
|
1220 |
{
|
|
1221 |
// if its a cluster ( rather than a proper subexpression ie with backrefs )
|
|
1222 |
if ( idx + 2 < len && pattern.charAt( idx + 1 ) == '?' && pattern.charAt( idx + 2 ) == ':' )
|
|
1223 |
{
|
|
1224 |
paren = 2;
|
|
1225 |
idx += 3;
|
|
1226 |
ret = node( RE.OP_OPEN_CLUSTER, 0 );
|
|
1227 |
}
|
|
1228 |
else
|
|
1229 |
{
|
|
1230 |
paren = 1;
|
|
1231 |
idx++;
|
|
1232 |
ret = node(RE.OP_OPEN, parens++);
|
|
1233 |
}
|
|
1234 |
}
|
|
1235 |
flags[0] &= ~NODE_TOPLEVEL;
|
|
1236 |
|
|
1237 |
// Create a branch node
|
|
1238 |
int branch = branch(flags);
|
|
1239 |
if (ret == -1)
|
|
1240 |
{
|
|
1241 |
ret = branch;
|
|
1242 |
}
|
|
1243 |
else
|
|
1244 |
{
|
|
1245 |
setNextOfEnd(ret, branch);
|
|
1246 |
}
|
|
1247 |
|
|
1248 |
// Loop through branches
|
|
1249 |
while (idx < len && pattern.charAt(idx) == '|')
|
|
1250 |
{
|
|
1251 |
idx++;
|
|
1252 |
branch = branch(flags);
|
|
1253 |
setNextOfEnd(ret, branch);
|
|
1254 |
}
|
|
1255 |
|
|
1256 |
// Create an ending node (either a close paren or an OP_END)
|
|
1257 |
int end;
|
|
1258 |
if ( paren > 0 )
|
|
1259 |
{
|
|
1260 |
if (idx < len && pattern.charAt(idx) == ')')
|
|
1261 |
{
|
|
1262 |
idx++;
|
|
1263 |
}
|
|
1264 |
else
|
|
1265 |
{
|
|
1266 |
syntaxError("Missing close paren");
|
|
1267 |
}
|
|
1268 |
if ( paren == 1 )
|
|
1269 |
{
|
|
1270 |
end = node(RE.OP_CLOSE, closeParens);
|
|
1271 |
}
|
|
1272 |
else
|
|
1273 |
{
|
|
1274 |
end = node( RE.OP_CLOSE_CLUSTER, 0 );
|
|
1275 |
}
|
|
1276 |
}
|
|
1277 |
else
|
|
1278 |
{
|
|
1279 |
end = node(RE.OP_END, 0);
|
|
1280 |
}
|
|
1281 |
|
|
1282 |
// Append the ending node to the ret nodelist
|
|
1283 |
setNextOfEnd(ret, end);
|
|
1284 |
|
|
1285 |
// Hook the ends of each branch to the end node
|
|
1286 |
int currentNode = ret;
|
|
1287 |
int nextNodeOffset = instruction[ currentNode + RE.offsetNext ];
|
|
1288 |
// while the next node o
|
|
1289 |
while ( nextNodeOffset != 0 && currentNode < lenInstruction )
|
|
1290 |
{
|
|
1291 |
// If branch, make the end of the branch's operand chain point to the end node.
|
|
1292 |
if ( instruction[ currentNode + RE.offsetOpcode ] == RE.OP_BRANCH )
|
|
1293 |
{
|
|
1294 |
setNextOfEnd( currentNode + RE.nodeSize, end );
|
|
1295 |
}
|
|
1296 |
nextNodeOffset = instruction[ currentNode + RE.offsetNext ];
|
|
1297 |
currentNode += nextNodeOffset;
|
|
1298 |
}
|
|
1299 |
|
|
1300 |
// Return the node list
|
|
1301 |
return ret;
|
|
1302 |
}
|
|
1303 |
|
|
1304 |
/**
|
|
1305 |
* Compiles a regular expression pattern into a program runnable by the pattern
|
|
1306 |
* matcher class 'RE'.
|
|
1307 |
* @param pattern Regular expression pattern to compile (see RECompiler class
|
|
1308 |
* for details).
|
|
1309 |
* @return A compiled regular expression program.
|
|
1310 |
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
|
|
1311 |
* @see RECompiler
|
|
1312 |
* @see RE
|
|
1313 |
*/
|
|
1314 |
public REProgram compile(String pattern) throws RESyntaxException
|
|
1315 |
{
|
|
1316 |
// Initialize variables for compilation
|
|
1317 |
this.pattern = pattern; // Save pattern in instance variable
|
|
1318 |
len = pattern.length(); // Precompute pattern length for speed
|
|
1319 |
idx = 0; // Set parsing index to the first character
|
|
1320 |
lenInstruction = 0; // Set emitted instruction count to zero
|
|
1321 |
parens = 1; // Set paren level to 1 (the implicit outer parens)
|
|
1322 |
brackets = 0; // No bracketed closures yet
|
|
1323 |
|
|
1324 |
// Initialize pass by reference flags value
|
|
1325 |
int[] flags = { NODE_TOPLEVEL };
|
|
1326 |
|
|
1327 |
// Parse expression
|
|
1328 |
expr(flags);
|
|
1329 |
|
|
1330 |
// Should be at end of input
|
|
1331 |
if (idx != len)
|
|
1332 |
{
|
|
1333 |
if (pattern.charAt(idx) == ')')
|
|
1334 |
{
|
|
1335 |
syntaxError("Unmatched close paren");
|
|
1336 |
}
|
|
1337 |
syntaxError("Unexpected input remains");
|
|
1338 |
}
|
|
1339 |
|
|
1340 |
// Return the result
|
|
1341 |
char[] ins = new char[lenInstruction];
|
|
1342 |
System.arraycopy(instruction, 0, ins, 0, lenInstruction);
|
|
1343 |
return new REProgram(parens, ins);
|
|
1344 |
}
|
|
1345 |
|
|
1346 |
/**
|
|
1347 |
* Local, nested class for maintaining character ranges for character classes.
|
|
1348 |
*/
|
|
1349 |
class RERange
|
|
1350 |
{
|
|
1351 |
int size = 16; // Capacity of current range arrays
|
|
1352 |
int[] minRange = new int[size]; // Range minima
|
|
1353 |
int[] maxRange = new int[size]; // Range maxima
|
|
1354 |
int num = 0; // Number of range array elements in use
|
|
1355 |
|
|
1356 |
/**
|
|
1357 |
* Deletes the range at a given index from the range lists
|
|
1358 |
* @param index Index of range to delete from minRange and maxRange arrays.
|
|
1359 |
*/
|
|
1360 |
void delete(int index)
|
|
1361 |
{
|
|
1362 |
// Return if no elements left or index is out of range
|
|
1363 |
if (num == 0 || index >= num)
|
|
1364 |
{
|
|
1365 |
return;
|
|
1366 |
}
|
|
1367 |
|
|
1368 |
// Move elements down
|
|
1369 |
while (++index < num)
|
|
1370 |
{
|
|
1371 |
if (index - 1 >= 0)
|
|
1372 |
{
|
|
1373 |
minRange[index-1] = minRange[index];
|
|
1374 |
maxRange[index-1] = maxRange[index];
|
|
1375 |
}
|
|
1376 |
}
|
|
1377 |
|
|
1378 |
// One less element now
|
|
1379 |
num--;
|
|
1380 |
}
|
|
1381 |
|
|
1382 |
/**
|
|
1383 |
* Merges a range into the range list, coalescing ranges if possible.
|
|
1384 |
* @param min Minimum end of range
|
|
1385 |
* @param max Maximum end of range
|
|
1386 |
*/
|
|
1387 |
void merge(int min, int max)
|
|
1388 |
{
|
|
1389 |
// Loop through ranges
|
|
1390 |
for (int i = 0; i < num; i++)
|
|
1391 |
{
|
|
1392 |
// Min-max is subsumed by minRange[i]-maxRange[i]
|
|
1393 |
if (min >= minRange[i] && max <= maxRange[i])
|
|
1394 |
{
|
|
1395 |
return;
|
|
1396 |
}
|
|
1397 |
|
|
1398 |
// Min-max subsumes minRange[i]-maxRange[i]
|
|
1399 |
else if (min <= minRange[i] && max >= maxRange[i])
|
|
1400 |
{
|
|
1401 |
delete(i);
|
|
1402 |
merge(min, max);
|
|
1403 |
return;
|
|
1404 |
}
|
|
1405 |
|
|
1406 |
// Min is in the range, but max is outside
|
|
1407 |
else if (min >= minRange[i] && min <= maxRange[i])
|
|
1408 |
{
|
|
1409 |
delete(i);
|
|
1410 |
min = minRange[i];
|
|
1411 |
merge(min, max);
|
|
1412 |
return;
|
|
1413 |
}
|
|
1414 |
|
|
1415 |
// Max is in the range, but min is outside
|
|
1416 |
else if (max >= minRange[i] && max <= maxRange[i])
|
|
1417 |
{
|
|
1418 |
delete(i);
|
|
1419 |
max = maxRange[i];
|
|
1420 |
merge(min, max);
|
|
1421 |
return;
|
|
1422 |
}
|
|
1423 |
}
|
|
1424 |
|
|
1425 |
// Must not overlap any other ranges
|
|
1426 |
if (num >= size)
|
|
1427 |
{
|
|
1428 |
size *= 2;
|
|
1429 |
int[] newMin = new int[size];
|
|
1430 |
int[] newMax = new int[size];
|
|
1431 |
System.arraycopy(minRange, 0, newMin, 0, num);
|
|
1432 |
System.arraycopy(maxRange, 0, newMax, 0, num);
|
|
1433 |
minRange = newMin;
|
|
1434 |
maxRange = newMax;
|
|
1435 |
}
|
|
1436 |
minRange[num] = min;
|
|
1437 |
maxRange[num] = max;
|
|
1438 |
num++;
|
|
1439 |
}
|
|
1440 |
|
|
1441 |
/**
|
|
1442 |
* Removes a range by deleting or shrinking all other ranges
|
|
1443 |
* @param min Minimum end of range
|
|
1444 |
* @param max Maximum end of range
|
|
1445 |
*/
|
|
1446 |
void remove(int min, int max)
|
|
1447 |
{
|
|
1448 |
// Loop through ranges
|
|
1449 |
for (int i = 0; i < num; i++)
|
|
1450 |
{
|
|
1451 |
// minRange[i]-maxRange[i] is subsumed by min-max
|
|
1452 |
if (minRange[i] >= min && maxRange[i] <= max)
|
|
1453 |
{
|
|
1454 |
delete(i);
|
|
1455 |
i--;
|
|
1456 |
return;
|
|
1457 |
}
|
|
1458 |
|
|
1459 |
// min-max is subsumed by minRange[i]-maxRange[i]
|
|
1460 |
else if (min >= minRange[i] && max <= maxRange[i])
|
|
1461 |
{
|
|
1462 |
int minr = minRange[i];
|
|
1463 |
int maxr = maxRange[i];
|
|
1464 |
delete(i);
|
|
1465 |
if (minr < min)
|
|
1466 |
{
|
|
1467 |
merge(minr, min - 1);
|
|
1468 |
}
|
|
1469 |
if (max < maxr)
|
|
1470 |
{
|
|
1471 |
merge(max + 1, maxr);
|
|
1472 |
}
|
|
1473 |
return;
|
|
1474 |
}
|
|
1475 |
|
|
1476 |
// minRange is in the range, but maxRange is outside
|
|
1477 |
else if (minRange[i] >= min && minRange[i] <= max)
|
|
1478 |
{
|
|
1479 |
minRange[i] = max + 1;
|
|
1480 |
return;
|
|
1481 |
}
|
|
1482 |
|
|
1483 |
// maxRange is in the range, but minRange is outside
|
|
1484 |
else if (maxRange[i] >= min && maxRange[i] <= max)
|
|
1485 |
{
|
|
1486 |
maxRange[i] = min - 1;
|
|
1487 |
return;
|
|
1488 |
}
|
|
1489 |
}
|
|
1490 |
}
|
|
1491 |
|
|
1492 |
/**
|
|
1493 |
* Includes (or excludes) the range from min to max, inclusive.
|
|
1494 |
* @param min Minimum end of range
|
|
1495 |
* @param max Maximum end of range
|
|
1496 |
* @param include True if range should be included. False otherwise.
|
|
1497 |
*/
|
|
1498 |
void include(int min, int max, boolean include)
|
|
1499 |
{
|
|
1500 |
if (include)
|
|
1501 |
{
|
|
1502 |
merge(min, max);
|
|
1503 |
}
|
|
1504 |
else
|
|
1505 |
{
|
|
1506 |
remove(min, max);
|
|
1507 |
}
|
|
1508 |
}
|
|
1509 |
|
|
1510 |
/**
|
|
1511 |
* Includes a range with the same min and max
|
|
1512 |
* @param minmax Minimum and maximum end of range (inclusive)
|
|
1513 |
* @param include True if range should be included. False otherwise.
|
|
1514 |
*/
|
|
1515 |
void include(char minmax, boolean include)
|
|
1516 |
{
|
|
1517 |
include(minmax, minmax, include);
|
|
1518 |
}
|
|
1519 |
}
|
|
1520 |
}
|