|
1 /* |
|
2 * reserved comment block |
|
3 * DO NOT REMOVE OR ALTER! |
|
4 */ |
|
5 /* |
|
6 * Copyright 1999-2004 The Apache Software Foundation. |
|
7 * |
|
8 * Licensed under the Apache License, Version 2.0 (the "License"); |
|
9 * you may not use this file except in compliance with the License. |
|
10 * You may obtain a copy of the License at |
|
11 * |
|
12 * http://www.apache.org/licenses/LICENSE-2.0 |
|
13 * |
|
14 * Unless required by applicable law or agreed to in writing, software |
|
15 * distributed under the License is distributed on an "AS IS" BASIS, |
|
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
17 * See the License for the specific language governing permissions and |
|
18 * limitations under the License. |
|
19 */ |
|
20 |
|
21 package com.sun.org.apache.regexp.internal; |
|
22 |
|
23 import java.io.Serializable; |
|
24 |
|
25 /** |
|
26 * A class that holds compiled regular expressions. This is exposed mainly |
|
27 * for use by the recompile utility (which helps you produce precompiled |
|
28 * REProgram objects). You should not otherwise need to work directly with |
|
29 * this class. |
|
30 * |
|
31 * @see RE |
|
32 * @see RECompiler |
|
33 * |
|
34 * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a> |
|
35 */ |
|
36 public class REProgram implements Serializable |
|
37 { |
|
38 static final int OPT_HASBACKREFS = 1; |
|
39 |
|
40 char[] instruction; // The compiled regular expression 'program' |
|
41 int lenInstruction; // The amount of the instruction buffer in use |
|
42 char[] prefix; // Prefix string optimization |
|
43 int flags; // Optimization flags (REProgram.OPT_*) |
|
44 int maxParens = -1; |
|
45 |
|
46 /** |
|
47 * Constructs a program object from a character array |
|
48 * @param instruction Character array with RE opcode instructions in it |
|
49 */ |
|
50 public REProgram(char[] instruction) |
|
51 { |
|
52 this(instruction, instruction.length); |
|
53 } |
|
54 |
|
55 /** |
|
56 * Constructs a program object from a character array |
|
57 * @param parens Count of parens in the program |
|
58 * @param instruction Character array with RE opcode instructions in it |
|
59 */ |
|
60 public REProgram(int parens, char[] instruction) |
|
61 { |
|
62 this(instruction, instruction.length); |
|
63 this.maxParens = parens; |
|
64 } |
|
65 |
|
66 /** |
|
67 * Constructs a program object from a character array |
|
68 * @param instruction Character array with RE opcode instructions in it |
|
69 * @param lenInstruction Amount of instruction array in use |
|
70 */ |
|
71 public REProgram(char[] instruction, int lenInstruction) |
|
72 { |
|
73 setInstructions(instruction, lenInstruction); |
|
74 } |
|
75 |
|
76 /** |
|
77 * Returns a copy of the current regular expression program in a character |
|
78 * array that is exactly the right length to hold the program. If there is |
|
79 * no program compiled yet, getInstructions() will return null. |
|
80 * @return A copy of the current compiled RE program |
|
81 */ |
|
82 public char[] getInstructions() |
|
83 { |
|
84 // Ensure program has been compiled! |
|
85 if (lenInstruction != 0) |
|
86 { |
|
87 // Return copy of program |
|
88 char[] ret = new char[lenInstruction]; |
|
89 System.arraycopy(instruction, 0, ret, 0, lenInstruction); |
|
90 return ret; |
|
91 } |
|
92 return null; |
|
93 } |
|
94 |
|
95 /** |
|
96 * Sets a new regular expression program to run. It is this method which |
|
97 * performs any special compile-time search optimizations. Currently only |
|
98 * two optimizations are in place - one which checks for backreferences |
|
99 * (so that they can be lazily allocated) and another which attempts to |
|
100 * find an prefix anchor string so that substantial amounts of input can |
|
101 * potentially be skipped without running the actual program. |
|
102 * @param instruction Program instruction buffer |
|
103 * @param lenInstruction Length of instruction buffer in use |
|
104 */ |
|
105 public void setInstructions(char[] instruction, int lenInstruction) |
|
106 { |
|
107 // Save reference to instruction array |
|
108 this.instruction = instruction; |
|
109 this.lenInstruction = lenInstruction; |
|
110 |
|
111 // Initialize other program-related variables |
|
112 flags = 0; |
|
113 prefix = null; |
|
114 |
|
115 // Try various compile-time optimizations if there's a program |
|
116 if (instruction != null && lenInstruction != 0) |
|
117 { |
|
118 // If the first node is a branch |
|
119 if (lenInstruction >= RE.nodeSize && instruction[0 + RE.offsetOpcode] == RE.OP_BRANCH) |
|
120 { |
|
121 // to the end node |
|
122 int next = instruction[0 + RE.offsetNext]; |
|
123 if (instruction[next + RE.offsetOpcode] == RE.OP_END) |
|
124 { |
|
125 // and the branch starts with an atom |
|
126 if (lenInstruction >= (RE.nodeSize * 2) && instruction[RE.nodeSize + RE.offsetOpcode] == RE.OP_ATOM) |
|
127 { |
|
128 // then get that atom as an prefix because there's no other choice |
|
129 int lenAtom = instruction[RE.nodeSize + RE.offsetOpdata]; |
|
130 prefix = new char[lenAtom]; |
|
131 System.arraycopy(instruction, RE.nodeSize * 2, prefix, 0, lenAtom); |
|
132 } |
|
133 } |
|
134 } |
|
135 |
|
136 BackrefScanLoop: |
|
137 |
|
138 // Check for backreferences |
|
139 for (int i = 0; i < lenInstruction; i += RE.nodeSize) |
|
140 { |
|
141 switch (instruction[i + RE.offsetOpcode]) |
|
142 { |
|
143 case RE.OP_ANYOF: |
|
144 i += (instruction[i + RE.offsetOpdata] * 2); |
|
145 break; |
|
146 |
|
147 case RE.OP_ATOM: |
|
148 i += instruction[i + RE.offsetOpdata]; |
|
149 break; |
|
150 |
|
151 case RE.OP_BACKREF: |
|
152 flags |= OPT_HASBACKREFS; |
|
153 break BackrefScanLoop; |
|
154 } |
|
155 } |
|
156 } |
|
157 } |
|
158 } |