author | hannesw |
Wed, 10 Apr 2013 14:05:11 +0200 | |
changeset 16939 | 9e3a9eda5775 |
parent 16938 | 1a8ffed97564 |
child 16940 | d5c597aa3d47 |
permissions | -rw-r--r-- |
16147 | 1 |
/* |
16151 | 2 |
* Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved. |
16147 | 3 |
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
4 |
* |
|
5 |
* This code is free software; you can redistribute it and/or modify it |
|
6 |
* under the terms of the GNU General Public License version 2 only, as |
|
7 |
* published by the Free Software Foundation. Oracle designates this |
|
8 |
* particular file as subject to the "Classpath" exception as provided |
|
9 |
* by Oracle in the LICENSE file that accompanied this code. |
|
10 |
* |
|
11 |
* This code is distributed in the hope that it will be useful, but WITHOUT |
|
12 |
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
13 |
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
14 |
* version 2 for more details (a copy is included in the LICENSE file that |
|
15 |
* accompanied this code). |
|
16 |
* |
|
17 |
* You should have received a copy of the GNU General Public License version |
|
18 |
* 2 along with this work; if not, write to the Free Software Foundation, |
|
19 |
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
|
20 |
* |
|
21 |
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
|
22 |
* or visit www.oracle.com if you need additional information or have any |
|
23 |
* questions. |
|
24 |
*/ |
|
25 |
||
16258 | 26 |
package jdk.nashorn.internal.runtime.regexp; |
16147 | 27 |
|
28 |
import java.util.HashMap; |
|
16781
41eadf003eff
8009230: Nashorn rejects extended RegExp syntax accepted by all major JS engines
hannesw
parents:
16525
diff
changeset
|
29 |
import java.util.Iterator; |
16147 | 30 |
import java.util.LinkedList; |
31 |
import java.util.List; |
|
32 |
import java.util.Map; |
|
33 |
import java.util.regex.PatternSyntaxException; |
|
16258 | 34 |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
35 |
import jdk.nashorn.internal.parser.Lexer; |
16226
0e4f37e6cc40
8007915: Nashorn IR, codegen, parser packages and Context instance should be inaccessible to user code
sundar
parents:
16151
diff
changeset
|
36 |
import jdk.nashorn.internal.parser.Scanner; |
16258 | 37 |
import jdk.nashorn.internal.runtime.BitVector; |
16147 | 38 |
|
39 |
/** |
|
40 |
* Scan a JavaScript regexp, converting to Java regex if necessary. |
|
41 |
* |
|
42 |
*/ |
|
16226
0e4f37e6cc40
8007915: Nashorn IR, codegen, parser packages and Context instance should be inaccessible to user code
sundar
parents:
16151
diff
changeset
|
43 |
final class RegExpScanner extends Scanner { |
16147 | 44 |
|
45 |
/** |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
46 |
* String builder used to rewrite the pattern for the currently used regexp factory. |
16147 | 47 |
*/ |
48 |
private final StringBuilder sb; |
|
49 |
||
50 |
/** Expected token table */ |
|
51 |
private final Map<Character, Integer> expected = new HashMap<>(); |
|
52 |
||
53 |
/** Capturing parenthesis that have been found so far. */ |
|
54 |
private final List<Capture> caps = new LinkedList<>(); |
|
55 |
||
56 |
/** Forward references to capturing parenthesis to be resolved later.*/ |
|
16781
41eadf003eff
8009230: Nashorn rejects extended RegExp syntax accepted by all major JS engines
hannesw
parents:
16525
diff
changeset
|
57 |
private final LinkedList<Integer> forwardReferences = new LinkedList<>(); |
16147 | 58 |
|
59 |
/** Current level of zero-width negative lookahead assertions. */ |
|
60 |
private int negativeLookaheadLevel; |
|
61 |
||
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
62 |
/** Are we currently inside a character class? */ |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
63 |
private boolean inCharClass = false; |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
64 |
|
16274
c3f35c5e0d1c
8008370: coffee script compiler doesn't work with Nashorn
hannesw
parents:
16271
diff
changeset
|
65 |
/** Are we currently inside a negated character class? */ |
c3f35c5e0d1c
8008370: coffee script compiler doesn't work with Nashorn
hannesw
parents:
16271
diff
changeset
|
66 |
private boolean inNegativeClass = false; |
c3f35c5e0d1c
8008370: coffee script compiler doesn't work with Nashorn
hannesw
parents:
16271
diff
changeset
|
67 |
|
16147 | 68 |
private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?"; |
69 |
||
70 |
private static class Capture { |
|
71 |
/** |
|
72 |
* Zero-width negative lookaheads enclosing the capture. |
|
73 |
*/ |
|
74 |
private final int negativeLookaheadLevel; |
|
75 |
||
76 |
Capture(final int negativeLookaheadLevel) { |
|
77 |
this.negativeLookaheadLevel = negativeLookaheadLevel; |
|
78 |
} |
|
79 |
||
80 |
public int getNegativeLookaheadLevel() { |
|
81 |
return negativeLookaheadLevel; |
|
82 |
} |
|
83 |
||
84 |
} |
|
85 |
||
86 |
/** |
|
87 |
* Constructor |
|
88 |
* @param string the JavaScript regexp to parse |
|
89 |
*/ |
|
90 |
private RegExpScanner(final String string) { |
|
91 |
super(string); |
|
92 |
sb = new StringBuilder(limit); |
|
93 |
reset(0); |
|
94 |
expected.put(']', 0); |
|
95 |
expected.put('}', 0); |
|
96 |
} |
|
97 |
||
98 |
private void processForwardReferences() { |
|
99 |
||
16781
41eadf003eff
8009230: Nashorn rejects extended RegExp syntax accepted by all major JS engines
hannesw
parents:
16525
diff
changeset
|
100 |
Iterator<Integer> iterator = forwardReferences.descendingIterator(); |
41eadf003eff
8009230: Nashorn rejects extended RegExp syntax accepted by all major JS engines
hannesw
parents:
16525
diff
changeset
|
101 |
while (iterator.hasNext()) { |
41eadf003eff
8009230: Nashorn rejects extended RegExp syntax accepted by all major JS engines
hannesw
parents:
16525
diff
changeset
|
102 |
final int pos = iterator.next(); |
41eadf003eff
8009230: Nashorn rejects extended RegExp syntax accepted by all major JS engines
hannesw
parents:
16525
diff
changeset
|
103 |
final int num = iterator.next(); |
41eadf003eff
8009230: Nashorn rejects extended RegExp syntax accepted by all major JS engines
hannesw
parents:
16525
diff
changeset
|
104 |
if (num > caps.size()) { |
16938
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
105 |
// Non-existing backreference. If the number begins with a valid octal convert it to |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
106 |
// Unicode escape and append the rest to a literal character sequence. |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
107 |
final StringBuilder buffer = new StringBuilder(); |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
108 |
octalOrLiteral(Integer.toString(num), buffer); |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
109 |
sb.insert(pos, buffer); |
16147 | 110 |
} |
111 |
} |
|
112 |
||
113 |
forwardReferences.clear(); |
|
114 |
} |
|
115 |
||
116 |
/** |
|
117 |
* Scan a JavaScript regexp string returning a Java safe regex string. |
|
118 |
* |
|
119 |
* @param string |
|
120 |
* JavaScript regexp string. |
|
121 |
* @return Java safe regex string. |
|
122 |
*/ |
|
123 |
public static RegExpScanner scan(final String string) { |
|
124 |
final RegExpScanner scanner = new RegExpScanner(string); |
|
125 |
||
126 |
try { |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
127 |
scanner.disjunction(); |
16147 | 128 |
} catch (final Exception e) { |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
129 |
throw new PatternSyntaxException(e.getMessage(), string, scanner.position); |
16147 | 130 |
} |
131 |
||
132 |
scanner.processForwardReferences(); |
|
133 |
||
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
134 |
// Throw syntax error unless we parsed the entire JavaScript regexp without syntax errors |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
135 |
if (scanner.position != string.length()) { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
136 |
final String p = scanner.getStringBuilder().toString(); |
16147 | 137 |
throw new PatternSyntaxException(string, p, p.length() + 1); |
138 |
} |
|
139 |
||
140 |
return scanner; |
|
141 |
} |
|
142 |
||
143 |
final StringBuilder getStringBuilder() { |
|
144 |
return sb; |
|
145 |
} |
|
146 |
||
147 |
String getJavaPattern() { |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
148 |
return sb.toString(); |
16147 | 149 |
} |
150 |
||
151 |
BitVector getGroupsInNegativeLookahead() { |
|
152 |
BitVector vec = null; |
|
153 |
for (int i = 0; i < caps.size(); i++) { |
|
154 |
final Capture cap = caps.get(i); |
|
155 |
if (cap.getNegativeLookaheadLevel() > 0) { |
|
156 |
if (vec == null) { |
|
157 |
vec = new BitVector(caps.size() + 1); |
|
158 |
} |
|
159 |
vec.set(i + 1); |
|
160 |
} |
|
161 |
} |
|
162 |
return vec; |
|
163 |
} |
|
164 |
||
165 |
/** |
|
166 |
* Commit n characters to the builder and to a given token |
|
167 |
* @param n Number of characters. |
|
168 |
* @return Committed token |
|
169 |
*/ |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
170 |
private boolean commit(final int n) { |
16147 | 171 |
switch (n) { |
172 |
case 1: |
|
173 |
sb.append(ch0); |
|
174 |
skip(1); |
|
175 |
break; |
|
176 |
case 2: |
|
177 |
sb.append(ch0); |
|
178 |
sb.append(ch1); |
|
179 |
skip(2); |
|
180 |
break; |
|
181 |
case 3: |
|
182 |
sb.append(ch0); |
|
183 |
sb.append(ch1); |
|
184 |
sb.append(ch2); |
|
185 |
skip(3); |
|
186 |
break; |
|
187 |
default: |
|
188 |
assert false : "Should not reach here"; |
|
189 |
} |
|
190 |
||
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
191 |
return true; |
16147 | 192 |
} |
193 |
||
194 |
/** |
|
195 |
* Restart the buffers back at an earlier position. |
|
196 |
* |
|
197 |
* @param startIn |
|
198 |
* Position in the input stream. |
|
199 |
* @param startOut |
|
200 |
* Position in the output stream. |
|
201 |
*/ |
|
202 |
private void restart(final int startIn, final int startOut) { |
|
203 |
reset(startIn); |
|
204 |
sb.setLength(startOut); |
|
205 |
} |
|
206 |
||
207 |
private void push(final char ch) { |
|
208 |
expected.put(ch, expected.get(ch) + 1); |
|
209 |
} |
|
210 |
||
211 |
private void pop(final char ch) { |
|
212 |
expected.put(ch, Math.min(0, expected.get(ch) - 1)); |
|
213 |
} |
|
214 |
||
215 |
/* |
|
216 |
* Recursive descent tokenizer starts below. |
|
217 |
*/ |
|
218 |
||
219 |
/* |
|
220 |
* Disjunction :: |
|
221 |
* Alternative |
|
222 |
* Alternative | Disjunction |
|
223 |
*/ |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
224 |
private void disjunction() { |
16147 | 225 |
while (true) { |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
226 |
alternative(); |
16147 | 227 |
|
228 |
if (ch0 == '|') { |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
229 |
commit(1); |
16147 | 230 |
} else { |
231 |
break; |
|
232 |
} |
|
233 |
} |
|
234 |
} |
|
235 |
||
236 |
/* |
|
237 |
* Alternative :: |
|
238 |
* [empty] |
|
239 |
* Alternative Term |
|
240 |
*/ |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
241 |
private void alternative() { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
242 |
while (term()) { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
243 |
// do nothing |
16147 | 244 |
} |
245 |
} |
|
246 |
||
247 |
/* |
|
248 |
* Term :: |
|
249 |
* Assertion |
|
250 |
* Atom |
|
251 |
* Atom Quantifier |
|
252 |
*/ |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
253 |
private boolean term() { |
16147 | 254 |
final int startIn = position; |
255 |
final int startOut = sb.length(); |
|
256 |
||
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
257 |
if (assertion()) { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
258 |
return true; |
16147 | 259 |
} |
260 |
||
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
261 |
if (atom()) { |
16939
9e3a9eda5775
8011749: Bugs with empty character class handling
hannesw
parents:
16938
diff
changeset
|
262 |
// Check for character classes that never or always match |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
263 |
if (sb.toString().endsWith("[]")) { |
16939
9e3a9eda5775
8011749: Bugs with empty character class handling
hannesw
parents:
16938
diff
changeset
|
264 |
sb.setLength(sb.length() - 1); |
9e3a9eda5775
8011749: Bugs with empty character class handling
hannesw
parents:
16938
diff
changeset
|
265 |
sb.append("^\\s\\S]"); |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
266 |
} else if (sb.toString().endsWith("[^]")) { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
267 |
sb.setLength(sb.length() - 2); |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
268 |
sb.append("\\s\\S]"); |
16147 | 269 |
} |
270 |
||
16939
9e3a9eda5775
8011749: Bugs with empty character class handling
hannesw
parents:
16938
diff
changeset
|
271 |
quantifier(); |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
272 |
return true; |
16147 | 273 |
} |
274 |
||
275 |
restart(startIn, startOut); |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
276 |
return false; |
16147 | 277 |
} |
278 |
||
279 |
/* |
|
280 |
* Assertion :: |
|
281 |
* ^ |
|
282 |
* $ |
|
283 |
* \b |
|
284 |
* \B |
|
285 |
* ( ? = Disjunction ) |
|
286 |
* ( ? ! Disjunction ) |
|
287 |
*/ |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
288 |
private boolean assertion() { |
16147 | 289 |
final int startIn = position; |
290 |
final int startOut = sb.length(); |
|
291 |
||
292 |
switch (ch0) { |
|
293 |
case '^': |
|
294 |
case '$': |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
295 |
return commit(1); |
16147 | 296 |
|
297 |
case '\\': |
|
298 |
if (ch1 == 'b' || ch1 == 'B') { |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
299 |
return commit(2); |
16147 | 300 |
} |
301 |
break; |
|
302 |
||
303 |
case '(': |
|
304 |
if (ch1 != '?') { |
|
305 |
break; |
|
306 |
} |
|
307 |
if (ch2 != '=' && ch2 != '!') { |
|
308 |
break; |
|
309 |
} |
|
310 |
final boolean isNegativeLookahead = (ch2 == '!'); |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
311 |
commit(3); |
16147 | 312 |
|
313 |
if (isNegativeLookahead) { |
|
314 |
negativeLookaheadLevel++; |
|
315 |
} |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
316 |
disjunction(); |
16147 | 317 |
if (isNegativeLookahead) { |
318 |
negativeLookaheadLevel--; |
|
319 |
} |
|
320 |
||
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
321 |
if (ch0 == ')') { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
322 |
return commit(1); |
16147 | 323 |
} |
324 |
break; |
|
325 |
||
326 |
default: |
|
327 |
break; |
|
328 |
} |
|
329 |
||
330 |
restart(startIn, startOut); |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
331 |
return false; |
16147 | 332 |
} |
333 |
||
334 |
/* |
|
335 |
* Quantifier :: |
|
336 |
* QuantifierPrefix |
|
337 |
* QuantifierPrefix ? |
|
338 |
*/ |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
339 |
private boolean quantifier() { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
340 |
if (quantifierPrefix()) { |
16147 | 341 |
if (ch0 == '?') { |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
342 |
commit(1); |
16147 | 343 |
} |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
344 |
return true; |
16147 | 345 |
} |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
346 |
return false; |
16147 | 347 |
} |
348 |
||
349 |
/* |
|
350 |
* QuantifierPrefix :: |
|
351 |
* * |
|
352 |
* + |
|
353 |
* ? |
|
354 |
* { DecimalDigits } |
|
355 |
* { DecimalDigits , } |
|
356 |
* { DecimalDigits , DecimalDigits } |
|
357 |
*/ |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
358 |
private boolean quantifierPrefix() { |
16147 | 359 |
final int startIn = position; |
360 |
final int startOut = sb.length(); |
|
361 |
||
362 |
switch (ch0) { |
|
363 |
case '*': |
|
364 |
case '+': |
|
365 |
case '?': |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
366 |
return commit(1); |
16147 | 367 |
|
368 |
case '{': |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
369 |
commit(1); |
16147 | 370 |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
371 |
if (!decimalDigits()) { |
16147 | 372 |
break; // not a quantifier - back out |
373 |
} |
|
374 |
push('}'); |
|
375 |
||
376 |
if (ch0 == ',') { |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
377 |
commit(1); |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
378 |
decimalDigits(); |
16147 | 379 |
} |
380 |
||
381 |
if (ch0 == '}') { |
|
382 |
pop('}'); |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
383 |
commit(1); |
16781
41eadf003eff
8009230: Nashorn rejects extended RegExp syntax accepted by all major JS engines
hannesw
parents:
16525
diff
changeset
|
384 |
} else { |
41eadf003eff
8009230: Nashorn rejects extended RegExp syntax accepted by all major JS engines
hannesw
parents:
16525
diff
changeset
|
385 |
// Bad quantifier should be rejected but is accepted by all major engines |
41eadf003eff
8009230: Nashorn rejects extended RegExp syntax accepted by all major JS engines
hannesw
parents:
16525
diff
changeset
|
386 |
restart(startIn, startOut); |
41eadf003eff
8009230: Nashorn rejects extended RegExp syntax accepted by all major JS engines
hannesw
parents:
16525
diff
changeset
|
387 |
return false; |
16147 | 388 |
} |
389 |
||
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
390 |
return true; |
16147 | 391 |
|
392 |
default: |
|
393 |
break; |
|
394 |
} |
|
395 |
||
396 |
restart(startIn, startOut); |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
397 |
return false; |
16147 | 398 |
} |
399 |
||
400 |
/* |
|
401 |
* Atom :: |
|
402 |
* PatternCharacter |
|
403 |
* . |
|
404 |
* \ AtomEscape |
|
405 |
* CharacterClass |
|
406 |
* ( Disjunction ) |
|
407 |
* ( ? : Disjunction ) |
|
408 |
* |
|
409 |
*/ |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
410 |
private boolean atom() { |
16147 | 411 |
final int startIn = position; |
412 |
final int startOut = sb.length(); |
|
413 |
||
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
414 |
if (patternCharacter()) { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
415 |
return true; |
16147 | 416 |
} |
417 |
||
418 |
if (ch0 == '.') { |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
419 |
return commit(1); |
16147 | 420 |
} |
421 |
||
422 |
if (ch0 == '\\') { |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
423 |
commit(1); |
16147 | 424 |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
425 |
if (atomEscape()) { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
426 |
return true; |
16147 | 427 |
} |
428 |
} |
|
429 |
||
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
430 |
if (characterClass()) { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
431 |
return true; |
16147 | 432 |
} |
433 |
||
434 |
if (ch0 == '(') { |
|
435 |
boolean capturingParens = true; |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
436 |
commit(1); |
16147 | 437 |
if (ch0 == '?' && ch1 == ':') { |
438 |
capturingParens = false; |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
439 |
commit(2); |
16147 | 440 |
} |
441 |
||
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
442 |
disjunction(); |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
443 |
|
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
444 |
if (ch0 == ')') { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
445 |
commit(1); |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
446 |
if (capturingParens) { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
447 |
caps.add(new Capture(negativeLookaheadLevel)); |
16147 | 448 |
} |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
449 |
return true; |
16147 | 450 |
} |
451 |
} |
|
452 |
||
453 |
restart(startIn, startOut); |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
454 |
return false; |
16147 | 455 |
} |
456 |
||
457 |
/* |
|
458 |
* PatternCharacter :: |
|
459 |
* SourceCharacter but not any of: ^$\.*+?()[]{}| |
|
460 |
*/ |
|
461 |
@SuppressWarnings("fallthrough") |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
462 |
private boolean patternCharacter() { |
16147 | 463 |
if (atEOF()) { |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
464 |
return false; |
16147 | 465 |
} |
466 |
||
467 |
switch (ch0) { |
|
468 |
case '^': |
|
469 |
case '$': |
|
470 |
case '\\': |
|
471 |
case '.': |
|
472 |
case '*': |
|
473 |
case '+': |
|
474 |
case '?': |
|
475 |
case '(': |
|
476 |
case ')': |
|
477 |
case '[': |
|
478 |
case '|': |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
479 |
return false; |
16147 | 480 |
|
481 |
case '}': |
|
482 |
case ']': |
|
483 |
final int n = expected.get(ch0); |
|
484 |
if (n != 0) { |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
485 |
return false; |
16147 | 486 |
} |
487 |
||
488 |
case '{': |
|
489 |
// if not a valid quantifier escape curly brace to match itself |
|
490 |
// this ensures compatibility with other JS implementations |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
491 |
if (!quantifierPrefix()) { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
492 |
sb.append('\\'); |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
493 |
return commit(1); |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
494 |
} |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
495 |
return false; |
16147 | 496 |
|
497 |
default: |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
498 |
return commit(1); // SOURCECHARACTER |
16147 | 499 |
} |
500 |
} |
|
501 |
||
502 |
/* |
|
503 |
* AtomEscape :: |
|
504 |
* DecimalEscape |
|
505 |
* CharacterEscape |
|
506 |
* CharacterClassEscape |
|
507 |
*/ |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
508 |
private boolean atomEscape() { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
509 |
// Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
510 |
return decimalEscape() || characterClassEscape() || characterEscape() || identityEscape(); |
16147 | 511 |
} |
512 |
||
513 |
/* |
|
514 |
* CharacterEscape :: |
|
515 |
* ControlEscape |
|
516 |
* c ControlLetter |
|
517 |
* HexEscapeSequence |
|
518 |
* UnicodeEscapeSequence |
|
519 |
* IdentityEscape |
|
520 |
*/ |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
521 |
private boolean characterEscape() { |
16147 | 522 |
final int startIn = position; |
523 |
final int startOut = sb.length(); |
|
524 |
||
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
525 |
if (controlEscape()) { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
526 |
return true; |
16147 | 527 |
} |
528 |
||
529 |
if (ch0 == 'c') { |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
530 |
commit(1); |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
531 |
if (controlLetter()) { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
532 |
return true; |
16147 | 533 |
} |
534 |
restart(startIn, startOut); |
|
535 |
} |
|
536 |
||
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
537 |
if (hexEscapeSequence() || unicodeEscapeSequence()) { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
538 |
return true; |
16147 | 539 |
} |
540 |
||
541 |
restart(startIn, startOut); |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
542 |
return false; |
16147 | 543 |
} |
544 |
||
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
545 |
private boolean scanEscapeSequence(final char leader, final int length) { |
16147 | 546 |
final int startIn = position; |
547 |
final int startOut = sb.length(); |
|
548 |
||
549 |
if (ch0 != leader) { |
|
550 |
return false; |
|
551 |
} |
|
552 |
||
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
553 |
commit(1); |
16147 | 554 |
for (int i = 0; i < length; i++) { |
555 |
final char ch0l = Character.toLowerCase(ch0); |
|
556 |
if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) { |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
557 |
commit(1); |
16147 | 558 |
} else { |
559 |
restart(startIn, startOut); |
|
560 |
return false; |
|
561 |
} |
|
562 |
} |
|
563 |
||
564 |
return true; |
|
565 |
} |
|
566 |
||
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
567 |
private boolean hexEscapeSequence() { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
568 |
return scanEscapeSequence('x', 2); |
16147 | 569 |
} |
570 |
||
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
571 |
private boolean unicodeEscapeSequence() { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
572 |
return scanEscapeSequence('u', 4); |
16147 | 573 |
} |
574 |
||
575 |
/* |
|
576 |
* ControlEscape :: |
|
577 |
* one of fnrtv |
|
578 |
*/ |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
579 |
private boolean controlEscape() { |
16147 | 580 |
switch (ch0) { |
581 |
case 'f': |
|
582 |
case 'n': |
|
583 |
case 'r': |
|
584 |
case 't': |
|
585 |
case 'v': |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
586 |
return commit(1); |
16147 | 587 |
|
588 |
default: |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
589 |
return false; |
16147 | 590 |
} |
591 |
} |
|
592 |
||
593 |
/* |
|
594 |
* ControlLetter :: |
|
595 |
* one of abcdefghijklmnopqrstuvwxyz |
|
596 |
* ABCDEFGHIJKLMNOPQRSTUVWXYZ |
|
597 |
*/ |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
598 |
private boolean controlLetter() { |
16147 | 599 |
final char c = Character.toUpperCase(ch0); |
600 |
if (c >= 'A' && c <= 'Z') { |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
601 |
// for some reason java regexps don't like control characters on the |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
602 |
// form "\\ca".match([string with ascii 1 at char0]). Translating |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
603 |
// them to unicode does it though. |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
604 |
sb.setLength(sb.length() - 1); |
16938
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
605 |
unicode(c - 'A' + 1, sb); |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
606 |
skip(1); |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
607 |
return true; |
16147 | 608 |
} |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
609 |
return false; |
16147 | 610 |
} |
611 |
||
612 |
/* |
|
613 |
* IdentityEscape :: |
|
614 |
* SourceCharacter but not IdentifierPart |
|
615 |
* <ZWJ> (200c) |
|
616 |
* <ZWNJ> (200d) |
|
617 |
*/ |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
618 |
private boolean identityEscape() { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
619 |
if (atEOF()) { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
620 |
throw new RuntimeException("\\ at end of pattern"); // will be converted to PatternSyntaxException |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
621 |
} |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
622 |
// ES 5.1 A.7 requires "not IdentifierPart" here but all major engines accept any character here. |
16781
41eadf003eff
8009230: Nashorn rejects extended RegExp syntax accepted by all major JS engines
hannesw
parents:
16525
diff
changeset
|
623 |
if (ch0 == 'c') { |
41eadf003eff
8009230: Nashorn rejects extended RegExp syntax accepted by all major JS engines
hannesw
parents:
16525
diff
changeset
|
624 |
// Ignore invalid control letter escape if within a character class |
41eadf003eff
8009230: Nashorn rejects extended RegExp syntax accepted by all major JS engines
hannesw
parents:
16525
diff
changeset
|
625 |
if (inCharClass && ch1 != ']') { |
41eadf003eff
8009230: Nashorn rejects extended RegExp syntax accepted by all major JS engines
hannesw
parents:
16525
diff
changeset
|
626 |
sb.setLength(sb.length() - 1); |
41eadf003eff
8009230: Nashorn rejects extended RegExp syntax accepted by all major JS engines
hannesw
parents:
16525
diff
changeset
|
627 |
skip(2); |
41eadf003eff
8009230: Nashorn rejects extended RegExp syntax accepted by all major JS engines
hannesw
parents:
16525
diff
changeset
|
628 |
return true; |
41eadf003eff
8009230: Nashorn rejects extended RegExp syntax accepted by all major JS engines
hannesw
parents:
16525
diff
changeset
|
629 |
} else { |
41eadf003eff
8009230: Nashorn rejects extended RegExp syntax accepted by all major JS engines
hannesw
parents:
16525
diff
changeset
|
630 |
sb.append('\\'); // Treat invalid \c control sequence as \\c |
41eadf003eff
8009230: Nashorn rejects extended RegExp syntax accepted by all major JS engines
hannesw
parents:
16525
diff
changeset
|
631 |
} |
41eadf003eff
8009230: Nashorn rejects extended RegExp syntax accepted by all major JS engines
hannesw
parents:
16525
diff
changeset
|
632 |
} else if (NON_IDENT_ESCAPES.indexOf(ch0) == -1) { |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
633 |
sb.setLength(sb.length() - 1); |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
634 |
} |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
635 |
return commit(1); |
16147 | 636 |
} |
637 |
||
638 |
/* |
|
639 |
* DecimalEscape :: |
|
640 |
* DecimalIntegerLiteral [lookahead DecimalDigit] |
|
641 |
*/ |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
642 |
private boolean decimalEscape() { |
16147 | 643 |
final int startIn = position; |
644 |
final int startOut = sb.length(); |
|
645 |
||
16938
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
646 |
if (ch0 == '0' && !isOctalDigit(ch1)) { |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
647 |
skip(1); |
16147 | 648 |
// DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000); |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
649 |
sb.append("\u0000"); |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
650 |
return true; |
16147 | 651 |
} |
652 |
||
653 |
if (isDecimalDigit(ch0)) { |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
654 |
|
16938
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
655 |
if (ch0 == '0') { |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
656 |
// We know this is an octal escape. |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
657 |
if (inCharClass) { |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
658 |
// Convert octal escape to unicode escape if inside character class. |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
659 |
int octalValue = 0; |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
660 |
while (isOctalDigit(ch0)) { |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
661 |
octalValue = octalValue * 8 + ch0 - '0'; |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
662 |
skip(1); |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
663 |
} |
16938
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
664 |
|
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
665 |
unicode(octalValue, sb); |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
666 |
|
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
667 |
} else { |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
668 |
// Copy decimal escape as-is |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
669 |
decimalDigits(); |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
670 |
} |
16938
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
671 |
} else { |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
672 |
// This should be a backreference, but could also be an octal escape or even a literal string. |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
673 |
int decimalValue = 0; |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
674 |
while (isDecimalDigit(ch0)) { |
16938
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
675 |
decimalValue = decimalValue * 10 + ch0 - '0'; |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
676 |
skip(1); |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
677 |
} |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
678 |
|
16938
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
679 |
if (inCharClass) { |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
680 |
// No backreferences in character classes. Encode as unicode escape or literal char sequence |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
681 |
sb.setLength(sb.length() - 1); |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
682 |
octalOrLiteral(Integer.toString(decimalValue), sb); |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
683 |
|
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
684 |
} else if (decimalValue <= caps.size() && caps.get(decimalValue - 1).getNegativeLookaheadLevel() > 0) { |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
685 |
// Captures that live inside a negative lookahead are dead after the |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
686 |
// lookahead and will be undefined if referenced from outside. |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
687 |
if (caps.get(decimalValue - 1).getNegativeLookaheadLevel() > negativeLookaheadLevel) { |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
688 |
sb.setLength(sb.length() - 1); |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
689 |
} else { |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
690 |
sb.append(decimalValue); |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
691 |
} |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
692 |
} else if (decimalValue > caps.size()) { |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
693 |
// Forward reference to a capture group. Forward references are always undefined so we can omit |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
694 |
// it from the output buffer. However, if the target capture does not exist, we need to rewrite |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
695 |
// the reference as hex escape or literal string, so register the reference for later processing. |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
696 |
sb.setLength(sb.length() - 1); |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
697 |
forwardReferences.add(decimalValue); |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
698 |
forwardReferences.add(sb.length()); |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
699 |
} else { |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
700 |
// Append as backreference |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
701 |
sb.append(decimalValue); |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
702 |
} |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
703 |
|
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
704 |
} |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
705 |
return true; |
16147 | 706 |
} |
707 |
||
708 |
restart(startIn, startOut); |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
709 |
return false; |
16147 | 710 |
} |
711 |
||
712 |
/* |
|
713 |
* CharacterClassEscape :: |
|
714 |
* one of dDsSwW |
|
715 |
*/ |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
716 |
private boolean characterClassEscape() { |
16147 | 717 |
switch (ch0) { |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
718 |
// java.util.regex requires translation of \s and \S to explicit character list |
16147 | 719 |
case 's': |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
720 |
if (RegExpFactory.usesJavaUtilRegex()) { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
721 |
sb.setLength(sb.length() - 1); |
16274
c3f35c5e0d1c
8008370: coffee script compiler doesn't work with Nashorn
hannesw
parents:
16271
diff
changeset
|
722 |
// No nested class required if we already are inside a character class |
c3f35c5e0d1c
8008370: coffee script compiler doesn't work with Nashorn
hannesw
parents:
16271
diff
changeset
|
723 |
if (inCharClass) { |
c3f35c5e0d1c
8008370: coffee script compiler doesn't work with Nashorn
hannesw
parents:
16271
diff
changeset
|
724 |
sb.append(Lexer.getWhitespaceRegExp()); |
c3f35c5e0d1c
8008370: coffee script compiler doesn't work with Nashorn
hannesw
parents:
16271
diff
changeset
|
725 |
} else { |
c3f35c5e0d1c
8008370: coffee script compiler doesn't work with Nashorn
hannesw
parents:
16271
diff
changeset
|
726 |
sb.append('[').append(Lexer.getWhitespaceRegExp()).append(']'); |
c3f35c5e0d1c
8008370: coffee script compiler doesn't work with Nashorn
hannesw
parents:
16271
diff
changeset
|
727 |
} |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
728 |
skip(1); |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
729 |
return true; |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
730 |
} |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
731 |
return commit(1); |
16147 | 732 |
case 'S': |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
733 |
if (RegExpFactory.usesJavaUtilRegex()) { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
734 |
sb.setLength(sb.length() - 1); |
16274
c3f35c5e0d1c
8008370: coffee script compiler doesn't work with Nashorn
hannesw
parents:
16271
diff
changeset
|
735 |
// In negative class we must use intersection to get double negation ("not anything else than space") |
c3f35c5e0d1c
8008370: coffee script compiler doesn't work with Nashorn
hannesw
parents:
16271
diff
changeset
|
736 |
sb.append(inNegativeClass ? "&&[" : "[^").append(Lexer.getWhitespaceRegExp()).append(']'); |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
737 |
skip(1); |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
738 |
return true; |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
739 |
} |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
740 |
return commit(1); |
16147 | 741 |
case 'd': |
742 |
case 'D': |
|
743 |
case 'w': |
|
744 |
case 'W': |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
745 |
return commit(1); |
16147 | 746 |
|
747 |
default: |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
748 |
return false; |
16147 | 749 |
} |
750 |
} |
|
751 |
||
752 |
/* |
|
753 |
* CharacterClass :: |
|
754 |
* [ [lookahead {^}] ClassRanges ] |
|
755 |
* [ ^ ClassRanges ] |
|
756 |
*/ |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
757 |
private boolean characterClass() { |
16147 | 758 |
final int startIn = position; |
759 |
final int startOut = sb.length(); |
|
760 |
||
761 |
if (ch0 == '[') { |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
762 |
try { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
763 |
inCharClass = true; |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
764 |
push(']'); |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
765 |
commit(1); |
16147 | 766 |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
767 |
if (ch0 == '^') { |
16274
c3f35c5e0d1c
8008370: coffee script compiler doesn't work with Nashorn
hannesw
parents:
16271
diff
changeset
|
768 |
inNegativeClass = true; |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
769 |
commit(1); |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
770 |
} |
16147 | 771 |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
772 |
if (classRanges() && ch0 == ']') { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
773 |
pop(']'); |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
774 |
return commit(1); |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
775 |
} |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
776 |
} finally { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
777 |
inCharClass = false; // no nested character classes in JavaScript |
16274
c3f35c5e0d1c
8008370: coffee script compiler doesn't work with Nashorn
hannesw
parents:
16271
diff
changeset
|
778 |
inNegativeClass = false; |
16147 | 779 |
} |
780 |
} |
|
781 |
||
782 |
restart(startIn, startOut); |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
783 |
return false; |
16147 | 784 |
} |
785 |
||
786 |
/* |
|
787 |
* ClassRanges :: |
|
788 |
* [empty] |
|
789 |
* NonemptyClassRanges |
|
790 |
*/ |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
791 |
private boolean classRanges() { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
792 |
nonemptyClassRanges(); |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
793 |
return true; |
16147 | 794 |
} |
795 |
||
796 |
/* |
|
797 |
* NonemptyClassRanges :: |
|
798 |
* ClassAtom |
|
799 |
* ClassAtom NonemptyClassRangesNoDash |
|
800 |
* ClassAtom - ClassAtom ClassRanges |
|
801 |
*/ |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
802 |
private boolean nonemptyClassRanges() { |
16147 | 803 |
final int startIn = position; |
804 |
final int startOut = sb.length(); |
|
805 |
||
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
806 |
if (classAtom()) { |
16147 | 807 |
|
808 |
if (ch0 == '-') { |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
809 |
commit(1); |
16147 | 810 |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
811 |
if (classAtom() && classRanges()) { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
812 |
return true; |
16147 | 813 |
} |
814 |
} |
|
815 |
||
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
816 |
nonemptyClassRangesNoDash(); |
16147 | 817 |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
818 |
return true; |
16147 | 819 |
} |
820 |
||
821 |
restart(startIn, startOut); |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
822 |
return false; |
16147 | 823 |
} |
824 |
||
825 |
/* |
|
826 |
* NonemptyClassRangesNoDash :: |
|
827 |
* ClassAtom |
|
828 |
* ClassAtomNoDash NonemptyClassRangesNoDash |
|
829 |
* ClassAtomNoDash - ClassAtom ClassRanges |
|
830 |
*/ |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
831 |
private boolean nonemptyClassRangesNoDash() { |
16147 | 832 |
final int startIn = position; |
833 |
final int startOut = sb.length(); |
|
834 |
||
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
835 |
if (classAtomNoDash()) { |
16147 | 836 |
|
837 |
// need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom |
|
838 |
if (ch0 == '-') { |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
839 |
commit(1); |
16147 | 840 |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
841 |
if (classAtom() && classRanges()) { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
842 |
return true; |
16147 | 843 |
} |
844 |
//fallthru |
|
845 |
} |
|
846 |
||
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
847 |
nonemptyClassRangesNoDash(); |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
848 |
return true; // still a class atom |
16147 | 849 |
} |
850 |
||
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
851 |
if (classAtom()) { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
852 |
return true; |
16147 | 853 |
} |
854 |
||
855 |
restart(startIn, startOut); |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
856 |
return false; |
16147 | 857 |
} |
858 |
||
859 |
/* |
|
860 |
* ClassAtom : - ClassAtomNoDash |
|
861 |
*/ |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
862 |
private boolean classAtom() { |
16147 | 863 |
|
864 |
if (ch0 == '-') { |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
865 |
return commit(1); |
16147 | 866 |
} |
867 |
||
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
868 |
return classAtomNoDash(); |
16147 | 869 |
} |
870 |
||
871 |
/* |
|
872 |
* ClassAtomNoDash :: |
|
873 |
* SourceCharacter but not one of \ or ] or - |
|
874 |
* \ ClassEscape |
|
875 |
*/ |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
876 |
private boolean classAtomNoDash() { |
16147 | 877 |
final int startIn = position; |
878 |
final int startOut = sb.length(); |
|
879 |
||
880 |
switch (ch0) { |
|
881 |
case ']': |
|
882 |
case '-': |
|
883 |
case '\0': |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
884 |
return false; |
16147 | 885 |
|
886 |
case '[': |
|
887 |
// unescaped left square bracket - add escape |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
888 |
sb.append('\\'); |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
889 |
return commit(1); |
16147 | 890 |
|
891 |
case '\\': |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
892 |
commit(1); |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
893 |
if (classEscape()) { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
894 |
return true; |
16147 | 895 |
} |
896 |
||
897 |
restart(startIn, startOut); |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
898 |
return false; |
16147 | 899 |
|
900 |
default: |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
901 |
return commit(1); |
16147 | 902 |
} |
903 |
} |
|
904 |
||
905 |
/* |
|
906 |
* ClassEscape :: |
|
907 |
* DecimalEscape |
|
908 |
* b |
|
909 |
* CharacterEscape |
|
910 |
* CharacterClassEscape |
|
911 |
*/ |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
912 |
private boolean classEscape() { |
16147 | 913 |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
914 |
if (decimalEscape()) { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
915 |
return true; |
16147 | 916 |
} |
917 |
||
918 |
if (ch0 == 'b') { |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
919 |
sb.setLength(sb.length() - 1); |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
920 |
sb.append('\b'); |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
921 |
skip(1); |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
922 |
return true; |
16147 | 923 |
} |
924 |
||
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
925 |
// Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
926 |
return characterEscape() || characterClassEscape() || identityEscape(); |
16147 | 927 |
} |
928 |
||
929 |
/* |
|
930 |
* DecimalDigits |
|
931 |
*/ |
|
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
932 |
private boolean decimalDigits() { |
16147 | 933 |
if (!isDecimalDigit(ch0)) { |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
934 |
return false; |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
935 |
} |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
936 |
|
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
937 |
while (isDecimalDigit(ch0)) { |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
938 |
commit(1); |
16147 | 939 |
} |
940 |
||
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
941 |
return true; |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
942 |
} |
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
943 |
|
16938
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
944 |
private void unicode(final int value, final StringBuilder buffer) { |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
945 |
final String hex = Integer.toHexString(value); |
16938
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
946 |
buffer.append('u'); |
16271
4817d7bb7434
8009240: RegExpScanner code is inefficient and too complex
hannesw
parents:
16258
diff
changeset
|
947 |
for (int i = 0; i < 4 - hex.length(); i++) { |
16938
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
948 |
buffer.append('0'); |
16147 | 949 |
} |
16938
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
950 |
buffer.append(hex); |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
951 |
} |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
952 |
|
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
953 |
// Convert what would have been a backreference into a unicode escape, or a number literal, or both. |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
954 |
private void octalOrLiteral(final String numberLiteral, final StringBuilder buffer) { |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
955 |
final int length = numberLiteral.length(); |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
956 |
int octalValue = 0; |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
957 |
int pos = 0; |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
958 |
// Maximum value for octal escape is 0377 (255) so we stop the loop at 32 |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
959 |
while (pos < length && octalValue < 0x20) { |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
960 |
final char ch = numberLiteral.charAt(pos); |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
961 |
if (isOctalDigit(ch)) { |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
962 |
octalValue = octalValue * 8 + ch - '0'; |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
963 |
} else { |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
964 |
break; |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
965 |
} |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
966 |
pos++; |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
967 |
} |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
968 |
if (octalValue > 0) { |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
969 |
buffer.append('\\'); |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
970 |
unicode(octalValue, buffer); |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
971 |
buffer.append(numberLiteral.substring(pos)); |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
972 |
} else { |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
973 |
buffer.append(numberLiteral); |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
974 |
} |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
975 |
} |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
976 |
|
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
977 |
private static boolean isOctalDigit(final char ch) { |
1a8ffed97564
8011714: Regexp decimal escape handling still not correct
hannesw
parents:
16781
diff
changeset
|
978 |
return ch >= '0' && ch <= '7'; |
16147 | 979 |
} |
980 |
||
981 |
private static boolean isDecimalDigit(final char ch) { |
|
982 |
return ch >= '0' && ch <= '9'; |
|
983 |
} |
|
984 |
} |