--- a/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java Thu Feb 28 20:31:30 2013 +0530
+++ b/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java Thu Feb 28 22:59:31 2013 +0100
@@ -25,15 +25,15 @@
package jdk.nashorn.internal.runtime.regexp;
-import java.util.ArrayList;
import java.util.HashMap;
-import java.util.Iterator;
-import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import java.util.regex.PatternSyntaxException;
+import jdk.nashorn.internal.parser.Lexer;
import jdk.nashorn.internal.parser.Scanner;
import jdk.nashorn.internal.runtime.BitVector;
@@ -44,17 +44,13 @@
final class RegExpScanner extends Scanner {
/**
- * String builder to accumulate the result - this contains verbatim parsed JavaScript.
- * to get the java equivalent we need to create a Pattern token and return its toString()
+ * String builder used to rewrite the pattern for the currently used regexp factory.
*/
private final StringBuilder sb;
/** Is this the special case of a regexp that never matches anything */
private boolean neverMatches;
- /** The resulting java.util.regex pattern string. */
- private String javaPattern;
-
/** Expected token table */
private final Map<Character, Integer> expected = new HashMap<>();
@@ -62,11 +58,14 @@
private final List<Capture> caps = new LinkedList<>();
/** Forward references to capturing parenthesis to be resolved later.*/
- private final Map<Integer, Token> forwardReferences = new LinkedHashMap<>();
+ private final Set<Integer> forwardReferences = new LinkedHashSet<>();
/** Current level of zero-width negative lookahead assertions. */
private int negativeLookaheadLevel;
+ /** Are we currently inside a character class? */
+ private boolean inCharClass = false;
+
private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?";
private static class Capture {
@@ -74,11 +73,6 @@
* Zero-width negative lookaheads enclosing the capture.
*/
private final int negativeLookaheadLevel;
- /**
- * Captures that live inside a negative lookahead are dead after the
- * lookahead and will be undefined if referenced from outside.
- */
- private boolean isDead;
Capture(final int negativeLookaheadLevel) {
this.negativeLookaheadLevel = negativeLookaheadLevel;
@@ -88,336 +82,6 @@
return negativeLookaheadLevel;
}
- public boolean isDead() {
- return isDead;
- }
-
- public void setDead() {
- this.isDead = true;
- }
- }
-
- /**
- * This is a token - the JavaScript regexp is scanned into a token tree
- * A token has other tokens as children as well as "atoms", i.e. Strings.
- */
- private static class Token {
-
- private enum Type {
- PATTERN,
- DISJUNCTION,
- ALTERNATIVE,
- TERM,
- ASSERTION,
- QUANTIFIER,
- QUANTIFIER_PREFIX,
- ATOM,
- PATTERN_CHARACTER,
- ATOM_ESCAPE,
- CHARACTER_ESCAPE,
- CONTROL_ESCAPE,
- CONTROL_LETTER,
- IDENTITY_ESCAPE,
- DECIMAL_ESCAPE,
- CHARACTERCLASS_ESCAPE,
- CHARACTERCLASS,
- CLASSRANGES,
- NON_EMPTY_CLASSRANGES,
- NON_EMPTY_CLASSRANGES_NODASH,
- CLASSATOM,
- CLASSATOM_NODASH,
- CLASS_ESCAPE,
- DECIMALDIGITS,
- HEX_ESCAPESEQUENCE,
- UNICODE_ESCAPESEQUENCE,
- }
-
- /**
- * Token tyoe
- */
- private final Token.Type type;
-
- /**
- * Child nodes
- */
- private final List<Object> children;
-
- /**
- * Parent node
- */
- private Token parent;
-
- /**
- * Dead code flag
- */
- private boolean isDead;
-
- private static final Map<Type, ToString> toStringMap = new HashMap<>();
- private static final ToString DEFAULT_TOSTRING = new ToString();
-
- private static String unicode(final int value) {
- final StringBuilder sb = new StringBuilder();
- final String hex = Integer.toHexString(value);
- sb.append('u');
- for (int i = 0; i < 4 - hex.length(); i++) {
- sb.append('0');
- }
- sb.append(hex);
-
- return sb.toString();
- }
-
- static {
- toStringMap.put(Type.CHARACTERCLASS, new ToString() {
- @Override
- public String toString(final Token token) {
- return super.toString(token).replace("\\b", "\b");
- }
- });
-
- // for some reason java regexps don't like control characters on the
- // form "\\ca".match([string with ascii 1 at char0]). Translating
- // them to unicode does it though.
- toStringMap.put(Type.CHARACTER_ESCAPE, new ToString() {
- @Override
- public String toString(final Token token) {
- final String str = super.toString(token);
- if (str.length() == 2) {
- return Token.unicode(Character.toLowerCase(str.charAt(1)) - 'a' + 1);
- }
- return str;
- }
- });
-
- toStringMap.put(Type.DECIMAL_ESCAPE, new ToString() {
- @Override
- public String toString(final Token token) {
- final String str = super.toString(token);
-
- if ("\0".equals(str)) {
- return str;
- }
-
- int value;
-
- if (!token.hasParentOfType(Type.CLASSRANGES)) {
- return str;
- }
-
- value = Integer.parseInt(str, 8); //throws exception that leads to SyntaxError if not octal
- if (value > 0xff) {
- throw new NumberFormatException(str);
- }
-
- return Token.unicode(value);
- }
- });
-
- }
-
- /**
- * JavaScript Token to Java regex substring framework.
- */
- private static class ToString {
- String toString(final Token token) {
- final Object[] children = token.getChildren();
-
- // Allow the installed regexp factory to perform global substitutions.
- switch (children.length) {
- case 0:
- return "";
- case 1:
- return RegExpFactory.replace(children[0].toString());
- default:
- final StringBuilder sb = new StringBuilder();
- for (final Object child : children) {
- sb.append(child);
- }
- return RegExpFactory.replace(sb.toString());
- }
- }
- }
-
- /**
- * Token iterator. Doesn't return "atom" children. i.e. string representations,
- * just tokens
- *
- */
- private static class TokenIterator implements Iterator<Token> {
- private final List<Token> preorder;
-
- private void init(final Token root) {
- preorder.add(root);
- for (final Object child : root.getChildren()) {
- if (child instanceof Token) {
- init((Token)child);
- }
- }
- }
-
- TokenIterator(final Token root) {
- preorder = new ArrayList<>();
- init(root);
- }
-
- @Override
- public boolean hasNext() {
- return !preorder.isEmpty();
- }
-
- @Override
- public Token next() {
- return preorder.remove(0);
- }
-
- @Override
- public void remove() {
- next();
- }
- }
-
- /**
- * Constructor
- * @param type the token type
- */
- Token(final Token.Type type) {
- this.type = type;
- children = new ArrayList<>();
- }
-
- /**
- * Add a an "atom" child to a token
- * @param child the child to add
- * @return the token (for chaining)
- */
- public Token add(final String child) {
- children.add(child);
- return this;
- }
-
- /**
- * Add a child to a token
- * @param child the child
- * @return the token (for chaining)
- */
- public Token add(final Token child) {
- if (child != null) {
- children.add(child);
- child.setParent(this);
- }
- return this;
- }
-
- /**
- * Remove a child from a token
- * @param child the child to remove
- * @return true if successful
- */
- public boolean remove(final Token child) {
- return children.remove(child);
- }
-
- /**
- * Remove the last child from a token
- * @return the removed child
- */
- public Object removeLast() {
- return children.remove(children.size() - 1);
- }
-
- /**
- * Flag this token as dead code
- * @param isDead is it dead or not
- */
- private void setIsDead(final boolean isDead) {
- this.isDead = isDead;
- }
-
- /**
- * Is this token dead code
- * @return boolean
- */
- private boolean getIsDead() {
- return isDead;
- }
-
- /**
- * Get the parent of this token
- * @return parent token
- */
- public Token getParent() {
- return parent;
- }
-
- public boolean hasParentOfType(final Token.Type parentType) {
- for (Token p = getParent(); p != null; p = p.getParent()) {
- if (p.getType() == parentType) {
- return true;
- }
- }
- return false;
- }
-
- public boolean hasChildOfType(final Token.Type childType) {
- for (final Iterator<Token> iter = iterator() ; iter.hasNext() ; ) {
- if (iter.next().getType() == childType) {
- return true;
- }
- }
- return false;
- }
-
- /**
- * Set the parent of this token
- * @param parent
- */
- private void setParent(final Token parent) {
- this.parent = parent;
- }
-
- /**
- * Get the children of this token
- * @return an array of children, never null
- */
- public Object[] getChildren() {
- return children.toArray();
- }
-
- /**
- * Reset this token, remove all children
- */
- public void reset() {
- children.clear();
- }
-
- /**
- * Get a preorder token iterator with this token as root
- * @return an iterator
- */
- public Iterator<Token> iterator() {
- return new TokenIterator(this);
- }
-
- /**
- * Get the type of this token
- * @return type
- */
- public Type getType() {
- return type;
- }
-
- /**
- * Turn this token into Java regexp compatible text
- * @return part of a java regexp
- */
- @Override
- public String toString() {
- ToString t = toStringMap.get(getType());
- if (t == null) {
- t = DEFAULT_TOSTRING;
- }
- return t.toString(this);
- }
}
/**
@@ -437,13 +101,11 @@
return;
}
- for (final Map.Entry<Integer, Token> fwdRef : forwardReferences.entrySet()) {
- if (fwdRef.getKey().intValue() > caps.size()) {
+ for (final Integer ref : forwardReferences) {
+ if (ref.intValue() > caps.size()) {
neverMatches = true;
break;
}
-
- fwdRef.getValue().setIsDead(true);
}
forwardReferences.clear();
@@ -459,12 +121,10 @@
public static RegExpScanner scan(final String string) {
final RegExpScanner scanner = new RegExpScanner(string);
- Token pattern;
-
try {
- pattern = scanner.pattern();
+ scanner.disjunction();
} catch (final Exception e) {
- throw new PatternSyntaxException(e.getMessage(), string, scanner.sb.length());
+ throw new PatternSyntaxException(e.getMessage(), string, scanner.position);
}
scanner.processForwardReferences();
@@ -472,24 +132,12 @@
return null; // never matches
}
- // go over the code and remove dead code
- final Iterator<Token> iter = pattern.iterator();
- while (iter.hasNext()) {
- final Token next = iter.next();
- if (next.getIsDead()) {
- next.getParent().remove(next);
- }
- }
-
- // turn the pattern into a string, p, the java equivalent string for our js regexp
- final String p = pattern.toString();
- // if builder contains all tokens that were sent in, we know
- // we correctly parsed the entire JavaScript regexp without syntax errors
- if (!string.equals(scanner.getStringBuilder().toString())) {
+ // Throw syntax error unless we parsed the entire JavaScript regexp without syntax errors
+ if (scanner.position != string.length()) {
+ final String p = scanner.getStringBuilder().toString();
throw new PatternSyntaxException(string, p, p.length() + 1);
}
- scanner.javaPattern = p;
return scanner;
}
@@ -508,7 +156,7 @@
}
String getJavaPattern() {
- return javaPattern;
+ return sb.toString();
}
BitVector getGroupsInNegativeLookahead() {
@@ -527,11 +175,10 @@
/**
* Commit n characters to the builder and to a given token
- * @param token Uncommitted token.
* @param n Number of characters.
* @return Committed token
*/
- private Token commit(final Token token, final int n) {
+ private boolean commit(final int n) {
final int startIn = position;
switch (n) {
@@ -554,11 +201,7 @@
assert false : "Should not reach here";
}
- if (token == null) {
- return null;
- }
-
- return token.add(sb.substring(startIn, sb.length()));
+ return true;
}
/**
@@ -587,35 +230,20 @@
*/
/*
- * Pattern ::
- * Disjunction
- */
- private Token pattern() {
- final Token token = new Token(Token.Type.PATTERN);
-
- final Token child = disjunction();
- return token.add(child);
- }
-
- /*
* Disjunction ::
* Alternative
* Alternative | Disjunction
*/
- private Token disjunction() {
- final Token token = new Token(Token.Type.DISJUNCTION);
-
+ private void disjunction() {
while (true) {
- token.add(alternative());
+ alternative();
if (ch0 == '|') {
- commit(token, 1);
+ commit(1);
} else {
break;
}
}
-
- return token;
}
/*
@@ -623,15 +251,10 @@
* [empty]
* Alternative Term
*/
- private Token alternative() {
- final Token token = new Token(Token.Type.ALTERNATIVE);
-
- Token child;
- while ((child = term()) != null) {
- token.add(child);
+ private void alternative() {
+ while (term()) {
+ // do nothing
}
-
- return token;
}
/*
@@ -640,48 +263,37 @@
* Atom
* Atom Quantifier
*/
- private Token term() {
+ private boolean term() {
final int startIn = position;
final int startOut = sb.length();
- final Token token = new Token(Token.Type.TERM);
- Token child;
- child = assertion();
- if (child != null) {
- return token.add(child);
+ if (assertion()) {
+ return true;
}
- child = atom();
- if (child != null) {
+ if (atom()) {
boolean emptyCharacterClass = false;
- if ("[]".equals(child.toString())) {
+ if (sb.toString().endsWith("[]")) {
emptyCharacterClass = true;
+ } else if (sb.toString().endsWith("[^]")) {
+ sb.setLength(sb.length() - 2);
+ sb.append("\\s\\S]");
}
- token.add(child);
-
- final Token quantifier = quantifier();
- if (quantifier != null) {
- token.add(quantifier);
- }
+ boolean quantifier = quantifier();
if (emptyCharacterClass) {
- if (quantifier == null) {
+ if (!quantifier) {
neverMatches = true; //never matches ever.
- } else {
- //if we can get away with max zero, remove this entire token
- final String qs = quantifier.toString();
- if ("+".equals(qs) || "*".equals(qs) || qs.startsWith("{0,")) {
- token.setIsDead(true);
- }
}
+ // Note: we could check if quantifier has min zero to mark empty character class as dead.
}
- return token;
+ return true;
}
restart(startIn, startOut);
- return null;
+ return false;
}
/*
@@ -693,19 +305,18 @@
* ( ? = Disjunction )
* ( ? ! Disjunction )
*/
- private Token assertion() {
+ private boolean assertion() {
final int startIn = position;
final int startOut = sb.length();
- final Token token = new Token(Token.Type.ASSERTION);
switch (ch0) {
case '^':
case '$':
- return commit(token, 1);
+ return commit(1);
case '\\':
if (ch1 == 'b' || ch1 == 'B') {
- return commit(token, 2);
+ return commit(2);
}
break;
@@ -717,24 +328,18 @@
break;
}
final boolean isNegativeLookahead = (ch2 == '!');
- commit(token, 3);
+ commit(3);
if (isNegativeLookahead) {
negativeLookaheadLevel++;
}
- final Token disjunction = disjunction();
+ disjunction();
if (isNegativeLookahead) {
- for (final Capture cap : caps) {
- if (cap.getNegativeLookaheadLevel() >= negativeLookaheadLevel) {
- cap.setDead();
- }
- }
negativeLookaheadLevel--;
}
- if (disjunction != null && ch0 == ')') {
- token.add(disjunction);
- return commit(token, 1);
+ if (ch0 == ')') {
+ return commit(1);
}
break;
@@ -743,8 +348,7 @@
}
restart(startIn, startOut);
-
- return null;
+ return false;
}
/*
@@ -752,17 +356,14 @@
* QuantifierPrefix
* QuantifierPrefix ?
*/
- private Token quantifier() {
- final Token token = new Token(Token.Type.QUANTIFIER);
- final Token child = quantifierPrefix();
- if (child != null) {
- token.add(child);
+ private boolean quantifier() {
+ if (quantifierPrefix()) {
if (ch0 == '?') {
- commit(token, 1);
+ commit(1);
}
- return token;
+ return true;
}
- return null;
+ return false;
}
/*
@@ -774,45 +375,42 @@
* { DecimalDigits , }
* { DecimalDigits , DecimalDigits }
*/
- private Token quantifierPrefix() {
+ private boolean quantifierPrefix() {
final int startIn = position;
final int startOut = sb.length();
- final Token token = new Token(Token.Type.QUANTIFIER_PREFIX);
switch (ch0) {
case '*':
case '+':
case '?':
- return commit(token, 1);
+ return commit(1);
case '{':
- commit(token, 1);
+ commit(1);
- final Token child = decimalDigits();
- if (child == null) {
+ if (!decimalDigits()) {
break; // not a quantifier - back out
}
push('}');
- token.add(child);
if (ch0 == ',') {
- commit(token, 1);
- token.add(decimalDigits());
+ commit(1);
+ decimalDigits();
}
if (ch0 == '}') {
pop('}');
- commit(token, 1);
+ commit(1);
}
- return token;
+ return true;
default:
break;
}
restart(startIn, startOut);
- return null;
+ return false;
}
/*
@@ -825,81 +423,51 @@
* ( ? : Disjunction )
*
*/
- private Token atom() {
+ private boolean atom() {
final int startIn = position;
final int startOut = sb.length();
- final Token token = new Token(Token.Type.ATOM);
- Token child;
- child = patternCharacter();
- if (child != null) {
- return token.add(child);
+ if (patternCharacter()) {
+ return true;
}
if (ch0 == '.') {
- return commit(token, 1);
+ return commit(1);
}
if (ch0 == '\\') {
- commit(token, 1);
- child = atomEscape();
-
- if (child != null) {
- if (child.hasChildOfType(Token.Type.IDENTITY_ESCAPE)) {
- final char idEscape = child.toString().charAt(0);
- if (NON_IDENT_ESCAPES.indexOf(idEscape) == -1) {
- token.reset();
- }
- }
-
- token.add(child);
+ commit(1);
- // forward backreferences always match empty. JavaScript != Java
- if (child.hasChildOfType(Token.Type.DECIMAL_ESCAPE) && !"\u0000".equals(child.toString())) {
- final int refNum = Integer.parseInt(child.toString());
-
- if (refNum - 1 < caps.size() && caps.get(refNum - 1).isDead()) {
- // reference to dead in-negative-lookahead capture
- token.setIsDead(true);
- } else if (caps.size() < refNum) {
- // forward reference: always matches against empty string (dead token).
- // invalid reference (non-existant capture): pattern never matches.
- forwardReferences.put(refNum, token);
- }
- }
-
- return token;
+ if (atomEscape()) {
+ return true;
}
}
- child = characterClass();
- if (child != null) {
- return token.add(child);
+ if (characterClass()) {
+ return true;
}
if (ch0 == '(') {
boolean capturingParens = true;
- commit(token, 1);
+ commit(1);
if (ch0 == '?' && ch1 == ':') {
capturingParens = false;
- commit(token, 2);
+ commit(2);
}
- child = disjunction();
- if (child != null) {
- token.add(child);
- if (ch0 == ')') {
- final Token atom = commit(token, 1);
- if (capturingParens) {
- caps.add(new Capture(negativeLookaheadLevel));
- }
- return atom;
+ disjunction();
+
+ if (ch0 == ')') {
+ commit(1);
+ if (capturingParens) {
+ caps.add(new Capture(negativeLookaheadLevel));
}
+ return true;
}
}
restart(startIn, startOut);
- return null;
+ return false;
}
/*
@@ -907,9 +475,9 @@
* SourceCharacter but not any of: ^$\.*+?()[]{}|
*/
@SuppressWarnings("fallthrough")
- private Token patternCharacter() {
+ private boolean patternCharacter() {
if (atEOF()) {
- return null;
+ return false;
}
switch (ch0) {
@@ -924,23 +492,26 @@
case ')':
case '[':
case '|':
- return null;
+ return false;
case '}':
case ']':
final int n = expected.get(ch0);
if (n != 0) {
- return null;
+ return false;
}
case '{':
// if not a valid quantifier escape curly brace to match itself
// this ensures compatibility with other JS implementations
- final Token quant = quantifierPrefix();
- return (quant == null) ? commit(new Token(Token.Type.PATTERN_CHARACTER).add("\\"), 1) : null;
+ if (!quantifierPrefix()) {
+ sb.append('\\');
+ return commit(1);
+ }
+ return false;
default:
- return commit(new Token(Token.Type.PATTERN_CHARACTER), 1); // SOURCECHARACTER
+ return commit(1); // SOURCECHARACTER
}
}
@@ -950,27 +521,9 @@
* CharacterEscape
* CharacterClassEscape
*/
- private Token atomEscape() {
- final Token token = new Token(Token.Type.ATOM_ESCAPE);
- Token child;
-
- child = decimalEscape();
- if (child != null) {
- return token.add(child);
- }
-
- child = characterClassEscape();
- if (child != null) {
- return token.add(child);
- }
-
- child = characterEscape();
- if (child != null) {
- return token.add(child);
- }
-
-
- return null;
+ private boolean atomEscape() {
+ // Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all
+ return decimalEscape() || characterClassEscape() || characterEscape() || identityEscape();
}
/*
@@ -981,48 +534,31 @@
* UnicodeEscapeSequence
* IdentityEscape
*/
- private Token characterEscape() {
+ private boolean characterEscape() {
final int startIn = position;
final int startOut = sb.length();
- final Token token = new Token(Token.Type.CHARACTER_ESCAPE);
- Token child;
-
- child = controlEscape();
- if (child != null) {
- return token.add(child);
+ if (controlEscape()) {
+ return true;
}
if (ch0 == 'c') {
- commit(token, 1);
- child = controlLetter();
- if (child != null) {
- return token.add(child);
+ commit(1);
+ if (controlLetter()) {
+ return true;
}
restart(startIn, startOut);
}
- child = hexEscapeSequence();
- if (child != null) {
- return token.add(child);
- }
-
- child = unicodeEscapeSequence();
- if (child != null) {
- return token.add(child);
- }
-
- child = identityEscape();
- if (child != null) {
- return token.add(child);
+ if (hexEscapeSequence() || unicodeEscapeSequence()) {
+ return true;
}
restart(startIn, startOut);
-
- return null;
+ return false;
}
- private boolean scanEscapeSequence(final char leader, final int length, final Token token) {
+ private boolean scanEscapeSequence(final char leader, final int length) {
final int startIn = position;
final int startOut = sb.length();
@@ -1030,11 +566,11 @@
return false;
}
- commit(token, 1);
+ commit(1);
for (int i = 0; i < length; i++) {
final char ch0l = Character.toLowerCase(ch0);
if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) {
- commit(token, 1);
+ commit(1);
} else {
restart(startIn, startOut);
return false;
@@ -1044,37 +580,29 @@
return true;
}
- private Token hexEscapeSequence() {
- final Token token = new Token(Token.Type.HEX_ESCAPESEQUENCE);
- if (scanEscapeSequence('x', 2, token)) {
- return token;
- }
- return null;
+ private boolean hexEscapeSequence() {
+ return scanEscapeSequence('x', 2);
}
- private Token unicodeEscapeSequence() {
- final Token token = new Token(Token.Type.UNICODE_ESCAPESEQUENCE);
- if (scanEscapeSequence('u', 4, token)) {
- return token;
- }
- return null;
+ private boolean unicodeEscapeSequence() {
+ return scanEscapeSequence('u', 4);
}
/*
* ControlEscape ::
* one of fnrtv
*/
- private Token controlEscape() {
+ private boolean controlEscape() {
switch (ch0) {
case 'f':
case 'n':
case 'r':
case 't':
case 'v':
- return commit(new Token(Token.Type.CONTROL_ESCAPE), 1);
+ return commit(1);
default:
- return null;
+ return false;
}
}
@@ -1083,19 +611,18 @@
* one of abcdefghijklmnopqrstuvwxyz
* ABCDEFGHIJKLMNOPQRSTUVWXYZ
*/
- private Token controlLetter() {
+ private boolean controlLetter() {
final char c = Character.toUpperCase(ch0);
if (c >= 'A' && c <= 'Z') {
- final Token token = new Token(Token.Type.CONTROL_LETTER);
- commit(token, 1);
- return token;
+ // for some reason java regexps don't like control characters on the
+ // form "\\ca".match([string with ascii 1 at char0]). Translating
+ // them to unicode does it though.
+ sb.setLength(sb.length() - 1);
+ unicode(c - 'A' + 1);
+ skip(1);
+ return true;
}
- return null;
- /*
- Token token = new Token(Token.Type.CONTROL_LETTER);
- commit(null, 1);//add original char to builder not to token
- this.neverMatches = c < 'A' || c > 'Z';
- return token.add(""+c);*/
+ return false;
}
/*
@@ -1104,56 +631,115 @@
* <ZWJ> (200c)
* <ZWNJ> (200d)
*/
- private Token identityEscape() {
- final Token token = new Token(Token.Type.IDENTITY_ESCAPE);
- commit(token, 1);
- return token;
+ private boolean identityEscape() {
+ if (atEOF()) {
+ throw new RuntimeException("\\ at end of pattern"); // will be converted to PatternSyntaxException
+ }
+ // ES 5.1 A.7 requires "not IdentifierPart" here but all major engines accept any character here.
+ if (NON_IDENT_ESCAPES.indexOf(ch0) == -1) {
+ sb.setLength(sb.length() - 1);
+ }
+ return commit(1);
}
/*
* DecimalEscape ::
* DecimalIntegerLiteral [lookahead DecimalDigit]
*/
- private Token decimalEscape() {
- final Token token = new Token(Token.Type.DECIMAL_ESCAPE);
+ private boolean decimalEscape() {
final int startIn = position;
final int startOut = sb.length();
if (ch0 == '0' && !isDecimalDigit(ch1)) {
- commit(token, 1);
- token.removeLast();
+ skip(1);
// DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000);
- return token.add("\u0000");
+ sb.append("\u0000");
+ return true;
}
if (isDecimalDigit(ch0)) {
- while (isDecimalDigit(ch0)) {
- commit(token, 1);
+ final int num = ch0 - '0';
+
+ // Single digit escape, treat as backreference.
+ if (!isDecimalDigit(ch1)) {
+ if (num <= caps.size() && caps.get(num - 1).getNegativeLookaheadLevel() > 0) {
+ // Captures that live inside a negative lookahead are dead after the
+ // lookahead and will be undefined if referenced from outside.
+ if (caps.get(num - 1).getNegativeLookaheadLevel() > negativeLookaheadLevel) {
+ sb.setLength(sb.length() - 1);
+ } else {
+ sb.append(ch0);
+ }
+ skip(1);
+ return true;
+ } else if (num > caps.size()) {
+ // Forward reference to a capture group. Forward references are always undefined so we
+ // can omit it from the output buffer. Additionally, if the capture group does not exist
+ // the whole regexp becomes invalid, so register the reference for later processing.
+ forwardReferences.add(num);
+ sb.setLength(sb.length() - 1);
+ skip(1);
+ return true;
+ }
}
- return token;
+
+ if (inCharClass) {
+ // Convert octal escape to unicode escape if inside character class.
+ StringBuilder digit = new StringBuilder(4);
+ while (isDecimalDigit(ch0)) {
+ digit.append(ch0);
+ skip(1);
+ }
+
+ int value = Integer.parseInt(digit.toString(), 8); //throws exception that leads to SyntaxError if not octal
+ if (value > 0xff) {
+ throw new NumberFormatException(digit.toString());
+ }
+
+ unicode(value);
+
+ } else {
+ // Copy decimal escape as-is
+ decimalDigits();
+ }
+ return true;
}
restart(startIn, startOut);
-
- return null;
+ return false;
}
/*
* CharacterClassEscape ::
* one of dDsSwW
*/
- private Token characterClassEscape() {
+ private boolean characterClassEscape() {
switch (ch0) {
+ // java.util.regex requires translation of \s and \S to explicit character list
case 's':
+ if (RegExpFactory.usesJavaUtilRegex()) {
+ sb.setLength(sb.length() - 1);
+ sb.append('[').append(Lexer.getWhitespaceRegExp()).append(']');
+ skip(1);
+ return true;
+ }
+ return commit(1);
case 'S':
+ if (RegExpFactory.usesJavaUtilRegex()) {
+ sb.setLength(sb.length() - 1);
+ sb.append("[^").append(Lexer.getWhitespaceRegExp()).append(']');
+ skip(1);
+ return true;
+ }
+ return commit(1);
case 'd':
case 'D':
case 'w':
case 'W':
- return commit(new Token(Token.Type.CHARACTERCLASS_ESCAPE), 1);
+ return commit(1);
default:
- return null;
+ return false;
}
}
@@ -1162,29 +748,31 @@
* [ [lookahead {^}] ClassRanges ]
* [ ^ ClassRanges ]
*/
- private Token characterClass() {
+ private boolean characterClass() {
final int startIn = position;
final int startOut = sb.length();
- final Token token = new Token(Token.Type.CHARACTERCLASS);
if (ch0 == '[') {
- push(']');
- commit(token, 1);
+ try {
+ inCharClass = true;
+ push(']');
+ commit(1);
- if (ch0 == '^') {
- commit(token, 1);
- }
+ if (ch0 == '^') {
+ commit(1);
+ }
- final Token child = classRanges();
- if (child != null && ch0 == ']') {
- pop(']');
- token.add(child);
- return commit(token, 1);
+ if (classRanges() && ch0 == ']') {
+ pop(']');
+ return commit(1);
+ }
+ } finally {
+ inCharClass = false; // no nested character classes in JavaScript
}
}
restart(startIn, startOut);
- return null;
+ return false;
}
/*
@@ -1192,8 +780,9 @@
* [empty]
* NonemptyClassRanges
*/
- private Token classRanges() {
- return new Token(Token.Type.CLASSRANGES).add(nonemptyClassRanges());
+ private boolean classRanges() {
+ nonemptyClassRanges();
+ return true;
}
/*
@@ -1202,40 +791,27 @@
* ClassAtom NonemptyClassRangesNoDash
* ClassAtom - ClassAtom ClassRanges
*/
- private Token nonemptyClassRanges() {
+ private boolean nonemptyClassRanges() {
final int startIn = position;
final int startOut = sb.length();
- final Token token = new Token(Token.Type.NON_EMPTY_CLASSRANGES);
- Token child;
- child = classAtom();
- if (child != null) {
- token.add(child);
+ if (classAtom()) {
if (ch0 == '-') {
- commit(token, 1);
+ commit(1);
- final Token child1 = classAtom();
- final Token child2 = classRanges();
- if (child1 != null && child2 != null) {
- token.add(child1);
- token.add(child2);
-
- return token;
+ if (classAtom() && classRanges()) {
+ return true;
}
}
- child = nonemptyClassRangesNoDash();
- if (child != null) {
- token.add(child);
- return token;
- }
+ nonemptyClassRangesNoDash();
- return token;
+ return true;
}
restart(startIn, startOut);
- return null;
+ return false;
}
/*
@@ -1244,61 +820,44 @@
* ClassAtomNoDash NonemptyClassRangesNoDash
* ClassAtomNoDash - ClassAtom ClassRanges
*/
- private Token nonemptyClassRangesNoDash() {
+ private boolean nonemptyClassRangesNoDash() {
final int startIn = position;
final int startOut = sb.length();
- final Token token = new Token(Token.Type.NON_EMPTY_CLASSRANGES_NODASH);
- Token child;
- child = classAtomNoDash();
- if (child != null) {
- token.add(child);
+ if (classAtomNoDash()) {
// need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom
if (ch0 == '-') {
- commit(token, 1);
+ commit(1);
- final Token child1 = classAtom();
- final Token child2 = classRanges();
- if (child1 != null && child2 != null) {
- token.add(child1);
- return token.add(child2);
+ if (classAtom() && classRanges()) {
+ return true;
}
//fallthru
}
- child = nonemptyClassRangesNoDash();
- if (child != null) {
- token.add(child);
- }
- return token; // still a class atom
+ nonemptyClassRangesNoDash();
+ return true; // still a class atom
}
- child = classAtom();
- if (child != null) {
- return token.add(child);
+ if (classAtom()) {
+ return true;
}
restart(startIn, startOut);
- return null;
+ return false;
}
/*
* ClassAtom : - ClassAtomNoDash
*/
- private Token classAtom() {
- final Token token = new Token(Token.Type.CLASSATOM);
+ private boolean classAtom() {
if (ch0 == '-') {
- return commit(token, 1);
+ return commit(1);
}
- final Token child = classAtomNoDash();
- if (child != null) {
- return token.add(child);
- }
-
- return null;
+ return classAtomNoDash();
}
/*
@@ -1306,33 +865,32 @@
* SourceCharacter but not one of \ or ] or -
* \ ClassEscape
*/
- private Token classAtomNoDash() {
+ private boolean classAtomNoDash() {
final int startIn = position;
final int startOut = sb.length();
- final Token token = new Token(Token.Type.CLASSATOM_NODASH);
switch (ch0) {
case ']':
case '-':
case '\0':
- return null;
+ return false;
case '[':
// unescaped left square bracket - add escape
- return commit(token.add("\\"), 1);
+ sb.append('\\');
+ return commit(1);
case '\\':
- commit(token, 1);
- final Token child = classEscape();
- if (child != null) {
- return token.add(child);
+ commit(1);
+ if (classEscape()) {
+ return true;
}
restart(startIn, startOut);
- return null;
+ return false;
default:
- return commit(token, 1);
+ return commit(1);
}
}
@@ -1343,46 +901,45 @@
* CharacterEscape
* CharacterClassEscape
*/
- private Token classEscape() {
- final Token token = new Token(Token.Type.CLASS_ESCAPE);
- Token child;
+ private boolean classEscape() {
- child = decimalEscape();
- if (child != null) {
- return token.add(child);
+ if (decimalEscape()) {
+ return true;
}
if (ch0 == 'b') {
- return commit(token, 1);
+ sb.setLength(sb.length() - 1);
+ sb.append('\b');
+ skip(1);
+ return true;
}
- child = characterEscape();
- if (child != null) {
- return token.add(child);
- }
-
- child = characterClassEscape();
- if (child != null) {
- return token.add(child);
- }
-
- return null;
+ // Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all
+ return characterEscape() || characterClassEscape() || identityEscape();
}
/*
* DecimalDigits
*/
- private Token decimalDigits() {
+ private boolean decimalDigits() {
if (!isDecimalDigit(ch0)) {
- return null;
+ return false;
+ }
+
+ while (isDecimalDigit(ch0)) {
+ commit(1);
}
- final Token token = new Token(Token.Type.DECIMALDIGITS);
- while (isDecimalDigit(ch0)) {
- commit(token, 1);
+ return true;
+ }
+
+ private void unicode(final int value) {
+ final String hex = Integer.toHexString(value);
+ sb.append('u');
+ for (int i = 0; i < 4 - hex.length(); i++) {
+ sb.append('0');
}
-
- return token;
+ sb.append(hex);
}
private static boolean isDecimalDigit(final char ch) {