# HG changeset patch # User hannesw # Date 1362088771 -3600 # Node ID 4817d7bb743474b8ef1b1b357dac88482feb09e2 # Parent 17f612656e6ba167e15a275c2ad5324e86d91752 8009240: RegExpScanner code is inefficient and too complex Reviewed-by: jlaskey, lagergren diff -r 17f612656e6b -r 4817d7bb7434 nashorn/src/jdk/nashorn/internal/runtime/regexp/JoniRegExp.java --- a/nashorn/src/jdk/nashorn/internal/runtime/regexp/JoniRegExp.java Thu Feb 28 20:31:30 2013 +0530 +++ b/nashorn/src/jdk/nashorn/internal/runtime/regexp/JoniRegExp.java Thu Feb 28 22:59:31 2013 +0100 @@ -117,10 +117,6 @@ return new JoniRegExp(pattern, flags); } - @Override - protected String replaceToken(final String str) { - return str.equals("[^]") ? "[\\s\\S]" : str; - } } class JoniMatcher implements RegExpMatcher { diff -r 17f612656e6b -r 4817d7bb7434 nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpFactory.java --- a/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpFactory.java Thu Feb 28 20:31:30 2013 +0530 +++ b/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpFactory.java Thu Feb 28 22:59:31 2013 +0100 @@ -68,25 +68,6 @@ } /** - * Replace a regexp token as suitable for regexp instances created by this factory. - * - * @param str a regular expression token - * @return the replacement token - */ - protected String replaceToken(final String str) { - switch (str) { - case "\\s": - return "[" + Lexer.getWhitespaceRegExp() + "]"; - case "\\S": - return "[^" + Lexer.getWhitespaceRegExp() + "]"; - case "[^]": - return "[\\s\\S]"; - default: - return str; - } - } - - /** * Compile a regexp with the given {@code source} and {@code flags}. * * @param pattern RegExp pattern string @@ -99,16 +80,6 @@ } /** - * Replace a regexp token as needed by the currently installed factory instance. - * - * @param token a regexp token - * @return the replacement token - */ - public static String replace(final String token) { - return instance.replaceToken(token); - } - - /** * Validate a regexp with the given {@code source} and {@code flags}. * * @param pattern RegExp pattern string @@ -120,4 +91,13 @@ public static void validate(final String pattern, final String flags) throws ParserException { instance.compile(pattern, flags); } + + /** + * Returns true if the instance uses the JDK's {@code java.util.regex} package. + * + * @return true if instance uses JDK regex package + */ + public static boolean usesJavaUtilRegex() { + return instance != null && instance.getClass() == RegExpFactory.class; + } } diff -r 17f612656e6b -r 4817d7bb7434 nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java --- a/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java Thu Feb 28 20:31:30 2013 +0530 +++ b/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java Thu Feb 28 22:59:31 2013 +0100 @@ -25,15 +25,15 @@ package jdk.nashorn.internal.runtime.regexp; -import java.util.ArrayList; import java.util.HashMap; -import java.util.Iterator; -import java.util.LinkedHashMap; +import java.util.LinkedHashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.regex.PatternSyntaxException; +import jdk.nashorn.internal.parser.Lexer; import jdk.nashorn.internal.parser.Scanner; import jdk.nashorn.internal.runtime.BitVector; @@ -44,17 +44,13 @@ final class RegExpScanner extends Scanner { /** - * String builder to accumulate the result - this contains verbatim parsed JavaScript. - * to get the java equivalent we need to create a Pattern token and return its toString() + * String builder used to rewrite the pattern for the currently used regexp factory. */ private final StringBuilder sb; /** Is this the special case of a regexp that never matches anything */ private boolean neverMatches; - /** The resulting java.util.regex pattern string. */ - private String javaPattern; - /** Expected token table */ private final Map expected = new HashMap<>(); @@ -62,11 +58,14 @@ private final List caps = new LinkedList<>(); /** Forward references to capturing parenthesis to be resolved later.*/ - private final Map forwardReferences = new LinkedHashMap<>(); + private final Set forwardReferences = new LinkedHashSet<>(); /** Current level of zero-width negative lookahead assertions. */ private int negativeLookaheadLevel; + /** Are we currently inside a character class? */ + private boolean inCharClass = false; + private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?"; private static class Capture { @@ -74,11 +73,6 @@ * Zero-width negative lookaheads enclosing the capture. */ private final int negativeLookaheadLevel; - /** - * Captures that live inside a negative lookahead are dead after the - * lookahead and will be undefined if referenced from outside. - */ - private boolean isDead; Capture(final int negativeLookaheadLevel) { this.negativeLookaheadLevel = negativeLookaheadLevel; @@ -88,336 +82,6 @@ return negativeLookaheadLevel; } - public boolean isDead() { - return isDead; - } - - public void setDead() { - this.isDead = true; - } - } - - /** - * This is a token - the JavaScript regexp is scanned into a token tree - * A token has other tokens as children as well as "atoms", i.e. Strings. - */ - private static class Token { - - private enum Type { - PATTERN, - DISJUNCTION, - ALTERNATIVE, - TERM, - ASSERTION, - QUANTIFIER, - QUANTIFIER_PREFIX, - ATOM, - PATTERN_CHARACTER, - ATOM_ESCAPE, - CHARACTER_ESCAPE, - CONTROL_ESCAPE, - CONTROL_LETTER, - IDENTITY_ESCAPE, - DECIMAL_ESCAPE, - CHARACTERCLASS_ESCAPE, - CHARACTERCLASS, - CLASSRANGES, - NON_EMPTY_CLASSRANGES, - NON_EMPTY_CLASSRANGES_NODASH, - CLASSATOM, - CLASSATOM_NODASH, - CLASS_ESCAPE, - DECIMALDIGITS, - HEX_ESCAPESEQUENCE, - UNICODE_ESCAPESEQUENCE, - } - - /** - * Token tyoe - */ - private final Token.Type type; - - /** - * Child nodes - */ - private final List children; - - /** - * Parent node - */ - private Token parent; - - /** - * Dead code flag - */ - private boolean isDead; - - private static final Map toStringMap = new HashMap<>(); - private static final ToString DEFAULT_TOSTRING = new ToString(); - - private static String unicode(final int value) { - final StringBuilder sb = new StringBuilder(); - final String hex = Integer.toHexString(value); - sb.append('u'); - for (int i = 0; i < 4 - hex.length(); i++) { - sb.append('0'); - } - sb.append(hex); - - return sb.toString(); - } - - static { - toStringMap.put(Type.CHARACTERCLASS, new ToString() { - @Override - public String toString(final Token token) { - return super.toString(token).replace("\\b", "\b"); - } - }); - - // for some reason java regexps don't like control characters on the - // form "\\ca".match([string with ascii 1 at char0]). Translating - // them to unicode does it though. - toStringMap.put(Type.CHARACTER_ESCAPE, new ToString() { - @Override - public String toString(final Token token) { - final String str = super.toString(token); - if (str.length() == 2) { - return Token.unicode(Character.toLowerCase(str.charAt(1)) - 'a' + 1); - } - return str; - } - }); - - toStringMap.put(Type.DECIMAL_ESCAPE, new ToString() { - @Override - public String toString(final Token token) { - final String str = super.toString(token); - - if ("\0".equals(str)) { - return str; - } - - int value; - - if (!token.hasParentOfType(Type.CLASSRANGES)) { - return str; - } - - value = Integer.parseInt(str, 8); //throws exception that leads to SyntaxError if not octal - if (value > 0xff) { - throw new NumberFormatException(str); - } - - return Token.unicode(value); - } - }); - - } - - /** - * JavaScript Token to Java regex substring framework. - */ - private static class ToString { - String toString(final Token token) { - final Object[] children = token.getChildren(); - - // Allow the installed regexp factory to perform global substitutions. - switch (children.length) { - case 0: - return ""; - case 1: - return RegExpFactory.replace(children[0].toString()); - default: - final StringBuilder sb = new StringBuilder(); - for (final Object child : children) { - sb.append(child); - } - return RegExpFactory.replace(sb.toString()); - } - } - } - - /** - * Token iterator. Doesn't return "atom" children. i.e. string representations, - * just tokens - * - */ - private static class TokenIterator implements Iterator { - private final List preorder; - - private void init(final Token root) { - preorder.add(root); - for (final Object child : root.getChildren()) { - if (child instanceof Token) { - init((Token)child); - } - } - } - - TokenIterator(final Token root) { - preorder = new ArrayList<>(); - init(root); - } - - @Override - public boolean hasNext() { - return !preorder.isEmpty(); - } - - @Override - public Token next() { - return preorder.remove(0); - } - - @Override - public void remove() { - next(); - } - } - - /** - * Constructor - * @param type the token type - */ - Token(final Token.Type type) { - this.type = type; - children = new ArrayList<>(); - } - - /** - * Add a an "atom" child to a token - * @param child the child to add - * @return the token (for chaining) - */ - public Token add(final String child) { - children.add(child); - return this; - } - - /** - * Add a child to a token - * @param child the child - * @return the token (for chaining) - */ - public Token add(final Token child) { - if (child != null) { - children.add(child); - child.setParent(this); - } - return this; - } - - /** - * Remove a child from a token - * @param child the child to remove - * @return true if successful - */ - public boolean remove(final Token child) { - return children.remove(child); - } - - /** - * Remove the last child from a token - * @return the removed child - */ - public Object removeLast() { - return children.remove(children.size() - 1); - } - - /** - * Flag this token as dead code - * @param isDead is it dead or not - */ - private void setIsDead(final boolean isDead) { - this.isDead = isDead; - } - - /** - * Is this token dead code - * @return boolean - */ - private boolean getIsDead() { - return isDead; - } - - /** - * Get the parent of this token - * @return parent token - */ - public Token getParent() { - return parent; - } - - public boolean hasParentOfType(final Token.Type parentType) { - for (Token p = getParent(); p != null; p = p.getParent()) { - if (p.getType() == parentType) { - return true; - } - } - return false; - } - - public boolean hasChildOfType(final Token.Type childType) { - for (final Iterator iter = iterator() ; iter.hasNext() ; ) { - if (iter.next().getType() == childType) { - return true; - } - } - return false; - } - - /** - * Set the parent of this token - * @param parent - */ - private void setParent(final Token parent) { - this.parent = parent; - } - - /** - * Get the children of this token - * @return an array of children, never null - */ - public Object[] getChildren() { - return children.toArray(); - } - - /** - * Reset this token, remove all children - */ - public void reset() { - children.clear(); - } - - /** - * Get a preorder token iterator with this token as root - * @return an iterator - */ - public Iterator iterator() { - return new TokenIterator(this); - } - - /** - * Get the type of this token - * @return type - */ - public Type getType() { - return type; - } - - /** - * Turn this token into Java regexp compatible text - * @return part of a java regexp - */ - @Override - public String toString() { - ToString t = toStringMap.get(getType()); - if (t == null) { - t = DEFAULT_TOSTRING; - } - return t.toString(this); - } } /** @@ -437,13 +101,11 @@ return; } - for (final Map.Entry fwdRef : forwardReferences.entrySet()) { - if (fwdRef.getKey().intValue() > caps.size()) { + for (final Integer ref : forwardReferences) { + if (ref.intValue() > caps.size()) { neverMatches = true; break; } - - fwdRef.getValue().setIsDead(true); } forwardReferences.clear(); @@ -459,12 +121,10 @@ public static RegExpScanner scan(final String string) { final RegExpScanner scanner = new RegExpScanner(string); - Token pattern; - try { - pattern = scanner.pattern(); + scanner.disjunction(); } catch (final Exception e) { - throw new PatternSyntaxException(e.getMessage(), string, scanner.sb.length()); + throw new PatternSyntaxException(e.getMessage(), string, scanner.position); } scanner.processForwardReferences(); @@ -472,24 +132,12 @@ return null; // never matches } - // go over the code and remove dead code - final Iterator iter = pattern.iterator(); - while (iter.hasNext()) { - final Token next = iter.next(); - if (next.getIsDead()) { - next.getParent().remove(next); - } - } - - // turn the pattern into a string, p, the java equivalent string for our js regexp - final String p = pattern.toString(); - // if builder contains all tokens that were sent in, we know - // we correctly parsed the entire JavaScript regexp without syntax errors - if (!string.equals(scanner.getStringBuilder().toString())) { + // Throw syntax error unless we parsed the entire JavaScript regexp without syntax errors + if (scanner.position != string.length()) { + final String p = scanner.getStringBuilder().toString(); throw new PatternSyntaxException(string, p, p.length() + 1); } - scanner.javaPattern = p; return scanner; } @@ -508,7 +156,7 @@ } String getJavaPattern() { - return javaPattern; + return sb.toString(); } BitVector getGroupsInNegativeLookahead() { @@ -527,11 +175,10 @@ /** * Commit n characters to the builder and to a given token - * @param token Uncommitted token. * @param n Number of characters. * @return Committed token */ - private Token commit(final Token token, final int n) { + private boolean commit(final int n) { final int startIn = position; switch (n) { @@ -554,11 +201,7 @@ assert false : "Should not reach here"; } - if (token == null) { - return null; - } - - return token.add(sb.substring(startIn, sb.length())); + return true; } /** @@ -587,35 +230,20 @@ */ /* - * Pattern :: - * Disjunction - */ - private Token pattern() { - final Token token = new Token(Token.Type.PATTERN); - - final Token child = disjunction(); - return token.add(child); - } - - /* * Disjunction :: * Alternative * Alternative | Disjunction */ - private Token disjunction() { - final Token token = new Token(Token.Type.DISJUNCTION); - + private void disjunction() { while (true) { - token.add(alternative()); + alternative(); if (ch0 == '|') { - commit(token, 1); + commit(1); } else { break; } } - - return token; } /* @@ -623,15 +251,10 @@ * [empty] * Alternative Term */ - private Token alternative() { - final Token token = new Token(Token.Type.ALTERNATIVE); - - Token child; - while ((child = term()) != null) { - token.add(child); + private void alternative() { + while (term()) { + // do nothing } - - return token; } /* @@ -640,48 +263,37 @@ * Atom * Atom Quantifier */ - private Token term() { + private boolean term() { final int startIn = position; final int startOut = sb.length(); - final Token token = new Token(Token.Type.TERM); - Token child; - child = assertion(); - if (child != null) { - return token.add(child); + if (assertion()) { + return true; } - child = atom(); - if (child != null) { + if (atom()) { boolean emptyCharacterClass = false; - if ("[]".equals(child.toString())) { + if (sb.toString().endsWith("[]")) { emptyCharacterClass = true; + } else if (sb.toString().endsWith("[^]")) { + sb.setLength(sb.length() - 2); + sb.append("\\s\\S]"); } - token.add(child); - - final Token quantifier = quantifier(); - if (quantifier != null) { - token.add(quantifier); - } + boolean quantifier = quantifier(); if (emptyCharacterClass) { - if (quantifier == null) { + if (!quantifier) { neverMatches = true; //never matches ever. - } else { - //if we can get away with max zero, remove this entire token - final String qs = quantifier.toString(); - if ("+".equals(qs) || "*".equals(qs) || qs.startsWith("{0,")) { - token.setIsDead(true); - } } + // Note: we could check if quantifier has min zero to mark empty character class as dead. } - return token; + return true; } restart(startIn, startOut); - return null; + return false; } /* @@ -693,19 +305,18 @@ * ( ? = Disjunction ) * ( ? ! Disjunction ) */ - private Token assertion() { + private boolean assertion() { final int startIn = position; final int startOut = sb.length(); - final Token token = new Token(Token.Type.ASSERTION); switch (ch0) { case '^': case '$': - return commit(token, 1); + return commit(1); case '\\': if (ch1 == 'b' || ch1 == 'B') { - return commit(token, 2); + return commit(2); } break; @@ -717,24 +328,18 @@ break; } final boolean isNegativeLookahead = (ch2 == '!'); - commit(token, 3); + commit(3); if (isNegativeLookahead) { negativeLookaheadLevel++; } - final Token disjunction = disjunction(); + disjunction(); if (isNegativeLookahead) { - for (final Capture cap : caps) { - if (cap.getNegativeLookaheadLevel() >= negativeLookaheadLevel) { - cap.setDead(); - } - } negativeLookaheadLevel--; } - if (disjunction != null && ch0 == ')') { - token.add(disjunction); - return commit(token, 1); + if (ch0 == ')') { + return commit(1); } break; @@ -743,8 +348,7 @@ } restart(startIn, startOut); - - return null; + return false; } /* @@ -752,17 +356,14 @@ * QuantifierPrefix * QuantifierPrefix ? */ - private Token quantifier() { - final Token token = new Token(Token.Type.QUANTIFIER); - final Token child = quantifierPrefix(); - if (child != null) { - token.add(child); + private boolean quantifier() { + if (quantifierPrefix()) { if (ch0 == '?') { - commit(token, 1); + commit(1); } - return token; + return true; } - return null; + return false; } /* @@ -774,45 +375,42 @@ * { DecimalDigits , } * { DecimalDigits , DecimalDigits } */ - private Token quantifierPrefix() { + private boolean quantifierPrefix() { final int startIn = position; final int startOut = sb.length(); - final Token token = new Token(Token.Type.QUANTIFIER_PREFIX); switch (ch0) { case '*': case '+': case '?': - return commit(token, 1); + return commit(1); case '{': - commit(token, 1); + commit(1); - final Token child = decimalDigits(); - if (child == null) { + if (!decimalDigits()) { break; // not a quantifier - back out } push('}'); - token.add(child); if (ch0 == ',') { - commit(token, 1); - token.add(decimalDigits()); + commit(1); + decimalDigits(); } if (ch0 == '}') { pop('}'); - commit(token, 1); + commit(1); } - return token; + return true; default: break; } restart(startIn, startOut); - return null; + return false; } /* @@ -825,81 +423,51 @@ * ( ? : Disjunction ) * */ - private Token atom() { + private boolean atom() { final int startIn = position; final int startOut = sb.length(); - final Token token = new Token(Token.Type.ATOM); - Token child; - child = patternCharacter(); - if (child != null) { - return token.add(child); + if (patternCharacter()) { + return true; } if (ch0 == '.') { - return commit(token, 1); + return commit(1); } if (ch0 == '\\') { - commit(token, 1); - child = atomEscape(); - - if (child != null) { - if (child.hasChildOfType(Token.Type.IDENTITY_ESCAPE)) { - final char idEscape = child.toString().charAt(0); - if (NON_IDENT_ESCAPES.indexOf(idEscape) == -1) { - token.reset(); - } - } - - token.add(child); + commit(1); - // forward backreferences always match empty. JavaScript != Java - if (child.hasChildOfType(Token.Type.DECIMAL_ESCAPE) && !"\u0000".equals(child.toString())) { - final int refNum = Integer.parseInt(child.toString()); - - if (refNum - 1 < caps.size() && caps.get(refNum - 1).isDead()) { - // reference to dead in-negative-lookahead capture - token.setIsDead(true); - } else if (caps.size() < refNum) { - // forward reference: always matches against empty string (dead token). - // invalid reference (non-existant capture): pattern never matches. - forwardReferences.put(refNum, token); - } - } - - return token; + if (atomEscape()) { + return true; } } - child = characterClass(); - if (child != null) { - return token.add(child); + if (characterClass()) { + return true; } if (ch0 == '(') { boolean capturingParens = true; - commit(token, 1); + commit(1); if (ch0 == '?' && ch1 == ':') { capturingParens = false; - commit(token, 2); + commit(2); } - child = disjunction(); - if (child != null) { - token.add(child); - if (ch0 == ')') { - final Token atom = commit(token, 1); - if (capturingParens) { - caps.add(new Capture(negativeLookaheadLevel)); - } - return atom; + disjunction(); + + if (ch0 == ')') { + commit(1); + if (capturingParens) { + caps.add(new Capture(negativeLookaheadLevel)); } + return true; } } restart(startIn, startOut); - return null; + return false; } /* @@ -907,9 +475,9 @@ * SourceCharacter but not any of: ^$\.*+?()[]{}| */ @SuppressWarnings("fallthrough") - private Token patternCharacter() { + private boolean patternCharacter() { if (atEOF()) { - return null; + return false; } switch (ch0) { @@ -924,23 +492,26 @@ case ')': case '[': case '|': - return null; + return false; case '}': case ']': final int n = expected.get(ch0); if (n != 0) { - return null; + return false; } case '{': // if not a valid quantifier escape curly brace to match itself // this ensures compatibility with other JS implementations - final Token quant = quantifierPrefix(); - return (quant == null) ? commit(new Token(Token.Type.PATTERN_CHARACTER).add("\\"), 1) : null; + if (!quantifierPrefix()) { + sb.append('\\'); + return commit(1); + } + return false; default: - return commit(new Token(Token.Type.PATTERN_CHARACTER), 1); // SOURCECHARACTER + return commit(1); // SOURCECHARACTER } } @@ -950,27 +521,9 @@ * CharacterEscape * CharacterClassEscape */ - private Token atomEscape() { - final Token token = new Token(Token.Type.ATOM_ESCAPE); - Token child; - - child = decimalEscape(); - if (child != null) { - return token.add(child); - } - - child = characterClassEscape(); - if (child != null) { - return token.add(child); - } - - child = characterEscape(); - if (child != null) { - return token.add(child); - } - - - return null; + private boolean atomEscape() { + // Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all + return decimalEscape() || characterClassEscape() || characterEscape() || identityEscape(); } /* @@ -981,48 +534,31 @@ * UnicodeEscapeSequence * IdentityEscape */ - private Token characterEscape() { + private boolean characterEscape() { final int startIn = position; final int startOut = sb.length(); - final Token token = new Token(Token.Type.CHARACTER_ESCAPE); - Token child; - - child = controlEscape(); - if (child != null) { - return token.add(child); + if (controlEscape()) { + return true; } if (ch0 == 'c') { - commit(token, 1); - child = controlLetter(); - if (child != null) { - return token.add(child); + commit(1); + if (controlLetter()) { + return true; } restart(startIn, startOut); } - child = hexEscapeSequence(); - if (child != null) { - return token.add(child); - } - - child = unicodeEscapeSequence(); - if (child != null) { - return token.add(child); - } - - child = identityEscape(); - if (child != null) { - return token.add(child); + if (hexEscapeSequence() || unicodeEscapeSequence()) { + return true; } restart(startIn, startOut); - - return null; + return false; } - private boolean scanEscapeSequence(final char leader, final int length, final Token token) { + private boolean scanEscapeSequence(final char leader, final int length) { final int startIn = position; final int startOut = sb.length(); @@ -1030,11 +566,11 @@ return false; } - commit(token, 1); + commit(1); for (int i = 0; i < length; i++) { final char ch0l = Character.toLowerCase(ch0); if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) { - commit(token, 1); + commit(1); } else { restart(startIn, startOut); return false; @@ -1044,37 +580,29 @@ return true; } - private Token hexEscapeSequence() { - final Token token = new Token(Token.Type.HEX_ESCAPESEQUENCE); - if (scanEscapeSequence('x', 2, token)) { - return token; - } - return null; + private boolean hexEscapeSequence() { + return scanEscapeSequence('x', 2); } - private Token unicodeEscapeSequence() { - final Token token = new Token(Token.Type.UNICODE_ESCAPESEQUENCE); - if (scanEscapeSequence('u', 4, token)) { - return token; - } - return null; + private boolean unicodeEscapeSequence() { + return scanEscapeSequence('u', 4); } /* * ControlEscape :: * one of fnrtv */ - private Token controlEscape() { + private boolean controlEscape() { switch (ch0) { case 'f': case 'n': case 'r': case 't': case 'v': - return commit(new Token(Token.Type.CONTROL_ESCAPE), 1); + return commit(1); default: - return null; + return false; } } @@ -1083,19 +611,18 @@ * one of abcdefghijklmnopqrstuvwxyz * ABCDEFGHIJKLMNOPQRSTUVWXYZ */ - private Token controlLetter() { + private boolean controlLetter() { final char c = Character.toUpperCase(ch0); if (c >= 'A' && c <= 'Z') { - final Token token = new Token(Token.Type.CONTROL_LETTER); - commit(token, 1); - return token; + // for some reason java regexps don't like control characters on the + // form "\\ca".match([string with ascii 1 at char0]). Translating + // them to unicode does it though. + sb.setLength(sb.length() - 1); + unicode(c - 'A' + 1); + skip(1); + return true; } - return null; - /* - Token token = new Token(Token.Type.CONTROL_LETTER); - commit(null, 1);//add original char to builder not to token - this.neverMatches = c < 'A' || c > 'Z'; - return token.add(""+c);*/ + return false; } /* @@ -1104,56 +631,115 @@ * (200c) * (200d) */ - private Token identityEscape() { - final Token token = new Token(Token.Type.IDENTITY_ESCAPE); - commit(token, 1); - return token; + private boolean identityEscape() { + if (atEOF()) { + throw new RuntimeException("\\ at end of pattern"); // will be converted to PatternSyntaxException + } + // ES 5.1 A.7 requires "not IdentifierPart" here but all major engines accept any character here. + if (NON_IDENT_ESCAPES.indexOf(ch0) == -1) { + sb.setLength(sb.length() - 1); + } + return commit(1); } /* * DecimalEscape :: * DecimalIntegerLiteral [lookahead DecimalDigit] */ - private Token decimalEscape() { - final Token token = new Token(Token.Type.DECIMAL_ESCAPE); + private boolean decimalEscape() { final int startIn = position; final int startOut = sb.length(); if (ch0 == '0' && !isDecimalDigit(ch1)) { - commit(token, 1); - token.removeLast(); + skip(1); // DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a character (Unicodevalue0000); - return token.add("\u0000"); + sb.append("\u0000"); + return true; } if (isDecimalDigit(ch0)) { - while (isDecimalDigit(ch0)) { - commit(token, 1); + final int num = ch0 - '0'; + + // Single digit escape, treat as backreference. + if (!isDecimalDigit(ch1)) { + if (num <= caps.size() && caps.get(num - 1).getNegativeLookaheadLevel() > 0) { + // Captures that live inside a negative lookahead are dead after the + // lookahead and will be undefined if referenced from outside. + if (caps.get(num - 1).getNegativeLookaheadLevel() > negativeLookaheadLevel) { + sb.setLength(sb.length() - 1); + } else { + sb.append(ch0); + } + skip(1); + return true; + } else if (num > caps.size()) { + // Forward reference to a capture group. Forward references are always undefined so we + // can omit it from the output buffer. Additionally, if the capture group does not exist + // the whole regexp becomes invalid, so register the reference for later processing. + forwardReferences.add(num); + sb.setLength(sb.length() - 1); + skip(1); + return true; + } } - return token; + + if (inCharClass) { + // Convert octal escape to unicode escape if inside character class. + StringBuilder digit = new StringBuilder(4); + while (isDecimalDigit(ch0)) { + digit.append(ch0); + skip(1); + } + + int value = Integer.parseInt(digit.toString(), 8); //throws exception that leads to SyntaxError if not octal + if (value > 0xff) { + throw new NumberFormatException(digit.toString()); + } + + unicode(value); + + } else { + // Copy decimal escape as-is + decimalDigits(); + } + return true; } restart(startIn, startOut); - - return null; + return false; } /* * CharacterClassEscape :: * one of dDsSwW */ - private Token characterClassEscape() { + private boolean characterClassEscape() { switch (ch0) { + // java.util.regex requires translation of \s and \S to explicit character list case 's': + if (RegExpFactory.usesJavaUtilRegex()) { + sb.setLength(sb.length() - 1); + sb.append('[').append(Lexer.getWhitespaceRegExp()).append(']'); + skip(1); + return true; + } + return commit(1); case 'S': + if (RegExpFactory.usesJavaUtilRegex()) { + sb.setLength(sb.length() - 1); + sb.append("[^").append(Lexer.getWhitespaceRegExp()).append(']'); + skip(1); + return true; + } + return commit(1); case 'd': case 'D': case 'w': case 'W': - return commit(new Token(Token.Type.CHARACTERCLASS_ESCAPE), 1); + return commit(1); default: - return null; + return false; } } @@ -1162,29 +748,31 @@ * [ [lookahead {^}] ClassRanges ] * [ ^ ClassRanges ] */ - private Token characterClass() { + private boolean characterClass() { final int startIn = position; final int startOut = sb.length(); - final Token token = new Token(Token.Type.CHARACTERCLASS); if (ch0 == '[') { - push(']'); - commit(token, 1); + try { + inCharClass = true; + push(']'); + commit(1); - if (ch0 == '^') { - commit(token, 1); - } + if (ch0 == '^') { + commit(1); + } - final Token child = classRanges(); - if (child != null && ch0 == ']') { - pop(']'); - token.add(child); - return commit(token, 1); + if (classRanges() && ch0 == ']') { + pop(']'); + return commit(1); + } + } finally { + inCharClass = false; // no nested character classes in JavaScript } } restart(startIn, startOut); - return null; + return false; } /* @@ -1192,8 +780,9 @@ * [empty] * NonemptyClassRanges */ - private Token classRanges() { - return new Token(Token.Type.CLASSRANGES).add(nonemptyClassRanges()); + private boolean classRanges() { + nonemptyClassRanges(); + return true; } /* @@ -1202,40 +791,27 @@ * ClassAtom NonemptyClassRangesNoDash * ClassAtom - ClassAtom ClassRanges */ - private Token nonemptyClassRanges() { + private boolean nonemptyClassRanges() { final int startIn = position; final int startOut = sb.length(); - final Token token = new Token(Token.Type.NON_EMPTY_CLASSRANGES); - Token child; - child = classAtom(); - if (child != null) { - token.add(child); + if (classAtom()) { if (ch0 == '-') { - commit(token, 1); + commit(1); - final Token child1 = classAtom(); - final Token child2 = classRanges(); - if (child1 != null && child2 != null) { - token.add(child1); - token.add(child2); - - return token; + if (classAtom() && classRanges()) { + return true; } } - child = nonemptyClassRangesNoDash(); - if (child != null) { - token.add(child); - return token; - } + nonemptyClassRangesNoDash(); - return token; + return true; } restart(startIn, startOut); - return null; + return false; } /* @@ -1244,61 +820,44 @@ * ClassAtomNoDash NonemptyClassRangesNoDash * ClassAtomNoDash - ClassAtom ClassRanges */ - private Token nonemptyClassRangesNoDash() { + private boolean nonemptyClassRangesNoDash() { final int startIn = position; final int startOut = sb.length(); - final Token token = new Token(Token.Type.NON_EMPTY_CLASSRANGES_NODASH); - Token child; - child = classAtomNoDash(); - if (child != null) { - token.add(child); + if (classAtomNoDash()) { // need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom if (ch0 == '-') { - commit(token, 1); + commit(1); - final Token child1 = classAtom(); - final Token child2 = classRanges(); - if (child1 != null && child2 != null) { - token.add(child1); - return token.add(child2); + if (classAtom() && classRanges()) { + return true; } //fallthru } - child = nonemptyClassRangesNoDash(); - if (child != null) { - token.add(child); - } - return token; // still a class atom + nonemptyClassRangesNoDash(); + return true; // still a class atom } - child = classAtom(); - if (child != null) { - return token.add(child); + if (classAtom()) { + return true; } restart(startIn, startOut); - return null; + return false; } /* * ClassAtom : - ClassAtomNoDash */ - private Token classAtom() { - final Token token = new Token(Token.Type.CLASSATOM); + private boolean classAtom() { if (ch0 == '-') { - return commit(token, 1); + return commit(1); } - final Token child = classAtomNoDash(); - if (child != null) { - return token.add(child); - } - - return null; + return classAtomNoDash(); } /* @@ -1306,33 +865,32 @@ * SourceCharacter but not one of \ or ] or - * \ ClassEscape */ - private Token classAtomNoDash() { + private boolean classAtomNoDash() { final int startIn = position; final int startOut = sb.length(); - final Token token = new Token(Token.Type.CLASSATOM_NODASH); switch (ch0) { case ']': case '-': case '\0': - return null; + return false; case '[': // unescaped left square bracket - add escape - return commit(token.add("\\"), 1); + sb.append('\\'); + return commit(1); case '\\': - commit(token, 1); - final Token child = classEscape(); - if (child != null) { - return token.add(child); + commit(1); + if (classEscape()) { + return true; } restart(startIn, startOut); - return null; + return false; default: - return commit(token, 1); + return commit(1); } } @@ -1343,46 +901,45 @@ * CharacterEscape * CharacterClassEscape */ - private Token classEscape() { - final Token token = new Token(Token.Type.CLASS_ESCAPE); - Token child; + private boolean classEscape() { - child = decimalEscape(); - if (child != null) { - return token.add(child); + if (decimalEscape()) { + return true; } if (ch0 == 'b') { - return commit(token, 1); + sb.setLength(sb.length() - 1); + sb.append('\b'); + skip(1); + return true; } - child = characterEscape(); - if (child != null) { - return token.add(child); - } - - child = characterClassEscape(); - if (child != null) { - return token.add(child); - } - - return null; + // Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all + return characterEscape() || characterClassEscape() || identityEscape(); } /* * DecimalDigits */ - private Token decimalDigits() { + private boolean decimalDigits() { if (!isDecimalDigit(ch0)) { - return null; + return false; + } + + while (isDecimalDigit(ch0)) { + commit(1); } - final Token token = new Token(Token.Type.DECIMALDIGITS); - while (isDecimalDigit(ch0)) { - commit(token, 1); + return true; + } + + private void unicode(final int value) { + final String hex = Integer.toHexString(value); + sb.append('u'); + for (int i = 0; i < 4 - hex.length(); i++) { + sb.append('0'); } - - return token; + sb.append(hex); } private static boolean isDecimalDigit(final char ch) {