8009240: RegExpScanner code is inefficient and too complex
authorhannesw
Thu, 28 Feb 2013 22:59:31 +0100
changeset 16271 4817d7bb7434
parent 16270 17f612656e6b
child 16272 675a0caf75bc
8009240: RegExpScanner code is inefficient and too complex Reviewed-by: jlaskey, lagergren
nashorn/src/jdk/nashorn/internal/runtime/regexp/JoniRegExp.java
nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpFactory.java
nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java
--- a/nashorn/src/jdk/nashorn/internal/runtime/regexp/JoniRegExp.java	Thu Feb 28 20:31:30 2013 +0530
+++ b/nashorn/src/jdk/nashorn/internal/runtime/regexp/JoniRegExp.java	Thu Feb 28 22:59:31 2013 +0100
@@ -117,10 +117,6 @@
             return new JoniRegExp(pattern, flags);
         }
 
-        @Override
-        protected String replaceToken(final String str) {
-            return str.equals("[^]") ? "[\\s\\S]" : str;
-        }
     }
 
     class JoniMatcher implements RegExpMatcher {
--- a/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpFactory.java	Thu Feb 28 20:31:30 2013 +0530
+++ b/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpFactory.java	Thu Feb 28 22:59:31 2013 +0100
@@ -68,25 +68,6 @@
     }
 
     /**
-     * Replace a regexp token as suitable for regexp instances created by this factory.
-     *
-     * @param str a regular expression token
-     * @return the replacement token
-     */
-    protected String replaceToken(final String str) {
-        switch (str) {
-            case "\\s":
-                return "[" + Lexer.getWhitespaceRegExp() + "]";
-            case "\\S":
-                return "[^" + Lexer.getWhitespaceRegExp() + "]";
-            case "[^]":
-                return "[\\s\\S]";
-            default:
-                return str;
-        }
-    }
-
-    /**
      * Compile a regexp with the given {@code source} and {@code flags}.
      *
      * @param pattern RegExp pattern string
@@ -99,16 +80,6 @@
     }
 
     /**
-     * Replace a regexp token as needed by the currently installed factory instance.
-     *
-     * @param token a regexp token
-     * @return the replacement token
-     */
-    public static String replace(final String token) {
-        return instance.replaceToken(token);
-    }
-
-    /**
      * Validate a regexp with the given {@code source} and {@code flags}.
      *
      * @param pattern RegExp pattern string
@@ -120,4 +91,13 @@
     public static void validate(final String pattern, final String flags) throws ParserException {
         instance.compile(pattern, flags);
     }
+
+    /**
+     * Returns true if the instance uses the JDK's {@code java.util.regex} package.
+     *
+     * @return true if instance uses JDK regex package
+     */
+    public static boolean usesJavaUtilRegex() {
+        return instance != null && instance.getClass() == RegExpFactory.class;
+    }
 }
--- a/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java	Thu Feb 28 20:31:30 2013 +0530
+++ b/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java	Thu Feb 28 22:59:31 2013 +0100
@@ -25,15 +25,15 @@
 
 package jdk.nashorn.internal.runtime.regexp;
 
-import java.util.ArrayList;
 import java.util.HashMap;
-import java.util.Iterator;
-import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.regex.PatternSyntaxException;
 
+import jdk.nashorn.internal.parser.Lexer;
 import jdk.nashorn.internal.parser.Scanner;
 import jdk.nashorn.internal.runtime.BitVector;
 
@@ -44,17 +44,13 @@
 final class RegExpScanner extends Scanner {
 
     /**
-     * String builder to accumulate the result - this contains verbatim parsed JavaScript.
-     * to get the java equivalent we need to create a Pattern token and return its toString()
+     * String builder used to rewrite the pattern for the currently used regexp factory.
      */
     private final StringBuilder sb;
 
     /** Is this the special case of a regexp that never matches anything */
     private boolean neverMatches;
 
-    /** The resulting java.util.regex pattern string. */
-    private String javaPattern;
-
     /** Expected token table */
     private final Map<Character, Integer> expected = new HashMap<>();
 
@@ -62,11 +58,14 @@
     private final List<Capture> caps = new LinkedList<>();
 
     /** Forward references to capturing parenthesis to be resolved later.*/
-    private final Map<Integer, Token> forwardReferences = new LinkedHashMap<>();
+    private final Set<Integer> forwardReferences = new LinkedHashSet<>();
 
     /** Current level of zero-width negative lookahead assertions. */
     private int negativeLookaheadLevel;
 
+    /** Are we currently inside a character class? */
+    private boolean inCharClass = false;
+
     private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?";
 
     private static class Capture {
@@ -74,11 +73,6 @@
          * Zero-width negative lookaheads enclosing the capture.
          */
         private final int negativeLookaheadLevel;
-        /**
-         * Captures that live inside a negative lookahead are dead after the
-         * lookahead and will be undefined if referenced from outside.
-         */
-        private boolean isDead;
 
         Capture(final int negativeLookaheadLevel) {
             this.negativeLookaheadLevel = negativeLookaheadLevel;
@@ -88,336 +82,6 @@
             return negativeLookaheadLevel;
         }
 
-        public boolean isDead() {
-            return isDead;
-        }
-
-        public void setDead() {
-            this.isDead = true;
-        }
-    }
-
-    /**
-     * This is a token - the JavaScript regexp is scanned into a token tree
-     * A token has other tokens as children as well as "atoms", i.e. Strings.
-     */
-    private static class Token {
-
-        private enum Type {
-            PATTERN,
-            DISJUNCTION,
-            ALTERNATIVE,
-            TERM,
-            ASSERTION,
-            QUANTIFIER,
-            QUANTIFIER_PREFIX,
-            ATOM,
-            PATTERN_CHARACTER,
-            ATOM_ESCAPE,
-            CHARACTER_ESCAPE,
-            CONTROL_ESCAPE,
-            CONTROL_LETTER,
-            IDENTITY_ESCAPE,
-            DECIMAL_ESCAPE,
-            CHARACTERCLASS_ESCAPE,
-            CHARACTERCLASS,
-            CLASSRANGES,
-            NON_EMPTY_CLASSRANGES,
-            NON_EMPTY_CLASSRANGES_NODASH,
-            CLASSATOM,
-            CLASSATOM_NODASH,
-            CLASS_ESCAPE,
-            DECIMALDIGITS,
-            HEX_ESCAPESEQUENCE,
-            UNICODE_ESCAPESEQUENCE,
-        }
-
-        /**
-         * Token tyoe
-         */
-        private final Token.Type type;
-
-        /**
-         * Child nodes
-         */
-        private final List<Object> children;
-
-        /**
-         * Parent node
-         */
-        private Token parent;
-
-        /**
-         * Dead code flag
-         */
-        private boolean isDead;
-
-        private static final Map<Type, ToString> toStringMap = new HashMap<>();
-        private static final ToString DEFAULT_TOSTRING = new ToString();
-
-        private static String unicode(final int value) {
-            final StringBuilder sb = new StringBuilder();
-            final String hex = Integer.toHexString(value);
-            sb.append('u');
-            for (int i = 0; i < 4 - hex.length(); i++) {
-                sb.append('0');
-            }
-            sb.append(hex);
-
-            return sb.toString();
-        }
-
-        static {
-            toStringMap.put(Type.CHARACTERCLASS, new ToString() {
-                @Override
-                public String toString(final Token token) {
-                    return super.toString(token).replace("\\b", "\b");
-                }
-            });
-
-            // for some reason java regexps don't like control characters on the
-            // form "\\ca".match([string with ascii 1 at char0]). Translating
-            // them to unicode does it though.
-            toStringMap.put(Type.CHARACTER_ESCAPE, new ToString() {
-                @Override
-                public String toString(final Token token) {
-                    final String str = super.toString(token);
-                    if (str.length() == 2) {
-                        return Token.unicode(Character.toLowerCase(str.charAt(1)) - 'a' + 1);
-                    }
-                    return str;
-                }
-            });
-
-            toStringMap.put(Type.DECIMAL_ESCAPE, new ToString() {
-                @Override
-                public String toString(final Token token) {
-                    final String str = super.toString(token);
-
-                    if ("\0".equals(str)) {
-                        return str;
-                    }
-
-                    int value;
-
-                    if (!token.hasParentOfType(Type.CLASSRANGES)) {
-                        return str;
-                    }
-
-                    value = Integer.parseInt(str, 8); //throws exception that leads to SyntaxError if not octal
-                    if (value > 0xff) {
-                        throw new NumberFormatException(str);
-                    }
-
-                    return Token.unicode(value);
-                }
-            });
-
-        }
-
-        /**
-         * JavaScript Token to Java regex substring framework.
-         */
-        private static class ToString {
-            String toString(final Token token) {
-                final Object[] children = token.getChildren();
-
-                // Allow the installed regexp factory to perform global substitutions.
-                switch (children.length) {
-                    case 0:
-                        return "";
-                    case 1:
-                        return RegExpFactory.replace(children[0].toString());
-                    default:
-                        final StringBuilder sb = new StringBuilder();
-                        for (final Object child : children) {
-                            sb.append(child);
-                        }
-                        return RegExpFactory.replace(sb.toString());
-                }
-            }
-        }
-
-        /**
-         * Token iterator. Doesn't return "atom" children. i.e. string representations,
-         * just tokens
-         *
-         */
-        private static class TokenIterator implements Iterator<Token> {
-            private final List<Token> preorder;
-
-            private void init(final Token root) {
-                preorder.add(root);
-                for (final Object child : root.getChildren()) {
-                    if (child instanceof Token) {
-                        init((Token)child);
-                    }
-                }
-            }
-
-            TokenIterator(final Token root) {
-                preorder = new ArrayList<>();
-                init(root);
-            }
-
-            @Override
-            public boolean hasNext() {
-                return !preorder.isEmpty();
-            }
-
-            @Override
-            public Token next() {
-                return preorder.remove(0);
-            }
-
-            @Override
-            public void remove() {
-                next();
-            }
-        }
-
-        /**
-         * Constructor
-         * @param type the token type
-         */
-        Token(final Token.Type type) {
-            this.type = type;
-            children = new ArrayList<>();
-        }
-
-        /**
-         * Add a an "atom" child to a token
-         * @param child the child to add
-         * @return the token (for chaining)
-         */
-        public Token add(final String child) {
-            children.add(child);
-            return this;
-        }
-
-        /**
-         * Add a child to a token
-         * @param child the child
-         * @return the token (for chaining)
-         */
-        public Token add(final Token child) {
-            if (child != null) {
-                children.add(child);
-                child.setParent(this);
-            }
-            return this;
-        }
-
-        /**
-         * Remove a child from a token
-         * @param child the child to remove
-         * @return true if successful
-         */
-        public boolean remove(final Token child) {
-            return children.remove(child);
-        }
-
-        /**
-         * Remove the last child from a token
-         * @return the removed child
-         */
-        public Object removeLast() {
-            return children.remove(children.size() - 1);
-        }
-
-        /**
-         * Flag this token as dead code
-         * @param isDead is it dead or not
-         */
-        private void setIsDead(final boolean isDead) {
-            this.isDead = isDead;
-        }
-
-        /**
-         * Is this token dead code
-         * @return boolean
-         */
-        private boolean getIsDead() {
-            return isDead;
-        }
-
-        /**
-         * Get the parent of this token
-         * @return parent token
-         */
-        public Token getParent() {
-            return parent;
-        }
-
-        public boolean hasParentOfType(final Token.Type parentType) {
-            for (Token p = getParent(); p != null; p = p.getParent()) {
-                if (p.getType() == parentType) {
-                    return true;
-                }
-            }
-            return false;
-        }
-
-        public boolean hasChildOfType(final Token.Type childType) {
-            for (final Iterator<Token> iter = iterator() ; iter.hasNext() ; ) {
-                if (iter.next().getType() == childType) {
-                    return true;
-                }
-            }
-            return false;
-        }
-
-        /**
-         * Set the parent of this token
-         * @param parent
-         */
-        private void setParent(final Token parent) {
-            this.parent = parent;
-        }
-
-        /**
-         * Get the children of this token
-         * @return an array of children, never null
-         */
-        public Object[] getChildren() {
-            return children.toArray();
-        }
-
-        /**
-         * Reset this token, remove all children
-         */
-        public void reset() {
-            children.clear();
-        }
-
-        /**
-         * Get a preorder token iterator with this token as root
-         * @return an iterator
-         */
-        public Iterator<Token> iterator() {
-            return new TokenIterator(this);
-        }
-
-        /**
-         * Get the type of this token
-         * @return type
-         */
-        public Type getType() {
-            return type;
-        }
-
-        /**
-         * Turn this token into Java regexp compatible text
-         * @return part of a java regexp
-         */
-        @Override
-        public String toString() {
-            ToString t = toStringMap.get(getType());
-            if (t == null) {
-                t = DEFAULT_TOSTRING;
-            }
-            return t.toString(this);
-        }
     }
 
     /**
@@ -437,13 +101,11 @@
             return;
         }
 
-        for (final Map.Entry<Integer, Token> fwdRef : forwardReferences.entrySet()) {
-            if (fwdRef.getKey().intValue() > caps.size()) {
+        for (final Integer ref : forwardReferences) {
+            if (ref.intValue() > caps.size()) {
                 neverMatches = true;
                 break;
             }
-
-            fwdRef.getValue().setIsDead(true);
         }
 
         forwardReferences.clear();
@@ -459,12 +121,10 @@
     public static RegExpScanner scan(final String string) {
         final RegExpScanner scanner = new RegExpScanner(string);
 
-        Token pattern;
-
         try {
-            pattern = scanner.pattern();
+            scanner.disjunction();
         } catch (final Exception e) {
-            throw new PatternSyntaxException(e.getMessage(), string, scanner.sb.length());
+            throw new PatternSyntaxException(e.getMessage(), string, scanner.position);
         }
 
         scanner.processForwardReferences();
@@ -472,24 +132,12 @@
             return null; // never matches
         }
 
-        // go over the code and remove dead code
-        final Iterator<Token> iter = pattern.iterator();
-        while (iter.hasNext()) {
-            final Token next = iter.next();
-            if (next.getIsDead()) {
-                next.getParent().remove(next);
-            }
-        }
-
-        // turn the pattern into a string, p, the java equivalent string for our js regexp
-        final String p = pattern.toString();
-        // if builder contains all tokens that were sent in, we know
-        // we correctly parsed the entire JavaScript regexp without syntax errors
-        if (!string.equals(scanner.getStringBuilder().toString())) {
+        // Throw syntax error unless we parsed the entire JavaScript regexp without syntax errors
+        if (scanner.position != string.length()) {
+            final String p = scanner.getStringBuilder().toString();
             throw new PatternSyntaxException(string, p, p.length() + 1);
         }
 
-        scanner.javaPattern = p;
         return scanner;
      }
 
@@ -508,7 +156,7 @@
     }
 
     String getJavaPattern() {
-        return javaPattern;
+        return sb.toString();
     }
 
     BitVector getGroupsInNegativeLookahead() {
@@ -527,11 +175,10 @@
 
     /**
      * Commit n characters to the builder and to a given token
-     * @param token Uncommitted token.
      * @param n     Number of characters.
      * @return Committed token
      */
-    private Token commit(final Token token, final int n) {
+    private boolean commit(final int n) {
         final int startIn = position;
 
         switch (n) {
@@ -554,11 +201,7 @@
             assert false : "Should not reach here";
         }
 
-        if (token == null) {
-            return null;
-        }
-
-        return token.add(sb.substring(startIn, sb.length()));
+        return true;
     }
 
     /**
@@ -587,35 +230,20 @@
      */
 
     /*
-     * Pattern ::
-     *      Disjunction
-     */
-    private Token pattern() {
-        final Token token = new Token(Token.Type.PATTERN);
-
-        final Token child = disjunction();
-        return token.add(child);
-    }
-
-    /*
      * Disjunction ::
      *      Alternative
      *      Alternative | Disjunction
      */
-    private Token disjunction() {
-        final Token token = new Token(Token.Type.DISJUNCTION);
-
+    private void disjunction() {
         while (true) {
-            token.add(alternative());
+            alternative();
 
             if (ch0 == '|') {
-                commit(token, 1);
+                commit(1);
             } else {
                 break;
             }
         }
-
-        return token;
     }
 
     /*
@@ -623,15 +251,10 @@
      *      [empty]
      *      Alternative Term
      */
-    private Token alternative() {
-        final Token token = new Token(Token.Type.ALTERNATIVE);
-
-        Token child;
-        while ((child = term()) != null) {
-            token.add(child);
+    private void alternative() {
+        while (term()) {
+            // do nothing
         }
-
-        return token;
     }
 
     /*
@@ -640,48 +263,37 @@
      *      Atom
      *      Atom Quantifier
      */
-    private Token term() {
+    private boolean term() {
         final int startIn  = position;
         final int startOut = sb.length();
-        final Token token  = new Token(Token.Type.TERM);
-        Token child;
 
-        child = assertion();
-        if (child != null) {
-            return token.add(child);
+        if (assertion()) {
+            return true;
         }
 
-        child = atom();
-        if (child != null) {
+        if (atom()) {
             boolean emptyCharacterClass = false;
-            if ("[]".equals(child.toString())) {
+            if (sb.toString().endsWith("[]")) {
                 emptyCharacterClass = true;
+            } else if (sb.toString().endsWith("[^]")) {
+                sb.setLength(sb.length() - 2);
+                sb.append("\\s\\S]");
             }
 
-            token.add(child);
-
-            final Token quantifier = quantifier();
-            if (quantifier != null) {
-                token.add(quantifier);
-            }
+            boolean quantifier = quantifier();
 
             if (emptyCharacterClass) {
-                if (quantifier == null) {
+                if (!quantifier) {
                     neverMatches = true; //never matches ever.
-                } else {
-                    //if we can get away with max zero, remove this entire token
-                    final String qs = quantifier.toString();
-                    if ("+".equals(qs) || "*".equals(qs) || qs.startsWith("{0,")) {
-                        token.setIsDead(true);
-                    }
                 }
+                // Note: we could check if quantifier has min zero to mark empty character class as dead.
             }
 
-            return token;
+            return true;
         }
 
         restart(startIn, startOut);
-        return null;
+        return false;
     }
 
     /*
@@ -693,19 +305,18 @@
      *      ( ? = Disjunction )
      *      ( ? ! Disjunction )
      */
-    private Token assertion() {
+    private boolean assertion() {
         final int startIn  = position;
         final int startOut = sb.length();
-        final Token token  = new Token(Token.Type.ASSERTION);
 
         switch (ch0) {
         case '^':
         case '$':
-            return commit(token, 1);
+            return commit(1);
 
         case '\\':
             if (ch1 == 'b' || ch1 == 'B') {
-                return commit(token, 2);
+                return commit(2);
             }
             break;
 
@@ -717,24 +328,18 @@
                 break;
             }
             final boolean isNegativeLookahead = (ch2 == '!');
-            commit(token, 3);
+            commit(3);
 
             if (isNegativeLookahead) {
                 negativeLookaheadLevel++;
             }
-            final Token disjunction = disjunction();
+            disjunction();
             if (isNegativeLookahead) {
-                for (final Capture cap : caps) {
-                    if (cap.getNegativeLookaheadLevel() >= negativeLookaheadLevel) {
-                        cap.setDead();
-                    }
-                }
                 negativeLookaheadLevel--;
             }
 
-            if (disjunction != null && ch0 == ')') {
-                token.add(disjunction);
-                return commit(token, 1);
+            if (ch0 == ')') {
+                return commit(1);
             }
             break;
 
@@ -743,8 +348,7 @@
         }
 
         restart(startIn, startOut);
-
-        return null;
+        return false;
     }
 
     /*
@@ -752,17 +356,14 @@
      *      QuantifierPrefix
      *      QuantifierPrefix ?
      */
-    private Token quantifier() {
-        final Token token = new Token(Token.Type.QUANTIFIER);
-        final Token child = quantifierPrefix();
-        if (child != null) {
-            token.add(child);
+    private boolean quantifier() {
+        if (quantifierPrefix()) {
             if (ch0 == '?') {
-                commit(token, 1);
+                commit(1);
             }
-            return token;
+            return true;
         }
-        return null;
+        return false;
     }
 
     /*
@@ -774,45 +375,42 @@
      *      { DecimalDigits , }
      *      { DecimalDigits , DecimalDigits }
      */
-    private Token quantifierPrefix() {
+    private boolean quantifierPrefix() {
         final int startIn  = position;
         final int startOut = sb.length();
-        final Token token  = new Token(Token.Type.QUANTIFIER_PREFIX);
 
         switch (ch0) {
         case '*':
         case '+':
         case '?':
-            return commit(token, 1);
+            return commit(1);
 
         case '{':
-            commit(token, 1);
+            commit(1);
 
-            final Token child = decimalDigits();
-            if (child == null) {
+            if (!decimalDigits()) {
                 break; // not a quantifier - back out
             }
             push('}');
-            token.add(child);
 
             if (ch0 == ',') {
-                commit(token, 1);
-                token.add(decimalDigits());
+                commit(1);
+                decimalDigits();
             }
 
             if (ch0 == '}') {
                 pop('}');
-                commit(token, 1);
+                commit(1);
             }
 
-            return token;
+            return true;
 
         default:
             break;
         }
 
         restart(startIn, startOut);
-        return null;
+        return false;
     }
 
     /*
@@ -825,81 +423,51 @@
      *      ( ? : Disjunction )
      *
      */
-    private Token atom() {
+    private boolean atom() {
         final int startIn  = position;
         final int startOut = sb.length();
-        final Token token  = new Token(Token.Type.ATOM);
-        Token child;
 
-        child = patternCharacter();
-        if (child != null) {
-            return token.add(child);
+        if (patternCharacter()) {
+            return true;
         }
 
         if (ch0 == '.') {
-            return commit(token, 1);
+            return commit(1);
         }
 
         if (ch0 == '\\') {
-            commit(token, 1);
-            child = atomEscape();
-
-            if (child != null) {
-                if (child.hasChildOfType(Token.Type.IDENTITY_ESCAPE)) {
-                    final char idEscape = child.toString().charAt(0);
-                    if (NON_IDENT_ESCAPES.indexOf(idEscape) == -1) {
-                        token.reset();
-                    }
-                }
-
-                token.add(child);
+            commit(1);
 
-                // forward backreferences always match empty. JavaScript != Java
-                if (child.hasChildOfType(Token.Type.DECIMAL_ESCAPE) && !"\u0000".equals(child.toString())) {
-                    final int refNum = Integer.parseInt(child.toString());
-
-                    if (refNum - 1 < caps.size() && caps.get(refNum - 1).isDead()) {
-                        // reference to dead in-negative-lookahead capture
-                        token.setIsDead(true);
-                    } else if (caps.size() < refNum) {
-                        // forward reference: always matches against empty string (dead token).
-                        // invalid reference (non-existant capture): pattern never matches.
-                        forwardReferences.put(refNum, token);
-                    }
-                }
-
-                return token;
+            if (atomEscape()) {
+                return true;
             }
         }
 
-        child = characterClass();
-        if (child != null) {
-            return token.add(child);
+        if (characterClass()) {
+            return true;
         }
 
         if (ch0 == '(') {
             boolean capturingParens = true;
-            commit(token, 1);
+            commit(1);
             if (ch0 == '?' && ch1 == ':') {
                 capturingParens = false;
-                commit(token, 2);
+                commit(2);
             }
 
-            child = disjunction();
-            if (child != null) {
-                token.add(child);
-                if (ch0 == ')') {
-                    final Token atom = commit(token, 1);
-                    if (capturingParens) {
-                        caps.add(new Capture(negativeLookaheadLevel));
-                    }
-                    return atom;
+            disjunction();
+
+            if (ch0 == ')') {
+                commit(1);
+                if (capturingParens) {
+                    caps.add(new Capture(negativeLookaheadLevel));
                 }
+                return true;
             }
         }
 
         restart(startIn, startOut);
-        return null;
+        return false;
     }
 
     /*
@@ -907,9 +475,9 @@
      *      SourceCharacter but not any of: ^$\.*+?()[]{}|
      */
     @SuppressWarnings("fallthrough")
-    private Token patternCharacter() {
+    private boolean patternCharacter() {
         if (atEOF()) {
-            return null;
+            return false;
         }
 
         switch (ch0) {
@@ -924,23 +492,26 @@
         case ')':
         case '[':
         case '|':
-            return null;
+            return false;
 
         case '}':
         case ']':
             final int n = expected.get(ch0);
             if (n != 0) {
-                return null;
+                return false;
             }
 
        case '{':
            // if not a valid quantifier escape curly brace to match itself
            // this ensures compatibility with other JS implementations
-           final Token quant = quantifierPrefix();
-           return (quant == null) ? commit(new Token(Token.Type.PATTERN_CHARACTER).add("\\"), 1) : null;
+           if (!quantifierPrefix()) {
+               sb.append('\\');
+               return commit(1);
+           }
+           return false;
 
         default:
-            return commit(new Token(Token.Type.PATTERN_CHARACTER), 1); // SOURCECHARACTER
+            return commit(1); // SOURCECHARACTER
         }
     }
 
@@ -950,27 +521,9 @@
      *      CharacterEscape
      *      CharacterClassEscape
      */
-    private Token atomEscape() {
-        final Token token = new Token(Token.Type.ATOM_ESCAPE);
-        Token child;
-
-        child = decimalEscape();
-        if (child != null) {
-            return token.add(child);
-        }
-
-        child = characterClassEscape();
-        if (child != null) {
-            return token.add(child);
-        }
-
-        child = characterEscape();
-        if (child != null) {
-            return token.add(child);
-        }
-
-
-        return null;
+    private boolean atomEscape() {
+        // Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all
+        return decimalEscape() || characterClassEscape() || characterEscape() || identityEscape();
     }
 
     /*
@@ -981,48 +534,31 @@
      *      UnicodeEscapeSequence
      *      IdentityEscape
      */
-    private Token characterEscape() {
+    private boolean characterEscape() {
         final int startIn  = position;
         final int startOut = sb.length();
 
-        final Token token = new Token(Token.Type.CHARACTER_ESCAPE);
-        Token child;
-
-        child = controlEscape();
-        if (child != null) {
-            return token.add(child);
+        if (controlEscape()) {
+            return true;
         }
 
         if (ch0 == 'c') {
-            commit(token, 1);
-            child = controlLetter();
-            if (child != null) {
-                return token.add(child);
+            commit(1);
+            if (controlLetter()) {
+                return true;
             }
             restart(startIn, startOut);
         }
 
-        child = hexEscapeSequence();
-        if (child != null) {
-            return token.add(child);
-        }
-
-        child = unicodeEscapeSequence();
-        if (child != null) {
-            return token.add(child);
-        }
-
-        child = identityEscape();
-        if (child != null) {
-            return token.add(child);
+        if (hexEscapeSequence() || unicodeEscapeSequence()) {
+            return true;
         }
 
         restart(startIn, startOut);
-
-        return null;
+        return false;
     }
 
-    private boolean scanEscapeSequence(final char leader, final int length, final Token token) {
+    private boolean scanEscapeSequence(final char leader, final int length) {
         final int startIn  = position;
         final int startOut = sb.length();
 
@@ -1030,11 +566,11 @@
             return false;
         }
 
-        commit(token, 1);
+        commit(1);
         for (int i = 0; i < length; i++) {
             final char ch0l = Character.toLowerCase(ch0);
             if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) {
-                commit(token, 1);
+                commit(1);
             } else {
                 restart(startIn, startOut);
                 return false;
@@ -1044,37 +580,29 @@
         return true;
     }
 
-    private Token hexEscapeSequence() {
-        final Token token = new Token(Token.Type.HEX_ESCAPESEQUENCE);
-        if (scanEscapeSequence('x', 2, token)) {
-            return token;
-        }
-        return null;
+    private boolean hexEscapeSequence() {
+        return scanEscapeSequence('x', 2);
     }
 
-    private Token unicodeEscapeSequence() {
-        final Token token = new Token(Token.Type.UNICODE_ESCAPESEQUENCE);
-        if (scanEscapeSequence('u', 4, token)) {
-            return token;
-        }
-        return null;
+    private boolean unicodeEscapeSequence() {
+        return scanEscapeSequence('u', 4);
     }
 
     /*
      * ControlEscape ::
      *      one of fnrtv
      */
-    private Token controlEscape() {
+    private boolean controlEscape() {
         switch (ch0) {
         case 'f':
         case 'n':
         case 'r':
         case 't':
         case 'v':
-            return commit(new Token(Token.Type.CONTROL_ESCAPE), 1);
+            return commit(1);
 
         default:
-            return null;
+            return false;
         }
     }
 
@@ -1083,19 +611,18 @@
      *      one of abcdefghijklmnopqrstuvwxyz
      *      ABCDEFGHIJKLMNOPQRSTUVWXYZ
      */
-    private Token controlLetter() {
+    private boolean controlLetter() {
         final char c = Character.toUpperCase(ch0);
         if (c >= 'A' && c <= 'Z') {
-            final Token token = new Token(Token.Type.CONTROL_LETTER);
-            commit(token, 1);
-            return token;
+            // for some reason java regexps don't like control characters on the
+            // form "\\ca".match([string with ascii 1 at char0]). Translating
+            // them to unicode does it though.
+            sb.setLength(sb.length() - 1);
+            unicode(c - 'A' + 1);
+            skip(1);
+            return true;
         }
-        return null;
-        /*
-        Token token = new Token(Token.Type.CONTROL_LETTER);
-        commit(null, 1);//add original char to builder not to token
-        this.neverMatches = c < 'A' || c > 'Z';
-        return token.add(""+c);*/
+        return false;
     }
 
     /*
@@ -1104,56 +631,115 @@
      *      <ZWJ>  (200c)
      *      <ZWNJ> (200d)
      */
-    private Token identityEscape() {
-        final Token token = new Token(Token.Type.IDENTITY_ESCAPE);
-        commit(token, 1);
-        return token;
+    private boolean identityEscape() {
+        if (atEOF()) {
+            throw new RuntimeException("\\ at end of pattern"); // will be converted to PatternSyntaxException
+        }
+        // ES 5.1 A.7 requires "not IdentifierPart" here but all major engines accept any character here.
+        if (NON_IDENT_ESCAPES.indexOf(ch0) == -1) {
+            sb.setLength(sb.length() - 1);
+        }
+        return commit(1);
     }
 
     /*
      * DecimalEscape ::
      *      DecimalIntegerLiteral [lookahead DecimalDigit]
      */
-    private Token decimalEscape() {
-        final Token token = new Token(Token.Type.DECIMAL_ESCAPE);
+    private boolean decimalEscape() {
         final int startIn  = position;
         final int startOut = sb.length();
 
         if (ch0 == '0' && !isDecimalDigit(ch1)) {
-            commit(token, 1);
-            token.removeLast();
+            skip(1);
             //  DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000);
-            return token.add("\u0000");
+            sb.append("\u0000");
+            return true;
         }
 
         if (isDecimalDigit(ch0)) {
-            while (isDecimalDigit(ch0)) {
-                commit(token, 1);
+            final int num = ch0 - '0';
+
+            // Single digit escape, treat as backreference.
+            if (!isDecimalDigit(ch1)) {
+                if (num <= caps.size() && caps.get(num - 1).getNegativeLookaheadLevel() > 0) {
+                    //  Captures that live inside a negative lookahead are dead after the
+                    //  lookahead and will be undefined if referenced from outside.
+                    if (caps.get(num - 1).getNegativeLookaheadLevel() > negativeLookaheadLevel) {
+                        sb.setLength(sb.length() - 1);
+                    } else {
+                        sb.append(ch0);
+                    }
+                    skip(1);
+                    return true;
+                } else if (num > caps.size()) {
+                    // Forward reference to a capture group. Forward references are always undefined so we
+                    // can omit it from the output buffer. Additionally, if the capture group does not exist
+                    // the whole regexp becomes invalid, so register the reference for later processing.
+                    forwardReferences.add(num);
+                    sb.setLength(sb.length() - 1);
+                    skip(1);
+                    return true;
+                }
             }
-            return token;
+
+            if (inCharClass) {
+                // Convert octal escape to unicode escape if inside character class.
+                StringBuilder digit = new StringBuilder(4);
+                while (isDecimalDigit(ch0)) {
+                    digit.append(ch0);
+                    skip(1);
+                }
+
+                int value = Integer.parseInt(digit.toString(), 8); //throws exception that leads to SyntaxError if not octal
+                if (value > 0xff) {
+                    throw new NumberFormatException(digit.toString());
+                }
+
+                unicode(value);
+
+            } else {
+                // Copy decimal escape as-is
+                decimalDigits();
+            }
+            return true;
         }
 
         restart(startIn, startOut);
-
-        return null;
+        return false;
     }
 
     /*
      * CharacterClassEscape ::
      *  one of dDsSwW
      */
-    private Token characterClassEscape() {
+    private boolean characterClassEscape() {
         switch (ch0) {
+        // java.util.regex requires translation of \s and \S to explicit character list
         case 's':
+            if (RegExpFactory.usesJavaUtilRegex()) {
+                sb.setLength(sb.length() - 1);
+                sb.append('[').append(Lexer.getWhitespaceRegExp()).append(']');
+                skip(1);
+                return true;
+            }
+            return commit(1);
         case 'S':
+            if (RegExpFactory.usesJavaUtilRegex()) {
+                sb.setLength(sb.length() - 1);
+                sb.append("[^").append(Lexer.getWhitespaceRegExp()).append(']');
+                skip(1);
+                return true;
+            }
+            return commit(1);
         case 'd':
         case 'D':
         case 'w':
         case 'W':
-            return commit(new Token(Token.Type.CHARACTERCLASS_ESCAPE), 1);
+            return commit(1);
 
         default:
-            return null;
+            return false;
         }
     }
 
@@ -1162,29 +748,31 @@
      *      [ [lookahead {^}] ClassRanges ]
      *      [ ^ ClassRanges ]
      */
-    private Token characterClass() {
+    private boolean characterClass() {
         final int startIn  = position;
         final int startOut = sb.length();
-        final Token token  = new Token(Token.Type.CHARACTERCLASS);
 
         if (ch0 == '[') {
-            push(']');
-            commit(token, 1);
+            try {
+                inCharClass = true;
+                push(']');
+                commit(1);
 
-            if (ch0 == '^') {
-                commit(token, 1);
-            }
+                if (ch0 == '^') {
+                    commit(1);
+                }
 
-            final Token child = classRanges();
-            if (child != null && ch0 == ']') {
-                pop(']');
-                token.add(child);
-                return commit(token, 1);
+                if (classRanges() && ch0 == ']') {
+                    pop(']');
+                    return commit(1);
+                }
+            } finally {
+                inCharClass = false;  // no nested character classes in JavaScript
             }
         }
 
         restart(startIn, startOut);
-        return null;
+        return false;
     }
 
     /*
@@ -1192,8 +780,9 @@
      *      [empty]
      *      NonemptyClassRanges
      */
-    private Token classRanges() {
-        return new Token(Token.Type.CLASSRANGES).add(nonemptyClassRanges());
+    private boolean classRanges() {
+        nonemptyClassRanges();
+        return true;
     }
 
     /*
@@ -1202,40 +791,27 @@
      *      ClassAtom NonemptyClassRangesNoDash
      *      ClassAtom - ClassAtom ClassRanges
      */
-    private Token nonemptyClassRanges() {
+    private boolean nonemptyClassRanges() {
         final int startIn  = position;
         final int startOut = sb.length();
-        final Token token  = new Token(Token.Type.NON_EMPTY_CLASSRANGES);
-        Token child;
 
-        child = classAtom();
-        if (child != null) {
-            token.add(child);
+        if (classAtom()) {
 
             if (ch0 == '-') {
-                commit(token, 1);
+                commit(1);
 
-                final Token child1 = classAtom();
-                final Token child2 = classRanges();
-                if (child1 != null && child2 != null) {
-                    token.add(child1);
-                    token.add(child2);
-
-                    return token;
+                if (classAtom() && classRanges()) {
+                    return true;
                 }
             }
 
-            child = nonemptyClassRangesNoDash();
-            if (child != null) {
-                token.add(child);
-                return token;
-            }
+            nonemptyClassRangesNoDash();
 
-            return token;
+            return true;
         }
 
         restart(startIn, startOut);
-        return null;
+        return false;
     }
 
     /*
@@ -1244,61 +820,44 @@
      *      ClassAtomNoDash NonemptyClassRangesNoDash
      *      ClassAtomNoDash - ClassAtom ClassRanges
      */
-    private Token nonemptyClassRangesNoDash() {
+    private boolean nonemptyClassRangesNoDash() {
         final int startIn  = position;
         final int startOut = sb.length();
-        final Token token  = new Token(Token.Type.NON_EMPTY_CLASSRANGES_NODASH);
-        Token child;
 
-        child = classAtomNoDash();
-        if (child != null) {
-            token.add(child);
+        if (classAtomNoDash()) {
 
             // need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom
             if (ch0 == '-') {
-               commit(token, 1);
+               commit(1);
 
-               final Token child1 = classAtom();
-               final Token child2 = classRanges();
-               if (child1 != null && child2 != null) {
-                   token.add(child1);
-                   return token.add(child2);
+               if (classAtom() && classRanges()) {
+                   return true;
                }
                //fallthru
            }
 
-            child = nonemptyClassRangesNoDash();
-            if (child != null) {
-                token.add(child);
-            }
-            return token; // still a class atom
+            nonemptyClassRangesNoDash();
+            return true; // still a class atom
         }
 
-        child = classAtom();
-        if (child != null) {
-            return token.add(child);
+        if (classAtom()) {
+            return true;
         }
 
         restart(startIn, startOut);
-        return null;
+        return false;
     }
 
     /*
      * ClassAtom : - ClassAtomNoDash
      */
-    private Token classAtom() {
-        final Token token = new Token(Token.Type.CLASSATOM);
+    private boolean classAtom() {
 
         if (ch0 == '-') {
-            return commit(token, 1);
+            return commit(1);
         }
 
-        final Token child = classAtomNoDash();
-        if (child != null) {
-            return token.add(child);
-        }
-
-        return null;
+        return classAtomNoDash();
     }
 
     /*
@@ -1306,33 +865,32 @@
      *      SourceCharacter but not one of \ or ] or -
      *      \ ClassEscape
      */
-    private Token classAtomNoDash() {
+    private boolean classAtomNoDash() {
         final int startIn  = position;
         final int startOut = sb.length();
-        final Token token  = new Token(Token.Type.CLASSATOM_NODASH);
 
         switch (ch0) {
         case ']':
         case '-':
         case '\0':
-            return null;
+            return false;
 
         case '[':
             // unescaped left square bracket - add escape
-            return commit(token.add("\\"), 1);
+            sb.append('\\');
+            return commit(1);
 
         case '\\':
-            commit(token, 1);
-            final Token child = classEscape();
-            if (child != null) {
-                return token.add(child);
+            commit(1);
+            if (classEscape()) {
+                return true;
             }
 
             restart(startIn, startOut);
-            return null;
+            return false;
 
         default:
-            return commit(token, 1);
+            return commit(1);
         }
     }
 
@@ -1343,46 +901,45 @@
      *      CharacterEscape
      *      CharacterClassEscape
      */
-    private Token classEscape() {
-        final Token token = new Token(Token.Type.CLASS_ESCAPE);
-        Token child;
+    private boolean classEscape() {
 
-        child = decimalEscape();
-        if (child != null) {
-            return token.add(child);
+        if (decimalEscape()) {
+            return true;
         }
 
         if (ch0 == 'b') {
-            return commit(token, 1);
+            sb.setLength(sb.length() - 1);
+            sb.append('\b');
+            skip(1);
+            return true;
         }
 
-        child = characterEscape();
-        if (child != null) {
-            return token.add(child);
-        }
-
-        child = characterClassEscape();
-        if (child != null) {
-            return token.add(child);
-        }
-
-        return null;
+        // Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all
+        return characterEscape() || characterClassEscape() || identityEscape();
     }
 
     /*
      * DecimalDigits
      */
-    private Token decimalDigits() {
+    private boolean decimalDigits() {
         if (!isDecimalDigit(ch0)) {
-            return null;
+            return false;
+        }
+
+        while (isDecimalDigit(ch0)) {
+            commit(1);
         }
 
-        final Token token = new Token(Token.Type.DECIMALDIGITS);
-        while (isDecimalDigit(ch0)) {
-            commit(token, 1);
+        return true;
+    }
+
+    private void unicode(final int value) {
+        final String hex = Integer.toHexString(value);
+        sb.append('u');
+        for (int i = 0; i < 4 - hex.length(); i++) {
+            sb.append('0');
         }
-
-        return token;
+        sb.append(hex);
     }
 
     private static boolean isDecimalDigit(final char ch) {