nashorn/src/jdk/nashorn/internal/parser/RegExpScanner.java
changeset 16226 0e4f37e6cc40
parent 16225 81d58c2b9fcf
child 16227 1bafb74d17b2
equal deleted inserted replaced
16225:81d58c2b9fcf 16226:0e4f37e6cc40
     1 /*
       
     2  * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
       
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
       
     4  *
       
     5  * This code is free software; you can redistribute it and/or modify it
       
     6  * under the terms of the GNU General Public License version 2 only, as
       
     7  * published by the Free Software Foundation.  Oracle designates this
       
     8  * particular file as subject to the "Classpath" exception as provided
       
     9  * by Oracle in the LICENSE file that accompanied this code.
       
    10  *
       
    11  * This code is distributed in the hope that it will be useful, but WITHOUT
       
    12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
       
    13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
       
    14  * version 2 for more details (a copy is included in the LICENSE file that
       
    15  * accompanied this code).
       
    16  *
       
    17  * You should have received a copy of the GNU General Public License version
       
    18  * 2 along with this work; if not, write to the Free Software Foundation,
       
    19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
       
    20  *
       
    21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
       
    22  * or visit www.oracle.com if you need additional information or have any
       
    23  * questions.
       
    24  */
       
    25 
       
    26 package jdk.nashorn.internal.parser;
       
    27 
       
    28 import java.util.ArrayList;
       
    29 import java.util.HashMap;
       
    30 import java.util.Iterator;
       
    31 import java.util.LinkedHashMap;
       
    32 import java.util.LinkedList;
       
    33 import java.util.List;
       
    34 import java.util.Map;
       
    35 import java.util.regex.PatternSyntaxException;
       
    36 import jdk.nashorn.internal.runtime.BitVector;
       
    37 
       
    38 /**
       
    39  * Scan a JavaScript regexp, converting to Java regex if necessary.
       
    40  *
       
    41  */
       
    42 public class RegExpScanner extends Scanner {
       
    43 
       
    44     /**
       
    45      * String builder to accumulate the result - this contains verbatim parsed JavaScript.
       
    46      * to get the java equivalent we need to create a Pattern token and return its toString()
       
    47      */
       
    48     private final StringBuilder sb;
       
    49 
       
    50     /** An optional error message if one occurred during parse. */
       
    51     private String errorMessage;
       
    52 
       
    53     /** Is this the special case of a regexp that never matches anything */
       
    54     private boolean neverMatches;
       
    55 
       
    56     /** The resulting java.util.regex pattern string. */
       
    57     private String javaPattern;
       
    58 
       
    59     /** Expected token table */
       
    60     private final Map<Character, Integer> expected = new HashMap<>();
       
    61 
       
    62     /** Capturing parenthesis that have been found so far. */
       
    63     private final List<Capture> caps = new LinkedList<>();
       
    64 
       
    65     /** Forward references to capturing parenthesis to be resolved later.*/
       
    66     private final Map<Integer, Token> forwardReferences = new LinkedHashMap<>();
       
    67 
       
    68     /** Current level of zero-width negative lookahead assertions. */
       
    69     private int negativeLookaheadLevel;
       
    70 
       
    71     private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?";
       
    72 
       
    73     private static class Capture {
       
    74         /**
       
    75          * Zero-width negative lookaheads enclosing the capture.
       
    76          */
       
    77         private final int negativeLookaheadLevel;
       
    78         /**
       
    79          * Captures that live inside a negative lookahead are dead after the
       
    80          * lookahead and will be undefined if referenced from outside.
       
    81          */
       
    82         private boolean isDead;
       
    83 
       
    84         Capture(final int negativeLookaheadLevel) {
       
    85             this.negativeLookaheadLevel = negativeLookaheadLevel;
       
    86         }
       
    87 
       
    88         public int getNegativeLookaheadLevel() {
       
    89             return negativeLookaheadLevel;
       
    90         }
       
    91 
       
    92         public boolean isDead() {
       
    93             return isDead;
       
    94         }
       
    95 
       
    96         public void setDead() {
       
    97             this.isDead = true;
       
    98         }
       
    99     }
       
   100 
       
   101     /**
       
   102      * This is a token - the JavaScript regexp is scanned into a token tree
       
   103      * A token has other tokens as children as well as "atoms", i.e. Strings.
       
   104      *
       
   105      */
       
   106     private static class Token {
       
   107 
       
   108         private enum Type {
       
   109             PATTERN,
       
   110             DISJUNCTION,
       
   111             ALTERNATIVE,
       
   112             TERM,
       
   113             ASSERTION,
       
   114             QUANTIFIER,
       
   115             QUANTIFIER_PREFIX,
       
   116             ATOM,
       
   117             PATTERN_CHARACTER,
       
   118             ATOM_ESCAPE,
       
   119             CHARACTER_ESCAPE,
       
   120             CONTROL_ESCAPE,
       
   121             CONTROL_LETTER,
       
   122             IDENTITY_ESCAPE,
       
   123             DECIMAL_ESCAPE,
       
   124             CHARACTERCLASS_ESCAPE,
       
   125             CHARACTERCLASS,
       
   126             CLASSRANGES,
       
   127             NON_EMPTY_CLASSRANGES,
       
   128             NON_EMPTY_CLASSRANGES_NODASH,
       
   129             CLASSATOM,
       
   130             CLASSATOM_NODASH,
       
   131             CLASS_ESCAPE,
       
   132             DECIMALDIGITS,
       
   133             HEX_ESCAPESEQUENCE,
       
   134             UNICODE_ESCAPESEQUENCE,
       
   135         }
       
   136 
       
   137         /**
       
   138          * Token tyoe
       
   139          */
       
   140         private final Token.Type type;
       
   141 
       
   142         /**
       
   143          * Child nodes
       
   144          */
       
   145         private final List<Object> children;
       
   146 
       
   147         /**
       
   148          * Parent node
       
   149          */
       
   150         private Token parent;
       
   151 
       
   152         /**
       
   153          * Dead code flag
       
   154          */
       
   155         private boolean isDead;
       
   156 
       
   157         private static final Map<Type, ToString> toStringMap = new HashMap<>();
       
   158         private static final ToString DEFAULT_TOSTRING = new ToString();
       
   159 
       
   160         private static String unicode(final int value) {
       
   161             final StringBuilder sb = new StringBuilder();
       
   162             final String hex = Integer.toHexString(value);
       
   163             sb.append('u');
       
   164             for (int i = 0; i < 4 - hex.length(); i++) {
       
   165                 sb.append('0');
       
   166             }
       
   167             sb.append(hex);
       
   168 
       
   169             return sb.toString();
       
   170         }
       
   171 
       
   172         static {
       
   173             toStringMap.put(Type.CHARACTERCLASS, new ToString() {
       
   174                 @Override
       
   175                 public String toString(final Token token) {
       
   176                     return super.toString(token).replace("\\b", "\b");
       
   177                 }
       
   178             });
       
   179 
       
   180             // for some reason java regexps don't like control characters on the
       
   181             // form "\\ca".match([string with ascii 1 at char0]). Translating
       
   182             // them to unicode does it though.
       
   183             toStringMap.put(Type.CHARACTER_ESCAPE, new ToString() {
       
   184                 @Override
       
   185                 public String toString(final Token token) {
       
   186                     final String str = super.toString(token);
       
   187                     if (str.length() == 2) {
       
   188                         return Token.unicode(Character.toLowerCase(str.charAt(1)) - 'a' + 1);
       
   189                     }
       
   190                     return str;
       
   191                 }
       
   192             });
       
   193 
       
   194             toStringMap.put(Type.DECIMAL_ESCAPE, new ToString() {
       
   195                 @Override
       
   196                 public String toString(final Token token) {
       
   197                     final String str = super.toString(token);
       
   198 
       
   199                     if ("\0".equals(str)) {
       
   200                         return str;
       
   201                     }
       
   202 
       
   203                     int value;
       
   204 
       
   205                     if (!token.hasParentOfType(Type.CLASSRANGES)) {
       
   206                         return str;
       
   207                     }
       
   208 
       
   209                     value = Integer.parseInt(str, 8); //throws exception that leads to SyntaxError if not octal
       
   210                     if (value > 0xff) {
       
   211                         throw new NumberFormatException(str);
       
   212                     }
       
   213 
       
   214                     return Token.unicode(value);
       
   215                 }
       
   216             });
       
   217 
       
   218         }
       
   219 
       
   220         /**
       
   221          * JavaScript Token to Java regex substring framework.
       
   222          *
       
   223          */
       
   224         private static class ToString {
       
   225             String toString(final Token token) {
       
   226                 final StringBuilder sb = new StringBuilder();
       
   227                 for (final Object child : token.getChildren()) {
       
   228                     sb.append(child);
       
   229                 }
       
   230 
       
   231                 //perform global substitutions that hold true for any evaluated form
       
   232                 String str = sb.toString();
       
   233                 switch (str) {
       
   234                 case "\\s":
       
   235                     str = "[" + Lexer.getWhitespaceRegExp() + "]";
       
   236                     break;
       
   237                 case "\\S":
       
   238                     str = "[^" + Lexer.getWhitespaceRegExp() + "]";
       
   239                     break;
       
   240                 case "[^]":
       
   241                     str = "[\\s\\S]";
       
   242                     break;
       
   243                 default:
       
   244                     break;
       
   245                 }
       
   246                 return str;
       
   247             }
       
   248         }
       
   249 
       
   250         /**
       
   251          * Token iterator. Doesn't return "atom" children. i.e. string representations,
       
   252          * just tokens
       
   253          *
       
   254          */
       
   255         private static class TokenIterator implements Iterator<Token> {
       
   256             private final List<Token> preorder;
       
   257 
       
   258             private void init(final Token root) {
       
   259                 preorder.add(root);
       
   260                 for (final Object child : root.getChildren()) {
       
   261                     if (child instanceof Token) {
       
   262                         init((Token)child);
       
   263                     }
       
   264                 }
       
   265             }
       
   266 
       
   267             TokenIterator(final Token root) {
       
   268                 preorder = new ArrayList<>();
       
   269                 init(root);
       
   270             }
       
   271 
       
   272             @Override
       
   273             public boolean hasNext() {
       
   274                 return !preorder.isEmpty();
       
   275             }
       
   276 
       
   277             @Override
       
   278             public Token next() {
       
   279                 return preorder.remove(0);
       
   280             }
       
   281 
       
   282             @Override
       
   283             public void remove() {
       
   284                 next();
       
   285             }
       
   286         }
       
   287 
       
   288         /**
       
   289          * Constructor
       
   290          * @param type the token type
       
   291          */
       
   292         Token(final Token.Type type) {
       
   293             this.type = type;
       
   294             children = new ArrayList<>();
       
   295         }
       
   296 
       
   297         /**
       
   298          * Add a an "atom" child to a token
       
   299          * @param child the child to add
       
   300          * @return the token (for chaining)
       
   301          */
       
   302         public Token add(final String child) {
       
   303             children.add(child);
       
   304             return this;
       
   305         }
       
   306 
       
   307         /**
       
   308          * Add a child to a token
       
   309          * @param child the child
       
   310          * @return the token (for chaining)
       
   311          */
       
   312         public Token add(final Token child) {
       
   313             if (child != null) {
       
   314                 children.add(child);
       
   315                 child.setParent(this);
       
   316             }
       
   317             return this;
       
   318         }
       
   319 
       
   320         /**
       
   321          * Remove a child from a token
       
   322          * @param child the child to remove
       
   323          * @return true if successful
       
   324          */
       
   325         public boolean remove(final Token child) {
       
   326             return children.remove(child);
       
   327         }
       
   328 
       
   329         /**
       
   330          * Remove the last child from a token
       
   331          * @return the removed child
       
   332          */
       
   333         public Object removeLast() {
       
   334             return children.remove(children.size() - 1);
       
   335         }
       
   336 
       
   337         /**
       
   338          * Flag this token as dead code
       
   339          * @param isDead is it dead or not
       
   340          */
       
   341         private void setIsDead(final boolean isDead) {
       
   342             this.isDead = isDead;
       
   343         }
       
   344 
       
   345         /**
       
   346          * Is this token dead code
       
   347          * @return boolean
       
   348          */
       
   349         private boolean getIsDead() {
       
   350             return isDead;
       
   351         }
       
   352 
       
   353         /**
       
   354          * Get the parent of this token
       
   355          * @return parent token
       
   356          */
       
   357         public Token getParent() {
       
   358             return parent;
       
   359         }
       
   360 
       
   361         public boolean hasParentOfType(final Token.Type parentType) {
       
   362             for (Token p = getParent(); p != null; p = p.getParent()) {
       
   363                 if (p.getType() == parentType) {
       
   364                     return true;
       
   365                 }
       
   366             }
       
   367             return false;
       
   368         }
       
   369 
       
   370         public boolean hasChildOfType(final Token.Type childType) {
       
   371             for (final Iterator<Token> iter = iterator() ; iter.hasNext() ; ) {
       
   372                 if (iter.next().getType() == childType) {
       
   373                     return true;
       
   374                 }
       
   375             }
       
   376             return false;
       
   377         }
       
   378 
       
   379         /**
       
   380          * Set the parent of this token
       
   381          * @param parent
       
   382          */
       
   383         private void setParent(final Token parent) {
       
   384             this.parent = parent;
       
   385         }
       
   386 
       
   387         /**
       
   388          * Get the children of this token
       
   389          * @return an array of children, never null
       
   390          */
       
   391         public Object[] getChildren() {
       
   392             return children.toArray();
       
   393         }
       
   394 
       
   395         /**
       
   396          * Reset this token, remove all children
       
   397          */
       
   398         public void reset() {
       
   399             children.clear();
       
   400         }
       
   401 
       
   402         /**
       
   403          * Get a preorder token iterator with this token as root
       
   404          * @return an iterator
       
   405          */
       
   406         public Iterator<Token> iterator() {
       
   407             return new TokenIterator(this);
       
   408         }
       
   409 
       
   410         /**
       
   411          * Get the type of this token
       
   412          * @return type
       
   413          */
       
   414         public Type getType() {
       
   415             return type;
       
   416         }
       
   417 
       
   418         /**
       
   419          * Turn this token into Java regexp compatible text
       
   420          * @return part of a java regexp
       
   421          */
       
   422         @Override
       
   423         public String toString() {
       
   424             ToString t = toStringMap.get(getType());
       
   425             if (t == null) {
       
   426                 t = DEFAULT_TOSTRING;
       
   427             }
       
   428             return t.toString(this);
       
   429         }
       
   430     }
       
   431 
       
   432     /**
       
   433      * Constructor
       
   434      * @param string the JavaScript regexp to parse
       
   435      */
       
   436     private RegExpScanner(final String string) {
       
   437         super(string);
       
   438         sb = new StringBuilder(limit);
       
   439         reset(0);
       
   440         expected.put(']', 0);
       
   441         expected.put('}', 0);
       
   442     }
       
   443 
       
   444     private void processForwardReferences() {
       
   445         if (neverMatches()) {
       
   446             return;
       
   447         }
       
   448 
       
   449         for (final Map.Entry<Integer, Token> fwdRef : forwardReferences.entrySet()) {
       
   450             if (fwdRef.getKey().intValue() > caps.size()) {
       
   451                 neverMatches = true;
       
   452                 break;
       
   453             }
       
   454 
       
   455             fwdRef.getValue().setIsDead(true);
       
   456         }
       
   457 
       
   458         forwardReferences.clear();
       
   459     }
       
   460 
       
   461     /**
       
   462      * Scan a JavaScript regexp string returning a Java safe regex string.
       
   463      *
       
   464      * @param string
       
   465      *            JavaScript regexp string.
       
   466      * @return Java safe regex string.
       
   467      */
       
   468     public static RegExpScanner scan(final String string) {
       
   469         final RegExpScanner scanner = new RegExpScanner(string);
       
   470 
       
   471         Token pattern;
       
   472 
       
   473         try {
       
   474             pattern = scanner.pattern();
       
   475         } catch (final Exception e) {
       
   476             throw new PatternSyntaxException(e.getMessage(), string, scanner.sb.length());
       
   477         }
       
   478 
       
   479         scanner.processForwardReferences();
       
   480         if (scanner.neverMatches()) {
       
   481             return null; // never matches
       
   482         }
       
   483 
       
   484         // go over the code and remove dead code
       
   485         final Iterator<Token> iter = pattern.iterator();
       
   486         while (iter.hasNext()) {
       
   487             final Token next = iter.next();
       
   488             if (next.getIsDead()) {
       
   489                 next.getParent().remove(next);
       
   490             }
       
   491         }
       
   492 
       
   493         // turn the pattern into a string, p, the java equivalent string for our js regexp
       
   494         final String p = pattern.toString();
       
   495         // if builder contains all tokens that were sent in, we know
       
   496         // we correctly parsed the entire JavaScript regexp without syntax errors
       
   497         if (!string.equals(scanner.getStringBuilder().toString())) {
       
   498             throw new PatternSyntaxException(string, p, p.length() + 1);
       
   499         }
       
   500 
       
   501         scanner.javaPattern = p;
       
   502         return scanner;
       
   503      }
       
   504 
       
   505     /**
       
   506      * Does this regexp ever match anything? Use of e.g. [], which is legal in JavaScript,
       
   507      * is an example where we never match
       
   508      *
       
   509      * @return boolean
       
   510      */
       
   511     private boolean neverMatches() {
       
   512         return neverMatches;
       
   513     }
       
   514 
       
   515     /**
       
   516      * This is used to set better error messages that can be reused
       
   517      * in NativeRegExp for augmenting e.g. SyntaxErrors.
       
   518      *
       
   519      * @return an error message or null if no extra info
       
   520      */
       
   521     public String getErrorMessage() {
       
   522         return errorMessage;
       
   523     }
       
   524 
       
   525     final StringBuilder getStringBuilder() {
       
   526         return sb;
       
   527     }
       
   528 
       
   529     String getJavaPattern() {
       
   530         return javaPattern;
       
   531     }
       
   532 
       
   533     BitVector getGroupsInNegativeLookahead() {
       
   534         BitVector vec = null;
       
   535         for (int i = 0; i < caps.size(); i++) {
       
   536             final Capture cap = caps.get(i);
       
   537             if (cap.getNegativeLookaheadLevel() > 0) {
       
   538                 if (vec == null) {
       
   539                     vec = new BitVector(caps.size() + 1);
       
   540                 }
       
   541                 vec.set(i + 1);
       
   542             }
       
   543         }
       
   544         return vec;
       
   545     }
       
   546 
       
   547     /**
       
   548      * Commit n characters to the builder and to a given token
       
   549      * @param token Uncommitted token.
       
   550      * @param n     Number of characters.
       
   551      * @return Committed token
       
   552      */
       
   553     private Token commit(final Token token, final int n) {
       
   554         final int startIn = position;
       
   555 
       
   556         switch (n) {
       
   557         case 1:
       
   558             sb.append(ch0);
       
   559             skip(1);
       
   560             break;
       
   561         case 2:
       
   562             sb.append(ch0);
       
   563             sb.append(ch1);
       
   564             skip(2);
       
   565             break;
       
   566         case 3:
       
   567             sb.append(ch0);
       
   568             sb.append(ch1);
       
   569             sb.append(ch2);
       
   570             skip(3);
       
   571             break;
       
   572         default:
       
   573             assert false : "Should not reach here";
       
   574         }
       
   575 
       
   576         if (token == null) {
       
   577             return null;
       
   578         }
       
   579 
       
   580         return token.add(sb.substring(startIn, sb.length()));
       
   581     }
       
   582 
       
   583     /**
       
   584      * Restart the buffers back at an earlier position.
       
   585      *
       
   586      * @param startIn
       
   587      *            Position in the input stream.
       
   588      * @param startOut
       
   589      *            Position in the output stream.
       
   590      */
       
   591     private void restart(final int startIn, final int startOut) {
       
   592         reset(startIn);
       
   593         sb.setLength(startOut);
       
   594     }
       
   595 
       
   596     private void push(final char ch) {
       
   597         expected.put(ch, expected.get(ch) + 1);
       
   598     }
       
   599 
       
   600     private void pop(final char ch) {
       
   601         expected.put(ch, Math.min(0, expected.get(ch) - 1));
       
   602     }
       
   603 
       
   604     /*
       
   605      * Recursive descent tokenizer starts below.
       
   606      */
       
   607 
       
   608     /*
       
   609      * Pattern ::
       
   610      *      Disjunction
       
   611      */
       
   612     private Token pattern() {
       
   613         final Token token = new Token(Token.Type.PATTERN);
       
   614 
       
   615         final Token child = disjunction();
       
   616         if (child != null) {
       
   617             return token.add(child);
       
   618         }
       
   619 
       
   620         return null;
       
   621     }
       
   622 
       
   623     /*
       
   624      * Disjunction ::
       
   625      *      Alternative
       
   626      *      Alternative | Disjunction
       
   627      */
       
   628     private Token disjunction() {
       
   629         final Token token = new Token(Token.Type.DISJUNCTION);
       
   630 
       
   631         while (true) {
       
   632             token.add(alternative());
       
   633 
       
   634             if (ch0 == '|') {
       
   635                 commit(token, 1);
       
   636             } else {
       
   637                 break;
       
   638             }
       
   639         }
       
   640 
       
   641         return token;
       
   642     }
       
   643 
       
   644     /*
       
   645      * Alternative ::
       
   646      *      [empty]
       
   647      *      Alternative Term
       
   648      */
       
   649     private Token alternative() {
       
   650         final Token token = new Token(Token.Type.ALTERNATIVE);
       
   651 
       
   652         Token child;
       
   653         while ((child = term()) != null) {
       
   654             token.add(child);
       
   655         }
       
   656 
       
   657         return token;
       
   658     }
       
   659 
       
   660     /*
       
   661      * Term ::
       
   662      *      Assertion
       
   663      *      Atom
       
   664      *      Atom Quantifier
       
   665      */
       
   666     private Token term() {
       
   667         final int startIn  = position;
       
   668         final int startOut = sb.length();
       
   669         final Token token  = new Token(Token.Type.TERM);
       
   670         Token child;
       
   671 
       
   672         child = assertion();
       
   673         if (child != null) {
       
   674             return token.add(child);
       
   675         }
       
   676 
       
   677         child = atom();
       
   678         if (child != null) {
       
   679             boolean emptyCharacterClass = false;
       
   680             if ("[]".equals(child.toString())) {
       
   681                 emptyCharacterClass = true;
       
   682             }
       
   683 
       
   684             token.add(child);
       
   685 
       
   686             final Token quantifier = quantifier();
       
   687             if (quantifier != null) {
       
   688                 token.add(quantifier);
       
   689             }
       
   690 
       
   691             if (emptyCharacterClass) {
       
   692                 if (quantifier == null) {
       
   693                     neverMatches = true; //never matches ever.
       
   694                 } else {
       
   695                     //if we can get away with max zero, remove this entire token
       
   696                     final String qs = quantifier.toString();
       
   697                     if ("+".equals(qs) || "*".equals(qs) || qs.startsWith("{0,")) {
       
   698                         token.setIsDead(true);
       
   699                     }
       
   700                 }
       
   701             }
       
   702 
       
   703             return token;
       
   704         }
       
   705 
       
   706         restart(startIn, startOut);
       
   707         return null;
       
   708     }
       
   709 
       
   710     /*
       
   711      * Assertion ::
       
   712      *      ^
       
   713      *      $
       
   714      *      \b
       
   715      *      \B
       
   716      *      ( ? = Disjunction )
       
   717      *      ( ? ! Disjunction )
       
   718      */
       
   719     private Token assertion() {
       
   720         final int startIn  = position;
       
   721         final int startOut = sb.length();
       
   722         final Token token  = new Token(Token.Type.ASSERTION);
       
   723 
       
   724         switch (ch0) {
       
   725         case '^':
       
   726         case '$':
       
   727             return commit(token, 1);
       
   728 
       
   729         case '\\':
       
   730             if (ch1 == 'b' || ch1 == 'B') {
       
   731                 return commit(token, 2);
       
   732             }
       
   733             break;
       
   734 
       
   735         case '(':
       
   736             if (ch1 != '?') {
       
   737                 break;
       
   738             }
       
   739             if (ch2 != '=' && ch2 != '!') {
       
   740                 break;
       
   741             }
       
   742             final boolean isNegativeLookahead = (ch2 == '!');
       
   743             commit(token, 3);
       
   744 
       
   745             if (isNegativeLookahead) {
       
   746                 negativeLookaheadLevel++;
       
   747             }
       
   748             final Token disjunction = disjunction();
       
   749             if (isNegativeLookahead) {
       
   750                 for (final Capture cap : caps) {
       
   751                     if (cap.getNegativeLookaheadLevel() >= negativeLookaheadLevel) {
       
   752                         cap.setDead();
       
   753                     }
       
   754                 }
       
   755                 negativeLookaheadLevel--;
       
   756             }
       
   757 
       
   758             if (disjunction != null && ch0 == ')') {
       
   759                 token.add(disjunction);
       
   760                 return commit(token, 1);
       
   761             }
       
   762             break;
       
   763 
       
   764         default:
       
   765             break;
       
   766         }
       
   767 
       
   768         restart(startIn, startOut);
       
   769 
       
   770         return null;
       
   771     }
       
   772 
       
   773     /*
       
   774      * Quantifier ::
       
   775      *      QuantifierPrefix
       
   776      *      QuantifierPrefix ?
       
   777      */
       
   778     private Token quantifier() {
       
   779         final Token token = new Token(Token.Type.QUANTIFIER);
       
   780         final Token child = quantifierPrefix();
       
   781         if (child != null) {
       
   782             token.add(child);
       
   783             if (ch0 == '?') {
       
   784                 commit(token, 1);
       
   785             }
       
   786             return token;
       
   787         }
       
   788         return null;
       
   789     }
       
   790 
       
   791     /*
       
   792      * QuantifierPrefix ::
       
   793      *      *
       
   794      *      +
       
   795      *      ?
       
   796      *      { DecimalDigits }
       
   797      *      { DecimalDigits , }
       
   798      *      { DecimalDigits , DecimalDigits }
       
   799      */
       
   800     private Token quantifierPrefix() {
       
   801         final int startIn  = position;
       
   802         final int startOut = sb.length();
       
   803         final Token token  = new Token(Token.Type.QUANTIFIER_PREFIX);
       
   804 
       
   805         switch (ch0) {
       
   806         case '*':
       
   807         case '+':
       
   808         case '?':
       
   809             return commit(token, 1);
       
   810 
       
   811         case '{':
       
   812             commit(token, 1);
       
   813 
       
   814             final Token child = decimalDigits();
       
   815             if (child == null) {
       
   816                 break; // not a quantifier - back out
       
   817             }
       
   818             push('}');
       
   819             token.add(child);
       
   820 
       
   821             if (ch0 == ',') {
       
   822                 commit(token, 1);
       
   823                 token.add(decimalDigits());
       
   824             }
       
   825 
       
   826             if (ch0 == '}') {
       
   827                 pop('}');
       
   828                 commit(token, 1);
       
   829             }
       
   830 
       
   831             return token;
       
   832 
       
   833         default:
       
   834             break;
       
   835         }
       
   836 
       
   837         restart(startIn, startOut);
       
   838         return null;
       
   839     }
       
   840 
       
   841     /*
       
   842      * Atom ::
       
   843      *      PatternCharacter
       
   844      *      .
       
   845      *      \ AtomEscape
       
   846      *      CharacterClass
       
   847      *      ( Disjunction )
       
   848      *      ( ? : Disjunction )
       
   849      *
       
   850      */
       
   851     private Token atom() {
       
   852         final int startIn  = position;
       
   853         final int startOut = sb.length();
       
   854         final Token token  = new Token(Token.Type.ATOM);
       
   855         Token child;
       
   856 
       
   857         child = patternCharacter();
       
   858         if (child != null) {
       
   859             return token.add(child);
       
   860         }
       
   861 
       
   862         if (ch0 == '.') {
       
   863             return commit(token, 1);
       
   864         }
       
   865 
       
   866         if (ch0 == '\\') {
       
   867             commit(token, 1);
       
   868             child = atomEscape();
       
   869 
       
   870             if (child != null) {
       
   871                 if (child.hasChildOfType(Token.Type.IDENTITY_ESCAPE)) {
       
   872                     final char idEscape = child.toString().charAt(0);
       
   873                     if (NON_IDENT_ESCAPES.indexOf(idEscape) == -1) {
       
   874                         token.reset();
       
   875                     }
       
   876                 }
       
   877 
       
   878                 token.add(child);
       
   879 
       
   880                 // forward backreferences always match empty. JavaScript != Java
       
   881                 if (child.hasChildOfType(Token.Type.DECIMAL_ESCAPE) && !"\u0000".equals(child.toString())) {
       
   882                     final int refNum = Integer.parseInt(child.toString());
       
   883 
       
   884                     if (refNum - 1 < caps.size() && caps.get(refNum - 1).isDead()) {
       
   885                         // reference to dead in-negative-lookahead capture
       
   886                         token.setIsDead(true);
       
   887                     } else if (caps.size() < refNum) {
       
   888                         // forward reference: always matches against empty string (dead token).
       
   889                         // invalid reference (non-existant capture): pattern never matches.
       
   890                         forwardReferences.put(refNum, token);
       
   891                     }
       
   892                 }
       
   893 
       
   894                 return token;
       
   895             }
       
   896         }
       
   897 
       
   898         child = characterClass();
       
   899         if (child != null) {
       
   900             return token.add(child);
       
   901         }
       
   902 
       
   903         if (ch0 == '(') {
       
   904             boolean capturingParens = true;
       
   905             commit(token, 1);
       
   906             if (ch0 == '?' && ch1 == ':') {
       
   907                 capturingParens = false;
       
   908                 commit(token, 2);
       
   909             }
       
   910 
       
   911             child = disjunction();
       
   912             if (child != null) {
       
   913                 token.add(child);
       
   914                 if (ch0 == ')') {
       
   915                     final Token atom = commit(token, 1);
       
   916                     if (capturingParens) {
       
   917                         caps.add(new Capture(negativeLookaheadLevel));
       
   918                     }
       
   919                     return atom;
       
   920                 }
       
   921             }
       
   922         }
       
   923 
       
   924         restart(startIn, startOut);
       
   925         return null;
       
   926     }
       
   927 
       
   928     /*
       
   929      * PatternCharacter ::
       
   930      *      SourceCharacter but not any of: ^$\.*+?()[]{}|
       
   931      */
       
   932     @SuppressWarnings("fallthrough")
       
   933     private Token patternCharacter() {
       
   934         if (atEOF()) {
       
   935             return null;
       
   936         }
       
   937 
       
   938         switch (ch0) {
       
   939         case '^':
       
   940         case '$':
       
   941         case '\\':
       
   942         case '.':
       
   943         case '*':
       
   944         case '+':
       
   945         case '?':
       
   946         case '(':
       
   947         case ')':
       
   948         case '[':
       
   949         case '|':
       
   950             return null;
       
   951 
       
   952         case '}':
       
   953         case ']':
       
   954             final int n = expected.get(ch0);
       
   955             if (n != 0) {
       
   956                 return null;
       
   957             }
       
   958 
       
   959        case '{':
       
   960            // if not a valid quantifier escape curly brace to match itself
       
   961            // this ensures compatibility with other JS implementations
       
   962            final Token quant = quantifierPrefix();
       
   963            return (quant == null) ? commit(new Token(Token.Type.PATTERN_CHARACTER).add("\\"), 1) : null;
       
   964 
       
   965         default:
       
   966             return commit(new Token(Token.Type.PATTERN_CHARACTER), 1); // SOURCECHARACTER
       
   967         }
       
   968     }
       
   969 
       
   970     /*
       
   971      * AtomEscape ::
       
   972      *      DecimalEscape
       
   973      *      CharacterEscape
       
   974      *      CharacterClassEscape
       
   975      */
       
   976     private Token atomEscape() {
       
   977         final Token token = new Token(Token.Type.ATOM_ESCAPE);
       
   978         Token child;
       
   979 
       
   980         child = decimalEscape();
       
   981         if (child != null) {
       
   982             return token.add(child);
       
   983         }
       
   984 
       
   985         child = characterClassEscape();
       
   986         if (child != null) {
       
   987             return token.add(child);
       
   988         }
       
   989 
       
   990         child = characterEscape();
       
   991         if (child != null) {
       
   992             return token.add(child);
       
   993         }
       
   994 
       
   995 
       
   996         return null;
       
   997     }
       
   998 
       
   999     /*
       
  1000      * CharacterEscape ::
       
  1001      *      ControlEscape
       
  1002      *      c ControlLetter
       
  1003      *      HexEscapeSequence
       
  1004      *      UnicodeEscapeSequence
       
  1005      *      IdentityEscape
       
  1006      */
       
  1007     private Token characterEscape() {
       
  1008         final int startIn  = position;
       
  1009         final int startOut = sb.length();
       
  1010 
       
  1011         final Token token = new Token(Token.Type.CHARACTER_ESCAPE);
       
  1012         Token child;
       
  1013 
       
  1014         child = controlEscape();
       
  1015         if (child != null) {
       
  1016             return token.add(child);
       
  1017         }
       
  1018 
       
  1019         if (ch0 == 'c') {
       
  1020             commit(token, 1);
       
  1021             child = controlLetter();
       
  1022             if (child != null) {
       
  1023                 return token.add(child);
       
  1024             }
       
  1025             restart(startIn, startOut);
       
  1026         }
       
  1027 
       
  1028         child = hexEscapeSequence();
       
  1029         if (child != null) {
       
  1030             return token.add(child);
       
  1031         }
       
  1032 
       
  1033         child = unicodeEscapeSequence();
       
  1034         if (child != null) {
       
  1035             return token.add(child);
       
  1036         }
       
  1037 
       
  1038         child = identityEscape();
       
  1039         if (child != null) {
       
  1040             return token.add(child);
       
  1041         }
       
  1042 
       
  1043         restart(startIn, startOut);
       
  1044 
       
  1045         return null;
       
  1046     }
       
  1047 
       
  1048     private boolean scanEscapeSequence(final char leader, final int length, final Token token) {
       
  1049         final int startIn  = position;
       
  1050         final int startOut = sb.length();
       
  1051 
       
  1052         if (ch0 != leader) {
       
  1053             return false;
       
  1054         }
       
  1055 
       
  1056         commit(token, 1);
       
  1057         for (int i = 0; i < length; i++) {
       
  1058             final char ch0l = Character.toLowerCase(ch0);
       
  1059             if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) {
       
  1060                 commit(token, 1);
       
  1061             } else {
       
  1062                 restart(startIn, startOut);
       
  1063                 return false;
       
  1064             }
       
  1065         }
       
  1066 
       
  1067         return true;
       
  1068     }
       
  1069 
       
  1070     private Token hexEscapeSequence() {
       
  1071         final Token token = new Token(Token.Type.HEX_ESCAPESEQUENCE);
       
  1072         if (scanEscapeSequence('x', 2, token)) {
       
  1073             return token;
       
  1074         }
       
  1075         return null;
       
  1076     }
       
  1077 
       
  1078     private Token unicodeEscapeSequence() {
       
  1079         final Token token = new Token(Token.Type.UNICODE_ESCAPESEQUENCE);
       
  1080         if (scanEscapeSequence('u', 4, token)) {
       
  1081             return token;
       
  1082         }
       
  1083         return null;
       
  1084     }
       
  1085 
       
  1086     /*
       
  1087      * ControlEscape ::
       
  1088      *      one of fnrtv
       
  1089      */
       
  1090     private Token controlEscape() {
       
  1091         switch (ch0) {
       
  1092         case 'f':
       
  1093         case 'n':
       
  1094         case 'r':
       
  1095         case 't':
       
  1096         case 'v':
       
  1097             return commit(new Token(Token.Type.CONTROL_ESCAPE), 1);
       
  1098 
       
  1099         default:
       
  1100             return null;
       
  1101         }
       
  1102     }
       
  1103 
       
  1104     /*
       
  1105      * ControlLetter ::
       
  1106      *      one of abcdefghijklmnopqrstuvwxyz
       
  1107      *      ABCDEFGHIJKLMNOPQRSTUVWXYZ
       
  1108      */
       
  1109     private Token controlLetter() {
       
  1110         final char c = Character.toUpperCase(ch0);
       
  1111         if (c >= 'A' && c <= 'Z') {
       
  1112             final Token token = new Token(Token.Type.CONTROL_LETTER);
       
  1113             commit(token, 1);
       
  1114             return token;
       
  1115         }
       
  1116         return null;
       
  1117         /*
       
  1118         Token token = new Token(Token.Type.CONTROL_LETTER);
       
  1119         commit(null, 1);//add original char to builder not to token
       
  1120         this.neverMatches = c < 'A' || c > 'Z';
       
  1121         return token.add(""+c);*/
       
  1122     }
       
  1123 
       
  1124     /*
       
  1125      * IdentityEscape ::
       
  1126      *      SourceCharacter but not IdentifierPart
       
  1127      *      <ZWJ>  (200c)
       
  1128      *      <ZWNJ> (200d)
       
  1129      */
       
  1130     private Token identityEscape() {
       
  1131         final Token token = new Token(Token.Type.IDENTITY_ESCAPE);
       
  1132         commit(token, 1);
       
  1133         return token;
       
  1134     }
       
  1135 
       
  1136     /*
       
  1137      * DecimalEscape ::
       
  1138      *      DecimalIntegerLiteral [lookahead DecimalDigit]
       
  1139      */
       
  1140     private Token decimalEscape() {
       
  1141         final Token token = new Token(Token.Type.DECIMAL_ESCAPE);
       
  1142         final int startIn  = position;
       
  1143         final int startOut = sb.length();
       
  1144 
       
  1145         if (ch0 == '0' && !isDecimalDigit(ch1)) {
       
  1146             commit(token, 1);
       
  1147             token.removeLast();
       
  1148             //  DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000);
       
  1149             return token.add("\u0000");
       
  1150         }
       
  1151 
       
  1152         if (isDecimalDigit(ch0)) {
       
  1153             while (isDecimalDigit(ch0)) {
       
  1154                 commit(token, 1);
       
  1155             }
       
  1156             return token;
       
  1157         }
       
  1158 
       
  1159         restart(startIn, startOut);
       
  1160 
       
  1161         return null;
       
  1162     }
       
  1163 
       
  1164     /*
       
  1165      * CharacterClassEscape ::
       
  1166      *  one of dDsSwW
       
  1167      */
       
  1168     private Token characterClassEscape() {
       
  1169         switch (ch0) {
       
  1170         case 's':
       
  1171         case 'S':
       
  1172         case 'd':
       
  1173         case 'D':
       
  1174         case 'w':
       
  1175         case 'W':
       
  1176             return commit(new Token(Token.Type.CHARACTERCLASS_ESCAPE), 1);
       
  1177 
       
  1178         default:
       
  1179             return null;
       
  1180         }
       
  1181     }
       
  1182 
       
  1183     /*
       
  1184      * CharacterClass ::
       
  1185      *      [ [lookahead {^}] ClassRanges ]
       
  1186      *      [ ^ ClassRanges ]
       
  1187      */
       
  1188     private Token characterClass() {
       
  1189         final int startIn  = position;
       
  1190         final int startOut = sb.length();
       
  1191         final Token token  = new Token(Token.Type.CHARACTERCLASS);
       
  1192 
       
  1193         if (ch0 == '[') {
       
  1194             push(']');
       
  1195             commit(token, 1);
       
  1196 
       
  1197             if (ch0 == '^') {
       
  1198                 commit(token, 1);
       
  1199             }
       
  1200 
       
  1201             final Token child = classRanges();
       
  1202             if (child != null && ch0 == ']') {
       
  1203                 pop(']');
       
  1204                 token.add(child);
       
  1205                 return commit(token, 1);
       
  1206             }
       
  1207         }
       
  1208 
       
  1209         restart(startIn, startOut);
       
  1210         return null;
       
  1211     }
       
  1212 
       
  1213     /*
       
  1214      * ClassRanges ::
       
  1215      *      [empty]
       
  1216      *      NonemptyClassRanges
       
  1217      */
       
  1218     private Token classRanges() {
       
  1219         return new Token(Token.Type.CLASSRANGES).add(nonemptyClassRanges());
       
  1220     }
       
  1221 
       
  1222     /*
       
  1223      * NonemptyClassRanges ::
       
  1224      *      ClassAtom
       
  1225      *      ClassAtom NonemptyClassRangesNoDash
       
  1226      *      ClassAtom - ClassAtom ClassRanges
       
  1227      */
       
  1228     private Token nonemptyClassRanges() {
       
  1229         final int startIn  = position;
       
  1230         final int startOut = sb.length();
       
  1231         final Token token  = new Token(Token.Type.NON_EMPTY_CLASSRANGES);
       
  1232         Token child;
       
  1233 
       
  1234         child = classAtom();
       
  1235         if (child != null) {
       
  1236             token.add(child);
       
  1237 
       
  1238             if (ch0 == '-') {
       
  1239                 commit(token, 1);
       
  1240 
       
  1241                 final Token child1 = classAtom();
       
  1242                 final Token child2 = classRanges();
       
  1243                 if (child1 != null && child2 != null) {
       
  1244                     token.add(child1);
       
  1245                     token.add(child2);
       
  1246 
       
  1247                     return token;
       
  1248                 }
       
  1249             }
       
  1250 
       
  1251             child = nonemptyClassRangesNoDash();
       
  1252             if (child != null) {
       
  1253                 token.add(child);
       
  1254                 return token;
       
  1255             }
       
  1256 
       
  1257             return token;
       
  1258         }
       
  1259 
       
  1260         restart(startIn, startOut);
       
  1261         return null;
       
  1262     }
       
  1263 
       
  1264     /*
       
  1265      * NonemptyClassRangesNoDash ::
       
  1266      *      ClassAtom
       
  1267      *      ClassAtomNoDash NonemptyClassRangesNoDash
       
  1268      *      ClassAtomNoDash - ClassAtom ClassRanges
       
  1269      */
       
  1270     private Token nonemptyClassRangesNoDash() {
       
  1271         final int startIn  = position;
       
  1272         final int startOut = sb.length();
       
  1273         final Token token  = new Token(Token.Type.NON_EMPTY_CLASSRANGES_NODASH);
       
  1274         Token child;
       
  1275 
       
  1276         child = classAtomNoDash();
       
  1277         if (child != null) {
       
  1278             token.add(child);
       
  1279 
       
  1280             // need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom
       
  1281             if (ch0 == '-') {
       
  1282                commit(token, 1);
       
  1283 
       
  1284                final Token child1 = classAtom();
       
  1285                final Token child2 = classRanges();
       
  1286                if (child1 != null && child2 != null) {
       
  1287                    token.add(child1);
       
  1288                    return token.add(child2);
       
  1289                }
       
  1290                //fallthru
       
  1291            }
       
  1292 
       
  1293             child = nonemptyClassRangesNoDash();
       
  1294             if (child != null) {
       
  1295                 token.add(child);
       
  1296             }
       
  1297             return token; // still a class atom
       
  1298         }
       
  1299 
       
  1300         child = classAtom();
       
  1301         if (child != null) {
       
  1302             return token.add(child);
       
  1303         }
       
  1304 
       
  1305         restart(startIn, startOut);
       
  1306         return null;
       
  1307     }
       
  1308 
       
  1309     /*
       
  1310      * ClassAtom : - ClassAtomNoDash
       
  1311      */
       
  1312     private Token classAtom() {
       
  1313         final Token token = new Token(Token.Type.CLASSATOM);
       
  1314 
       
  1315         if (ch0 == '-') {
       
  1316             return commit(token, 1);
       
  1317         }
       
  1318 
       
  1319         final Token child = classAtomNoDash();
       
  1320         if (child != null) {
       
  1321             return token.add(child);
       
  1322         }
       
  1323 
       
  1324         return null;
       
  1325     }
       
  1326 
       
  1327     /*
       
  1328      * ClassAtomNoDash ::
       
  1329      *      SourceCharacter but not one of \ or ] or -
       
  1330      *      \ ClassEscape
       
  1331      */
       
  1332     private Token classAtomNoDash() {
       
  1333         final int startIn  = position;
       
  1334         final int startOut = sb.length();
       
  1335         final Token token  = new Token(Token.Type.CLASSATOM_NODASH);
       
  1336 
       
  1337         switch (ch0) {
       
  1338         case ']':
       
  1339         case '-':
       
  1340         case '\0':
       
  1341             return null;
       
  1342 
       
  1343         case '[':
       
  1344             // unescaped left square bracket - add escape
       
  1345             return commit(token.add("\\"), 1);
       
  1346 
       
  1347         case '\\':
       
  1348             commit(token, 1);
       
  1349             final Token child = classEscape();
       
  1350             if (child != null) {
       
  1351                 return token.add(child);
       
  1352             }
       
  1353 
       
  1354             restart(startIn, startOut);
       
  1355             return null;
       
  1356 
       
  1357         default:
       
  1358             return commit(token, 1);
       
  1359         }
       
  1360     }
       
  1361 
       
  1362     /*
       
  1363      * ClassEscape ::
       
  1364      *      DecimalEscape
       
  1365      *      b
       
  1366      *      CharacterEscape
       
  1367      *      CharacterClassEscape
       
  1368      */
       
  1369     private Token classEscape() {
       
  1370         final Token token = new Token(Token.Type.CLASS_ESCAPE);
       
  1371         Token child;
       
  1372 
       
  1373         child = decimalEscape();
       
  1374         if (child != null) {
       
  1375             return token.add(child);
       
  1376         }
       
  1377 
       
  1378         if (ch0 == 'b') {
       
  1379             return commit(token, 1);
       
  1380         }
       
  1381 
       
  1382         child = characterEscape();
       
  1383         if (child != null) {
       
  1384             return token.add(child);
       
  1385         }
       
  1386 
       
  1387         child = characterClassEscape();
       
  1388         if (child != null) {
       
  1389             return token.add(child);
       
  1390         }
       
  1391 
       
  1392         return null;
       
  1393     }
       
  1394 
       
  1395     /*
       
  1396      * DecimalDigits
       
  1397      */
       
  1398     private Token decimalDigits() {
       
  1399         if (!isDecimalDigit(ch0)) {
       
  1400             return null;
       
  1401         }
       
  1402 
       
  1403         final Token token = new Token(Token.Type.DECIMALDIGITS);
       
  1404         while (isDecimalDigit(ch0)) {
       
  1405             commit(token, 1);
       
  1406         }
       
  1407 
       
  1408         return token;
       
  1409     }
       
  1410 
       
  1411     private static boolean isDecimalDigit(final char ch) {
       
  1412         return ch >= '0' && ch <= '9';
       
  1413     }
       
  1414 }