--- a/nashorn/src/jdk/nashorn/internal/objects/Global.java Fri Feb 22 10:39:00 2013 -0400
+++ b/nashorn/src/jdk/nashorn/internal/objects/Global.java Fri Feb 22 16:31:10 2013 +0100
@@ -50,7 +50,7 @@
import jdk.nashorn.internal.runtime.NativeJavaPackage;
import jdk.nashorn.internal.runtime.OptionsObject;
import jdk.nashorn.internal.runtime.PropertyDescriptor;
-import jdk.nashorn.internal.runtime.RegExpMatch;
+import jdk.nashorn.internal.runtime.regexp.RegExpResult;
import jdk.nashorn.internal.runtime.Scope;
import jdk.nashorn.internal.runtime.ScriptFunction;
import jdk.nashorn.internal.runtime.ScriptObject;
@@ -339,7 +339,7 @@
private ClassCache classCache;
// Used to store the last RegExp result to support deprecated RegExp constructor properties
- private RegExpMatch lastRegExpMatch;
+ private RegExpResult lastRegExpResult;
private static final MethodHandle EVAL = findOwnMH("eval", Object.class, Object.class, Object.class);
private static final MethodHandle PRINT = findOwnMH("print", Object.class, Object.class, Object[].class);
@@ -1709,12 +1709,12 @@
return MH.findStatic(MethodHandles.publicLookup(), Global.class, name, MH.type(rtype, types));
}
- RegExpMatch getLastRegExpMatch() {
- return lastRegExpMatch;
+ RegExpResult getLastRegExpResult() {
+ return lastRegExpResult;
}
- void setLastRegExpMatch(RegExpMatch regExpMatch) {
- this.lastRegExpMatch = regExpMatch;
+ void setLastRegExpResult(final RegExpResult regExpResult) {
+ this.lastRegExpResult = regExpResult;
}
}
--- a/nashorn/src/jdk/nashorn/internal/objects/NativeRegExp.java Fri Feb 22 10:39:00 2013 -0400
+++ b/nashorn/src/jdk/nashorn/internal/objects/NativeRegExp.java Fri Feb 22 16:31:10 2013 +0100
@@ -31,8 +31,7 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
+
import jdk.nashorn.internal.objects.annotations.Attribute;
import jdk.nashorn.internal.objects.annotations.Constructor;
import jdk.nashorn.internal.objects.annotations.Function;
@@ -44,8 +43,10 @@
import jdk.nashorn.internal.runtime.BitVector;
import jdk.nashorn.internal.runtime.JSType;
import jdk.nashorn.internal.runtime.ParserException;
-import jdk.nashorn.internal.runtime.RegExp;
-import jdk.nashorn.internal.runtime.RegExpMatch;
+import jdk.nashorn.internal.runtime.regexp.RegExp;
+import jdk.nashorn.internal.runtime.regexp.RegExpFactory;
+import jdk.nashorn.internal.runtime.regexp.RegExpResult;
+import jdk.nashorn.internal.runtime.regexp.RegExpMatcher;
import jdk.nashorn.internal.runtime.ScriptFunction;
import jdk.nashorn.internal.runtime.ScriptObject;
import jdk.nashorn.internal.runtime.ScriptRuntime;
@@ -59,35 +60,15 @@
@Property(attributes = Attribute.NOT_ENUMERABLE | Attribute.NOT_CONFIGURABLE)
public Object lastIndex;
- /** Pattern string. */
- private String input;
-
- /** Global search flag for this regexp. */
- private boolean global;
-
- /** Case insensitive flag for this regexp */
- private boolean ignoreCase;
-
- /** Multi-line flag for this regexp */
- private boolean multiline;
-
- /** Java regex pattern to use for match. We compile to one of these */
- private Pattern pattern;
-
- private BitVector groupsInNegativeLookahead;
+ /** Compiled regexp */
+ private RegExp regexp;
// Reference to global object needed to support static RegExp properties
private Global globalObject;
- /*
- public NativeRegExp() {
- init();
- }*/
-
NativeRegExp(final String input, final String flagString) {
- RegExp regExp = null;
try {
- regExp = new RegExp(input, flagString);
+ this.regexp = RegExpFactory.create(input, flagString);
} catch (final ParserException e) {
// translate it as SyntaxError object and throw it
e.throwAsEcmaException();
@@ -95,13 +76,6 @@
}
this.setLastIndex(0);
- this.input = regExp.getInput();
- this.global = regExp.isGlobal();
- this.ignoreCase = regExp.isIgnoreCase();
- this.multiline = regExp.isMultiline();
- this.pattern = regExp.getPattern();
- this.groupsInNegativeLookahead = regExp.getGroupsInNegativeLookahead();
-
init();
}
@@ -110,24 +84,8 @@
}
NativeRegExp(final NativeRegExp regExp) {
- this.input = regExp.getInput();
- this.global = regExp.getGlobal();
- this.multiline = regExp.getMultiline();
- this.ignoreCase = regExp.getIgnoreCase();
this.lastIndex = regExp.getLastIndexObject();
- this.pattern = regExp.getPattern();
- this.groupsInNegativeLookahead = regExp.getGroupsInNegativeLookahead();
-
- init();
- }
-
- NativeRegExp(final Pattern pattern) {
- this.input = pattern.pattern();
- this.multiline = (pattern.flags() & Pattern.MULTILINE) != 0;
- this.ignoreCase = (pattern.flags() & Pattern.CASE_INSENSITIVE) != 0;
- this.lastIndex = 0;
- this.pattern = pattern;
-
+ this.regexp = regExp.getRegExp();
init();
}
@@ -232,16 +190,59 @@
return new NativeRegExp(patternString, flagString);
}
+ /**
+ * Build a regexp that matches {@code string} as-is. All meta-characters will be escaped.
+ *
+ * @param string pattern string
+ * @return flat regexp
+ */
+ static NativeRegExp flatRegExp(String string) {
+ // escape special characters
+ StringBuilder sb = null;
+ final int length = string.length();
+
+ for (int i = 0; i < length; i++) {
+ final char c = string.charAt(i);
+ switch (c) {
+ case '^':
+ case '$':
+ case '\\':
+ case '.':
+ case '*':
+ case '+':
+ case '?':
+ case '(':
+ case ')':
+ case '[':
+ case '{':
+ case '|':
+ if (sb == null) {
+ sb = new StringBuilder(length * 2);
+ sb.append(string, 0, i);
+ }
+ sb.append('\\');
+ sb.append(c);
+ break;
+ default:
+ if (sb != null) {
+ sb.append(c);
+ }
+ break;
+ }
+ }
+ return new NativeRegExp(sb == null ? string : sb.toString(), "");
+ }
+
private String getFlagString() {
- final StringBuilder sb = new StringBuilder();
+ final StringBuilder sb = new StringBuilder(3);
- if (global) {
+ if (regexp.isGlobal()) {
sb.append('g');
}
- if (ignoreCase) {
+ if (regexp.isIgnoreCase()) {
sb.append('i');
}
- if (multiline) {
+ if (regexp.isMultiline()) {
sb.append('m');
}
@@ -255,7 +256,7 @@
@Override
public String toString() {
- return "/" + input + "/" + getFlagString();
+ return "/" + regexp.getSource() + "/" + getFlagString();
}
/**
@@ -270,13 +271,8 @@
public static Object compile(final Object self, final Object pattern, final Object flags) {
final NativeRegExp regExp = checkRegExp(self);
final NativeRegExp compiled = newRegExp(pattern, flags);
- // copy over fields to 'self'
- regExp.setInput(compiled.getInput());
- regExp.setGlobal(compiled.getGlobal());
- regExp.setIgnoreCase(compiled.getIgnoreCase());
- regExp.setMultiline(compiled.getMultiline());
- regExp.setPattern(compiled.getPattern());
- regExp.setGroupsInNegativeLookahead(compiled.getGroupsInNegativeLookahead());
+ // copy over regexp to 'self'
+ regExp.setRegExp(compiled.getRegExp());
// Some implementations return undefined. Some return 'self'. Since return
// value is most likely be ignored, we can play safe and return 'self'.
@@ -326,7 +322,7 @@
*/
@Getter(attributes = Attribute.NON_ENUMERABLE_CONSTANT)
public static Object source(final Object self) {
- return checkRegExp(self).input;
+ return checkRegExp(self).getRegExp().getSource();
}
/**
@@ -337,7 +333,7 @@
*/
@Getter(attributes = Attribute.NON_ENUMERABLE_CONSTANT)
public static Object global(final Object self) {
- return checkRegExp(self).global;
+ return checkRegExp(self).getRegExp().isGlobal();
}
/**
@@ -348,7 +344,7 @@
*/
@Getter(attributes = Attribute.NON_ENUMERABLE_CONSTANT)
public static Object ignoreCase(final Object self) {
- return checkRegExp(self).ignoreCase;
+ return checkRegExp(self).getRegExp().isIgnoreCase();
}
/**
@@ -359,7 +355,7 @@
*/
@Getter(attributes = Attribute.NON_ENUMERABLE_CONSTANT)
public static Object multiline(final Object self) {
- return checkRegExp(self).multiline;
+ return checkRegExp(self).getRegExp().isMultiline();
}
/**
@@ -369,7 +365,7 @@
*/
@Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "input")
public static Object getLastInput(Object self) {
- final RegExpMatch match = Global.instance().getLastRegExpMatch();
+ final RegExpResult match = Global.instance().getLastRegExpResult();
return match == null ? "" : match.getInput();
}
@@ -390,7 +386,7 @@
*/
@Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "lastMatch")
public static Object getLastMatch(Object self) {
- final RegExpMatch match = Global.instance().getLastRegExpMatch();
+ final RegExpResult match = Global.instance().getLastRegExpResult();
return match == null ? "" : match.getGroup(0);
}
@@ -401,7 +397,7 @@
*/
@Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "lastParen")
public static Object getLastParen(Object self) {
- final RegExpMatch match = Global.instance().getLastRegExpMatch();
+ final RegExpResult match = Global.instance().getLastRegExpResult();
return match == null ? "" : match.getLastParen();
}
@@ -412,7 +408,7 @@
*/
@Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "leftContext")
public static Object getLeftContext(Object self) {
- final RegExpMatch match = Global.instance().getLastRegExpMatch();
+ final RegExpResult match = Global.instance().getLastRegExpResult();
return match == null ? "" : match.getInput().substring(0, match.getIndex());
}
@@ -423,7 +419,7 @@
*/
@Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "rightContext")
public static Object getRightContext(Object self) {
- final RegExpMatch match = Global.instance().getLastRegExpMatch();
+ final RegExpResult match = Global.instance().getLastRegExpResult();
return match == null ? "" : match.getInput().substring(match.getIndex() + match.length());
}
@@ -434,7 +430,7 @@
*/
@Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "$1")
public static Object getGroup1(Object self) {
- final RegExpMatch match = Global.instance().getLastRegExpMatch();
+ final RegExpResult match = Global.instance().getLastRegExpResult();
return match == null ? "" : match.getGroup(1);
}
@@ -445,7 +441,7 @@
*/
@Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "$2")
public static Object getGroup2(Object self) {
- final RegExpMatch match = Global.instance().getLastRegExpMatch();
+ final RegExpResult match = Global.instance().getLastRegExpResult();
return match == null ? "" : match.getGroup(2);
}
@@ -456,7 +452,7 @@
*/
@Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "$3")
public static Object getGroup3(Object self) {
- final RegExpMatch match = Global.instance().getLastRegExpMatch();
+ final RegExpResult match = Global.instance().getLastRegExpResult();
return match == null ? "" : match.getGroup(3);
}
@@ -467,7 +463,7 @@
*/
@Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "$4")
public static Object getGroup4(Object self) {
- final RegExpMatch match = Global.instance().getLastRegExpMatch();
+ final RegExpResult match = Global.instance().getLastRegExpResult();
return match == null ? "" : match.getGroup(4);
}
@@ -478,7 +474,7 @@
*/
@Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "$5")
public static Object getGroup5(Object self) {
- final RegExpMatch match = Global.instance().getLastRegExpMatch();
+ final RegExpResult match = Global.instance().getLastRegExpResult();
return match == null ? "" : match.getGroup(5);
}
@@ -489,7 +485,7 @@
*/
@Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "$6")
public static Object getGroup6(Object self) {
- final RegExpMatch match = Global.instance().getLastRegExpMatch();
+ final RegExpResult match = Global.instance().getLastRegExpResult();
return match == null ? "" : match.getGroup(6);
}
@@ -500,7 +496,7 @@
*/
@Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "$7")
public static Object getGroup7(Object self) {
- final RegExpMatch match = Global.instance().getLastRegExpMatch();
+ final RegExpResult match = Global.instance().getLastRegExpResult();
return match == null ? "" : match.getGroup(7);
}
@@ -511,7 +507,7 @@
*/
@Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "$8")
public static Object getGroup8(Object self) {
- final RegExpMatch match = Global.instance().getLastRegExpMatch();
+ final RegExpResult match = Global.instance().getLastRegExpResult();
return match == null ? "" : match.getGroup(8);
}
@@ -522,34 +518,30 @@
*/
@Getter(where = Where.CONSTRUCTOR, attributes = Attribute.CONSTANT, name = "$9")
public static Object getGroup9(Object self) {
- final RegExpMatch match = Global.instance().getLastRegExpMatch();
+ final RegExpResult match = Global.instance().getLastRegExpResult();
return match == null ? "" : match.getGroup(9);
}
- private RegExpMatch execInner(final String string) {
- if (this.pattern == null) {
- return null; // never matches or similar, e.g. a[]
- }
+ private RegExpResult execInner(final String string) {
- final Matcher matcher = pattern.matcher(string);
- final int start = this.global ? getLastIndex() : 0;
-
+ final int start = regexp.isGlobal() ? getLastIndex() : 0;
if (start < 0 || start > string.length()) {
setLastIndex(0);
return null;
}
- if (!matcher.find(start)) {
+ final RegExpMatcher matcher = regexp.match(string);
+ if (matcher == null || !matcher.search(start)) {
setLastIndex(0);
return null;
}
- if (global) {
+ if (regexp.isGlobal()) {
setLastIndex(matcher.end());
}
- final RegExpMatch match = new RegExpMatch(string, matcher.start(), groups(matcher));
- globalObject.setLastRegExpMatch(match);
+ final RegExpResult match = new RegExpResult(string, matcher.start(), groups(matcher));
+ globalObject.setLastRegExpResult(match);
return match;
}
@@ -557,9 +549,11 @@
* Convert java.util.regex.Matcher groups to JavaScript groups.
* That is, replace null and groups that didn't match with undefined.
*/
- private Object[] groups(final Matcher matcher) {
+ private Object[] groups(final RegExpMatcher matcher) {
final int groupCount = matcher.groupCount();
final Object[] groups = new Object[groupCount + 1];
+ final BitVector groupsInNegativeLookahead = regexp.getGroupsInNegativeLookahead();
+
for (int i = 0, lastGroupStart = matcher.start(); i <= groupCount; i++) {
final int groupStart = matcher.start(i);
if (lastGroupStart > groupStart
@@ -586,7 +580,7 @@
* @return NativeArray of matches, string or null.
*/
public Object exec(final String string) {
- final RegExpMatch match = execInner(string);
+ final RegExpResult match = execInner(string);
if (match == null) {
return null;
@@ -617,7 +611,12 @@
* @return String with substitutions.
*/
Object replace(final String string, final String replacement, final ScriptFunction function) {
- final Matcher matcher = pattern.matcher(string);
+ final RegExpMatcher matcher = regexp.match(string);
+
+ if (matcher == null) {
+ return string;
+ }
+
/*
* $$ -> $
* $& -> the matched substring
@@ -628,8 +627,8 @@
*/
String replace = replacement;
- if (!global) {
- if (!matcher.find()) {
+ if (!regexp.isGlobal()) {
+ if (!matcher.search(0)) {
return string;
}
@@ -642,45 +641,39 @@
return sb.toString();
}
- int end = 0; // a.k.a. lastAppendPosition
setLastIndex(0);
- boolean found;
- try {
- found = matcher.find(end);
- } catch (final IndexOutOfBoundsException e) {
- found = false;
- }
-
- if (!found) {
+ if (!matcher.search(0)) {
return string;
}
+ int thisIndex = 0;
int previousLastIndex = 0;
final StringBuilder sb = new StringBuilder();
+
do {
if (function != null) {
replace = callReplaceValue(function, matcher, string);
}
- appendReplacement(matcher, string, replace, sb, end);
- end = matcher.end();
+
+ appendReplacement(matcher, string, replace, sb, thisIndex);
// ECMA 15.5.4.10 String.prototype.match(regexp)
- final int thisIndex = end;
+ thisIndex = matcher.end();
if (thisIndex == previousLastIndex) {
setLastIndex(thisIndex + 1);
previousLastIndex = thisIndex + 1;
} else {
previousLastIndex = thisIndex;
}
- } while (matcher.find());
+ } while (previousLastIndex <= string.length() && matcher.search(previousLastIndex));
- sb.append(string, end, string.length());
+ sb.append(string, thisIndex, string.length());
return sb.toString();
}
- private void appendReplacement(final Matcher matcher, final String text, final String replacement, final StringBuilder sb, final int lastAppendPosition) {
+ private void appendReplacement(final RegExpMatcher matcher, final String text, final String replacement, final StringBuilder sb, final int lastAppendPosition) {
// Process substitution string to replace group references with groups
int cursor = 0;
final StringBuilder result = new StringBuilder();
@@ -748,7 +741,7 @@
sb.append(result);
}
- private String callReplaceValue(final ScriptFunction function, final Matcher matcher, final String string) {
+ private String callReplaceValue(final ScriptFunction function, final RegExpMatcher matcher, final String string) {
final Object[] groups = groups(matcher);
final Object[] args = Arrays.copyOf(groups, groups.length + 2);
@@ -782,7 +775,7 @@
return new NativeArray();
}
- RegExpMatch match;
+ RegExpResult match;
final int inputLength = input.length();
int lastLength = -1;
int lastLastIndex = 0;
@@ -834,7 +827,7 @@
* @return Index of match.
*/
Object search(final String string) {
- final RegExpMatch match = execInner(string);
+ final RegExpResult match = execInner(string);
if (match == null) {
return -1;
@@ -884,52 +877,20 @@
}
}
- private String getInput() {
- return input;
- }
-
- private void setInput(final String input) {
- this.input = input;
+ private void setGlobal(final boolean global) {
+ regexp.setGlobal(global);
}
boolean getGlobal() {
- return global;
- }
-
- private void setGlobal(final boolean global) {
- this.global = global;
- }
-
- private boolean getIgnoreCase() {
- return ignoreCase;
- }
-
- private void setIgnoreCase(final boolean ignoreCase) {
- this.ignoreCase = ignoreCase;
- }
-
- private boolean getMultiline() {
- return multiline;
+ return regexp.isGlobal();
}
- private void setMultiline(final boolean multiline) {
- this.multiline = multiline;
- }
-
- private Pattern getPattern() {
- return pattern;
+ private RegExp getRegExp() {
+ return regexp;
}
- private void setPattern(final Pattern pattern) {
- this.pattern = pattern;
- }
-
- private BitVector getGroupsInNegativeLookahead() {
- return groupsInNegativeLookahead;
- }
-
- private void setGroupsInNegativeLookahead(final BitVector groupsInNegativeLookahead) {
- this.groupsInNegativeLookahead = groupsInNegativeLookahead;
+ private void setRegExp(final RegExp regexp) {
+ this.regexp = regexp;
}
}
--- a/nashorn/src/jdk/nashorn/internal/objects/NativeRegExpExecResult.java Fri Feb 22 10:39:00 2013 -0400
+++ b/nashorn/src/jdk/nashorn/internal/objects/NativeRegExpExecResult.java Fri Feb 22 16:31:10 2013 +0100
@@ -31,7 +31,7 @@
import jdk.nashorn.internal.objects.annotations.ScriptClass;
import jdk.nashorn.internal.objects.annotations.Setter;
import jdk.nashorn.internal.runtime.JSType;
-import jdk.nashorn.internal.runtime.RegExpMatch;
+import jdk.nashorn.internal.runtime.regexp.RegExpResult;
import jdk.nashorn.internal.runtime.ScriptObject;
import jdk.nashorn.internal.runtime.arrays.ArrayData;
@@ -49,11 +49,11 @@
@Property
public Object input;
- NativeRegExpExecResult(final RegExpMatch match) {
+ NativeRegExpExecResult(final RegExpResult result) {
setProto(Global.instance().getArrayPrototype());
- this.setArray(ArrayData.allocate(match.getGroups().clone()));
- this.index = match.getIndex();
- this.input = match.getInput();
+ this.setArray(ArrayData.allocate(result.getGroups().clone()));
+ this.index = result.getIndex();
+ this.input = result.getInput();
}
/**
--- a/nashorn/src/jdk/nashorn/internal/objects/NativeString.java Fri Feb 22 10:39:00 2013 -0400
+++ b/nashorn/src/jdk/nashorn/internal/objects/NativeString.java Fri Feb 22 16:31:10 2013 +0100
@@ -38,7 +38,6 @@
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
-import java.util.regex.Pattern;
import jdk.internal.dynalink.CallSiteDescriptor;
import jdk.internal.dynalink.linker.GuardedInvocation;
import jdk.internal.dynalink.linker.LinkRequest;
@@ -712,7 +711,7 @@
if (string instanceof NativeRegExp) {
nativeRegExp = (NativeRegExp) string;
} else {
- nativeRegExp = new NativeRegExp(Pattern.compile(JSType.toString(string), Pattern.LITERAL));
+ nativeRegExp = NativeRegExp.flatRegExp(JSType.toString(string));
}
if (replacement instanceof ScriptFunction) {
--- a/nashorn/src/jdk/nashorn/internal/parser/AbstractParser.java Fri Feb 22 10:39:00 2013 -0400
+++ b/nashorn/src/jdk/nashorn/internal/parser/AbstractParser.java Fri Feb 22 16:31:10 2013 +0100
@@ -37,7 +37,7 @@
import jdk.nashorn.internal.runtime.ErrorManager;
import jdk.nashorn.internal.runtime.JSErrorType;
import jdk.nashorn.internal.runtime.ParserException;
-import jdk.nashorn.internal.runtime.RegExp;
+import jdk.nashorn.internal.runtime.regexp.RegExpFactory;
import jdk.nashorn.internal.runtime.Source;
/**
@@ -427,7 +427,7 @@
if (value instanceof RegexToken) {
final RegexToken regex = (RegexToken)value;
try {
- RegExp.validate(regex.getExpression(), regex.getOptions());
+ RegExpFactory.validate(regex.getExpression(), regex.getOptions());
} catch (final ParserException e) {
error(e.getMessage());
}
--- a/nashorn/src/jdk/nashorn/internal/runtime/RegExp.java Fri Feb 22 10:39:00 2013 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,177 +0,0 @@
-/*
- * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation. Oracle designates this
- * particular file as subject to the "Classpath" exception as provided
- * by Oracle in the LICENSE file that accompanied this code.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-
-package jdk.nashorn.internal.runtime;
-
-import static java.util.regex.Pattern.CASE_INSENSITIVE;
-import static java.util.regex.Pattern.MULTILINE;
-import static java.util.regex.Pattern.UNICODE_CASE;
-
-import java.util.HashSet;
-import java.util.regex.Pattern;
-import java.util.regex.PatternSyntaxException;
-
-/**
- * This class is used to represent a parsed regular expression. Accepts input
- * pattern string and flagString. This is used by AbstractParser to validate
- * RegExp literals as well as by NativeRegExp to parse RegExp constructor arguments.
- */
-public final class RegExp {
- /** Pattern string. */
- private final String input;
-
- /** Global search flag for this regexp.*/
- private boolean global;
-
- /** Case insensitive flag for this regexp */
- private boolean ignoreCase;
-
- /** Multi-line flag for this regexp */
- private boolean multiline;
-
- /** Java regexp pattern to use for match. We compile to one of these */
- private Pattern pattern;
-
- /** BitVector that keeps track of groups in negative lookahead */
- private BitVector groupsInNegativeLookahead;
-
- /**
- * Creates RegExpLiteral object from given input and flagString.
- *
- * @param input RegExp pattern string
- * @param flagString RegExp flags
- * @throws ParserException if flagString is invalid or input string has syntax error.
- */
- public RegExp(final String input, final String flagString) throws ParserException {
- this.input = input;
- final HashSet<Character> usedFlags = new HashSet<>();
- int flags = 0;
-
- for (final char ch : flagString.toCharArray()) {
- if (usedFlags.contains(ch)) {
- throwParserException("repeated.flag", Character.toString(ch));
- }
-
- switch (ch) {
- case 'g':
- this.global = true;
- usedFlags.add(ch);
- break;
- case 'i':
- this.ignoreCase = true;
- flags |= CASE_INSENSITIVE | UNICODE_CASE;
- usedFlags.add(ch);
- break;
- case 'm':
- this.multiline = true;
- flags |= MULTILINE;
- usedFlags.add(ch);
- break;
- default:
- throwParserException("unsupported.flag", Character.toString(ch));
- }
- }
-
- try {
- RegExpScanner parsed;
-
- try {
- parsed = RegExpScanner.scan(input);
- } catch (final PatternSyntaxException e) {
- // refine the exception with a better syntax error, if this
- // passes, just rethrow what we have
- Pattern.compile(input, flags);
- throw e;
- }
-
- if (parsed != null) {
- this.pattern = Pattern.compile(parsed.getJavaPattern(), flags);
- this.groupsInNegativeLookahead = parsed.getGroupsInNegativeLookahead();
- }
- } catch (final PatternSyntaxException e2) {
- throwParserException("syntax", e2.getMessage());
- }
-
- }
-
- /**
- * @return the input
- */
- public String getInput() {
- return input;
- }
-
- /**
- * @return the global
- */
- public boolean isGlobal() {
- return global;
- }
-
- /**
- * @return the ignoreCase
- */
- public boolean isIgnoreCase() {
- return ignoreCase;
- }
-
- /**
- * @return the multiline
- */
- public boolean isMultiline() {
- return multiline;
- }
-
- /**
- * @return the pattern
- */
- public Pattern getPattern() {
- return pattern;
- }
-
- /**
- * @return the groupsInNegativeLookahead
- */
- public BitVector getGroupsInNegativeLookahead() {
- return groupsInNegativeLookahead;
- }
-
- /**
- * Validation method for RegExp input and flagString - we don't care about the RegExp object
- *
- * @param input regexp input
- * @param flagString flag string
- *
- * @throws ParserException if invalid regexp and flags
- */
- @SuppressWarnings({"unused", "ResultOfObjectAllocationIgnored"})
- public static void validate(final String input, final String flagString) throws ParserException {
- new RegExp(input, flagString);
- }
-
- private static void throwParserException(final String key, final String str) throws ParserException {
- throw new ParserException(ECMAErrors.getMessage("parser.error.regex." + key, str));
- }
-}
--- a/nashorn/src/jdk/nashorn/internal/runtime/RegExpMatch.java Fri Feb 22 10:39:00 2013 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation. Oracle designates this
- * particular file as subject to the "Classpath" exception as provided
- * by Oracle in the LICENSE file that accompanied this code.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-
-package jdk.nashorn.internal.runtime;
-
-/**
- * Match tuple to keep track of ongoing regexp match.
- */
-public final class RegExpMatch {
- final Object[] groups;
- final int index;
- final String input;
-
- /**
- * Constructor
- *
- * @param input regexp input
- * @param index index of match
- * @param groups groups vector
- */
- public RegExpMatch(final String input, final int index, final Object[] groups) {
- this.input = input;
- this.index = index;
- this.groups = groups;
- }
-
- /**
- * Get the groups for the match
- * @return group vector
- */
- public Object[] getGroups() {
- return groups;
- }
-
- /**
- * Get the input for the map
- * @return input
- */
- public String getInput() {
- return input;
- }
-
- /**
- * Get the index for the match
- * @return index
- */
- public int getIndex() {
- return index;
- }
-
- /**
- * Get the length of the match
- * @return length
- */
- public int length() {
- return ((String)groups[0]).length();
- }
-
- /**
- * Get the group with the given index or the empty string if group index is not valid.
- * @param index the group index
- * @return the group or ""
- */
- public Object getGroup(int index) {
- return index >= 0 && index < groups.length ? groups[index] : "";
- }
-
- /**
- * Get the last parenthesis group, or the empty string if none exists.
- * @return the last group or ""
- */
- public Object getLastParen() {
- return groups.length > 1 ? groups[groups.length - 1] : "";
- }
-
-}
--- a/nashorn/src/jdk/nashorn/internal/runtime/RegExpScanner.java Fri Feb 22 10:39:00 2013 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,1411 +0,0 @@
-/*
- * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation. Oracle designates this
- * particular file as subject to the "Classpath" exception as provided
- * by Oracle in the LICENSE file that accompanied this code.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-
-package jdk.nashorn.internal.runtime;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.LinkedHashMap;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.regex.PatternSyntaxException;
-import jdk.nashorn.internal.parser.Lexer;
-import jdk.nashorn.internal.parser.Scanner;
-
-/**
- * Scan a JavaScript regexp, converting to Java regex if necessary.
- *
- */
-final class RegExpScanner extends Scanner {
-
- /**
- * String builder to accumulate the result - this contains verbatim parsed JavaScript.
- * to get the java equivalent we need to create a Pattern token and return its toString()
- */
- private final StringBuilder sb;
-
- /** An optional error message if one occurred during parse. */
- private String errorMessage;
-
- /** Is this the special case of a regexp that never matches anything */
- private boolean neverMatches;
-
- /** The resulting java.util.regex pattern string. */
- private String javaPattern;
-
- /** Expected token table */
- private final Map<Character, Integer> expected = new HashMap<>();
-
- /** Capturing parenthesis that have been found so far. */
- private final List<Capture> caps = new LinkedList<>();
-
- /** Forward references to capturing parenthesis to be resolved later.*/
- private final Map<Integer, Token> forwardReferences = new LinkedHashMap<>();
-
- /** Current level of zero-width negative lookahead assertions. */
- private int negativeLookaheadLevel;
-
- private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?";
-
- private static class Capture {
- /**
- * Zero-width negative lookaheads enclosing the capture.
- */
- private final int negativeLookaheadLevel;
- /**
- * Captures that live inside a negative lookahead are dead after the
- * lookahead and will be undefined if referenced from outside.
- */
- private boolean isDead;
-
- Capture(final int negativeLookaheadLevel) {
- this.negativeLookaheadLevel = negativeLookaheadLevel;
- }
-
- public int getNegativeLookaheadLevel() {
- return negativeLookaheadLevel;
- }
-
- public boolean isDead() {
- return isDead;
- }
-
- public void setDead() {
- this.isDead = true;
- }
- }
-
- /**
- * This is a token - the JavaScript regexp is scanned into a token tree
- * A token has other tokens as children as well as "atoms", i.e. Strings.
- *
- */
- private static class Token {
-
- private enum Type {
- PATTERN,
- DISJUNCTION,
- ALTERNATIVE,
- TERM,
- ASSERTION,
- QUANTIFIER,
- QUANTIFIER_PREFIX,
- ATOM,
- PATTERN_CHARACTER,
- ATOM_ESCAPE,
- CHARACTER_ESCAPE,
- CONTROL_ESCAPE,
- CONTROL_LETTER,
- IDENTITY_ESCAPE,
- DECIMAL_ESCAPE,
- CHARACTERCLASS_ESCAPE,
- CHARACTERCLASS,
- CLASSRANGES,
- NON_EMPTY_CLASSRANGES,
- NON_EMPTY_CLASSRANGES_NODASH,
- CLASSATOM,
- CLASSATOM_NODASH,
- CLASS_ESCAPE,
- DECIMALDIGITS,
- HEX_ESCAPESEQUENCE,
- UNICODE_ESCAPESEQUENCE,
- }
-
- /**
- * Token tyoe
- */
- private final Token.Type type;
-
- /**
- * Child nodes
- */
- private final List<Object> children;
-
- /**
- * Parent node
- */
- private Token parent;
-
- /**
- * Dead code flag
- */
- private boolean isDead;
-
- private static final Map<Type, ToString> toStringMap = new HashMap<>();
- private static final ToString DEFAULT_TOSTRING = new ToString();
-
- private static String unicode(final int value) {
- final StringBuilder sb = new StringBuilder();
- final String hex = Integer.toHexString(value);
- sb.append('u');
- for (int i = 0; i < 4 - hex.length(); i++) {
- sb.append('0');
- }
- sb.append(hex);
-
- return sb.toString();
- }
-
- static {
- toStringMap.put(Type.CHARACTERCLASS, new ToString() {
- @Override
- public String toString(final Token token) {
- return super.toString(token).replace("\\b", "\b");
- }
- });
-
- // for some reason java regexps don't like control characters on the
- // form "\\ca".match([string with ascii 1 at char0]). Translating
- // them to unicode does it though.
- toStringMap.put(Type.CHARACTER_ESCAPE, new ToString() {
- @Override
- public String toString(final Token token) {
- final String str = super.toString(token);
- if (str.length() == 2) {
- return Token.unicode(Character.toLowerCase(str.charAt(1)) - 'a' + 1);
- }
- return str;
- }
- });
-
- toStringMap.put(Type.DECIMAL_ESCAPE, new ToString() {
- @Override
- public String toString(final Token token) {
- final String str = super.toString(token);
-
- if ("\0".equals(str)) {
- return str;
- }
-
- int value;
-
- if (!token.hasParentOfType(Type.CLASSRANGES)) {
- return str;
- }
-
- value = Integer.parseInt(str, 8); //throws exception that leads to SyntaxError if not octal
- if (value > 0xff) {
- throw new NumberFormatException(str);
- }
-
- return Token.unicode(value);
- }
- });
-
- }
-
- /**
- * JavaScript Token to Java regex substring framework.
- *
- */
- private static class ToString {
- String toString(final Token token) {
- final StringBuilder sb = new StringBuilder();
- for (final Object child : token.getChildren()) {
- sb.append(child);
- }
-
- //perform global substitutions that hold true for any evaluated form
- String str = sb.toString();
- switch (str) {
- case "\\s":
- str = "[" + Lexer.getWhitespaceRegExp() + "]";
- break;
- case "\\S":
- str = "[^" + Lexer.getWhitespaceRegExp() + "]";
- break;
- case "[^]":
- str = "[\\s\\S]";
- break;
- default:
- break;
- }
- return str;
- }
- }
-
- /**
- * Token iterator. Doesn't return "atom" children. i.e. string representations,
- * just tokens
- *
- */
- private static class TokenIterator implements Iterator<Token> {
- private final List<Token> preorder;
-
- private void init(final Token root) {
- preorder.add(root);
- for (final Object child : root.getChildren()) {
- if (child instanceof Token) {
- init((Token)child);
- }
- }
- }
-
- TokenIterator(final Token root) {
- preorder = new ArrayList<>();
- init(root);
- }
-
- @Override
- public boolean hasNext() {
- return !preorder.isEmpty();
- }
-
- @Override
- public Token next() {
- return preorder.remove(0);
- }
-
- @Override
- public void remove() {
- next();
- }
- }
-
- /**
- * Constructor
- * @param type the token type
- */
- Token(final Token.Type type) {
- this.type = type;
- children = new ArrayList<>();
- }
-
- /**
- * Add a an "atom" child to a token
- * @param child the child to add
- * @return the token (for chaining)
- */
- public Token add(final String child) {
- children.add(child);
- return this;
- }
-
- /**
- * Add a child to a token
- * @param child the child
- * @return the token (for chaining)
- */
- public Token add(final Token child) {
- if (child != null) {
- children.add(child);
- child.setParent(this);
- }
- return this;
- }
-
- /**
- * Remove a child from a token
- * @param child the child to remove
- * @return true if successful
- */
- public boolean remove(final Token child) {
- return children.remove(child);
- }
-
- /**
- * Remove the last child from a token
- * @return the removed child
- */
- public Object removeLast() {
- return children.remove(children.size() - 1);
- }
-
- /**
- * Flag this token as dead code
- * @param isDead is it dead or not
- */
- private void setIsDead(final boolean isDead) {
- this.isDead = isDead;
- }
-
- /**
- * Is this token dead code
- * @return boolean
- */
- private boolean getIsDead() {
- return isDead;
- }
-
- /**
- * Get the parent of this token
- * @return parent token
- */
- public Token getParent() {
- return parent;
- }
-
- public boolean hasParentOfType(final Token.Type parentType) {
- for (Token p = getParent(); p != null; p = p.getParent()) {
- if (p.getType() == parentType) {
- return true;
- }
- }
- return false;
- }
-
- public boolean hasChildOfType(final Token.Type childType) {
- for (final Iterator<Token> iter = iterator() ; iter.hasNext() ; ) {
- if (iter.next().getType() == childType) {
- return true;
- }
- }
- return false;
- }
-
- /**
- * Set the parent of this token
- * @param parent
- */
- private void setParent(final Token parent) {
- this.parent = parent;
- }
-
- /**
- * Get the children of this token
- * @return an array of children, never null
- */
- public Object[] getChildren() {
- return children.toArray();
- }
-
- /**
- * Reset this token, remove all children
- */
- public void reset() {
- children.clear();
- }
-
- /**
- * Get a preorder token iterator with this token as root
- * @return an iterator
- */
- public Iterator<Token> iterator() {
- return new TokenIterator(this);
- }
-
- /**
- * Get the type of this token
- * @return type
- */
- public Type getType() {
- return type;
- }
-
- /**
- * Turn this token into Java regexp compatible text
- * @return part of a java regexp
- */
- @Override
- public String toString() {
- ToString t = toStringMap.get(getType());
- if (t == null) {
- t = DEFAULT_TOSTRING;
- }
- return t.toString(this);
- }
- }
-
- /**
- * Constructor
- * @param string the JavaScript regexp to parse
- */
- private RegExpScanner(final String string) {
- super(string);
- sb = new StringBuilder(limit);
- reset(0);
- expected.put(']', 0);
- expected.put('}', 0);
- }
-
- private void processForwardReferences() {
- if (neverMatches()) {
- return;
- }
-
- for (final Map.Entry<Integer, Token> fwdRef : forwardReferences.entrySet()) {
- if (fwdRef.getKey().intValue() > caps.size()) {
- neverMatches = true;
- break;
- }
-
- fwdRef.getValue().setIsDead(true);
- }
-
- forwardReferences.clear();
- }
-
- /**
- * Scan a JavaScript regexp string returning a Java safe regex string.
- *
- * @param string
- * JavaScript regexp string.
- * @return Java safe regex string.
- */
- public static RegExpScanner scan(final String string) {
- final RegExpScanner scanner = new RegExpScanner(string);
-
- Token pattern;
-
- try {
- pattern = scanner.pattern();
- } catch (final Exception e) {
- throw new PatternSyntaxException(e.getMessage(), string, scanner.sb.length());
- }
-
- scanner.processForwardReferences();
- if (scanner.neverMatches()) {
- return null; // never matches
- }
-
- // go over the code and remove dead code
- final Iterator<Token> iter = pattern.iterator();
- while (iter.hasNext()) {
- final Token next = iter.next();
- if (next.getIsDead()) {
- next.getParent().remove(next);
- }
- }
-
- // turn the pattern into a string, p, the java equivalent string for our js regexp
- final String p = pattern.toString();
- // if builder contains all tokens that were sent in, we know
- // we correctly parsed the entire JavaScript regexp without syntax errors
- if (!string.equals(scanner.getStringBuilder().toString())) {
- throw new PatternSyntaxException(string, p, p.length() + 1);
- }
-
- scanner.javaPattern = p;
- return scanner;
- }
-
- /**
- * Does this regexp ever match anything? Use of e.g. [], which is legal in JavaScript,
- * is an example where we never match
- *
- * @return boolean
- */
- private boolean neverMatches() {
- return neverMatches;
- }
-
- /**
- * This is used to set better error messages that can be reused
- * in NativeRegExp for augmenting e.g. SyntaxErrors.
- *
- * @return an error message or null if no extra info
- */
- public String getErrorMessage() {
- return errorMessage;
- }
-
- final StringBuilder getStringBuilder() {
- return sb;
- }
-
- String getJavaPattern() {
- return javaPattern;
- }
-
- BitVector getGroupsInNegativeLookahead() {
- BitVector vec = null;
- for (int i = 0; i < caps.size(); i++) {
- final Capture cap = caps.get(i);
- if (cap.getNegativeLookaheadLevel() > 0) {
- if (vec == null) {
- vec = new BitVector(caps.size() + 1);
- }
- vec.set(i + 1);
- }
- }
- return vec;
- }
-
- /**
- * Commit n characters to the builder and to a given token
- * @param token Uncommitted token.
- * @param n Number of characters.
- * @return Committed token
- */
- private Token commit(final Token token, final int n) {
- final int startIn = position;
-
- switch (n) {
- case 1:
- sb.append(ch0);
- skip(1);
- break;
- case 2:
- sb.append(ch0);
- sb.append(ch1);
- skip(2);
- break;
- case 3:
- sb.append(ch0);
- sb.append(ch1);
- sb.append(ch2);
- skip(3);
- break;
- default:
- assert false : "Should not reach here";
- }
-
- if (token == null) {
- return null;
- }
-
- return token.add(sb.substring(startIn, sb.length()));
- }
-
- /**
- * Restart the buffers back at an earlier position.
- *
- * @param startIn
- * Position in the input stream.
- * @param startOut
- * Position in the output stream.
- */
- private void restart(final int startIn, final int startOut) {
- reset(startIn);
- sb.setLength(startOut);
- }
-
- private void push(final char ch) {
- expected.put(ch, expected.get(ch) + 1);
- }
-
- private void pop(final char ch) {
- expected.put(ch, Math.min(0, expected.get(ch) - 1));
- }
-
- /*
- * Recursive descent tokenizer starts below.
- */
-
- /*
- * Pattern ::
- * Disjunction
- */
- private Token pattern() {
- final Token token = new Token(Token.Type.PATTERN);
-
- final Token child = disjunction();
- return token.add(child);
- }
-
- /*
- * Disjunction ::
- * Alternative
- * Alternative | Disjunction
- */
- private Token disjunction() {
- final Token token = new Token(Token.Type.DISJUNCTION);
-
- while (true) {
- token.add(alternative());
-
- if (ch0 == '|') {
- commit(token, 1);
- } else {
- break;
- }
- }
-
- return token;
- }
-
- /*
- * Alternative ::
- * [empty]
- * Alternative Term
- */
- private Token alternative() {
- final Token token = new Token(Token.Type.ALTERNATIVE);
-
- Token child;
- while ((child = term()) != null) {
- token.add(child);
- }
-
- return token;
- }
-
- /*
- * Term ::
- * Assertion
- * Atom
- * Atom Quantifier
- */
- private Token term() {
- final int startIn = position;
- final int startOut = sb.length();
- final Token token = new Token(Token.Type.TERM);
- Token child;
-
- child = assertion();
- if (child != null) {
- return token.add(child);
- }
-
- child = atom();
- if (child != null) {
- boolean emptyCharacterClass = false;
- if ("[]".equals(child.toString())) {
- emptyCharacterClass = true;
- }
-
- token.add(child);
-
- final Token quantifier = quantifier();
- if (quantifier != null) {
- token.add(quantifier);
- }
-
- if (emptyCharacterClass) {
- if (quantifier == null) {
- neverMatches = true; //never matches ever.
- } else {
- //if we can get away with max zero, remove this entire token
- final String qs = quantifier.toString();
- if ("+".equals(qs) || "*".equals(qs) || qs.startsWith("{0,")) {
- token.setIsDead(true);
- }
- }
- }
-
- return token;
- }
-
- restart(startIn, startOut);
- return null;
- }
-
- /*
- * Assertion ::
- * ^
- * $
- * \b
- * \B
- * ( ? = Disjunction )
- * ( ? ! Disjunction )
- */
- private Token assertion() {
- final int startIn = position;
- final int startOut = sb.length();
- final Token token = new Token(Token.Type.ASSERTION);
-
- switch (ch0) {
- case '^':
- case '$':
- return commit(token, 1);
-
- case '\\':
- if (ch1 == 'b' || ch1 == 'B') {
- return commit(token, 2);
- }
- break;
-
- case '(':
- if (ch1 != '?') {
- break;
- }
- if (ch2 != '=' && ch2 != '!') {
- break;
- }
- final boolean isNegativeLookahead = (ch2 == '!');
- commit(token, 3);
-
- if (isNegativeLookahead) {
- negativeLookaheadLevel++;
- }
- final Token disjunction = disjunction();
- if (isNegativeLookahead) {
- for (final Capture cap : caps) {
- if (cap.getNegativeLookaheadLevel() >= negativeLookaheadLevel) {
- cap.setDead();
- }
- }
- negativeLookaheadLevel--;
- }
-
- if (disjunction != null && ch0 == ')') {
- token.add(disjunction);
- return commit(token, 1);
- }
- break;
-
- default:
- break;
- }
-
- restart(startIn, startOut);
-
- return null;
- }
-
- /*
- * Quantifier ::
- * QuantifierPrefix
- * QuantifierPrefix ?
- */
- private Token quantifier() {
- final Token token = new Token(Token.Type.QUANTIFIER);
- final Token child = quantifierPrefix();
- if (child != null) {
- token.add(child);
- if (ch0 == '?') {
- commit(token, 1);
- }
- return token;
- }
- return null;
- }
-
- /*
- * QuantifierPrefix ::
- * *
- * +
- * ?
- * { DecimalDigits }
- * { DecimalDigits , }
- * { DecimalDigits , DecimalDigits }
- */
- private Token quantifierPrefix() {
- final int startIn = position;
- final int startOut = sb.length();
- final Token token = new Token(Token.Type.QUANTIFIER_PREFIX);
-
- switch (ch0) {
- case '*':
- case '+':
- case '?':
- return commit(token, 1);
-
- case '{':
- commit(token, 1);
-
- final Token child = decimalDigits();
- if (child == null) {
- break; // not a quantifier - back out
- }
- push('}');
- token.add(child);
-
- if (ch0 == ',') {
- commit(token, 1);
- token.add(decimalDigits());
- }
-
- if (ch0 == '}') {
- pop('}');
- commit(token, 1);
- }
-
- return token;
-
- default:
- break;
- }
-
- restart(startIn, startOut);
- return null;
- }
-
- /*
- * Atom ::
- * PatternCharacter
- * .
- * \ AtomEscape
- * CharacterClass
- * ( Disjunction )
- * ( ? : Disjunction )
- *
- */
- private Token atom() {
- final int startIn = position;
- final int startOut = sb.length();
- final Token token = new Token(Token.Type.ATOM);
- Token child;
-
- child = patternCharacter();
- if (child != null) {
- return token.add(child);
- }
-
- if (ch0 == '.') {
- return commit(token, 1);
- }
-
- if (ch0 == '\\') {
- commit(token, 1);
- child = atomEscape();
-
- if (child != null) {
- if (child.hasChildOfType(Token.Type.IDENTITY_ESCAPE)) {
- final char idEscape = child.toString().charAt(0);
- if (NON_IDENT_ESCAPES.indexOf(idEscape) == -1) {
- token.reset();
- }
- }
-
- token.add(child);
-
- // forward backreferences always match empty. JavaScript != Java
- if (child.hasChildOfType(Token.Type.DECIMAL_ESCAPE) && !"\u0000".equals(child.toString())) {
- final int refNum = Integer.parseInt(child.toString());
-
- if (refNum - 1 < caps.size() && caps.get(refNum - 1).isDead()) {
- // reference to dead in-negative-lookahead capture
- token.setIsDead(true);
- } else if (caps.size() < refNum) {
- // forward reference: always matches against empty string (dead token).
- // invalid reference (non-existant capture): pattern never matches.
- forwardReferences.put(refNum, token);
- }
- }
-
- return token;
- }
- }
-
- child = characterClass();
- if (child != null) {
- return token.add(child);
- }
-
- if (ch0 == '(') {
- boolean capturingParens = true;
- commit(token, 1);
- if (ch0 == '?' && ch1 == ':') {
- capturingParens = false;
- commit(token, 2);
- }
-
- child = disjunction();
- if (child != null) {
- token.add(child);
- if (ch0 == ')') {
- final Token atom = commit(token, 1);
- if (capturingParens) {
- caps.add(new Capture(negativeLookaheadLevel));
- }
- return atom;
- }
- }
- }
-
- restart(startIn, startOut);
- return null;
- }
-
- /*
- * PatternCharacter ::
- * SourceCharacter but not any of: ^$\.*+?()[]{}|
- */
- @SuppressWarnings("fallthrough")
- private Token patternCharacter() {
- if (atEOF()) {
- return null;
- }
-
- switch (ch0) {
- case '^':
- case '$':
- case '\\':
- case '.':
- case '*':
- case '+':
- case '?':
- case '(':
- case ')':
- case '[':
- case '|':
- return null;
-
- case '}':
- case ']':
- final int n = expected.get(ch0);
- if (n != 0) {
- return null;
- }
-
- case '{':
- // if not a valid quantifier escape curly brace to match itself
- // this ensures compatibility with other JS implementations
- final Token quant = quantifierPrefix();
- return (quant == null) ? commit(new Token(Token.Type.PATTERN_CHARACTER).add("\\"), 1) : null;
-
- default:
- return commit(new Token(Token.Type.PATTERN_CHARACTER), 1); // SOURCECHARACTER
- }
- }
-
- /*
- * AtomEscape ::
- * DecimalEscape
- * CharacterEscape
- * CharacterClassEscape
- */
- private Token atomEscape() {
- final Token token = new Token(Token.Type.ATOM_ESCAPE);
- Token child;
-
- child = decimalEscape();
- if (child != null) {
- return token.add(child);
- }
-
- child = characterClassEscape();
- if (child != null) {
- return token.add(child);
- }
-
- child = characterEscape();
- if (child != null) {
- return token.add(child);
- }
-
-
- return null;
- }
-
- /*
- * CharacterEscape ::
- * ControlEscape
- * c ControlLetter
- * HexEscapeSequence
- * UnicodeEscapeSequence
- * IdentityEscape
- */
- private Token characterEscape() {
- final int startIn = position;
- final int startOut = sb.length();
-
- final Token token = new Token(Token.Type.CHARACTER_ESCAPE);
- Token child;
-
- child = controlEscape();
- if (child != null) {
- return token.add(child);
- }
-
- if (ch0 == 'c') {
- commit(token, 1);
- child = controlLetter();
- if (child != null) {
- return token.add(child);
- }
- restart(startIn, startOut);
- }
-
- child = hexEscapeSequence();
- if (child != null) {
- return token.add(child);
- }
-
- child = unicodeEscapeSequence();
- if (child != null) {
- return token.add(child);
- }
-
- child = identityEscape();
- if (child != null) {
- return token.add(child);
- }
-
- restart(startIn, startOut);
-
- return null;
- }
-
- private boolean scanEscapeSequence(final char leader, final int length, final Token token) {
- final int startIn = position;
- final int startOut = sb.length();
-
- if (ch0 != leader) {
- return false;
- }
-
- commit(token, 1);
- for (int i = 0; i < length; i++) {
- final char ch0l = Character.toLowerCase(ch0);
- if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) {
- commit(token, 1);
- } else {
- restart(startIn, startOut);
- return false;
- }
- }
-
- return true;
- }
-
- private Token hexEscapeSequence() {
- final Token token = new Token(Token.Type.HEX_ESCAPESEQUENCE);
- if (scanEscapeSequence('x', 2, token)) {
- return token;
- }
- return null;
- }
-
- private Token unicodeEscapeSequence() {
- final Token token = new Token(Token.Type.UNICODE_ESCAPESEQUENCE);
- if (scanEscapeSequence('u', 4, token)) {
- return token;
- }
- return null;
- }
-
- /*
- * ControlEscape ::
- * one of fnrtv
- */
- private Token controlEscape() {
- switch (ch0) {
- case 'f':
- case 'n':
- case 'r':
- case 't':
- case 'v':
- return commit(new Token(Token.Type.CONTROL_ESCAPE), 1);
-
- default:
- return null;
- }
- }
-
- /*
- * ControlLetter ::
- * one of abcdefghijklmnopqrstuvwxyz
- * ABCDEFGHIJKLMNOPQRSTUVWXYZ
- */
- private Token controlLetter() {
- final char c = Character.toUpperCase(ch0);
- if (c >= 'A' && c <= 'Z') {
- final Token token = new Token(Token.Type.CONTROL_LETTER);
- commit(token, 1);
- return token;
- }
- return null;
- /*
- Token token = new Token(Token.Type.CONTROL_LETTER);
- commit(null, 1);//add original char to builder not to token
- this.neverMatches = c < 'A' || c > 'Z';
- return token.add(""+c);*/
- }
-
- /*
- * IdentityEscape ::
- * SourceCharacter but not IdentifierPart
- * <ZWJ> (200c)
- * <ZWNJ> (200d)
- */
- private Token identityEscape() {
- final Token token = new Token(Token.Type.IDENTITY_ESCAPE);
- commit(token, 1);
- return token;
- }
-
- /*
- * DecimalEscape ::
- * DecimalIntegerLiteral [lookahead DecimalDigit]
- */
- private Token decimalEscape() {
- final Token token = new Token(Token.Type.DECIMAL_ESCAPE);
- final int startIn = position;
- final int startOut = sb.length();
-
- if (ch0 == '0' && !isDecimalDigit(ch1)) {
- commit(token, 1);
- token.removeLast();
- // DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000);
- return token.add("\u0000");
- }
-
- if (isDecimalDigit(ch0)) {
- while (isDecimalDigit(ch0)) {
- commit(token, 1);
- }
- return token;
- }
-
- restart(startIn, startOut);
-
- return null;
- }
-
- /*
- * CharacterClassEscape ::
- * one of dDsSwW
- */
- private Token characterClassEscape() {
- switch (ch0) {
- case 's':
- case 'S':
- case 'd':
- case 'D':
- case 'w':
- case 'W':
- return commit(new Token(Token.Type.CHARACTERCLASS_ESCAPE), 1);
-
- default:
- return null;
- }
- }
-
- /*
- * CharacterClass ::
- * [ [lookahead {^}] ClassRanges ]
- * [ ^ ClassRanges ]
- */
- private Token characterClass() {
- final int startIn = position;
- final int startOut = sb.length();
- final Token token = new Token(Token.Type.CHARACTERCLASS);
-
- if (ch0 == '[') {
- push(']');
- commit(token, 1);
-
- if (ch0 == '^') {
- commit(token, 1);
- }
-
- final Token child = classRanges();
- if (child != null && ch0 == ']') {
- pop(']');
- token.add(child);
- return commit(token, 1);
- }
- }
-
- restart(startIn, startOut);
- return null;
- }
-
- /*
- * ClassRanges ::
- * [empty]
- * NonemptyClassRanges
- */
- private Token classRanges() {
- return new Token(Token.Type.CLASSRANGES).add(nonemptyClassRanges());
- }
-
- /*
- * NonemptyClassRanges ::
- * ClassAtom
- * ClassAtom NonemptyClassRangesNoDash
- * ClassAtom - ClassAtom ClassRanges
- */
- private Token nonemptyClassRanges() {
- final int startIn = position;
- final int startOut = sb.length();
- final Token token = new Token(Token.Type.NON_EMPTY_CLASSRANGES);
- Token child;
-
- child = classAtom();
- if (child != null) {
- token.add(child);
-
- if (ch0 == '-') {
- commit(token, 1);
-
- final Token child1 = classAtom();
- final Token child2 = classRanges();
- if (child1 != null && child2 != null) {
- token.add(child1);
- token.add(child2);
-
- return token;
- }
- }
-
- child = nonemptyClassRangesNoDash();
- if (child != null) {
- token.add(child);
- return token;
- }
-
- return token;
- }
-
- restart(startIn, startOut);
- return null;
- }
-
- /*
- * NonemptyClassRangesNoDash ::
- * ClassAtom
- * ClassAtomNoDash NonemptyClassRangesNoDash
- * ClassAtomNoDash - ClassAtom ClassRanges
- */
- private Token nonemptyClassRangesNoDash() {
- final int startIn = position;
- final int startOut = sb.length();
- final Token token = new Token(Token.Type.NON_EMPTY_CLASSRANGES_NODASH);
- Token child;
-
- child = classAtomNoDash();
- if (child != null) {
- token.add(child);
-
- // need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom
- if (ch0 == '-') {
- commit(token, 1);
-
- final Token child1 = classAtom();
- final Token child2 = classRanges();
- if (child1 != null && child2 != null) {
- token.add(child1);
- return token.add(child2);
- }
- //fallthru
- }
-
- child = nonemptyClassRangesNoDash();
- if (child != null) {
- token.add(child);
- }
- return token; // still a class atom
- }
-
- child = classAtom();
- if (child != null) {
- return token.add(child);
- }
-
- restart(startIn, startOut);
- return null;
- }
-
- /*
- * ClassAtom : - ClassAtomNoDash
- */
- private Token classAtom() {
- final Token token = new Token(Token.Type.CLASSATOM);
-
- if (ch0 == '-') {
- return commit(token, 1);
- }
-
- final Token child = classAtomNoDash();
- if (child != null) {
- return token.add(child);
- }
-
- return null;
- }
-
- /*
- * ClassAtomNoDash ::
- * SourceCharacter but not one of \ or ] or -
- * \ ClassEscape
- */
- private Token classAtomNoDash() {
- final int startIn = position;
- final int startOut = sb.length();
- final Token token = new Token(Token.Type.CLASSATOM_NODASH);
-
- switch (ch0) {
- case ']':
- case '-':
- case '\0':
- return null;
-
- case '[':
- // unescaped left square bracket - add escape
- return commit(token.add("\\"), 1);
-
- case '\\':
- commit(token, 1);
- final Token child = classEscape();
- if (child != null) {
- return token.add(child);
- }
-
- restart(startIn, startOut);
- return null;
-
- default:
- return commit(token, 1);
- }
- }
-
- /*
- * ClassEscape ::
- * DecimalEscape
- * b
- * CharacterEscape
- * CharacterClassEscape
- */
- private Token classEscape() {
- final Token token = new Token(Token.Type.CLASS_ESCAPE);
- Token child;
-
- child = decimalEscape();
- if (child != null) {
- return token.add(child);
- }
-
- if (ch0 == 'b') {
- return commit(token, 1);
- }
-
- child = characterEscape();
- if (child != null) {
- return token.add(child);
- }
-
- child = characterClassEscape();
- if (child != null) {
- return token.add(child);
- }
-
- return null;
- }
-
- /*
- * DecimalDigits
- */
- private Token decimalDigits() {
- if (!isDecimalDigit(ch0)) {
- return null;
- }
-
- final Token token = new Token(Token.Type.DECIMALDIGITS);
- while (isDecimalDigit(ch0)) {
- commit(token, 1);
- }
-
- return token;
- }
-
- private static boolean isDecimalDigit(final char ch) {
- return ch >= '0' && ch <= '9';
- }
-}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/nashorn/src/jdk/nashorn/internal/runtime/regexp/DefaultRegExp.java Fri Feb 22 16:31:10 2013 +0100
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package jdk.nashorn.internal.runtime.regexp;
+
+import jdk.nashorn.internal.runtime.ParserException;
+
+import static java.util.regex.Pattern.CASE_INSENSITIVE;
+import static java.util.regex.Pattern.MULTILINE;
+import static java.util.regex.Pattern.UNICODE_CASE;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+/**
+ * Default regular expression implementation based on java.util.regex package.
+ *
+ * Note that this class is not thread-safe as it stores the current match result
+ * and the string being matched in instance fields.
+ */
+public class DefaultRegExp extends RegExp {
+
+ /** Java regexp pattern to use for match. We compile to one of these */
+ private Pattern pattern;
+
+ /** The matcher */
+ private RegExpMatcher matcher;
+
+ /**
+ * Construct a Regular expression from the given {@code source} and {@code flags} strings.
+ *
+ * @param source RegExp source string
+ * @param flags RegExp flag string
+ * @throws ParserException if flags is invalid or source string has syntax error.
+ */
+ public DefaultRegExp(final String source, final String flags) throws ParserException {
+ super(source, flags);
+
+ int intFlags = 0;
+
+ if (isIgnoreCase()) {
+ intFlags |= CASE_INSENSITIVE | UNICODE_CASE;
+ }
+ if (isMultiline()) {
+ intFlags |= MULTILINE;
+ }
+
+ try {
+ RegExpScanner parsed;
+
+ try {
+ parsed = RegExpScanner.scan(source);
+ } catch (final PatternSyntaxException e) {
+ // refine the exception with a better syntax error, if this
+ // passes, just rethrow what we have
+ Pattern.compile(source, intFlags);
+ throw e;
+ }
+
+ if (parsed != null) {
+ this.pattern = Pattern.compile(parsed.getJavaPattern(), intFlags);
+ this.groupsInNegativeLookahead = parsed.getGroupsInNegativeLookahead();
+ }
+ } catch (final PatternSyntaxException e2) {
+ throwParserException("syntax", e2.getMessage());
+ }
+ }
+
+ @Override
+ public RegExpMatcher match(final String str) {
+ if (pattern == null) {
+ return null; // never matches or similar, e.g. a[]
+ }
+
+ RegExpMatcher matcher = this.matcher;
+
+ if (matcher == null || matcher.getInput() != str) {
+ matcher = new DefaultMatcher(str);
+ this.matcher = matcher;
+ }
+
+ return matcher;
+ }
+
+ class DefaultMatcher implements RegExpMatcher {
+ final String input;
+ final Matcher matcher;
+
+ DefaultMatcher(final String input) {
+ this.input = input;
+ this.matcher = pattern.matcher(input);
+ }
+
+ @Override
+ public boolean search(final int start) {
+ return matcher.find(start);
+ }
+
+ @Override
+ public String getInput() {
+ return input;
+ }
+
+ @Override
+ public int start() {
+ return matcher.start();
+ }
+
+ @Override
+ public int start(final int group) {
+ return matcher.start(group);
+ }
+
+ @Override
+ public int end() {
+ return matcher.end();
+ }
+
+ @Override
+ public int end(final int group) {
+ return matcher.end(group);
+ }
+
+ @Override
+ public String group() {
+ return matcher.group();
+ }
+
+ @Override
+ public String group(final int group) {
+ return matcher.group(group);
+ }
+
+ @Override
+ public int groupCount() {
+ return matcher.groupCount();
+ }
+ }
+
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExp.java Fri Feb 22 16:31:10 2013 +0100
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package jdk.nashorn.internal.runtime.regexp;
+
+import jdk.nashorn.internal.runtime.BitVector;
+import jdk.nashorn.internal.runtime.ECMAErrors;
+import jdk.nashorn.internal.runtime.ParserException;
+
+import java.util.regex.MatchResult;
+
+/**
+ * This is the base class for representing a parsed regular expression.
+ *
+ * Instances of this class are created by a {@link RegExpFactory}.
+ */
+public abstract class RegExp {
+
+ /** Pattern string. */
+ private final String source;
+
+ /** Global search flag for this regexp.*/
+ private boolean global;
+
+ /** Case insensitive flag for this regexp */
+ private boolean ignoreCase;
+
+ /** Multi-line flag for this regexp */
+ private boolean multiline;
+
+ /** BitVector that keeps track of groups in negative lookahead */
+ protected BitVector groupsInNegativeLookahead;
+
+ /**
+ * Constructor.
+ *
+ * @param source the source string
+ * @param flags the flags string
+ */
+ protected RegExp(final String source, final String flags) {
+ this.source = source;
+ for (int i = 0; i < flags.length(); i++) {
+ final char ch = flags.charAt(i);
+ switch (ch) {
+ case 'g':
+ if (this.global) {
+ throwParserException("repeated.flag", "g");
+ }
+ this.global = true;
+ break;
+ case 'i':
+ if (this.ignoreCase) {
+ throwParserException("repeated.flag", "i");
+ }
+ this.ignoreCase = true;
+ break;
+ case 'm':
+ if (this.multiline) {
+ throwParserException("repeated.flag", "m");
+ }
+ this.multiline = true;
+ break;
+ default:
+ throwParserException("unsupported.flag", Character.toString(ch));
+ }
+ }
+ }
+
+ /**
+ * Get the source pattern of this regular expression.
+ *
+ * @return the source string
+ */
+ public String getSource() {
+ return source;
+ }
+
+ /**
+ * Set the global flag of this regular expression to {@code global}.
+ *
+ * @param global the new global flag
+ */
+ public void setGlobal(final boolean global) {
+ this.global = global;
+ }
+
+ /**
+ * Get the global flag of this regular expression.
+ *
+ * @return the global flag
+ */
+ public boolean isGlobal() {
+ return global;
+ }
+
+ /**
+ * Get the ignore-case flag of this regular expression.
+ *
+ * @return the ignore-case flag
+ */
+ public boolean isIgnoreCase() {
+ return ignoreCase;
+ }
+
+ /**
+ * Get the multiline flag of this regular expression.
+ *
+ * @return the multiline flag
+ */
+ public boolean isMultiline() {
+ return multiline;
+ }
+
+ /**
+ * Get a bitset indicating which of the groups in this regular expression are inside a negative lookahead.
+ *
+ * @return the groups-in-negative-lookahead bitset
+ */
+ public BitVector getGroupsInNegativeLookahead() {
+ return groupsInNegativeLookahead;
+ }
+
+ /**
+ * Match this regular expression against {@code str}, starting at index {@code start}
+ * and return a {@link MatchResult} with the result.
+ *
+ * @param str the string
+ * @return the matcher
+ */
+ public abstract RegExpMatcher match(String str);
+
+ /**
+ * Throw a regexp parser exception.
+ *
+ * @param key the message key
+ * @param str string argument
+ * @throws jdk.nashorn.internal.runtime.ParserException
+ */
+ protected static void throwParserException(final String key, final String str) throws ParserException {
+ throw new ParserException(ECMAErrors.getMessage("parser.error.regex." + key, str));
+ }
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpFactory.java Fri Feb 22 16:31:10 2013 +0100
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package jdk.nashorn.internal.runtime.regexp;
+
+import jdk.nashorn.internal.parser.Lexer;
+import jdk.nashorn.internal.runtime.ParserException;
+
+/**
+ * Factory class for regular expressions. This class creates instances of {@link DefaultRegExp}.
+ */
+public class RegExpFactory {
+
+
+ private final static RegExpFactory instance = new RegExpFactory();
+
+ /**
+ * Creates a Regular expression from the given {@code pattern} and {@code flags} strings.
+ *
+ * @param pattern RegExp pattern string
+ * @param flags RegExp flags string
+ * @throws ParserException if flags is invalid or pattern string has syntax error.
+ */
+ protected RegExp compile(final String pattern, final String flags) throws ParserException {
+ return new DefaultRegExp(pattern, flags);
+ }
+
+ /**
+ * Replace a regexp token as suitable for regexp instances created by this factory.
+ *
+ * @param str a regular expression token
+ * @return the replacement token
+ */
+ protected String replaceToken(final String str) {
+ switch (str) {
+ case "\\s":
+ return "[" + Lexer.getWhitespaceRegExp() + "]";
+ case "\\S":
+ return "[^" + Lexer.getWhitespaceRegExp() + "]";
+ case "[^]":
+ return "[\\s\\S]";
+ default:
+ return str;
+ }
+ }
+
+ /**
+ * Compile a regexp with the given {@code source} and {@code flags}.
+ *
+ * @param pattern RegExp pattern string
+ * @param flags flag string
+ *
+ * @throws ParserException if invalid source or flags
+ */
+ public static RegExp create(final String pattern, final String flags) {
+ return instance.compile(pattern, flags);
+ }
+
+ /**
+ * Replace a regexp token as needed by the currently installed factory instance.
+ *
+ * @param token a regexp token
+ * @return the replacement token
+ */
+ public static String replace(final String token) {
+ return instance.replaceToken(token);
+ }
+
+ /**
+ * Validate a regexp with the given {@code source} and {@code flags}.
+ *
+ * @param pattern RegExp pattern string
+ * @param flags flag string
+ *
+ * @throws ParserException if invalid source or flags
+ */
+ // @SuppressWarnings({"unused"})
+ public static void validate(final String pattern, final String flags) throws ParserException {
+ instance.compile(pattern, flags);
+ }
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpMatcher.java Fri Feb 22 16:31:10 2013 +0100
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package jdk.nashorn.internal.runtime.regexp;
+
+import java.util.regex.MatchResult;
+
+/**
+ * Interface for matching a regular expression against a string and retrieving the
+ * match result. Extends {@link MatchResult}.
+ */
+public interface RegExpMatcher extends MatchResult {
+
+ /**
+ * Searches for pattern starting at {@code start}. Returns {@code true} if a match was found.
+ *
+ * @param start the start index in the input string
+ * @return {@code true} if a match was found
+ */
+ boolean search(int start);
+
+ /**
+ * Get the input string.
+ *
+ * @return the input string
+ */
+ String getInput();
+
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpResult.java Fri Feb 22 16:31:10 2013 +0100
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package jdk.nashorn.internal.runtime.regexp;
+
+/**
+ * Match tuple to keep track of ongoing regexp match.
+ */
+public final class RegExpResult {
+ final Object[] groups;
+ final int index;
+ final String input;
+
+ /**
+ * Constructor
+ *
+ * @param input regexp input
+ * @param index index of match
+ * @param groups groups vector
+ */
+ public RegExpResult(final String input, final int index, final Object[] groups) {
+ this.input = input;
+ this.index = index;
+ this.groups = groups;
+ }
+
+ /**
+ * Get the groups for the match
+ * @return group vector
+ */
+ public Object[] getGroups() {
+ return groups;
+ }
+
+ /**
+ * Get the input for the map
+ * @return input
+ */
+ public String getInput() {
+ return input;
+ }
+
+ /**
+ * Get the index for the match
+ * @return index
+ */
+ public int getIndex() {
+ return index;
+ }
+
+ /**
+ * Get the length of the match
+ * @return length
+ */
+ public int length() {
+ return ((String)groups[0]).length();
+ }
+
+ /**
+ * Get the group with the given index or the empty string if group index is not valid.
+ * @param index the group index
+ * @return the group or ""
+ */
+ public Object getGroup(int index) {
+ return index >= 0 && index < groups.length ? groups[index] : "";
+ }
+
+ /**
+ * Get the last parenthesis group, or the empty string if none exists.
+ * @return the last group or ""
+ */
+ public Object getLastParen() {
+ return groups.length > 1 ? groups[groups.length - 1] : "";
+ }
+
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java Fri Feb 22 16:31:10 2013 +0100
@@ -0,0 +1,1391 @@
+/*
+ * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package jdk.nashorn.internal.runtime.regexp;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.PatternSyntaxException;
+
+import jdk.nashorn.internal.parser.Scanner;
+import jdk.nashorn.internal.runtime.BitVector;
+
+/**
+ * Scan a JavaScript regexp, converting to Java regex if necessary.
+ *
+ */
+final class RegExpScanner extends Scanner {
+
+ /**
+ * String builder to accumulate the result - this contains verbatim parsed JavaScript.
+ * to get the java equivalent we need to create a Pattern token and return its toString()
+ */
+ private final StringBuilder sb;
+
+ /** Is this the special case of a regexp that never matches anything */
+ private boolean neverMatches;
+
+ /** The resulting java.util.regex pattern string. */
+ private String javaPattern;
+
+ /** Expected token table */
+ private final Map<Character, Integer> expected = new HashMap<>();
+
+ /** Capturing parenthesis that have been found so far. */
+ private final List<Capture> caps = new LinkedList<>();
+
+ /** Forward references to capturing parenthesis to be resolved later.*/
+ private final Map<Integer, Token> forwardReferences = new LinkedHashMap<>();
+
+ /** Current level of zero-width negative lookahead assertions. */
+ private int negativeLookaheadLevel;
+
+ private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?";
+
+ private static class Capture {
+ /**
+ * Zero-width negative lookaheads enclosing the capture.
+ */
+ private final int negativeLookaheadLevel;
+ /**
+ * Captures that live inside a negative lookahead are dead after the
+ * lookahead and will be undefined if referenced from outside.
+ */
+ private boolean isDead;
+
+ Capture(final int negativeLookaheadLevel) {
+ this.negativeLookaheadLevel = negativeLookaheadLevel;
+ }
+
+ public int getNegativeLookaheadLevel() {
+ return negativeLookaheadLevel;
+ }
+
+ public boolean isDead() {
+ return isDead;
+ }
+
+ public void setDead() {
+ this.isDead = true;
+ }
+ }
+
+ /**
+ * This is a token - the JavaScript regexp is scanned into a token tree
+ * A token has other tokens as children as well as "atoms", i.e. Strings.
+ */
+ private static class Token {
+
+ private enum Type {
+ PATTERN,
+ DISJUNCTION,
+ ALTERNATIVE,
+ TERM,
+ ASSERTION,
+ QUANTIFIER,
+ QUANTIFIER_PREFIX,
+ ATOM,
+ PATTERN_CHARACTER,
+ ATOM_ESCAPE,
+ CHARACTER_ESCAPE,
+ CONTROL_ESCAPE,
+ CONTROL_LETTER,
+ IDENTITY_ESCAPE,
+ DECIMAL_ESCAPE,
+ CHARACTERCLASS_ESCAPE,
+ CHARACTERCLASS,
+ CLASSRANGES,
+ NON_EMPTY_CLASSRANGES,
+ NON_EMPTY_CLASSRANGES_NODASH,
+ CLASSATOM,
+ CLASSATOM_NODASH,
+ CLASS_ESCAPE,
+ DECIMALDIGITS,
+ HEX_ESCAPESEQUENCE,
+ UNICODE_ESCAPESEQUENCE,
+ }
+
+ /**
+ * Token tyoe
+ */
+ private final Token.Type type;
+
+ /**
+ * Child nodes
+ */
+ private final List<Object> children;
+
+ /**
+ * Parent node
+ */
+ private Token parent;
+
+ /**
+ * Dead code flag
+ */
+ private boolean isDead;
+
+ private static final Map<Type, ToString> toStringMap = new HashMap<>();
+ private static final ToString DEFAULT_TOSTRING = new ToString();
+
+ private static String unicode(final int value) {
+ final StringBuilder sb = new StringBuilder();
+ final String hex = Integer.toHexString(value);
+ sb.append('u');
+ for (int i = 0; i < 4 - hex.length(); i++) {
+ sb.append('0');
+ }
+ sb.append(hex);
+
+ return sb.toString();
+ }
+
+ static {
+ toStringMap.put(Type.CHARACTERCLASS, new ToString() {
+ @Override
+ public String toString(final Token token) {
+ return super.toString(token).replace("\\b", "\b");
+ }
+ });
+
+ // for some reason java regexps don't like control characters on the
+ // form "\\ca".match([string with ascii 1 at char0]). Translating
+ // them to unicode does it though.
+ toStringMap.put(Type.CHARACTER_ESCAPE, new ToString() {
+ @Override
+ public String toString(final Token token) {
+ final String str = super.toString(token);
+ if (str.length() == 2) {
+ return Token.unicode(Character.toLowerCase(str.charAt(1)) - 'a' + 1);
+ }
+ return str;
+ }
+ });
+
+ toStringMap.put(Type.DECIMAL_ESCAPE, new ToString() {
+ @Override
+ public String toString(final Token token) {
+ final String str = super.toString(token);
+
+ if ("\0".equals(str)) {
+ return str;
+ }
+
+ int value;
+
+ if (!token.hasParentOfType(Type.CLASSRANGES)) {
+ return str;
+ }
+
+ value = Integer.parseInt(str, 8); //throws exception that leads to SyntaxError if not octal
+ if (value > 0xff) {
+ throw new NumberFormatException(str);
+ }
+
+ return Token.unicode(value);
+ }
+ });
+
+ }
+
+ /**
+ * JavaScript Token to Java regex substring framework.
+ */
+ private static class ToString {
+ String toString(final Token token) {
+ final Object[] children = token.getChildren();
+
+ // Allow the installed regexp factory to perform global substitutions.
+ switch (children.length) {
+ case 0:
+ return "";
+ case 1:
+ return RegExpFactory.replace(children[0].toString());
+ default:
+ final StringBuilder sb = new StringBuilder();
+ for (final Object child : children) {
+ sb.append(child);
+ }
+ return RegExpFactory.replace(sb.toString());
+ }
+ }
+ }
+
+ /**
+ * Token iterator. Doesn't return "atom" children. i.e. string representations,
+ * just tokens
+ *
+ */
+ private static class TokenIterator implements Iterator<Token> {
+ private final List<Token> preorder;
+
+ private void init(final Token root) {
+ preorder.add(root);
+ for (final Object child : root.getChildren()) {
+ if (child instanceof Token) {
+ init((Token)child);
+ }
+ }
+ }
+
+ TokenIterator(final Token root) {
+ preorder = new ArrayList<>();
+ init(root);
+ }
+
+ @Override
+ public boolean hasNext() {
+ return !preorder.isEmpty();
+ }
+
+ @Override
+ public Token next() {
+ return preorder.remove(0);
+ }
+
+ @Override
+ public void remove() {
+ next();
+ }
+ }
+
+ /**
+ * Constructor
+ * @param type the token type
+ */
+ Token(final Token.Type type) {
+ this.type = type;
+ children = new ArrayList<>();
+ }
+
+ /**
+ * Add a an "atom" child to a token
+ * @param child the child to add
+ * @return the token (for chaining)
+ */
+ public Token add(final String child) {
+ children.add(child);
+ return this;
+ }
+
+ /**
+ * Add a child to a token
+ * @param child the child
+ * @return the token (for chaining)
+ */
+ public Token add(final Token child) {
+ if (child != null) {
+ children.add(child);
+ child.setParent(this);
+ }
+ return this;
+ }
+
+ /**
+ * Remove a child from a token
+ * @param child the child to remove
+ * @return true if successful
+ */
+ public boolean remove(final Token child) {
+ return children.remove(child);
+ }
+
+ /**
+ * Remove the last child from a token
+ * @return the removed child
+ */
+ public Object removeLast() {
+ return children.remove(children.size() - 1);
+ }
+
+ /**
+ * Flag this token as dead code
+ * @param isDead is it dead or not
+ */
+ private void setIsDead(final boolean isDead) {
+ this.isDead = isDead;
+ }
+
+ /**
+ * Is this token dead code
+ * @return boolean
+ */
+ private boolean getIsDead() {
+ return isDead;
+ }
+
+ /**
+ * Get the parent of this token
+ * @return parent token
+ */
+ public Token getParent() {
+ return parent;
+ }
+
+ public boolean hasParentOfType(final Token.Type parentType) {
+ for (Token p = getParent(); p != null; p = p.getParent()) {
+ if (p.getType() == parentType) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public boolean hasChildOfType(final Token.Type childType) {
+ for (final Iterator<Token> iter = iterator() ; iter.hasNext() ; ) {
+ if (iter.next().getType() == childType) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Set the parent of this token
+ * @param parent
+ */
+ private void setParent(final Token parent) {
+ this.parent = parent;
+ }
+
+ /**
+ * Get the children of this token
+ * @return an array of children, never null
+ */
+ public Object[] getChildren() {
+ return children.toArray();
+ }
+
+ /**
+ * Reset this token, remove all children
+ */
+ public void reset() {
+ children.clear();
+ }
+
+ /**
+ * Get a preorder token iterator with this token as root
+ * @return an iterator
+ */
+ public Iterator<Token> iterator() {
+ return new TokenIterator(this);
+ }
+
+ /**
+ * Get the type of this token
+ * @return type
+ */
+ public Type getType() {
+ return type;
+ }
+
+ /**
+ * Turn this token into Java regexp compatible text
+ * @return part of a java regexp
+ */
+ @Override
+ public String toString() {
+ ToString t = toStringMap.get(getType());
+ if (t == null) {
+ t = DEFAULT_TOSTRING;
+ }
+ return t.toString(this);
+ }
+ }
+
+ /**
+ * Constructor
+ * @param string the JavaScript regexp to parse
+ */
+ private RegExpScanner(final String string) {
+ super(string);
+ sb = new StringBuilder(limit);
+ reset(0);
+ expected.put(']', 0);
+ expected.put('}', 0);
+ }
+
+ private void processForwardReferences() {
+ if (neverMatches()) {
+ return;
+ }
+
+ for (final Map.Entry<Integer, Token> fwdRef : forwardReferences.entrySet()) {
+ if (fwdRef.getKey().intValue() > caps.size()) {
+ neverMatches = true;
+ break;
+ }
+
+ fwdRef.getValue().setIsDead(true);
+ }
+
+ forwardReferences.clear();
+ }
+
+ /**
+ * Scan a JavaScript regexp string returning a Java safe regex string.
+ *
+ * @param string
+ * JavaScript regexp string.
+ * @return Java safe regex string.
+ */
+ public static RegExpScanner scan(final String string) {
+ final RegExpScanner scanner = new RegExpScanner(string);
+
+ Token pattern;
+
+ try {
+ pattern = scanner.pattern();
+ } catch (final Exception e) {
+ throw new PatternSyntaxException(e.getMessage(), string, scanner.sb.length());
+ }
+
+ scanner.processForwardReferences();
+ if (scanner.neverMatches()) {
+ return null; // never matches
+ }
+
+ // go over the code and remove dead code
+ final Iterator<Token> iter = pattern.iterator();
+ while (iter.hasNext()) {
+ final Token next = iter.next();
+ if (next.getIsDead()) {
+ next.getParent().remove(next);
+ }
+ }
+
+ // turn the pattern into a string, p, the java equivalent string for our js regexp
+ final String p = pattern.toString();
+ // if builder contains all tokens that were sent in, we know
+ // we correctly parsed the entire JavaScript regexp without syntax errors
+ if (!string.equals(scanner.getStringBuilder().toString())) {
+ throw new PatternSyntaxException(string, p, p.length() + 1);
+ }
+
+ scanner.javaPattern = p;
+ return scanner;
+ }
+
+ /**
+ * Does this regexp ever match anything? Use of e.g. [], which is legal in JavaScript,
+ * is an example where we never match
+ *
+ * @return boolean
+ */
+ private boolean neverMatches() {
+ return neverMatches;
+ }
+
+ final StringBuilder getStringBuilder() {
+ return sb;
+ }
+
+ String getJavaPattern() {
+ return javaPattern;
+ }
+
+ BitVector getGroupsInNegativeLookahead() {
+ BitVector vec = null;
+ for (int i = 0; i < caps.size(); i++) {
+ final Capture cap = caps.get(i);
+ if (cap.getNegativeLookaheadLevel() > 0) {
+ if (vec == null) {
+ vec = new BitVector(caps.size() + 1);
+ }
+ vec.set(i + 1);
+ }
+ }
+ return vec;
+ }
+
+ /**
+ * Commit n characters to the builder and to a given token
+ * @param token Uncommitted token.
+ * @param n Number of characters.
+ * @return Committed token
+ */
+ private Token commit(final Token token, final int n) {
+ final int startIn = position;
+
+ switch (n) {
+ case 1:
+ sb.append(ch0);
+ skip(1);
+ break;
+ case 2:
+ sb.append(ch0);
+ sb.append(ch1);
+ skip(2);
+ break;
+ case 3:
+ sb.append(ch0);
+ sb.append(ch1);
+ sb.append(ch2);
+ skip(3);
+ break;
+ default:
+ assert false : "Should not reach here";
+ }
+
+ if (token == null) {
+ return null;
+ }
+
+ return token.add(sb.substring(startIn, sb.length()));
+ }
+
+ /**
+ * Restart the buffers back at an earlier position.
+ *
+ * @param startIn
+ * Position in the input stream.
+ * @param startOut
+ * Position in the output stream.
+ */
+ private void restart(final int startIn, final int startOut) {
+ reset(startIn);
+ sb.setLength(startOut);
+ }
+
+ private void push(final char ch) {
+ expected.put(ch, expected.get(ch) + 1);
+ }
+
+ private void pop(final char ch) {
+ expected.put(ch, Math.min(0, expected.get(ch) - 1));
+ }
+
+ /*
+ * Recursive descent tokenizer starts below.
+ */
+
+ /*
+ * Pattern ::
+ * Disjunction
+ */
+ private Token pattern() {
+ final Token token = new Token(Token.Type.PATTERN);
+
+ final Token child = disjunction();
+ return token.add(child);
+ }
+
+ /*
+ * Disjunction ::
+ * Alternative
+ * Alternative | Disjunction
+ */
+ private Token disjunction() {
+ final Token token = new Token(Token.Type.DISJUNCTION);
+
+ while (true) {
+ token.add(alternative());
+
+ if (ch0 == '|') {
+ commit(token, 1);
+ } else {
+ break;
+ }
+ }
+
+ return token;
+ }
+
+ /*
+ * Alternative ::
+ * [empty]
+ * Alternative Term
+ */
+ private Token alternative() {
+ final Token token = new Token(Token.Type.ALTERNATIVE);
+
+ Token child;
+ while ((child = term()) != null) {
+ token.add(child);
+ }
+
+ return token;
+ }
+
+ /*
+ * Term ::
+ * Assertion
+ * Atom
+ * Atom Quantifier
+ */
+ private Token term() {
+ final int startIn = position;
+ final int startOut = sb.length();
+ final Token token = new Token(Token.Type.TERM);
+ Token child;
+
+ child = assertion();
+ if (child != null) {
+ return token.add(child);
+ }
+
+ child = atom();
+ if (child != null) {
+ boolean emptyCharacterClass = false;
+ if ("[]".equals(child.toString())) {
+ emptyCharacterClass = true;
+ }
+
+ token.add(child);
+
+ final Token quantifier = quantifier();
+ if (quantifier != null) {
+ token.add(quantifier);
+ }
+
+ if (emptyCharacterClass) {
+ if (quantifier == null) {
+ neverMatches = true; //never matches ever.
+ } else {
+ //if we can get away with max zero, remove this entire token
+ final String qs = quantifier.toString();
+ if ("+".equals(qs) || "*".equals(qs) || qs.startsWith("{0,")) {
+ token.setIsDead(true);
+ }
+ }
+ }
+
+ return token;
+ }
+
+ restart(startIn, startOut);
+ return null;
+ }
+
+ /*
+ * Assertion ::
+ * ^
+ * $
+ * \b
+ * \B
+ * ( ? = Disjunction )
+ * ( ? ! Disjunction )
+ */
+ private Token assertion() {
+ final int startIn = position;
+ final int startOut = sb.length();
+ final Token token = new Token(Token.Type.ASSERTION);
+
+ switch (ch0) {
+ case '^':
+ case '$':
+ return commit(token, 1);
+
+ case '\\':
+ if (ch1 == 'b' || ch1 == 'B') {
+ return commit(token, 2);
+ }
+ break;
+
+ case '(':
+ if (ch1 != '?') {
+ break;
+ }
+ if (ch2 != '=' && ch2 != '!') {
+ break;
+ }
+ final boolean isNegativeLookahead = (ch2 == '!');
+ commit(token, 3);
+
+ if (isNegativeLookahead) {
+ negativeLookaheadLevel++;
+ }
+ final Token disjunction = disjunction();
+ if (isNegativeLookahead) {
+ for (final Capture cap : caps) {
+ if (cap.getNegativeLookaheadLevel() >= negativeLookaheadLevel) {
+ cap.setDead();
+ }
+ }
+ negativeLookaheadLevel--;
+ }
+
+ if (disjunction != null && ch0 == ')') {
+ token.add(disjunction);
+ return commit(token, 1);
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ restart(startIn, startOut);
+
+ return null;
+ }
+
+ /*
+ * Quantifier ::
+ * QuantifierPrefix
+ * QuantifierPrefix ?
+ */
+ private Token quantifier() {
+ final Token token = new Token(Token.Type.QUANTIFIER);
+ final Token child = quantifierPrefix();
+ if (child != null) {
+ token.add(child);
+ if (ch0 == '?') {
+ commit(token, 1);
+ }
+ return token;
+ }
+ return null;
+ }
+
+ /*
+ * QuantifierPrefix ::
+ * *
+ * +
+ * ?
+ * { DecimalDigits }
+ * { DecimalDigits , }
+ * { DecimalDigits , DecimalDigits }
+ */
+ private Token quantifierPrefix() {
+ final int startIn = position;
+ final int startOut = sb.length();
+ final Token token = new Token(Token.Type.QUANTIFIER_PREFIX);
+
+ switch (ch0) {
+ case '*':
+ case '+':
+ case '?':
+ return commit(token, 1);
+
+ case '{':
+ commit(token, 1);
+
+ final Token child = decimalDigits();
+ if (child == null) {
+ break; // not a quantifier - back out
+ }
+ push('}');
+ token.add(child);
+
+ if (ch0 == ',') {
+ commit(token, 1);
+ token.add(decimalDigits());
+ }
+
+ if (ch0 == '}') {
+ pop('}');
+ commit(token, 1);
+ }
+
+ return token;
+
+ default:
+ break;
+ }
+
+ restart(startIn, startOut);
+ return null;
+ }
+
+ /*
+ * Atom ::
+ * PatternCharacter
+ * .
+ * \ AtomEscape
+ * CharacterClass
+ * ( Disjunction )
+ * ( ? : Disjunction )
+ *
+ */
+ private Token atom() {
+ final int startIn = position;
+ final int startOut = sb.length();
+ final Token token = new Token(Token.Type.ATOM);
+ Token child;
+
+ child = patternCharacter();
+ if (child != null) {
+ return token.add(child);
+ }
+
+ if (ch0 == '.') {
+ return commit(token, 1);
+ }
+
+ if (ch0 == '\\') {
+ commit(token, 1);
+ child = atomEscape();
+
+ if (child != null) {
+ if (child.hasChildOfType(Token.Type.IDENTITY_ESCAPE)) {
+ final char idEscape = child.toString().charAt(0);
+ if (NON_IDENT_ESCAPES.indexOf(idEscape) == -1) {
+ token.reset();
+ }
+ }
+
+ token.add(child);
+
+ // forward backreferences always match empty. JavaScript != Java
+ if (child.hasChildOfType(Token.Type.DECIMAL_ESCAPE) && !"\u0000".equals(child.toString())) {
+ final int refNum = Integer.parseInt(child.toString());
+
+ if (refNum - 1 < caps.size() && caps.get(refNum - 1).isDead()) {
+ // reference to dead in-negative-lookahead capture
+ token.setIsDead(true);
+ } else if (caps.size() < refNum) {
+ // forward reference: always matches against empty string (dead token).
+ // invalid reference (non-existant capture): pattern never matches.
+ forwardReferences.put(refNum, token);
+ }
+ }
+
+ return token;
+ }
+ }
+
+ child = characterClass();
+ if (child != null) {
+ return token.add(child);
+ }
+
+ if (ch0 == '(') {
+ boolean capturingParens = true;
+ commit(token, 1);
+ if (ch0 == '?' && ch1 == ':') {
+ capturingParens = false;
+ commit(token, 2);
+ }
+
+ child = disjunction();
+ if (child != null) {
+ token.add(child);
+ if (ch0 == ')') {
+ final Token atom = commit(token, 1);
+ if (capturingParens) {
+ caps.add(new Capture(negativeLookaheadLevel));
+ }
+ return atom;
+ }
+ }
+ }
+
+ restart(startIn, startOut);
+ return null;
+ }
+
+ /*
+ * PatternCharacter ::
+ * SourceCharacter but not any of: ^$\.*+?()[]{}|
+ */
+ @SuppressWarnings("fallthrough")
+ private Token patternCharacter() {
+ if (atEOF()) {
+ return null;
+ }
+
+ switch (ch0) {
+ case '^':
+ case '$':
+ case '\\':
+ case '.':
+ case '*':
+ case '+':
+ case '?':
+ case '(':
+ case ')':
+ case '[':
+ case '|':
+ return null;
+
+ case '}':
+ case ']':
+ final int n = expected.get(ch0);
+ if (n != 0) {
+ return null;
+ }
+
+ case '{':
+ // if not a valid quantifier escape curly brace to match itself
+ // this ensures compatibility with other JS implementations
+ final Token quant = quantifierPrefix();
+ return (quant == null) ? commit(new Token(Token.Type.PATTERN_CHARACTER).add("\\"), 1) : null;
+
+ default:
+ return commit(new Token(Token.Type.PATTERN_CHARACTER), 1); // SOURCECHARACTER
+ }
+ }
+
+ /*
+ * AtomEscape ::
+ * DecimalEscape
+ * CharacterEscape
+ * CharacterClassEscape
+ */
+ private Token atomEscape() {
+ final Token token = new Token(Token.Type.ATOM_ESCAPE);
+ Token child;
+
+ child = decimalEscape();
+ if (child != null) {
+ return token.add(child);
+ }
+
+ child = characterClassEscape();
+ if (child != null) {
+ return token.add(child);
+ }
+
+ child = characterEscape();
+ if (child != null) {
+ return token.add(child);
+ }
+
+
+ return null;
+ }
+
+ /*
+ * CharacterEscape ::
+ * ControlEscape
+ * c ControlLetter
+ * HexEscapeSequence
+ * UnicodeEscapeSequence
+ * IdentityEscape
+ */
+ private Token characterEscape() {
+ final int startIn = position;
+ final int startOut = sb.length();
+
+ final Token token = new Token(Token.Type.CHARACTER_ESCAPE);
+ Token child;
+
+ child = controlEscape();
+ if (child != null) {
+ return token.add(child);
+ }
+
+ if (ch0 == 'c') {
+ commit(token, 1);
+ child = controlLetter();
+ if (child != null) {
+ return token.add(child);
+ }
+ restart(startIn, startOut);
+ }
+
+ child = hexEscapeSequence();
+ if (child != null) {
+ return token.add(child);
+ }
+
+ child = unicodeEscapeSequence();
+ if (child != null) {
+ return token.add(child);
+ }
+
+ child = identityEscape();
+ if (child != null) {
+ return token.add(child);
+ }
+
+ restart(startIn, startOut);
+
+ return null;
+ }
+
+ private boolean scanEscapeSequence(final char leader, final int length, final Token token) {
+ final int startIn = position;
+ final int startOut = sb.length();
+
+ if (ch0 != leader) {
+ return false;
+ }
+
+ commit(token, 1);
+ for (int i = 0; i < length; i++) {
+ final char ch0l = Character.toLowerCase(ch0);
+ if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) {
+ commit(token, 1);
+ } else {
+ restart(startIn, startOut);
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ private Token hexEscapeSequence() {
+ final Token token = new Token(Token.Type.HEX_ESCAPESEQUENCE);
+ if (scanEscapeSequence('x', 2, token)) {
+ return token;
+ }
+ return null;
+ }
+
+ private Token unicodeEscapeSequence() {
+ final Token token = new Token(Token.Type.UNICODE_ESCAPESEQUENCE);
+ if (scanEscapeSequence('u', 4, token)) {
+ return token;
+ }
+ return null;
+ }
+
+ /*
+ * ControlEscape ::
+ * one of fnrtv
+ */
+ private Token controlEscape() {
+ switch (ch0) {
+ case 'f':
+ case 'n':
+ case 'r':
+ case 't':
+ case 'v':
+ return commit(new Token(Token.Type.CONTROL_ESCAPE), 1);
+
+ default:
+ return null;
+ }
+ }
+
+ /*
+ * ControlLetter ::
+ * one of abcdefghijklmnopqrstuvwxyz
+ * ABCDEFGHIJKLMNOPQRSTUVWXYZ
+ */
+ private Token controlLetter() {
+ final char c = Character.toUpperCase(ch0);
+ if (c >= 'A' && c <= 'Z') {
+ final Token token = new Token(Token.Type.CONTROL_LETTER);
+ commit(token, 1);
+ return token;
+ }
+ return null;
+ /*
+ Token token = new Token(Token.Type.CONTROL_LETTER);
+ commit(null, 1);//add original char to builder not to token
+ this.neverMatches = c < 'A' || c > 'Z';
+ return token.add(""+c);*/
+ }
+
+ /*
+ * IdentityEscape ::
+ * SourceCharacter but not IdentifierPart
+ * <ZWJ> (200c)
+ * <ZWNJ> (200d)
+ */
+ private Token identityEscape() {
+ final Token token = new Token(Token.Type.IDENTITY_ESCAPE);
+ commit(token, 1);
+ return token;
+ }
+
+ /*
+ * DecimalEscape ::
+ * DecimalIntegerLiteral [lookahead DecimalDigit]
+ */
+ private Token decimalEscape() {
+ final Token token = new Token(Token.Type.DECIMAL_ESCAPE);
+ final int startIn = position;
+ final int startOut = sb.length();
+
+ if (ch0 == '0' && !isDecimalDigit(ch1)) {
+ commit(token, 1);
+ token.removeLast();
+ // DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000);
+ return token.add("\u0000");
+ }
+
+ if (isDecimalDigit(ch0)) {
+ while (isDecimalDigit(ch0)) {
+ commit(token, 1);
+ }
+ return token;
+ }
+
+ restart(startIn, startOut);
+
+ return null;
+ }
+
+ /*
+ * CharacterClassEscape ::
+ * one of dDsSwW
+ */
+ private Token characterClassEscape() {
+ switch (ch0) {
+ case 's':
+ case 'S':
+ case 'd':
+ case 'D':
+ case 'w':
+ case 'W':
+ return commit(new Token(Token.Type.CHARACTERCLASS_ESCAPE), 1);
+
+ default:
+ return null;
+ }
+ }
+
+ /*
+ * CharacterClass ::
+ * [ [lookahead {^}] ClassRanges ]
+ * [ ^ ClassRanges ]
+ */
+ private Token characterClass() {
+ final int startIn = position;
+ final int startOut = sb.length();
+ final Token token = new Token(Token.Type.CHARACTERCLASS);
+
+ if (ch0 == '[') {
+ push(']');
+ commit(token, 1);
+
+ if (ch0 == '^') {
+ commit(token, 1);
+ }
+
+ final Token child = classRanges();
+ if (child != null && ch0 == ']') {
+ pop(']');
+ token.add(child);
+ return commit(token, 1);
+ }
+ }
+
+ restart(startIn, startOut);
+ return null;
+ }
+
+ /*
+ * ClassRanges ::
+ * [empty]
+ * NonemptyClassRanges
+ */
+ private Token classRanges() {
+ return new Token(Token.Type.CLASSRANGES).add(nonemptyClassRanges());
+ }
+
+ /*
+ * NonemptyClassRanges ::
+ * ClassAtom
+ * ClassAtom NonemptyClassRangesNoDash
+ * ClassAtom - ClassAtom ClassRanges
+ */
+ private Token nonemptyClassRanges() {
+ final int startIn = position;
+ final int startOut = sb.length();
+ final Token token = new Token(Token.Type.NON_EMPTY_CLASSRANGES);
+ Token child;
+
+ child = classAtom();
+ if (child != null) {
+ token.add(child);
+
+ if (ch0 == '-') {
+ commit(token, 1);
+
+ final Token child1 = classAtom();
+ final Token child2 = classRanges();
+ if (child1 != null && child2 != null) {
+ token.add(child1);
+ token.add(child2);
+
+ return token;
+ }
+ }
+
+ child = nonemptyClassRangesNoDash();
+ if (child != null) {
+ token.add(child);
+ return token;
+ }
+
+ return token;
+ }
+
+ restart(startIn, startOut);
+ return null;
+ }
+
+ /*
+ * NonemptyClassRangesNoDash ::
+ * ClassAtom
+ * ClassAtomNoDash NonemptyClassRangesNoDash
+ * ClassAtomNoDash - ClassAtom ClassRanges
+ */
+ private Token nonemptyClassRangesNoDash() {
+ final int startIn = position;
+ final int startOut = sb.length();
+ final Token token = new Token(Token.Type.NON_EMPTY_CLASSRANGES_NODASH);
+ Token child;
+
+ child = classAtomNoDash();
+ if (child != null) {
+ token.add(child);
+
+ // need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom
+ if (ch0 == '-') {
+ commit(token, 1);
+
+ final Token child1 = classAtom();
+ final Token child2 = classRanges();
+ if (child1 != null && child2 != null) {
+ token.add(child1);
+ return token.add(child2);
+ }
+ //fallthru
+ }
+
+ child = nonemptyClassRangesNoDash();
+ if (child != null) {
+ token.add(child);
+ }
+ return token; // still a class atom
+ }
+
+ child = classAtom();
+ if (child != null) {
+ return token.add(child);
+ }
+
+ restart(startIn, startOut);
+ return null;
+ }
+
+ /*
+ * ClassAtom : - ClassAtomNoDash
+ */
+ private Token classAtom() {
+ final Token token = new Token(Token.Type.CLASSATOM);
+
+ if (ch0 == '-') {
+ return commit(token, 1);
+ }
+
+ final Token child = classAtomNoDash();
+ if (child != null) {
+ return token.add(child);
+ }
+
+ return null;
+ }
+
+ /*
+ * ClassAtomNoDash ::
+ * SourceCharacter but not one of \ or ] or -
+ * \ ClassEscape
+ */
+ private Token classAtomNoDash() {
+ final int startIn = position;
+ final int startOut = sb.length();
+ final Token token = new Token(Token.Type.CLASSATOM_NODASH);
+
+ switch (ch0) {
+ case ']':
+ case '-':
+ case '\0':
+ return null;
+
+ case '[':
+ // unescaped left square bracket - add escape
+ return commit(token.add("\\"), 1);
+
+ case '\\':
+ commit(token, 1);
+ final Token child = classEscape();
+ if (child != null) {
+ return token.add(child);
+ }
+
+ restart(startIn, startOut);
+ return null;
+
+ default:
+ return commit(token, 1);
+ }
+ }
+
+ /*
+ * ClassEscape ::
+ * DecimalEscape
+ * b
+ * CharacterEscape
+ * CharacterClassEscape
+ */
+ private Token classEscape() {
+ final Token token = new Token(Token.Type.CLASS_ESCAPE);
+ Token child;
+
+ child = decimalEscape();
+ if (child != null) {
+ return token.add(child);
+ }
+
+ if (ch0 == 'b') {
+ return commit(token, 1);
+ }
+
+ child = characterEscape();
+ if (child != null) {
+ return token.add(child);
+ }
+
+ child = characterClassEscape();
+ if (child != null) {
+ return token.add(child);
+ }
+
+ return null;
+ }
+
+ /*
+ * DecimalDigits
+ */
+ private Token decimalDigits() {
+ if (!isDecimalDigit(ch0)) {
+ return null;
+ }
+
+ final Token token = new Token(Token.Type.DECIMALDIGITS);
+ while (isDecimalDigit(ch0)) {
+ commit(token, 1);
+ }
+
+ return token;
+ }
+
+ private static boolean isDecimalDigit(final char ch) {
+ return ch >= '0' && ch <= '9';
+ }
+}