8009230: Nashorn rejects extended RegExp syntax accepted by all major JS engines
Reviewed-by: jlaskey, lagergren
--- a/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java Thu Apr 04 18:32:00 2013 +0200
+++ b/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java Fri Apr 05 19:50:10 2013 +0200
@@ -26,11 +26,10 @@
package jdk.nashorn.internal.runtime.regexp;
import java.util.HashMap;
-import java.util.LinkedHashSet;
+import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
-import java.util.Set;
import java.util.regex.PatternSyntaxException;
import jdk.nashorn.internal.parser.Lexer;
@@ -58,7 +57,7 @@
private final List<Capture> caps = new LinkedList<>();
/** Forward references to capturing parenthesis to be resolved later.*/
- private final Set<Integer> forwardReferences = new LinkedHashSet<>();
+ private final LinkedList<Integer> forwardReferences = new LinkedList<>();
/** Current level of zero-width negative lookahead assertions. */
private int negativeLookaheadLevel;
@@ -104,10 +103,20 @@
return;
}
- for (final Integer ref : forwardReferences) {
- if (ref.intValue() > caps.size()) {
- neverMatches = true;
- break;
+ Iterator<Integer> iterator = forwardReferences.descendingIterator();
+ while (iterator.hasNext()) {
+ final int pos = iterator.next();
+ final int num = iterator.next();
+ if (num > caps.size()) {
+ // Non-existing reference should never match, if smaller than 8 convert to octal escape
+ // to be compatible with other engines.
+ if (num < 8) {
+ String escape = "\\x0" + num;
+ sb.insert(pos, escape);
+ } else {
+ neverMatches = true;
+ break;
+ }
}
}
@@ -402,6 +411,10 @@
if (ch0 == '}') {
pop('}');
commit(1);
+ } else {
+ // Bad quantifier should be rejected but is accepted by all major engines
+ restart(startIn, startOut);
+ return false;
}
return true;
@@ -637,7 +650,16 @@
throw new RuntimeException("\\ at end of pattern"); // will be converted to PatternSyntaxException
}
// ES 5.1 A.7 requires "not IdentifierPart" here but all major engines accept any character here.
- if (NON_IDENT_ESCAPES.indexOf(ch0) == -1) {
+ if (ch0 == 'c') {
+ // Ignore invalid control letter escape if within a character class
+ if (inCharClass && ch1 != ']') {
+ sb.setLength(sb.length() - 1);
+ skip(2);
+ return true;
+ } else {
+ sb.append('\\'); // Treat invalid \c control sequence as \\c
+ }
+ } else if (NON_IDENT_ESCAPES.indexOf(ch0) == -1) {
sb.setLength(sb.length() - 1);
}
return commit(1);
@@ -677,8 +699,9 @@
// Forward reference to a capture group. Forward references are always undefined so we
// can omit it from the output buffer. Additionally, if the capture group does not exist
// the whole regexp becomes invalid, so register the reference for later processing.
+ sb.setLength(sb.length() - 1);
forwardReferences.add(num);
- sb.setLength(sb.length() - 1);
+ forwardReferences.add(sb.length());
skip(1);
return true;
}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/nashorn/test/script/basic/JDK-8009230.js Fri Apr 05 19:50:10 2013 +0200
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * JDK-8009230: Nashorn rejects extended RegExp syntax accepted by all major JS engines
+ *
+ * @test
+ * @run
+ */
+
+
+// Invalid ControlEscape/IdentityEscape character treated as literal.
+print(/\z/.exec("z")); // Invalid escape, same as /z/
+// Incomplete/Invalid ControlEscape treated as "\\c"
+print(/\c/.exec("\\c")); // same as /\\c/
+print(/\c2/.exec("\\c2")); // same as /\\c2/
+print(/\C/.exec("C")); // same as /C/
+print(/\C2/.exec("C2")); // same as /C2/
+// Incomplete HexEscapeSequence escape treated as "x".
+print(/\x/.exec("x")); // incomplete x-escape
+print(/\x1/.exec("x1")); // incomplete x-escape
+print(/\x1z/.exec("x1z")); // incomplete x-escape
+// Incomplete UnicodeEscapeSequence escape treated as "u".
+print(/\u/.exec("u")); // incomplete u-escape
+print(/\uz/.exec("uz")); // incomplete u-escape
+print(/\u1/.exec("u1")); // incomplete u-escape
+print(/\u1z/.exec("u1z")); // incomplete u-escape
+print(/\u12/.exec("u12")); // incomplete u-escape
+print(/\u12z/.exec("u12z")); // incomplete u-escape
+print(/\u123/.exec("u123")); // incomplete u-escape
+print(/\u123z/.exec("u123z")); // incomplete u-escape
+// Bad quantifier range:
+print(/x{z/.exec("x{z")); // same as /x\{z/
+print(/x{1z/.exec("x{1z")); // same as /x\{1z/
+print(/x{1,z/.exec("x{1,z")); // same as /x\{1,z/
+print(/x{1,2z/.exec("x{1,2z")); // same as /x\{1,2z/
+print(/x{10000,20000z/.exec("x{10000,20000z")); // same as /x\{10000,20000z/
+// Notice: It needs arbitrary lookahead to determine the invalidity,
+// except Mozilla that limits the numbers.
+
+// Zero-initialized Octal escapes.
+/\012/; // same as /\x0a/
+
+// Nonexisting back-references smaller than 8 treated as octal escapes:
+print(/\5/.exec("\u0005")); // same as /\x05/
+print(/\7/.exec("\u0007")); // same as /\x07/
+print(/\8/.exec("\u0008")); // does not match
+
+// Invalid PatternCharacter accepted unescaped
+print(/]/.exec("]"));
+print(/{/.exec("{"));
+print(/}/.exec("}"));
+
+// Bad escapes also inside CharacterClass.
+print(/[\z]/.exec("z"));
+print(/[\c]/.exec("c"));
+print(/[\c2]/.exec("c"));
+print(/[\x]/.exec("x"));
+print(/[\x1]/.exec("x1"));
+print(/[\x1z]/.exec("x1z"));
+print(/[\u]/.exec("u"));
+print(/[\uz]/.exec("u"));
+print(/[\u1]/.exec("u"));
+print(/[\u1z]/.exec("u"));
+print(/[\u12]/.exec("u"));
+print(/[\u12z]/.exec("u"));
+print(/[\u123]/.exec("u"));
+print(/[\u123z]/.exec("u"));
+print(/[\012]/.exec("0"));
+print(/[\5]/.exec("5"));
+// And in addition:
+print(/[\B]/.exec("B"));
+print(/()()[\2]/.exec("")); // Valid backreference should be invalid.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/nashorn/test/script/basic/JDK-8009230.js.EXPECTED Fri Apr 05 19:50:10 2013 +0200
@@ -0,0 +1,45 @@
+z
+\c
+\c2
+C
+C2
+x
+x1
+x1z
+u
+uz
+u1
+u1z
+u12
+u12z
+u123
+u123z
+x{z
+x{1z
+x{1,z
+x{1,2z
+x{10000,20000z
+
+
+null
+]
+{
+}
+z
+c
+null
+x
+x
+x
+u
+u
+u
+u
+u
+u
+u
+u
+null
+null
+B
+null