8009230: Nashorn rejects extended RegExp syntax accepted by all major JS engines
authorhannesw
Fri, 05 Apr 2013 19:50:10 +0200
changeset 16781 41eadf003eff
parent 16780 f23743ec1a93
child 16782 acacf013d08a
8009230: Nashorn rejects extended RegExp syntax accepted by all major JS engines Reviewed-by: jlaskey, lagergren
nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java
nashorn/test/script/basic/JDK-8009230.js
nashorn/test/script/basic/JDK-8009230.js.EXPECTED
--- a/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java	Thu Apr 04 18:32:00 2013 +0200
+++ b/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java	Fri Apr 05 19:50:10 2013 +0200
@@ -26,11 +26,10 @@
 package jdk.nashorn.internal.runtime.regexp;
 
 import java.util.HashMap;
-import java.util.LinkedHashSet;
+import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
-import java.util.Set;
 import java.util.regex.PatternSyntaxException;
 
 import jdk.nashorn.internal.parser.Lexer;
@@ -58,7 +57,7 @@
     private final List<Capture> caps = new LinkedList<>();
 
     /** Forward references to capturing parenthesis to be resolved later.*/
-    private final Set<Integer> forwardReferences = new LinkedHashSet<>();
+    private final LinkedList<Integer> forwardReferences = new LinkedList<>();
 
     /** Current level of zero-width negative lookahead assertions. */
     private int negativeLookaheadLevel;
@@ -104,10 +103,20 @@
             return;
         }
 
-        for (final Integer ref : forwardReferences) {
-            if (ref.intValue() > caps.size()) {
-                neverMatches = true;
-                break;
+        Iterator<Integer> iterator = forwardReferences.descendingIterator();
+        while (iterator.hasNext()) {
+            final int pos = iterator.next();
+            final int num = iterator.next();
+            if (num > caps.size()) {
+                // Non-existing reference should never match, if smaller than 8 convert to octal escape
+                // to be compatible with other engines.
+                if (num < 8) {
+                    String escape = "\\x0" + num;
+                    sb.insert(pos, escape);
+                } else {
+                    neverMatches = true;
+                    break;
+                }
             }
         }
 
@@ -402,6 +411,10 @@
             if (ch0 == '}') {
                 pop('}');
                 commit(1);
+            } else {
+                // Bad quantifier should be rejected but is accepted by all major engines
+                restart(startIn, startOut);
+                return false;
             }
 
             return true;
@@ -637,7 +650,16 @@
             throw new RuntimeException("\\ at end of pattern"); // will be converted to PatternSyntaxException
         }
         // ES 5.1 A.7 requires "not IdentifierPart" here but all major engines accept any character here.
-        if (NON_IDENT_ESCAPES.indexOf(ch0) == -1) {
+        if (ch0 == 'c') {
+            // Ignore invalid control letter escape if within a character class
+            if (inCharClass && ch1 != ']') {
+                sb.setLength(sb.length() - 1);
+                skip(2);
+                return true;
+            } else {
+                sb.append('\\'); // Treat invalid \c control sequence as \\c
+            }
+        } else if (NON_IDENT_ESCAPES.indexOf(ch0) == -1) {
             sb.setLength(sb.length() - 1);
         }
         return commit(1);
@@ -677,8 +699,9 @@
                     // Forward reference to a capture group. Forward references are always undefined so we
                     // can omit it from the output buffer. Additionally, if the capture group does not exist
                     // the whole regexp becomes invalid, so register the reference for later processing.
+                    sb.setLength(sb.length() - 1);
                     forwardReferences.add(num);
-                    sb.setLength(sb.length() - 1);
+                    forwardReferences.add(sb.length());
                     skip(1);
                     return true;
                 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/nashorn/test/script/basic/JDK-8009230.js	Fri Apr 05 19:50:10 2013 +0200
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * JDK-8009230: Nashorn rejects extended RegExp syntax accepted by all major JS engines
+ *
+ * @test
+ * @run
+ */
+
+
+// Invalid ControlEscape/IdentityEscape character treated as literal.
+print(/\z/.exec("z"));  // Invalid escape, same as /z/
+// Incomplete/Invalid ControlEscape treated as "\\c"
+print(/\c/.exec("\\c"));  // same as /\\c/
+print(/\c2/.exec("\\c2"));  // same as /\\c2/
+print(/\C/.exec("C"));  // same as /C/
+print(/\C2/.exec("C2"));  // same as /C2/
+// Incomplete HexEscapeSequence escape treated as "x".
+print(/\x/.exec("x"));  // incomplete x-escape
+print(/\x1/.exec("x1"));  // incomplete x-escape
+print(/\x1z/.exec("x1z"));  // incomplete x-escape
+// Incomplete UnicodeEscapeSequence escape treated as "u".
+print(/\u/.exec("u"));  // incomplete u-escape
+print(/\uz/.exec("uz"));  // incomplete u-escape
+print(/\u1/.exec("u1"));  // incomplete u-escape
+print(/\u1z/.exec("u1z"));  // incomplete u-escape
+print(/\u12/.exec("u12"));  // incomplete u-escape
+print(/\u12z/.exec("u12z"));  // incomplete u-escape
+print(/\u123/.exec("u123"));  // incomplete u-escape
+print(/\u123z/.exec("u123z"));  // incomplete u-escape
+// Bad quantifier range:
+print(/x{z/.exec("x{z"));  // same as /x\{z/
+print(/x{1z/.exec("x{1z"));  // same as /x\{1z/
+print(/x{1,z/.exec("x{1,z"));  // same as /x\{1,z/
+print(/x{1,2z/.exec("x{1,2z"));  // same as /x\{1,2z/
+print(/x{10000,20000z/.exec("x{10000,20000z"));  // same as /x\{10000,20000z/
+// Notice: It needs arbitrary lookahead to determine the invalidity,
+// except Mozilla that limits the numbers.
+
+// Zero-initialized Octal escapes.
+/\012/;    // same as /\x0a/
+
+// Nonexisting back-references smaller than 8 treated as octal escapes:
+print(/\5/.exec("\u0005"));  // same as /\x05/
+print(/\7/.exec("\u0007"));  // same as /\x07/
+print(/\8/.exec("\u0008"));  // does not match
+
+// Invalid PatternCharacter accepted unescaped
+print(/]/.exec("]"));
+print(/{/.exec("{"));
+print(/}/.exec("}"));
+
+// Bad escapes also inside CharacterClass.
+print(/[\z]/.exec("z"));
+print(/[\c]/.exec("c"));
+print(/[\c2]/.exec("c"));
+print(/[\x]/.exec("x"));
+print(/[\x1]/.exec("x1"));
+print(/[\x1z]/.exec("x1z"));
+print(/[\u]/.exec("u"));
+print(/[\uz]/.exec("u"));
+print(/[\u1]/.exec("u"));
+print(/[\u1z]/.exec("u"));
+print(/[\u12]/.exec("u"));
+print(/[\u12z]/.exec("u"));
+print(/[\u123]/.exec("u"));
+print(/[\u123z]/.exec("u"));
+print(/[\012]/.exec("0"));
+print(/[\5]/.exec("5"));
+// And in addition:
+print(/[\B]/.exec("B"));
+print(/()()[\2]/.exec(""));  // Valid backreference should be invalid.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/nashorn/test/script/basic/JDK-8009230.js.EXPECTED	Fri Apr 05 19:50:10 2013 +0200
@@ -0,0 +1,45 @@
+z
+\c
+\c2
+C
+C2
+x
+x1
+x1z
+u
+uz
+u1
+u1z
+u12
+u12z
+u123
+u123z
+x{z
+x{1z
+x{1,z
+x{1,2z
+x{10000,20000z
+
+
+null
+]
+{
+}
+z
+c
+null
+x
+x
+x
+u
+u
+u
+u
+u
+u
+u
+u
+null
+null
+B
+null