8011714: Regexp decimal escape handling still not correct
authorhannesw
Wed, 10 Apr 2013 14:00:11 +0200
changeset 16938 1a8ffed97564
parent 16782 acacf013d08a
child 16939 9e3a9eda5775
8011714: Regexp decimal escape handling still not correct Reviewed-by: lagergren, attila
nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java
nashorn/test/script/basic/JDK-8011714.js
nashorn/test/script/basic/JDK-8011714.js.EXPECTED
--- a/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java	Tue Apr 09 08:36:32 2013 -0300
+++ b/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java	Wed Apr 10 14:00:11 2013 +0200
@@ -108,15 +108,11 @@
             final int pos = iterator.next();
             final int num = iterator.next();
             if (num > caps.size()) {
-                // Non-existing reference should never match, if smaller than 8 convert to octal escape
-                // to be compatible with other engines.
-                if (num < 8) {
-                    String escape = "\\x0" + num;
-                    sb.insert(pos, escape);
-                } else {
-                    neverMatches = true;
-                    break;
-                }
+                // Non-existing backreference. If the number begins with a valid octal convert it to
+                // Unicode escape and append the rest to a literal character sequence.
+                final StringBuilder buffer = new StringBuilder();
+                octalOrLiteral(Integer.toString(num), buffer);
+                sb.insert(pos, buffer);
             }
         }
 
@@ -632,7 +628,7 @@
             // form "\\ca".match([string with ascii 1 at char0]). Translating
             // them to unicode does it though.
             sb.setLength(sb.length() - 1);
-            unicode(c - 'A' + 1);
+            unicode(c - 'A' + 1, sb);
             skip(1);
             return true;
         }
@@ -673,7 +669,7 @@
         final int startIn  = position;
         final int startOut = sb.length();
 
-        if (ch0 == '0' && !isDecimalDigit(ch1)) {
+        if (ch0 == '0' && !isOctalDigit(ch1)) {
             skip(1);
             //  DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000);
             sb.append("\u0000");
@@ -681,50 +677,56 @@
         }
 
         if (isDecimalDigit(ch0)) {
-            final int num = ch0 - '0';
 
-            // Single digit escape, treat as backreference.
-            if (!isDecimalDigit(ch1)) {
-                if (num <= caps.size() && caps.get(num - 1).getNegativeLookaheadLevel() > 0) {
-                    //  Captures that live inside a negative lookahead are dead after the
-                    //  lookahead and will be undefined if referenced from outside.
-                    if (caps.get(num - 1).getNegativeLookaheadLevel() > negativeLookaheadLevel) {
-                        sb.setLength(sb.length() - 1);
-                    } else {
-                        sb.append(ch0);
+            if (ch0 == '0') {
+                // We know this is an octal escape.
+                if (inCharClass) {
+                    // Convert octal escape to unicode escape if inside character class.
+                    int octalValue = 0;
+                    while (isOctalDigit(ch0)) {
+                        octalValue = octalValue * 8 + ch0 - '0';
+                        skip(1);
                     }
-                    skip(1);
-                    return true;
-                } else if (num > caps.size()) {
-                    // Forward reference to a capture group. Forward references are always undefined so we
-                    // can omit it from the output buffer. Additionally, if the capture group does not exist
-                    // the whole regexp becomes invalid, so register the reference for later processing.
-                    sb.setLength(sb.length() - 1);
-                    forwardReferences.add(num);
-                    forwardReferences.add(sb.length());
-                    skip(1);
-                    return true;
+
+                    unicode(octalValue, sb);
+
+                } else {
+                    // Copy decimal escape as-is
+                    decimalDigits();
                 }
-            }
-
-            if (inCharClass) {
-                // Convert octal escape to unicode escape if inside character class.
-                StringBuilder digit = new StringBuilder(4);
+            } else {
+                // This should be a backreference, but could also be an octal escape or even a literal string.
+                int decimalValue = 0;
                 while (isDecimalDigit(ch0)) {
-                    digit.append(ch0);
+                    decimalValue = decimalValue * 10 + ch0 - '0';
                     skip(1);
                 }
 
-                int value = Integer.parseInt(digit.toString(), 8); //throws exception that leads to SyntaxError if not octal
-                if (value > 0xff) {
-                    throw new NumberFormatException(digit.toString());
+                if (inCharClass) {
+                    // No backreferences in character classes. Encode as unicode escape or literal char sequence
+                    sb.setLength(sb.length() - 1);
+                    octalOrLiteral(Integer.toString(decimalValue), sb);
+
+                } else if (decimalValue <= caps.size() && caps.get(decimalValue - 1).getNegativeLookaheadLevel() > 0) {
+                    //  Captures that live inside a negative lookahead are dead after the
+                    //  lookahead and will be undefined if referenced from outside.
+                    if (caps.get(decimalValue - 1).getNegativeLookaheadLevel() > negativeLookaheadLevel) {
+                        sb.setLength(sb.length() - 1);
+                    } else {
+                        sb.append(decimalValue);
+                    }
+                } else if (decimalValue > caps.size()) {
+                    // Forward reference to a capture group. Forward references are always undefined so we can omit
+                    // it from the output buffer. However, if the target capture does not exist, we need to rewrite
+                    // the reference as hex escape or literal string, so register the reference for later processing.
+                    sb.setLength(sb.length() - 1);
+                    forwardReferences.add(decimalValue);
+                    forwardReferences.add(sb.length());
+                } else {
+                    // Append as backreference
+                    sb.append(decimalValue);
                 }
 
-                unicode(value);
-
-            } else {
-                // Copy decimal escape as-is
-                decimalDigits();
             }
             return true;
         }
@@ -965,13 +967,41 @@
         return true;
     }
 
-    private void unicode(final int value) {
+    private void unicode(final int value, final StringBuilder buffer) {
         final String hex = Integer.toHexString(value);
-        sb.append('u');
+        buffer.append('u');
         for (int i = 0; i < 4 - hex.length(); i++) {
-            sb.append('0');
+            buffer.append('0');
         }
-        sb.append(hex);
+        buffer.append(hex);
+    }
+
+    // Convert what would have been a backreference into a unicode escape, or a number literal, or both.
+    private void octalOrLiteral(final String numberLiteral, final StringBuilder buffer) {
+        final int length = numberLiteral.length();
+        int octalValue = 0;
+        int pos = 0;
+        // Maximum value for octal escape is 0377 (255) so we stop the loop at 32
+        while (pos < length && octalValue < 0x20) {
+            final char ch = numberLiteral.charAt(pos);
+            if (isOctalDigit(ch)) {
+                octalValue = octalValue * 8 + ch - '0';
+            } else {
+                break;
+            }
+            pos++;
+        }
+        if (octalValue > 0) {
+            buffer.append('\\');
+            unicode(octalValue, buffer);
+            buffer.append(numberLiteral.substring(pos));
+        } else {
+            buffer.append(numberLiteral);
+        }
+    }
+
+    private static boolean isOctalDigit(final char ch) {
+        return ch >= '0' && ch <= '7';
     }
 
     private static boolean isDecimalDigit(final char ch) {
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/nashorn/test/script/basic/JDK-8011714.js	Wed Apr 10 14:00:11 2013 +0200
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * JDK-8011714: Regexp decimal escape handling still not correct
+ *
+ * @test
+ * @run
+ */
+
+// \0 should be interpreted as <NUL> character here
+print(/\08/.test("\x008"));
+print(/[\08]/.test("8"));
+print(/[\08]/.test("\x00"));
+
+// Can't be converted to octal thus encoded as literal char sequence
+print(/\8/.exec("\\8"));
+print(/[\8]/.exec("\\"));
+print(/[\8]/.exec("8"));
+
+// 0471 is too high for an octal escape so it is \047 outside a character class
+// and \\471 inside a character class
+print(/\471/.exec("\x271"));
+print(/[\471]/.exec("1"));
+print(/[\471]/.exec("\x27"));
+
+// 0366 is a valid octal escape (246)
+print(/\366/.test("\xf6"));
+print(/[\366]/.test("\xf6"));
+print(/[\366]/.test("\xf6"));
+
+// more tests for conversion of invalid backreferences to octal escapes or literals
+print(/(a)(b)(c)(d)\4/.exec("abcdd"));
+print(/(a)(b)(c)(d)\4x/.exec("abcddx"));
+print(/(a)(b)(c)(d)\47/.exec("abcdd7"));
+print(/(a)(b)(c)(d)\47/.exec("abcd\x27"));
+print(/(a)(b)(c)(d)\47xyz/.exec("abcd\x27xyz"));
+print(/(a)(b)(c)(d)[\47]/.exec("abcd\x27"));
+print(/(a)(b)(c)(d)[\47]xyz/.exec("abcd\x27xyz"));
+print(/(a)(b)(c)(d)\48/.exec("abcd\x048"));
+print(/(a)(b)(c)(d)\48xyz/.exec("abcd\x048xyz"));
+print(/(a)(b)(c)(d)[\48]/.exec("abcd\x04"));
+print(/(a)(b)(c)(d)[\48]xyz/.exec("abcd\x04xyz"));
+print(/(a)(b)(c)(d)\84/.exec("abcd84"));
+print(/(a)(b)(c)(d)\84xyz/.exec("abcd84xyz"));
+print(/(a)(b)(c)(d)[\84]/.exec("abcd8"));
+print(/(a)(b)(c)(d)[\84]xyz/.exec("abcd8xyz"));
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/nashorn/test/script/basic/JDK-8011714.js.EXPECTED	Wed Apr 10 14:00:11 2013 +0200
@@ -0,0 +1,27 @@
+true
+true
+true
+8
+null
+8
+'1
+1
+'
+true
+true
+true
+abcdd,a,b,c,d
+abcddx,a,b,c,d
+null
+abcd',a,b,c,d
+abcd'xyz,a,b,c,d
+abcd',a,b,c,d
+abcd'xyz,a,b,c,d
+abcd8,a,b,c,d
+abcd8xyz,a,b,c,d
+abcd,a,b,c,d
+abcdxyz,a,b,c,d
+abcd84,a,b,c,d
+abcd84xyz,a,b,c,d
+abcd8,a,b,c,d
+abcd8xyz,a,b,c,d