8011714: Regexp decimal escape handling still not correct
Reviewed-by: lagergren, attila
--- a/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java Tue Apr 09 08:36:32 2013 -0300
+++ b/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java Wed Apr 10 14:00:11 2013 +0200
@@ -108,15 +108,11 @@
final int pos = iterator.next();
final int num = iterator.next();
if (num > caps.size()) {
- // Non-existing reference should never match, if smaller than 8 convert to octal escape
- // to be compatible with other engines.
- if (num < 8) {
- String escape = "\\x0" + num;
- sb.insert(pos, escape);
- } else {
- neverMatches = true;
- break;
- }
+ // Non-existing backreference. If the number begins with a valid octal convert it to
+ // Unicode escape and append the rest to a literal character sequence.
+ final StringBuilder buffer = new StringBuilder();
+ octalOrLiteral(Integer.toString(num), buffer);
+ sb.insert(pos, buffer);
}
}
@@ -632,7 +628,7 @@
// form "\\ca".match([string with ascii 1 at char0]). Translating
// them to unicode does it though.
sb.setLength(sb.length() - 1);
- unicode(c - 'A' + 1);
+ unicode(c - 'A' + 1, sb);
skip(1);
return true;
}
@@ -673,7 +669,7 @@
final int startIn = position;
final int startOut = sb.length();
- if (ch0 == '0' && !isDecimalDigit(ch1)) {
+ if (ch0 == '0' && !isOctalDigit(ch1)) {
skip(1);
// DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000);
sb.append("\u0000");
@@ -681,50 +677,56 @@
}
if (isDecimalDigit(ch0)) {
- final int num = ch0 - '0';
- // Single digit escape, treat as backreference.
- if (!isDecimalDigit(ch1)) {
- if (num <= caps.size() && caps.get(num - 1).getNegativeLookaheadLevel() > 0) {
- // Captures that live inside a negative lookahead are dead after the
- // lookahead and will be undefined if referenced from outside.
- if (caps.get(num - 1).getNegativeLookaheadLevel() > negativeLookaheadLevel) {
- sb.setLength(sb.length() - 1);
- } else {
- sb.append(ch0);
+ if (ch0 == '0') {
+ // We know this is an octal escape.
+ if (inCharClass) {
+ // Convert octal escape to unicode escape if inside character class.
+ int octalValue = 0;
+ while (isOctalDigit(ch0)) {
+ octalValue = octalValue * 8 + ch0 - '0';
+ skip(1);
}
- skip(1);
- return true;
- } else if (num > caps.size()) {
- // Forward reference to a capture group. Forward references are always undefined so we
- // can omit it from the output buffer. Additionally, if the capture group does not exist
- // the whole regexp becomes invalid, so register the reference for later processing.
- sb.setLength(sb.length() - 1);
- forwardReferences.add(num);
- forwardReferences.add(sb.length());
- skip(1);
- return true;
+
+ unicode(octalValue, sb);
+
+ } else {
+ // Copy decimal escape as-is
+ decimalDigits();
}
- }
-
- if (inCharClass) {
- // Convert octal escape to unicode escape if inside character class.
- StringBuilder digit = new StringBuilder(4);
+ } else {
+ // This should be a backreference, but could also be an octal escape or even a literal string.
+ int decimalValue = 0;
while (isDecimalDigit(ch0)) {
- digit.append(ch0);
+ decimalValue = decimalValue * 10 + ch0 - '0';
skip(1);
}
- int value = Integer.parseInt(digit.toString(), 8); //throws exception that leads to SyntaxError if not octal
- if (value > 0xff) {
- throw new NumberFormatException(digit.toString());
+ if (inCharClass) {
+ // No backreferences in character classes. Encode as unicode escape or literal char sequence
+ sb.setLength(sb.length() - 1);
+ octalOrLiteral(Integer.toString(decimalValue), sb);
+
+ } else if (decimalValue <= caps.size() && caps.get(decimalValue - 1).getNegativeLookaheadLevel() > 0) {
+ // Captures that live inside a negative lookahead are dead after the
+ // lookahead and will be undefined if referenced from outside.
+ if (caps.get(decimalValue - 1).getNegativeLookaheadLevel() > negativeLookaheadLevel) {
+ sb.setLength(sb.length() - 1);
+ } else {
+ sb.append(decimalValue);
+ }
+ } else if (decimalValue > caps.size()) {
+ // Forward reference to a capture group. Forward references are always undefined so we can omit
+ // it from the output buffer. However, if the target capture does not exist, we need to rewrite
+ // the reference as hex escape or literal string, so register the reference for later processing.
+ sb.setLength(sb.length() - 1);
+ forwardReferences.add(decimalValue);
+ forwardReferences.add(sb.length());
+ } else {
+ // Append as backreference
+ sb.append(decimalValue);
}
- unicode(value);
-
- } else {
- // Copy decimal escape as-is
- decimalDigits();
}
return true;
}
@@ -965,13 +967,41 @@
return true;
}
- private void unicode(final int value) {
+ private void unicode(final int value, final StringBuilder buffer) {
final String hex = Integer.toHexString(value);
- sb.append('u');
+ buffer.append('u');
for (int i = 0; i < 4 - hex.length(); i++) {
- sb.append('0');
+ buffer.append('0');
}
- sb.append(hex);
+ buffer.append(hex);
+ }
+
+ // Convert what would have been a backreference into a unicode escape, or a number literal, or both.
+ private void octalOrLiteral(final String numberLiteral, final StringBuilder buffer) {
+ final int length = numberLiteral.length();
+ int octalValue = 0;
+ int pos = 0;
+ // Maximum value for octal escape is 0377 (255) so we stop the loop at 32
+ while (pos < length && octalValue < 0x20) {
+ final char ch = numberLiteral.charAt(pos);
+ if (isOctalDigit(ch)) {
+ octalValue = octalValue * 8 + ch - '0';
+ } else {
+ break;
+ }
+ pos++;
+ }
+ if (octalValue > 0) {
+ buffer.append('\\');
+ unicode(octalValue, buffer);
+ buffer.append(numberLiteral.substring(pos));
+ } else {
+ buffer.append(numberLiteral);
+ }
+ }
+
+ private static boolean isOctalDigit(final char ch) {
+ return ch >= '0' && ch <= '7';
}
private static boolean isDecimalDigit(final char ch) {
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/nashorn/test/script/basic/JDK-8011714.js Wed Apr 10 14:00:11 2013 +0200
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * JDK-8011714: Regexp decimal escape handling still not correct
+ *
+ * @test
+ * @run
+ */
+
+// \0 should be interpreted as <NUL> character here
+print(/\08/.test("\x008"));
+print(/[\08]/.test("8"));
+print(/[\08]/.test("\x00"));
+
+// Can't be converted to octal thus encoded as literal char sequence
+print(/\8/.exec("\\8"));
+print(/[\8]/.exec("\\"));
+print(/[\8]/.exec("8"));
+
+// 0471 is too high for an octal escape so it is \047 outside a character class
+// and \\471 inside a character class
+print(/\471/.exec("\x271"));
+print(/[\471]/.exec("1"));
+print(/[\471]/.exec("\x27"));
+
+// 0366 is a valid octal escape (246)
+print(/\366/.test("\xf6"));
+print(/[\366]/.test("\xf6"));
+print(/[\366]/.test("\xf6"));
+
+// more tests for conversion of invalid backreferences to octal escapes or literals
+print(/(a)(b)(c)(d)\4/.exec("abcdd"));
+print(/(a)(b)(c)(d)\4x/.exec("abcddx"));
+print(/(a)(b)(c)(d)\47/.exec("abcdd7"));
+print(/(a)(b)(c)(d)\47/.exec("abcd\x27"));
+print(/(a)(b)(c)(d)\47xyz/.exec("abcd\x27xyz"));
+print(/(a)(b)(c)(d)[\47]/.exec("abcd\x27"));
+print(/(a)(b)(c)(d)[\47]xyz/.exec("abcd\x27xyz"));
+print(/(a)(b)(c)(d)\48/.exec("abcd\x048"));
+print(/(a)(b)(c)(d)\48xyz/.exec("abcd\x048xyz"));
+print(/(a)(b)(c)(d)[\48]/.exec("abcd\x04"));
+print(/(a)(b)(c)(d)[\48]xyz/.exec("abcd\x04xyz"));
+print(/(a)(b)(c)(d)\84/.exec("abcd84"));
+print(/(a)(b)(c)(d)\84xyz/.exec("abcd84xyz"));
+print(/(a)(b)(c)(d)[\84]/.exec("abcd8"));
+print(/(a)(b)(c)(d)[\84]xyz/.exec("abcd8xyz"));
+
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/nashorn/test/script/basic/JDK-8011714.js.EXPECTED Wed Apr 10 14:00:11 2013 +0200
@@ -0,0 +1,27 @@
+true
+true
+true
+8
+null
+8
+'1
+1
+'
+true
+true
+true
+abcdd,a,b,c,d
+abcddx,a,b,c,d
+null
+abcd',a,b,c,d
+abcd'xyz,a,b,c,d
+abcd',a,b,c,d
+abcd'xyz,a,b,c,d
+abcd8,a,b,c,d
+abcd8xyz,a,b,c,d
+abcd,a,b,c,d
+abcdxyz,a,b,c,d
+abcd84,a,b,c,d
+abcd84xyz,a,b,c,d
+abcd8,a,b,c,d
+abcd8xyz,a,b,c,d