# HG changeset patch # User hannesw # Date 1365595211 -7200 # Node ID 1a8ffed975647e0f1ddc0a2d78aaa6ce262da183 # Parent acacf013d08ac6ec8c199f64e3e89deec7d34cac 8011714: Regexp decimal escape handling still not correct Reviewed-by: lagergren, attila diff -r acacf013d08a -r 1a8ffed97564 nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java --- a/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java Tue Apr 09 08:36:32 2013 -0300 +++ b/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java Wed Apr 10 14:00:11 2013 +0200 @@ -108,15 +108,11 @@ final int pos = iterator.next(); final int num = iterator.next(); if (num > caps.size()) { - // Non-existing reference should never match, if smaller than 8 convert to octal escape - // to be compatible with other engines. - if (num < 8) { - String escape = "\\x0" + num; - sb.insert(pos, escape); - } else { - neverMatches = true; - break; - } + // Non-existing backreference. If the number begins with a valid octal convert it to + // Unicode escape and append the rest to a literal character sequence. + final StringBuilder buffer = new StringBuilder(); + octalOrLiteral(Integer.toString(num), buffer); + sb.insert(pos, buffer); } } @@ -632,7 +628,7 @@ // form "\\ca".match([string with ascii 1 at char0]). Translating // them to unicode does it though. sb.setLength(sb.length() - 1); - unicode(c - 'A' + 1); + unicode(c - 'A' + 1, sb); skip(1); return true; } @@ -673,7 +669,7 @@ final int startIn = position; final int startOut = sb.length(); - if (ch0 == '0' && !isDecimalDigit(ch1)) { + if (ch0 == '0' && !isOctalDigit(ch1)) { skip(1); // DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a character (Unicodevalue0000); sb.append("\u0000"); @@ -681,50 +677,56 @@ } if (isDecimalDigit(ch0)) { - final int num = ch0 - '0'; - // Single digit escape, treat as backreference. - if (!isDecimalDigit(ch1)) { - if (num <= caps.size() && caps.get(num - 1).getNegativeLookaheadLevel() > 0) { - // Captures that live inside a negative lookahead are dead after the - // lookahead and will be undefined if referenced from outside. - if (caps.get(num - 1).getNegativeLookaheadLevel() > negativeLookaheadLevel) { - sb.setLength(sb.length() - 1); - } else { - sb.append(ch0); + if (ch0 == '0') { + // We know this is an octal escape. + if (inCharClass) { + // Convert octal escape to unicode escape if inside character class. + int octalValue = 0; + while (isOctalDigit(ch0)) { + octalValue = octalValue * 8 + ch0 - '0'; + skip(1); } - skip(1); - return true; - } else if (num > caps.size()) { - // Forward reference to a capture group. Forward references are always undefined so we - // can omit it from the output buffer. Additionally, if the capture group does not exist - // the whole regexp becomes invalid, so register the reference for later processing. - sb.setLength(sb.length() - 1); - forwardReferences.add(num); - forwardReferences.add(sb.length()); - skip(1); - return true; + + unicode(octalValue, sb); + + } else { + // Copy decimal escape as-is + decimalDigits(); } - } - - if (inCharClass) { - // Convert octal escape to unicode escape if inside character class. - StringBuilder digit = new StringBuilder(4); + } else { + // This should be a backreference, but could also be an octal escape or even a literal string. + int decimalValue = 0; while (isDecimalDigit(ch0)) { - digit.append(ch0); + decimalValue = decimalValue * 10 + ch0 - '0'; skip(1); } - int value = Integer.parseInt(digit.toString(), 8); //throws exception that leads to SyntaxError if not octal - if (value > 0xff) { - throw new NumberFormatException(digit.toString()); + if (inCharClass) { + // No backreferences in character classes. Encode as unicode escape or literal char sequence + sb.setLength(sb.length() - 1); + octalOrLiteral(Integer.toString(decimalValue), sb); + + } else if (decimalValue <= caps.size() && caps.get(decimalValue - 1).getNegativeLookaheadLevel() > 0) { + // Captures that live inside a negative lookahead are dead after the + // lookahead and will be undefined if referenced from outside. + if (caps.get(decimalValue - 1).getNegativeLookaheadLevel() > negativeLookaheadLevel) { + sb.setLength(sb.length() - 1); + } else { + sb.append(decimalValue); + } + } else if (decimalValue > caps.size()) { + // Forward reference to a capture group. Forward references are always undefined so we can omit + // it from the output buffer. However, if the target capture does not exist, we need to rewrite + // the reference as hex escape or literal string, so register the reference for later processing. + sb.setLength(sb.length() - 1); + forwardReferences.add(decimalValue); + forwardReferences.add(sb.length()); + } else { + // Append as backreference + sb.append(decimalValue); } - unicode(value); - - } else { - // Copy decimal escape as-is - decimalDigits(); } return true; } @@ -965,13 +967,41 @@ return true; } - private void unicode(final int value) { + private void unicode(final int value, final StringBuilder buffer) { final String hex = Integer.toHexString(value); - sb.append('u'); + buffer.append('u'); for (int i = 0; i < 4 - hex.length(); i++) { - sb.append('0'); + buffer.append('0'); } - sb.append(hex); + buffer.append(hex); + } + + // Convert what would have been a backreference into a unicode escape, or a number literal, or both. + private void octalOrLiteral(final String numberLiteral, final StringBuilder buffer) { + final int length = numberLiteral.length(); + int octalValue = 0; + int pos = 0; + // Maximum value for octal escape is 0377 (255) so we stop the loop at 32 + while (pos < length && octalValue < 0x20) { + final char ch = numberLiteral.charAt(pos); + if (isOctalDigit(ch)) { + octalValue = octalValue * 8 + ch - '0'; + } else { + break; + } + pos++; + } + if (octalValue > 0) { + buffer.append('\\'); + unicode(octalValue, buffer); + buffer.append(numberLiteral.substring(pos)); + } else { + buffer.append(numberLiteral); + } + } + + private static boolean isOctalDigit(final char ch) { + return ch >= '0' && ch <= '7'; } private static boolean isDecimalDigit(final char ch) { diff -r acacf013d08a -r 1a8ffed97564 nashorn/test/script/basic/JDK-8011714.js --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/nashorn/test/script/basic/JDK-8011714.js Wed Apr 10 14:00:11 2013 +0200 @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/** + * JDK-8011714: Regexp decimal escape handling still not correct + * + * @test + * @run + */ + +// \0 should be interpreted as character here +print(/\08/.test("\x008")); +print(/[\08]/.test("8")); +print(/[\08]/.test("\x00")); + +// Can't be converted to octal thus encoded as literal char sequence +print(/\8/.exec("\\8")); +print(/[\8]/.exec("\\")); +print(/[\8]/.exec("8")); + +// 0471 is too high for an octal escape so it is \047 outside a character class +// and \\471 inside a character class +print(/\471/.exec("\x271")); +print(/[\471]/.exec("1")); +print(/[\471]/.exec("\x27")); + +// 0366 is a valid octal escape (246) +print(/\366/.test("\xf6")); +print(/[\366]/.test("\xf6")); +print(/[\366]/.test("\xf6")); + +// more tests for conversion of invalid backreferences to octal escapes or literals +print(/(a)(b)(c)(d)\4/.exec("abcdd")); +print(/(a)(b)(c)(d)\4x/.exec("abcddx")); +print(/(a)(b)(c)(d)\47/.exec("abcdd7")); +print(/(a)(b)(c)(d)\47/.exec("abcd\x27")); +print(/(a)(b)(c)(d)\47xyz/.exec("abcd\x27xyz")); +print(/(a)(b)(c)(d)[\47]/.exec("abcd\x27")); +print(/(a)(b)(c)(d)[\47]xyz/.exec("abcd\x27xyz")); +print(/(a)(b)(c)(d)\48/.exec("abcd\x048")); +print(/(a)(b)(c)(d)\48xyz/.exec("abcd\x048xyz")); +print(/(a)(b)(c)(d)[\48]/.exec("abcd\x04")); +print(/(a)(b)(c)(d)[\48]xyz/.exec("abcd\x04xyz")); +print(/(a)(b)(c)(d)\84/.exec("abcd84")); +print(/(a)(b)(c)(d)\84xyz/.exec("abcd84xyz")); +print(/(a)(b)(c)(d)[\84]/.exec("abcd8")); +print(/(a)(b)(c)(d)[\84]xyz/.exec("abcd8xyz")); + diff -r acacf013d08a -r 1a8ffed97564 nashorn/test/script/basic/JDK-8011714.js.EXPECTED --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/nashorn/test/script/basic/JDK-8011714.js.EXPECTED Wed Apr 10 14:00:11 2013 +0200 @@ -0,0 +1,27 @@ +true +true +true +8 +null +8 +'1 +1 +' +true +true +true +abcdd,a,b,c,d +abcddx,a,b,c,d +null +abcd',a,b,c,d +abcd'xyz,a,b,c,d +abcd',a,b,c,d +abcd'xyz,a,b,c,d +abcd8,a,b,c,d +abcd8xyz,a,b,c,d +abcd,a,b,c,d +abcdxyz,a,b,c,d +abcd84,a,b,c,d +abcd84xyz,a,b,c,d +abcd8,a,b,c,d +abcd8xyz,a,b,c,d