8016681: regex capture behaves differently than on V8
Reviewed-by: lagergren, sundar
--- a/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java Wed Jul 10 13:25:07 2013 +0530
+++ b/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java Wed Jul 10 10:54:19 2013 +0200
@@ -57,7 +57,10 @@
private final LinkedList<Integer> forwardReferences = new LinkedList<>();
/** Current level of zero-width negative lookahead assertions. */
- private int negativeLookaheadLevel;
+ private int negLookaheadLevel;
+
+ /** Sequential id of current top-level zero-width negative lookahead assertion. */
+ private int negLookaheadGroup;
/** Are we currently inside a character class? */
private boolean inCharClass = false;
@@ -68,17 +71,18 @@
private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?-";
private static class Capture {
- /**
- * Zero-width negative lookaheads enclosing the capture.
- */
- private final int negativeLookaheadLevel;
+ /** Zero-width negative lookaheads enclosing the capture. */
+ private final int negLookaheadLevel;
+ /** Sequential id of top-level negative lookaheads containing the capture. */
+ private final int negLookaheadGroup;
- Capture(final int negativeLookaheadLevel) {
- this.negativeLookaheadLevel = negativeLookaheadLevel;
+ Capture(final int negLookaheadGroup, final int negLookaheadLevel) {
+ this.negLookaheadGroup = negLookaheadGroup;
+ this.negLookaheadLevel = negLookaheadLevel;
}
- public int getNegativeLookaheadLevel() {
- return negativeLookaheadLevel;
+ boolean isContained(final int group, final int level) {
+ return group == this.negLookaheadGroup && level >= this.negLookaheadLevel;
}
}
@@ -152,7 +156,7 @@
BitVector vec = null;
for (int i = 0; i < caps.size(); i++) {
final Capture cap = caps.get(i);
- if (cap.getNegativeLookaheadLevel() > 0) {
+ if (cap.negLookaheadLevel > 0) {
if (vec == null) {
vec = new BitVector(caps.size() + 1);
}
@@ -311,11 +315,14 @@
commit(3);
if (isNegativeLookahead) {
- negativeLookaheadLevel++;
+ if (negLookaheadLevel == 0) {
+ negLookaheadGroup++;
+ }
+ negLookaheadLevel++;
}
disjunction();
if (isNegativeLookahead) {
- negativeLookaheadLevel--;
+ negLookaheadLevel--;
}
if (ch0 == ')') {
@@ -432,20 +439,17 @@
}
if (ch0 == '(') {
- boolean capturingParens = true;
commit(1);
if (ch0 == '?' && ch1 == ':') {
- capturingParens = false;
commit(2);
+ } else {
+ caps.add(new Capture(negLookaheadGroup, negLookaheadLevel));
}
disjunction();
if (ch0 == ')') {
commit(1);
- if (capturingParens) {
- caps.add(new Capture(negativeLookaheadLevel));
- }
return true;
}
}
@@ -675,24 +679,22 @@
sb.setLength(sb.length() - 1);
octalOrLiteral(Integer.toString(decimalValue), sb);
- } else if (decimalValue <= caps.size() && caps.get(decimalValue - 1).getNegativeLookaheadLevel() > 0) {
- // Captures that live inside a negative lookahead are dead after the
- // lookahead and will be undefined if referenced from outside.
- if (caps.get(decimalValue - 1).getNegativeLookaheadLevel() > negativeLookaheadLevel) {
+ } else if (decimalValue <= caps.size()) {
+ // Captures inside a negative lookahead are undefined when referenced from the outside.
+ if (!caps.get(decimalValue - 1).isContained(negLookaheadGroup, negLookaheadLevel)) {
+ // Reference to capture in negative lookahead, omit from output buffer.
sb.setLength(sb.length() - 1);
} else {
+ // Append backreference to output buffer.
sb.append(decimalValue);
}
- } else if (decimalValue > caps.size()) {
- // Forward reference to a capture group. Forward references are always undefined so we can omit
- // it from the output buffer. However, if the target capture does not exist, we need to rewrite
- // the reference as hex escape or literal string, so register the reference for later processing.
+ } else {
+ // Forward references to a capture group are always undefined so we can omit it from the output buffer.
+ // However, if the target capture does not exist, we need to rewrite the reference as hex escape
+ // or literal string, so register the reference for later processing.
sb.setLength(sb.length() - 1);
forwardReferences.add(decimalValue);
forwardReferences.add(sb.length());
- } else {
- // Append as backreference
- sb.append(decimalValue);
}
}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/nashorn/test/script/basic/JDK-8016681.js Wed Jul 10 10:54:19 2013 +0200
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * JDK-8016681: regex capture behaves differently than on V8
+ *
+ * @test
+ * @run
+ */
+
+// regexp similar to the one used in marked.js
+/^((?:[^\n]+\n?(?!( *[-*_]){3,} *(?:\n+|$)| *(#{1,6}) *([^\n]+?) *#* *(?:\n+|$)|([^\n]+)\n *(=|-){3,} *\n*))+)\n*/
+ .exec("a\n\nb")
+ .forEach(function(e) { print(e); });
+
+// simplified regexp
+/(x(?!(a))(?!(b))y)/
+ .exec("xy")
+ .forEach(function(e) { print(e); });
+
+// should not match as cross-negative-lookeahead backreference \2 should be undefined
+print(/(x(?!(a))(?!(b)\2))/.exec("xbc"));
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/nashorn/test/script/basic/JDK-8016681.js.EXPECTED Wed Jul 10 10:54:19 2013 +0200
@@ -0,0 +1,15 @@
+a
+
+
+a
+
+undefined
+undefined
+undefined
+undefined
+undefined
+xy
+xy
+undefined
+undefined
+null