8016681: regex capture behaves differently than on V8
authorhannesw
Wed, 10 Jul 2013 10:54:19 +0200
changeset 18863 f582e6cdeae5
parent 18862 8b6a01b38cb8
child 18864 c701b823ed9e
8016681: regex capture behaves differently than on V8 Reviewed-by: lagergren, sundar
nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java
nashorn/test/script/basic/JDK-8016681.js
nashorn/test/script/basic/JDK-8016681.js.EXPECTED
--- a/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java	Wed Jul 10 13:25:07 2013 +0530
+++ b/nashorn/src/jdk/nashorn/internal/runtime/regexp/RegExpScanner.java	Wed Jul 10 10:54:19 2013 +0200
@@ -57,7 +57,10 @@
     private final LinkedList<Integer> forwardReferences = new LinkedList<>();
 
     /** Current level of zero-width negative lookahead assertions. */
-    private int negativeLookaheadLevel;
+    private int negLookaheadLevel;
+
+    /** Sequential id of current top-level zero-width negative lookahead assertion. */
+    private int negLookaheadGroup;
 
     /** Are we currently inside a character class? */
     private boolean inCharClass = false;
@@ -68,17 +71,18 @@
     private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?-";
 
     private static class Capture {
-        /**
-         * Zero-width negative lookaheads enclosing the capture.
-         */
-        private final int negativeLookaheadLevel;
+        /** Zero-width negative lookaheads enclosing the capture. */
+        private final int negLookaheadLevel;
+        /** Sequential id of top-level negative lookaheads containing the capture. */
+        private  final int negLookaheadGroup;
 
-        Capture(final int negativeLookaheadLevel) {
-            this.negativeLookaheadLevel = negativeLookaheadLevel;
+        Capture(final int negLookaheadGroup, final int negLookaheadLevel) {
+            this.negLookaheadGroup = negLookaheadGroup;
+            this.negLookaheadLevel = negLookaheadLevel;
         }
 
-        public int getNegativeLookaheadLevel() {
-            return negativeLookaheadLevel;
+        boolean isContained(final int group, final int level) {
+            return group == this.negLookaheadGroup && level >= this.negLookaheadLevel;
         }
 
     }
@@ -152,7 +156,7 @@
         BitVector vec = null;
         for (int i = 0; i < caps.size(); i++) {
             final Capture cap = caps.get(i);
-            if (cap.getNegativeLookaheadLevel() > 0) {
+            if (cap.negLookaheadLevel > 0) {
                 if (vec == null) {
                     vec = new BitVector(caps.size() + 1);
                 }
@@ -311,11 +315,14 @@
             commit(3);
 
             if (isNegativeLookahead) {
-                negativeLookaheadLevel++;
+                if (negLookaheadLevel == 0) {
+                    negLookaheadGroup++;
+                }
+                negLookaheadLevel++;
             }
             disjunction();
             if (isNegativeLookahead) {
-                negativeLookaheadLevel--;
+                negLookaheadLevel--;
             }
 
             if (ch0 == ')') {
@@ -432,20 +439,17 @@
         }
 
         if (ch0 == '(') {
-            boolean capturingParens = true;
             commit(1);
             if (ch0 == '?' && ch1 == ':') {
-                capturingParens = false;
                 commit(2);
+            } else {
+                caps.add(new Capture(negLookaheadGroup, negLookaheadLevel));
             }
 
             disjunction();
 
             if (ch0 == ')') {
                 commit(1);
-                if (capturingParens) {
-                    caps.add(new Capture(negativeLookaheadLevel));
-                }
                 return true;
             }
         }
@@ -675,24 +679,22 @@
                     sb.setLength(sb.length() - 1);
                     octalOrLiteral(Integer.toString(decimalValue), sb);
 
-                } else if (decimalValue <= caps.size() && caps.get(decimalValue - 1).getNegativeLookaheadLevel() > 0) {
-                    //  Captures that live inside a negative lookahead are dead after the
-                    //  lookahead and will be undefined if referenced from outside.
-                    if (caps.get(decimalValue - 1).getNegativeLookaheadLevel() > negativeLookaheadLevel) {
+                } else if (decimalValue <= caps.size()) {
+                    //  Captures inside a negative lookahead are undefined when referenced from the outside.
+                    if (!caps.get(decimalValue - 1).isContained(negLookaheadGroup, negLookaheadLevel)) {
+                        // Reference to capture in negative lookahead, omit from output buffer.
                         sb.setLength(sb.length() - 1);
                     } else {
+                        // Append backreference to output buffer.
                         sb.append(decimalValue);
                     }
-                } else if (decimalValue > caps.size()) {
-                    // Forward reference to a capture group. Forward references are always undefined so we can omit
-                    // it from the output buffer. However, if the target capture does not exist, we need to rewrite
-                    // the reference as hex escape or literal string, so register the reference for later processing.
+                } else {
+                    // Forward references to a capture group are always undefined so we can omit it from the output buffer.
+                    // However, if the target capture does not exist, we need to rewrite the reference as hex escape
+                    // or literal string, so register the reference for later processing.
                     sb.setLength(sb.length() - 1);
                     forwardReferences.add(decimalValue);
                     forwardReferences.add(sb.length());
-                } else {
-                    // Append as backreference
-                    sb.append(decimalValue);
                 }
 
             }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/nashorn/test/script/basic/JDK-8016681.js	Wed Jul 10 10:54:19 2013 +0200
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ * 
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ * 
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ * 
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ * 
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * JDK-8016681: regex capture behaves differently than on V8
+ *
+ * @test
+ * @run
+ */
+
+// regexp similar to the one used in marked.js
+/^((?:[^\n]+\n?(?!( *[-*_]){3,} *(?:\n+|$)| *(#{1,6}) *([^\n]+?) *#* *(?:\n+|$)|([^\n]+)\n *(=|-){3,} *\n*))+)\n*/
+    .exec("a\n\nb")
+    .forEach(function(e) { print(e); });
+
+// simplified regexp
+/(x(?!(a))(?!(b))y)/
+    .exec("xy")
+    .forEach(function(e) { print(e); });
+
+// should not match as cross-negative-lookeahead backreference \2 should be undefined
+print(/(x(?!(a))(?!(b)\2))/.exec("xbc"));
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/nashorn/test/script/basic/JDK-8016681.js.EXPECTED	Wed Jul 10 10:54:19 2013 +0200
@@ -0,0 +1,15 @@
+a
+
+
+a
+
+undefined
+undefined
+undefined
+undefined
+undefined
+xy
+xy
+undefined
+undefined
+null