8030202: Nashorn: Multiple RegExp#ignoreCase issues
authorhannesw
Thu, 22 May 2014 17:51:56 +0200
changeset 24588 36f65e9b2f4c
parent 24587 c17e03fc8cc4
child 24589 6d4c7e566c59
8030202: Nashorn: Multiple RegExp#ignoreCase issues Reviewed-by: sundar, jlaskey
nashorn/src/jdk/nashorn/internal/runtime/regexp/joni/Analyser.java
nashorn/src/jdk/nashorn/internal/runtime/regexp/joni/ApplyCaseFold.java
nashorn/src/jdk/nashorn/internal/runtime/regexp/joni/ByteCodeMachine.java
nashorn/src/jdk/nashorn/internal/runtime/regexp/joni/EncodingHelper.java
nashorn/src/jdk/nashorn/internal/runtime/regexp/joni/SearchAlgorithm.java
nashorn/test/script/basic/JDK-8030202.js
nashorn/test/script/basic/JDK-8030202.js.EXPECTED
--- a/nashorn/src/jdk/nashorn/internal/runtime/regexp/joni/Analyser.java	Thu May 22 11:12:29 2014 +0200
+++ b/nashorn/src/jdk/nashorn/internal/runtime/regexp/joni/Analyser.java	Thu May 22 17:51:56 2014 +0200
@@ -771,7 +771,7 @@
 
         while (value < end) {
             int ovalue = value;
-            buf = Character.toLowerCase(chars[value++]);
+            buf = EncodingHelper.toLowerCase(chars[value++]);
 
             if (chars[ovalue] != buf) {
 
@@ -779,7 +779,7 @@
                 System.arraycopy(chars, sn.p, sbuf, 0, ovalue - sn.p);
                 value = ovalue;
                 while (value < end) {
-                    buf = Character.toLowerCase(chars[value++]);
+                    buf = EncodingHelper.toLowerCase(chars[value++]);
                     if (sp >= sbuf.length) {
                         char[]tmp = new char[sbuf.length << 1];
                         System.arraycopy(sbuf, 0, tmp, 0, sbuf.length);
--- a/nashorn/src/jdk/nashorn/internal/runtime/regexp/joni/ApplyCaseFold.java	Thu May 22 11:12:29 2014 +0200
+++ b/nashorn/src/jdk/nashorn/internal/runtime/regexp/joni/ApplyCaseFold.java	Thu May 22 17:51:56 2014 +0200
@@ -20,70 +20,42 @@
 package jdk.nashorn.internal.runtime.regexp.joni;
 
 import jdk.nashorn.internal.runtime.regexp.joni.ast.CClassNode;
-import jdk.nashorn.internal.runtime.regexp.joni.ast.ConsAltNode;
-import jdk.nashorn.internal.runtime.regexp.joni.ast.StringNode;
 
 final class ApplyCaseFold {
 
     // i_apply_case_fold
-    public void apply(int from, int[]to, int length, Object o) {
+    public void apply(int from, int to, Object o) {
         ApplyCaseFoldArg arg = (ApplyCaseFoldArg)o;
 
         ScanEnvironment env = arg.env;
         CClassNode cc = arg.cc;
         BitSet bs = cc.bs;
 
-        if (length == 1) {
-            boolean inCC = cc.isCodeInCC(from);
+        boolean inCC = cc.isCodeInCC(from);
 
-            if (Config.CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS) {
-                if ((inCC && !cc.isNot()) || (!inCC && cc.isNot())) {
-                    if (to[0] >= BitSet.SINGLE_BYTE_SIZE) {
-                        cc.addCodeRange(env, to[0], to[0]);
-                    } else {
-                        /* /(?i:[^A-C])/.match("a") ==> fail. */
-                        bs.set(to[0]);
-                    }
+        if (Config.CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS) {
+            if ((inCC && !cc.isNot()) || (!inCC && cc.isNot())) {
+                if (to >= BitSet.SINGLE_BYTE_SIZE) {
+                    cc.addCodeRange(env, to, to);
+                } else {
+                    /* /(?i:[^A-C])/.match("a") ==> fail. */
+                    bs.set(to);
                 }
-            } else {
-                if (inCC) {
-                    if (to[0] >= BitSet.SINGLE_BYTE_SIZE) {
-                        if (cc.isNot()) cc.clearNotFlag();
-                        cc.addCodeRange(env, to[0], to[0]);
+            }
+        } else {
+            if (inCC) {
+                if (to >= BitSet.SINGLE_BYTE_SIZE) {
+                    if (cc.isNot()) cc.clearNotFlag();
+                    cc.addCodeRange(env, to, to);
+                } else {
+                    if (cc.isNot()) {
+                        bs.clear(to);
                     } else {
-                        if (cc.isNot()) {
-                            bs.clear(to[0]);
-                        } else {
-                            bs.set(to[0]);
-                        }
+                        bs.set(to);
                     }
                 }
-            } // CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
-
-        } else {
-            if (cc.isCodeInCC(from) && (!Config.CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS || !cc.isNot())) {
-                StringNode node = null;
-                for (int i=0; i<length; i++) {
-                    if (i == 0) {
-                        node = new StringNode();
-                        /* char-class expanded multi-char only
-                        compare with string folded at match time. */
-                        node.setAmbig();
-                    }
-                    node.catCode(to[i]);
-                }
-
-                ConsAltNode alt = ConsAltNode.newAltNode(node, null);
-
-                if (arg.tail == null) {
-                    arg.altRoot = alt;
-                } else {
-                    arg.tail.setCdr(alt);
-                }
-                arg.tail = alt;
             }
-
-        }
+        } // CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS
 
     }
 
--- a/nashorn/src/jdk/nashorn/internal/runtime/regexp/joni/ByteCodeMachine.java	Thu May 22 11:12:29 2014 +0200
+++ b/nashorn/src/jdk/nashorn/internal/runtime/regexp/joni/ByteCodeMachine.java	Thu May 22 17:51:56 2014 +0200
@@ -58,8 +58,8 @@
         int end1 = s1 + mbLen;
 
         while (s1 < end1) {
-            char c1 = Character.toLowerCase(chars[s1++]);
-            char c2 = Character.toLowerCase(chars[s2++]);
+            char c1 = EncodingHelper.toLowerCase(chars[s1++]);
+            char c2 = EncodingHelper.toLowerCase(chars[s2++]);
 
             if (c1 != c2) {
                 return false;
@@ -367,7 +367,7 @@
     }
 
     private void opExact1IC() {
-        if (s >= range || code[ip] != Character.toLowerCase(chars[s++])) {opFail(); return;}
+        if (s >= range || code[ip] != EncodingHelper.toLowerCase(chars[s++])) {opFail(); return;}
         ip++;
         sprev = sbegin; // break;
     }
@@ -380,10 +380,10 @@
             char[] bs = regex.templates[code[ip++]];
             int ps = code[ip++];
 
-            while (tlen-- > 0) if (bs[ps++] != Character.toLowerCase(chars[s++])) {opFail(); return;}
+            while (tlen-- > 0) if (bs[ps++] != EncodingHelper.toLowerCase(chars[s++])) {opFail(); return;}
         } else {
 
-            while (tlen-- > 0) if (code[ip++] != Character.toLowerCase(chars[s++])) {opFail(); return;}
+            while (tlen-- > 0) if (code[ip++] != EncodingHelper.toLowerCase(chars[s++])) {opFail(); return;}
         }
         sprev = s - 1;
     }
--- a/nashorn/src/jdk/nashorn/internal/runtime/regexp/joni/EncodingHelper.java	Thu May 22 11:12:29 2014 +0200
+++ b/nashorn/src/jdk/nashorn/internal/runtime/regexp/joni/EncodingHelper.java	Thu May 22 17:51:56 2014 +0200
@@ -93,43 +93,80 @@
        return s;
     }
 
-    public static int mbcToCode(byte[] bytes, int p, int end) {
-        int code = 0;
-        for (int i = p; i < end; i++) {
-            code = (code << 8) | (bytes[i] & 0xff);
-        }
-        return code;
-    }
-
     public static int mbcodeStartPosition() {
         return 0x80;
     }
 
     public static char[] caseFoldCodesByString(int flag, char c) {
-        if (Character.isUpperCase(c)) {
-            return new char[] {Character.toLowerCase(c)};
-        } else if (Character.isLowerCase(c)) {
-            return new char[] {Character.toUpperCase(c)};
-        } else {
-            return EMPTYCHARS;
+        char[] codes = EMPTYCHARS;
+        final char upper = toUpperCase(c);
+
+        if (upper != toLowerCase(upper)) {
+            int count = 0;
+            char ch = 0;
+
+            do {
+                final char u = toUpperCase(ch);
+                if (u == upper && ch != c) {
+                    // Almost all characters will return array of length 1, very few 2 or 3, so growing by one is fine.
+                    codes = count == 0 ? new char[1] : Arrays.copyOf(codes, count + 1);
+                    codes[count++] = ch;
+                }
+            } while (ch++ < 0xffff);
         }
+        return codes;
     }
 
     public static void applyAllCaseFold(int flag, ApplyCaseFold fun, Object arg) {
-        int[] code = new int[1];
-
         for (int c = 0; c < 0xffff; c++) {
-            if (Character.getType(c) == Character.LOWERCASE_LETTER) {
+            if (Character.isLowerCase(c)) {
+                final int upper = toUpperCase(c);
 
-                int upper = code[0] = Character.toUpperCase(c);
-                fun.apply(c, code, 1, arg);
+                if (upper != c) {
+                    fun.apply(c, upper, arg);
+                }
+            }
+        }
 
-                code[0] = c;
-                fun.apply(upper, code, 1, arg);
+        // Some characters have multiple lower case variants, hence we need to do a second run
+        for (int c = 0; c < 0xffff; c++) {
+            if (Character.isLowerCase(c)) {
+                final int upper = toUpperCase(c);
+
+                if (upper != c) {
+                    fun.apply(upper, c, arg);
+                }
             }
         }
     }
 
+    public static char toLowerCase(char c) {
+        return (char)toLowerCase((int)c);
+    }
+
+    public static int toLowerCase(int c) {
+        if (c < 128) {
+            return ('A' <= c && c <= 'Z') ? (c + ('a' - 'A')) : c;
+        }
+        // Do not convert non-ASCII upper case character to ASCII lower case.
+        int lower = Character.toLowerCase(c);
+        return (lower < 128) ? c : lower;
+
+    }
+
+    public static char toUpperCase(char c) {
+        return (char)toUpperCase((int)c);
+    }
+
+    public static int toUpperCase(int c) {
+        if (c < 128) {
+            return ('a' <= c && c <= 'z') ? c + ('A' - 'a') : c;
+        }
+        // Do not convert non-ASCII lower case character to ASCII upper case.
+        int upper = Character.toUpperCase(c);
+        return (upper < 128) ? c : upper;
+    }
+
     public static int[] ctypeCodeRange(int ctype, IntHolder sbOut) {
         sbOut.value = 0x100; // use bitset for codes smaller than 256
         int[] range = null;
--- a/nashorn/src/jdk/nashorn/internal/runtime/regexp/joni/SearchAlgorithm.java	Thu May 22 11:12:29 2014 +0200
+++ b/nashorn/src/jdk/nashorn/internal/runtime/regexp/joni/SearchAlgorithm.java	Thu May 22 17:51:56 2014 +0200
@@ -168,7 +168,7 @@
                                        char[] chars, int p, int end) {
 
             while (tP < tEnd) {
-                if (t[tP++] != Character.toLowerCase(chars[p++])) return false;
+                if (t[tP++] != EncodingHelper.toLowerCase(chars[p++])) return false;
             }
             return true;
         }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/nashorn/test/script/basic/JDK-8030202.js	Thu May 22 17:51:56 2014 +0200
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2010, 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ * 
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ * 
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ * 
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ * 
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * JDK-8030202: Nashorn: Multiple RegExp#ignoreCase issues
+ *
+ * @test
+ * @run
+ */
+
+print(/\u2160/i.test("\u2170"));
+print(/[\u2160]/i.test("\u2170"));
+print(/\u2170/i.test("\u2160"));
+print(/[\u2170]/i.test("\u2160"));
+
+print(/\u0130/i.test("\u0069"));
+print(/[\u0130]/i.test("\u0069"));
+print(/\u0069/i.test("\u0130"));
+print(/[\u0069]/i.test("\u0130"));
+
+print(/\u1e9e/i.test("\u00df"));
+print(/[\u1e9e]/i.test("\u00df"));
+print(/\u00df/i.test("\u1e9e"));
+print(/[\u00df]/i.test("\u1e9e"));
+
+print(/[^\u1e9e]/i.test("\u00df"));
+print(/[^\u00df]/i.test("\u1e9e"));
+
+print(/\u0345{4}/i.test("\u0345\u0399\u03b9\u1fbe"));
+print(/\u0399{4}/i.test("\u0345\u0399\u03b9\u1fbe"));
+print(/\u03b9{4}/i.test("\u0345\u0399\u03b9\u1fbe"));
+print(/\u1fbe{4}/i.test("\u0345\u0399\u03b9\u1fbe"));
+
+print(/[\u0345]{4}/i.test("\u0345\u0399\u03b9\u1fbe"));
+print(/[\u0399]{4}/i.test("\u0345\u0399\u03b9\u1fbe"));
+print(/[\u03b9]{4}/i.test("\u0345\u0399\u03b9\u1fbe"));
+print(/[\u1fbe]{4}/i.test("\u0345\u0399\u03b9\u1fbe"));
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/nashorn/test/script/basic/JDK-8030202.js.EXPECTED	Thu May 22 17:51:56 2014 +0200
@@ -0,0 +1,22 @@
+true
+true
+true
+true
+false
+false
+false
+false
+false
+false
+false
+false
+true
+true
+true
+true
+true
+true
+true
+true
+true
+true