6919132: Regex \P{Lu} selects half of a surrogate pari
authorsherman
Fri, 05 Feb 2010 00:10:42 -0800
changeset 4820 e32f5d2ba062
parent 4819 c1661a223e19
child 4821 c4b695155c91
6919132: Regex \P{Lu} selects half of a surrogate pari Summary: To use StartS for complement category/block class Reviewed-by: martin, okutsu
jdk/src/share/classes/java/util/regex/Pattern.java
jdk/test/java/util/regex/RegExTest.java
--- a/jdk/src/share/classes/java/util/regex/Pattern.java	Wed Feb 03 17:04:52 2010 +0800
+++ b/jdk/src/share/classes/java/util/regex/Pattern.java	Fri Feb 05 00:10:42 2010 -0800
@@ -861,6 +861,14 @@
     private transient int patternLength;
 
     /**
+     * If the Start node might possibly match supplementary characters.
+     * It is set to true during compiling if
+     * (1) There is supplementary char in pattern, or
+     * (2) There is complement node of Category or Block
+     */
+    private transient boolean hasSupplementary;
+
+    /**
      * Compiles the given regular expression into a pattern.  </p>
      *
      * @param  regex
@@ -1481,7 +1489,7 @@
         // Use double zero to terminate pattern
         temp = new int[patternLength + 2];
 
-        boolean hasSupplementary = false;
+        hasSupplementary = false;
         int c, count = 0;
         // Convert all chars into code points
         for (int x = 0; x < patternLength; x += Character.charCount(c)) {
@@ -1787,7 +1795,8 @@
      * character or unpaired surrogate.
      */
     private static final boolean isSupplementary(int ch) {
-        return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT || isSurrogate(ch);
+        return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT ||
+               Character.isSurrogate((char)ch);
     }
 
     /**
@@ -1885,7 +1894,7 @@
                     } else {
                         oneLetter = false;
                     }
-                    node = family(oneLetter).maybeComplement(comp);
+                    node = family(oneLetter, comp);
                 } else {
                     unread();
                     node = atom();
@@ -2001,7 +2010,7 @@
                             unread();
                         else
                             oneLetter = false;
-                        return family(oneLetter).maybeComplement(comp);
+                        return family(oneLetter, comp);
                     }
                 }
                 unread();
@@ -2404,7 +2413,7 @@
                     unread();
                 else
                     oneLetter = false;
-                return family(oneLetter).maybeComplement(comp);
+                return family(oneLetter, comp);
             } else { // ordinary escape
                 unread();
                 ch = escape(true, true);
@@ -2450,9 +2459,12 @@
     /**
      * Parses a Unicode character family and returns its representative node.
      */
-    private CharProperty family(boolean singleLetter) {
+    private CharProperty family(boolean singleLetter,
+                                boolean maybeComplement)
+    {
         next();
         String name;
+        CharProperty node;
 
         if (singleLetter) {
             int c = temp[cursor];
@@ -2477,12 +2489,18 @@
         }
 
         if (name.startsWith("In")) {
-            return unicodeBlockPropertyFor(name.substring(2));
+            node = unicodeBlockPropertyFor(name.substring(2));
         } else {
             if (name.startsWith("Is"))
                 name = name.substring(2);
-            return charPropertyNodeFor(name);
-        }
+            node = charPropertyNodeFor(name);
+        }
+        if (maybeComplement) {
+            if (node instanceof Category || node instanceof Block)
+                hasSupplementary = true;
+            node = node.complement();
+        }
+        return node;
     }
 
     /**
@@ -2495,9 +2513,7 @@
         } catch (IllegalArgumentException iae) {
             throw error("Unknown character block name {" + name + "}");
         }
-        return new CharProperty() {
-                boolean isSatisfiedBy(int ch) {
-                    return block == Character.UnicodeBlock.of(ch);}};
+        return new Block(block);
     }
 
     /**
@@ -2968,13 +2984,6 @@
     // Utility methods for code point support
     //
 
-    /**
-     * Tests a surrogate value.
-     */
-    private static final boolean isSurrogate(int c) {
-        return c >= Character.MIN_HIGH_SURROGATE && c <= Character.MAX_LOW_SURROGATE;
-    }
-
     private static final int countChars(CharSequence seq, int index,
                                         int lengthInCodePoints) {
         // optimization
@@ -3174,20 +3183,17 @@
                 matcher.hitEnd = true;
                 return false;
             }
-            boolean ret = false;
             int guard = matcher.to - minLength;
             for (; i <= guard; i++) {
-                if (ret = next.match(matcher, i, seq))
-                    break;
-                if (i == guard)
-                    matcher.hitEnd = true;
+                if (next.match(matcher, i, seq)) {
+                    matcher.first = i;
+                    matcher.groups[0] = matcher.first;
+                    matcher.groups[1] = matcher.last;
+                    return true;
+                }
             }
-            if (ret) {
-                matcher.first = i;
-                matcher.groups[0] = matcher.first;
-                matcher.groups[1] = matcher.last;
-            }
-            return ret;
+            matcher.hitEnd = true;
+            return false;
         }
         boolean study(TreeInfo info) {
             next.study(info);
@@ -3209,27 +3215,28 @@
                 matcher.hitEnd = true;
                 return false;
             }
-            boolean ret = false;
             int guard = matcher.to - minLength;
             while (i <= guard) {
-                if ((ret = next.match(matcher, i, seq)) || i == guard)
+                //if ((ret = next.match(matcher, i, seq)) || i == guard)
+                if (next.match(matcher, i, seq)) {
+                    matcher.first = i;
+                    matcher.groups[0] = matcher.first;
+                    matcher.groups[1] = matcher.last;
+                    return true;
+                }
+                if (i == guard)
                     break;
                 // Optimization to move to the next character. This is
                 // faster than countChars(seq, i, 1).
                 if (Character.isHighSurrogate(seq.charAt(i++))) {
-                    if (i < seq.length() && Character.isLowSurrogate(seq.charAt(i))) {
+                    if (i < seq.length() &&
+                        Character.isLowSurrogate(seq.charAt(i))) {
                         i++;
                     }
                 }
-                if (i == guard)
-                    matcher.hitEnd = true;
             }
-            if (ret) {
-                matcher.first = i;
-                matcher.groups[0] = matcher.first;
-                matcher.groups[1] = matcher.last;
-            }
-            return ret;
+            matcher.hitEnd = true;
+            return false;
         }
     }
 
@@ -3461,9 +3468,6 @@
                     boolean isSatisfiedBy(int ch) {
                         return ! CharProperty.this.isSatisfiedBy(ch);}};
         }
-        CharProperty maybeComplement(boolean complement) {
-            return complement ? complement() : this;
-        }
         boolean match(Matcher matcher, int i, CharSequence seq) {
             if (i < matcher.to) {
                 int ch = Character.codePointAt(seq, i);
@@ -3548,6 +3552,20 @@
         }
     }
 
+
+    /**
+     * Node class that matches a Unicode block.
+     */
+    static final class Block extends CharProperty {
+        final Character.UnicodeBlock block;
+        Block(Character.UnicodeBlock block) {
+            this.block = block;
+        }
+        boolean isSatisfiedBy(int ch) {
+            return block == Character.UnicodeBlock.of(ch);
+        }
+    }
+
     /**
      * Node class that matches a Unicode category.
      */
--- a/jdk/test/java/util/regex/RegExTest.java	Wed Feb 03 17:04:52 2010 +0800
+++ b/jdk/test/java/util/regex/RegExTest.java	Fri Feb 05 00:10:42 2010 -0800
@@ -32,7 +32,7 @@
  * 4872664 4803179 4892980 4900747 4945394 4938995 4979006 4994840 4997476
  * 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940
  * 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133
- * 6350801 6676425 6878475
+ * 6350801 6676425 6878475 6919132
  */
 
 import java.util.regex.*;
@@ -134,6 +134,7 @@
         toMatchResultTest();
         surrogatesInClassTest();
         namedGroupCaptureTest();
+        nonBmpClassComplementTest();
 
         if (failure)
             throw new RuntimeException("Failure in the RE handling.");
@@ -365,7 +366,6 @@
         m.find();
         if (!m.hitEnd())
             failCount++;
-
         report("hitEnd from a Slice");
     }
 
@@ -3514,4 +3514,29 @@
                           null);
         report("NamedGroupCapture");
     }
+
+    // This is for bug 6969132
+    private static void nonBmpClassComplementTest() throws Exception {
+        Pattern p = Pattern.compile("\\P{Lu}");
+        Matcher m = p.matcher(new String(new int[] {0x1d400}, 0, 1));
+        if (m.find() && m.start() == 1)
+            failCount++;
+
+        // from a unicode category
+        p = Pattern.compile("\\P{Lu}");
+        m = p.matcher(new String(new int[] {0x1d400}, 0, 1));
+        if (m.find())
+            failCount++;
+        if (!m.hitEnd())
+            failCount++;
+
+        // block
+        p = Pattern.compile("\\P{InMathematicalAlphanumericSymbols}");
+        m = p.matcher(new String(new int[] {0x1d400}, 0, 1));
+        if (m.find() && m.start() == 1)
+            failCount++;
+
+        report("NonBmpClassComplement");
+    }
+
 }