6919132: Regex \P{Lu} selects half of a surrogate pari
Summary: To use StartS for complement category/block class
Reviewed-by: martin, okutsu
--- a/jdk/src/share/classes/java/util/regex/Pattern.java Wed Feb 03 17:04:52 2010 +0800
+++ b/jdk/src/share/classes/java/util/regex/Pattern.java Fri Feb 05 00:10:42 2010 -0800
@@ -861,6 +861,14 @@
private transient int patternLength;
/**
+ * If the Start node might possibly match supplementary characters.
+ * It is set to true during compiling if
+ * (1) There is supplementary char in pattern, or
+ * (2) There is complement node of Category or Block
+ */
+ private transient boolean hasSupplementary;
+
+ /**
* Compiles the given regular expression into a pattern. </p>
*
* @param regex
@@ -1481,7 +1489,7 @@
// Use double zero to terminate pattern
temp = new int[patternLength + 2];
- boolean hasSupplementary = false;
+ hasSupplementary = false;
int c, count = 0;
// Convert all chars into code points
for (int x = 0; x < patternLength; x += Character.charCount(c)) {
@@ -1787,7 +1795,8 @@
* character or unpaired surrogate.
*/
private static final boolean isSupplementary(int ch) {
- return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT || isSurrogate(ch);
+ return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT ||
+ Character.isSurrogate((char)ch);
}
/**
@@ -1885,7 +1894,7 @@
} else {
oneLetter = false;
}
- node = family(oneLetter).maybeComplement(comp);
+ node = family(oneLetter, comp);
} else {
unread();
node = atom();
@@ -2001,7 +2010,7 @@
unread();
else
oneLetter = false;
- return family(oneLetter).maybeComplement(comp);
+ return family(oneLetter, comp);
}
}
unread();
@@ -2404,7 +2413,7 @@
unread();
else
oneLetter = false;
- return family(oneLetter).maybeComplement(comp);
+ return family(oneLetter, comp);
} else { // ordinary escape
unread();
ch = escape(true, true);
@@ -2450,9 +2459,12 @@
/**
* Parses a Unicode character family and returns its representative node.
*/
- private CharProperty family(boolean singleLetter) {
+ private CharProperty family(boolean singleLetter,
+ boolean maybeComplement)
+ {
next();
String name;
+ CharProperty node;
if (singleLetter) {
int c = temp[cursor];
@@ -2477,12 +2489,18 @@
}
if (name.startsWith("In")) {
- return unicodeBlockPropertyFor(name.substring(2));
+ node = unicodeBlockPropertyFor(name.substring(2));
} else {
if (name.startsWith("Is"))
name = name.substring(2);
- return charPropertyNodeFor(name);
- }
+ node = charPropertyNodeFor(name);
+ }
+ if (maybeComplement) {
+ if (node instanceof Category || node instanceof Block)
+ hasSupplementary = true;
+ node = node.complement();
+ }
+ return node;
}
/**
@@ -2495,9 +2513,7 @@
} catch (IllegalArgumentException iae) {
throw error("Unknown character block name {" + name + "}");
}
- return new CharProperty() {
- boolean isSatisfiedBy(int ch) {
- return block == Character.UnicodeBlock.of(ch);}};
+ return new Block(block);
}
/**
@@ -2968,13 +2984,6 @@
// Utility methods for code point support
//
- /**
- * Tests a surrogate value.
- */
- private static final boolean isSurrogate(int c) {
- return c >= Character.MIN_HIGH_SURROGATE && c <= Character.MAX_LOW_SURROGATE;
- }
-
private static final int countChars(CharSequence seq, int index,
int lengthInCodePoints) {
// optimization
@@ -3174,20 +3183,17 @@
matcher.hitEnd = true;
return false;
}
- boolean ret = false;
int guard = matcher.to - minLength;
for (; i <= guard; i++) {
- if (ret = next.match(matcher, i, seq))
- break;
- if (i == guard)
- matcher.hitEnd = true;
+ if (next.match(matcher, i, seq)) {
+ matcher.first = i;
+ matcher.groups[0] = matcher.first;
+ matcher.groups[1] = matcher.last;
+ return true;
+ }
}
- if (ret) {
- matcher.first = i;
- matcher.groups[0] = matcher.first;
- matcher.groups[1] = matcher.last;
- }
- return ret;
+ matcher.hitEnd = true;
+ return false;
}
boolean study(TreeInfo info) {
next.study(info);
@@ -3209,27 +3215,28 @@
matcher.hitEnd = true;
return false;
}
- boolean ret = false;
int guard = matcher.to - minLength;
while (i <= guard) {
- if ((ret = next.match(matcher, i, seq)) || i == guard)
+ //if ((ret = next.match(matcher, i, seq)) || i == guard)
+ if (next.match(matcher, i, seq)) {
+ matcher.first = i;
+ matcher.groups[0] = matcher.first;
+ matcher.groups[1] = matcher.last;
+ return true;
+ }
+ if (i == guard)
break;
// Optimization to move to the next character. This is
// faster than countChars(seq, i, 1).
if (Character.isHighSurrogate(seq.charAt(i++))) {
- if (i < seq.length() && Character.isLowSurrogate(seq.charAt(i))) {
+ if (i < seq.length() &&
+ Character.isLowSurrogate(seq.charAt(i))) {
i++;
}
}
- if (i == guard)
- matcher.hitEnd = true;
}
- if (ret) {
- matcher.first = i;
- matcher.groups[0] = matcher.first;
- matcher.groups[1] = matcher.last;
- }
- return ret;
+ matcher.hitEnd = true;
+ return false;
}
}
@@ -3461,9 +3468,6 @@
boolean isSatisfiedBy(int ch) {
return ! CharProperty.this.isSatisfiedBy(ch);}};
}
- CharProperty maybeComplement(boolean complement) {
- return complement ? complement() : this;
- }
boolean match(Matcher matcher, int i, CharSequence seq) {
if (i < matcher.to) {
int ch = Character.codePointAt(seq, i);
@@ -3548,6 +3552,20 @@
}
}
+
+ /**
+ * Node class that matches a Unicode block.
+ */
+ static final class Block extends CharProperty {
+ final Character.UnicodeBlock block;
+ Block(Character.UnicodeBlock block) {
+ this.block = block;
+ }
+ boolean isSatisfiedBy(int ch) {
+ return block == Character.UnicodeBlock.of(ch);
+ }
+ }
+
/**
* Node class that matches a Unicode category.
*/
--- a/jdk/test/java/util/regex/RegExTest.java Wed Feb 03 17:04:52 2010 +0800
+++ b/jdk/test/java/util/regex/RegExTest.java Fri Feb 05 00:10:42 2010 -0800
@@ -32,7 +32,7 @@
* 4872664 4803179 4892980 4900747 4945394 4938995 4979006 4994840 4997476
* 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940
* 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133
- * 6350801 6676425 6878475
+ * 6350801 6676425 6878475 6919132
*/
import java.util.regex.*;
@@ -134,6 +134,7 @@
toMatchResultTest();
surrogatesInClassTest();
namedGroupCaptureTest();
+ nonBmpClassComplementTest();
if (failure)
throw new RuntimeException("Failure in the RE handling.");
@@ -365,7 +366,6 @@
m.find();
if (!m.hitEnd())
failCount++;
-
report("hitEnd from a Slice");
}
@@ -3514,4 +3514,29 @@
null);
report("NamedGroupCapture");
}
+
+ // This is for bug 6969132
+ private static void nonBmpClassComplementTest() throws Exception {
+ Pattern p = Pattern.compile("\\P{Lu}");
+ Matcher m = p.matcher(new String(new int[] {0x1d400}, 0, 1));
+ if (m.find() && m.start() == 1)
+ failCount++;
+
+ // from a unicode category
+ p = Pattern.compile("\\P{Lu}");
+ m = p.matcher(new String(new int[] {0x1d400}, 0, 1));
+ if (m.find())
+ failCount++;
+ if (!m.hitEnd())
+ failCount++;
+
+ // block
+ p = Pattern.compile("\\P{InMathematicalAlphanumericSymbols}");
+ m = p.matcher(new String(new int[] {0x1d400}, 0, 1));
+ if (m.find() && m.start() == 1)
+ failCount++;
+
+ report("NonBmpClassComplement");
+ }
+
}