8225061: Performance regression in Regex
Reviewed-by: naoto, alanb
Contributed-by: claes.redestad@oracle.com, naoto.sato@oracle.com
--- a/make/jdk/src/classes/build/tools/generateemojidata/GenerateEmojiData.java Fri May 31 17:27:28 2019 -0700
+++ b/make/jdk/src/classes/build/tools/generateemojidata/GenerateEmojiData.java Sat Jun 01 03:18:23 2019 +0200
@@ -67,27 +67,28 @@
},
ArrayList<Range>::addAll);
+
// make the code point conditions
- String extPictCodePoints = extPictRanges.stream()
- .map(r -> {
- if (r.start == r.last) {
- return (" ".repeat(12) + "cp == 0x" + toHexString(r.start));
- } else if (r.start == r.last - 1) {
- return " ".repeat(12) + "cp == 0x" + toHexString(r.start) + " ||\n" +
- " ".repeat(12) + "cp == 0x" + toHexString(r.last);
- } else {
- return " ".repeat(11) + "(cp >= 0x" + toHexString(r.start) +
- " && cp <= 0x" + toHexString(r.last) + ")";
- }
- })
- .collect(Collectors.joining(" ||\n")) + ";\n";
+ // only very few codepoints below 0x2000 are "emojis", so separate them
+ // out to generate a fast-path check that can be efficiently inlined
+ String lowExtPictCodePoints = extPictRanges.stream()
+ .takeWhile(r -> r.last < 0x2000)
+ .map(r -> rangeToString(r))
+ .collect(Collectors.joining(" ||\n", "", ";\n"));
+
+ String highExtPictCodePoints = extPictRanges.stream()
+ .dropWhile(r -> r.last < 0x2000)
+ .map(r -> rangeToString(r))
+ .collect(Collectors.joining(" ||\n", "", ";\n"));
// Generate EmojiData.java file
Files.write(Paths.get(args[2]),
Files.lines(Paths.get(args[0]))
.flatMap(l -> {
- if (l.equals("%%%EXTPICT%%%")) {
- return Stream.of(extPictCodePoints);
+ if (l.equals("%%%EXTPICT_LOW%%%")) {
+ return Stream.of(lowExtPictCodePoints);
+ } else if (l.equals("%%%EXTPICT_HIGH%%%")) {
+ return Stream.of(highExtPictCodePoints);
} else {
return Stream.of(l);
}
@@ -99,6 +100,18 @@
}
}
+ static String rangeToString(Range r) {
+ if (r.start == r.last) {
+ return (" ".repeat(16) + "cp == 0x" + toHexString(r.start));
+ } else if (r.start == r.last - 1) {
+ return " ".repeat(16) + "cp == 0x" + toHexString(r.start) + " ||\n" +
+ " ".repeat(16) + "cp == 0x" + toHexString(r.last);
+ } else {
+ return " ".repeat(15) + "(cp >= 0x" + toHexString(r.start) +
+ " && cp <= 0x" + toHexString(r.last) + ")";
+ }
+ }
+
static int toInt(String hexStr) {
return Integer.parseUnsignedInt(hexStr, 16);
}
--- a/src/java.base/share/classes/java/util/regex/EmojiData.java.template Fri May 31 17:27:28 2019 -0700
+++ b/src/java.base/share/classes/java/util/regex/EmojiData.java.template Sat Jun 01 03:18:23 2019 +0200
@@ -40,7 +40,16 @@
* @return true if {@code cp} is an extended pictographic
*/
static boolean isExtendedPictographic(int cp) {
+ if (cp < 0x2000) {
+ return
+%%%EXTPICT_LOW%%%
+ } else {
+ return isHigh(cp);
+ }
+ }
+
+ private static boolean isHigh(int cp) {
return
-%%%EXTPICT%%%
+%%%EXTPICT_HIGH%%%
}
}
--- a/src/java.base/share/classes/java/util/regex/Grapheme.java Fri May 31 17:27:28 2019 -0700
+++ b/src/java.base/share/classes/java/util/regex/Grapheme.java Sat Jun 01 03:18:23 2019 +0200
@@ -30,6 +30,19 @@
final class Grapheme {
/**
+ * Determines if there is an extended grapheme cluster boundary between two
+ * continuing characters {@code cp1} and {@code cp2}.
+ * <p>
+ * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
+ * for the extended grapheme cluster boundary rules
+ * <p>
+ * Note: this method does not take care of stateful breaking.
+ */
+ static boolean isBoundary(int cp1, int cp2) {
+ return rules[getType(cp1)][getType(cp2)];
+ }
+
+ /**
* Look for the next extended grapheme cluster boundary in a CharSequence. It assumes
* the start of the char sequence is a boundary.
* <p>
@@ -50,12 +63,12 @@
int ret = Character.charCount(ch0);
int ch1;
// indicates whether gb11 or gb12 is underway
- boolean gb11 = EmojiData.isExtendedPictographic(ch0);
- int riCount = getType(ch0) == RI ? 1 : 0;
+ int t0 = getGraphemeType(ch0);
+ int riCount = t0 == RI ? 1 : 0;
+ boolean gb11 = t0 == EXTENDED_PICTOGRAPHIC;
while (ret < limit) {
ch1 = Character.codePointAt(src, ret);
- int t0 = getType(ch0);
- int t1 = getType(ch1);
+ int t1 = getGraphemeType(ch1);
if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
gb11 = false;
@@ -65,13 +78,14 @@
if (ret > off) {
break;
} else {
- gb11 = EmojiData.isExtendedPictographic(ch1);
+ gb11 = t1 == EXTENDED_PICTOGRAPHIC;
riCount = 0;
}
}
- riCount += getType(ch1) == RI ? 1 : 0;
- ch0 = ch1;
+ riCount += (t1 == RI) ? 1 : 0;
+ t0 = t1;
+
ret += Character.charCount(ch1);
}
return ret;
@@ -163,6 +177,20 @@
cp == 0xAA7B || cp == 0xAA7D;
}
+ private static int getGraphemeType(int cp) {
+ if (cp < 0x007F) { // ASCII
+ if (cp < 32) { // Control characters
+ if (cp == 0x000D)
+ return CR;
+ if (cp == 0x000A)
+ return LF;
+ return CONTROL;
+ }
+ return OTHER;
+ }
+ return getType(cp);
+ }
+
@SuppressWarnings("fallthrough")
private static int getType(int cp) {
if (EmojiData.isExtendedPictographic(cp)) {
@@ -171,12 +199,6 @@
int type = Character.getType(cp);
switch(type) {
- case Character.CONTROL:
- if (cp == 0x000D)
- return CR;
- if (cp == 0x000A)
- return LF;
- return CONTROL;
case Character.UNASSIGNED:
// NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control
// but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other"
@@ -184,6 +206,7 @@
if (cp == 0x0378)
return OTHER;
+ case Character.CONTROL:
case Character.LINE_SEPARATOR:
case Character.PARAGRAPH_SEPARATOR:
case Character.SURROGATE:
--- a/src/java.base/share/classes/java/util/regex/Pattern.java Fri May 31 17:27:28 2019 -0700
+++ b/src/java.base/share/classes/java/util/regex/Pattern.java Sat Jun 01 03:18:23 2019 +0200
@@ -3973,7 +3973,16 @@
if (i < matcher.to) {
int ch0 = Character.codePointAt(seq, i);
int n = Character.charCount(ch0);
- int j = Grapheme.nextBoundary(seq, i, matcher.to);
+ int j = i + n;
+ // Fast check if it's necessary to call Normalizer;
+ // testing Grapheme.isBoundary is enough for this case
+ while (j < matcher.to) {
+ int ch1 = Character.codePointAt(seq, j);
+ if (Grapheme.isBoundary(ch0, ch1))
+ break;
+ ch0 = ch1;
+ j += Character.charCount(ch1);
+ }
if (i + n == j) { // single, assume nfc cp
if (predicate.is(ch0))
return next.match(matcher, j, seq);
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/micro/org/openjdk/bench/java/util/regex/PatternBench.java Sat Jun 01 03:18:23 2019 +0200
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package org.openjdk.bench.java.util.regex;
+
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+
+import java.util.concurrent.TimeUnit;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@State(Scope.Thread)
+public class PatternBench {
+
+ public String fileTestString;
+ public String flagsString;
+
+
+ public Pattern graphemePattern;
+ public Pattern jmodPattern;
+ public Pattern jmodCanonicalPattern;
+
+ public Pattern pattern;
+
+ @Setup
+ public void setup() {
+ flagsString = "\ud83c\udde6\ud83c\uddec\ud83c\uddec\ud83c\udde6\ud83c\uddfa\ud83c\uddf8\ud83c\uddeb\ud83c\uddf7";
+ fileTestString = "META-INF/providers/org.openjdk.foo_hotspot_nodes_PluginFactory_EndLockScopeNode";
+ graphemePattern = Pattern.compile("\\b{g}");
+
+ String jmodRegex = "^.*(?:(?:_the\\.[^/]*)|(?:_[^/]*\\.marker)|(?:[^/]*\\.diz)|(?:[^/]*\\.debuginfo)|(?:[^/]*\\.dSYM/.*)|(?:[^/]*\\.dSYM)|(?:[^/]*\\.pdb)|(?:[^/]*\\.map))$";
+ jmodCanonicalPattern = Pattern.compile(jmodRegex, Pattern.CANON_EQ);
+ jmodPattern = Pattern.compile(jmodRegex);
+ }
+
+ @Benchmark
+ @Warmup(iterations = 3)
+ @Measurement(iterations = 3)
+ public int splitFlags() {
+ return graphemePattern.split(flagsString).length;
+ }
+
+ @Benchmark
+ @Warmup(iterations = 3)
+ @Measurement(iterations = 3)
+ public boolean canonicalJmodMatch() {
+ return jmodCanonicalPattern.matcher(fileTestString).matches();
+ }
+
+ @Benchmark
+ @Warmup(iterations = 3)
+ @Measurement(iterations = 3)
+ public boolean normalJmodMatch() {
+ return jmodPattern.matcher(fileTestString).matches();
+ }
+}