8225061: Performance regression in Regex
authorredestad
Sat, 01 Jun 2019 03:18:23 +0200
changeset 55141 db105c4c5776
parent 55140 d4890c3721be
child 55142 e2dbcc6ed36d
8225061: Performance regression in Regex Reviewed-by: naoto, alanb Contributed-by: claes.redestad@oracle.com, naoto.sato@oracle.com
make/jdk/src/classes/build/tools/generateemojidata/GenerateEmojiData.java
src/java.base/share/classes/java/util/regex/EmojiData.java.template
src/java.base/share/classes/java/util/regex/Grapheme.java
src/java.base/share/classes/java/util/regex/Pattern.java
test/micro/org/openjdk/bench/java/util/regex/PatternBench.java
--- a/make/jdk/src/classes/build/tools/generateemojidata/GenerateEmojiData.java	Fri May 31 17:27:28 2019 -0700
+++ b/make/jdk/src/classes/build/tools/generateemojidata/GenerateEmojiData.java	Sat Jun 01 03:18:23 2019 +0200
@@ -67,27 +67,28 @@
                     },
                     ArrayList<Range>::addAll);
 
+
             // make the code point conditions
-            String extPictCodePoints = extPictRanges.stream()
-                .map(r -> {
-                    if (r.start == r.last) {
-                        return (" ".repeat(12) + "cp == 0x" + toHexString(r.start));
-                    } else  if (r.start == r.last - 1) {
-                        return " ".repeat(12) + "cp == 0x" + toHexString(r.start) + " ||\n" +
-                               " ".repeat(12) + "cp == 0x" + toHexString(r.last);
-                    } else {
-                        return " ".repeat(11) + "(cp >= 0x" + toHexString(r.start) +
-                               " && cp <= 0x" + toHexString(r.last) + ")";
-                    }
-                })
-                .collect(Collectors.joining(" ||\n")) + ";\n";
+            // only very few codepoints below 0x2000 are "emojis", so separate them
+            // out to generate a fast-path check that can be efficiently inlined
+            String lowExtPictCodePoints = extPictRanges.stream()
+                    .takeWhile(r -> r.last < 0x2000)
+                    .map(r -> rangeToString(r))
+                    .collect(Collectors.joining(" ||\n", "", ";\n"));
+
+            String highExtPictCodePoints = extPictRanges.stream()
+                    .dropWhile(r -> r.last < 0x2000)
+                    .map(r -> rangeToString(r))
+                    .collect(Collectors.joining(" ||\n", "", ";\n"));
 
             // Generate EmojiData.java file
             Files.write(Paths.get(args[2]),
                 Files.lines(Paths.get(args[0]))
                     .flatMap(l -> {
-                        if (l.equals("%%%EXTPICT%%%")) {
-                            return Stream.of(extPictCodePoints);
+                        if (l.equals("%%%EXTPICT_LOW%%%")) {
+                            return Stream.of(lowExtPictCodePoints);
+                        } else if (l.equals("%%%EXTPICT_HIGH%%%")) {
+                            return Stream.of(highExtPictCodePoints);
                         } else {
                             return Stream.of(l);
                         }
@@ -99,6 +100,18 @@
         }
     }
 
+    static String rangeToString(Range r) {
+        if (r.start == r.last) {
+            return (" ".repeat(16) + "cp == 0x" + toHexString(r.start));
+        } else  if (r.start == r.last - 1) {
+            return " ".repeat(16) + "cp == 0x" + toHexString(r.start) + " ||\n" +
+                    " ".repeat(16) + "cp == 0x" + toHexString(r.last);
+        } else {
+            return " ".repeat(15) + "(cp >= 0x" + toHexString(r.start) +
+                    " && cp <= 0x" + toHexString(r.last) + ")";
+        }
+    }
+
     static int toInt(String hexStr) {
         return Integer.parseUnsignedInt(hexStr, 16);
     }
--- a/src/java.base/share/classes/java/util/regex/EmojiData.java.template	Fri May 31 17:27:28 2019 -0700
+++ b/src/java.base/share/classes/java/util/regex/EmojiData.java.template	Sat Jun 01 03:18:23 2019 +0200
@@ -40,7 +40,16 @@
      * @return true if {@code cp} is an extended pictographic
      */
     static boolean isExtendedPictographic(int cp) {
+        if (cp < 0x2000) {
+            return
+%%%EXTPICT_LOW%%%
+        } else {
+            return isHigh(cp);
+        }
+    }
+
+    private static boolean isHigh(int cp) {
         return
-%%%EXTPICT%%%
+%%%EXTPICT_HIGH%%%
     }
 }
--- a/src/java.base/share/classes/java/util/regex/Grapheme.java	Fri May 31 17:27:28 2019 -0700
+++ b/src/java.base/share/classes/java/util/regex/Grapheme.java	Sat Jun 01 03:18:23 2019 +0200
@@ -30,6 +30,19 @@
 final class Grapheme {
 
     /**
+     * Determines if there is an extended  grapheme cluster boundary between two
+     * continuing characters {@code cp1} and {@code cp2}.
+     * <p>
+     * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
+     * for the extended grapheme cluster boundary rules
+     * <p>
+     * Note: this method does not take care of stateful breaking.
+     */
+    static boolean isBoundary(int cp1, int cp2) {
+        return rules[getType(cp1)][getType(cp2)];
+    }
+
+    /**
      * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes
      * the start of the char sequence is a boundary.
      * <p>
@@ -50,12 +63,12 @@
         int ret = Character.charCount(ch0);
         int ch1;
         // indicates whether gb11 or gb12 is underway
-        boolean gb11 = EmojiData.isExtendedPictographic(ch0);
-        int riCount = getType(ch0) == RI ? 1 : 0;
+        int t0 = getGraphemeType(ch0);
+        int riCount = t0 == RI ? 1 : 0;
+        boolean gb11 = t0 == EXTENDED_PICTOGRAPHIC;
         while (ret < limit) {
             ch1 = Character.codePointAt(src, ret);
-            int t0 = getType(ch0);
-            int t1 = getType(ch1);
+            int t1 = getGraphemeType(ch1);
 
             if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
                 gb11 = false;
@@ -65,13 +78,14 @@
                 if (ret > off) {
                     break;
                 } else {
-                    gb11 = EmojiData.isExtendedPictographic(ch1);
+                    gb11 = t1 == EXTENDED_PICTOGRAPHIC;
                     riCount = 0;
                 }
             }
 
-            riCount += getType(ch1) == RI ? 1 : 0;
-            ch0 = ch1;
+            riCount += (t1 == RI) ? 1 : 0;
+            t0 = t1;
+
             ret += Character.charCount(ch1);
         }
         return ret;
@@ -163,6 +177,20 @@
                cp == 0xAA7B || cp == 0xAA7D;
     }
 
+    private static int getGraphemeType(int cp) {
+        if (cp < 0x007F) { // ASCII
+            if (cp < 32) { // Control characters
+                if (cp == 0x000D)
+                    return CR;
+                if (cp == 0x000A)
+                    return LF;
+                return CONTROL;
+            }
+            return OTHER;
+        }
+        return getType(cp);
+    }
+
     @SuppressWarnings("fallthrough")
     private static int getType(int cp) {
         if (EmojiData.isExtendedPictographic(cp)) {
@@ -171,12 +199,6 @@
 
         int type = Character.getType(cp);
         switch(type) {
-        case Character.CONTROL:
-            if (cp == 0x000D)
-                return CR;
-            if (cp == 0x000A)
-                return LF;
-            return CONTROL;
         case Character.UNASSIGNED:
             // NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control
             // but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other"
@@ -184,6 +206,7 @@
             if (cp == 0x0378)
                 return OTHER;
 
+        case Character.CONTROL:
         case Character.LINE_SEPARATOR:
         case Character.PARAGRAPH_SEPARATOR:
         case Character.SURROGATE:
--- a/src/java.base/share/classes/java/util/regex/Pattern.java	Fri May 31 17:27:28 2019 -0700
+++ b/src/java.base/share/classes/java/util/regex/Pattern.java	Sat Jun 01 03:18:23 2019 +0200
@@ -3973,7 +3973,16 @@
             if (i < matcher.to) {
                 int ch0 = Character.codePointAt(seq, i);
                 int n = Character.charCount(ch0);
-                int j = Grapheme.nextBoundary(seq, i, matcher.to);
+                int j = i + n;
+                // Fast check if it's necessary to call Normalizer;
+                // testing Grapheme.isBoundary is enough for this case
+                while (j < matcher.to) {
+                    int ch1 = Character.codePointAt(seq, j);
+                    if (Grapheme.isBoundary(ch0, ch1))
+                        break;
+                    ch0 = ch1;
+                    j += Character.charCount(ch1);
+                }
                 if (i + n == j) {    // single, assume nfc cp
                     if (predicate.is(ch0))
                         return next.match(matcher, j, seq);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/micro/org/openjdk/bench/java/util/regex/PatternBench.java	Sat Jun 01 03:18:23 2019 +0200
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package org.openjdk.bench.java.util.regex;
+
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+
+import java.util.concurrent.TimeUnit;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@State(Scope.Thread)
+public class PatternBench {
+
+    public String fileTestString;
+    public String flagsString;
+
+
+    public Pattern graphemePattern;
+    public Pattern jmodPattern;
+    public Pattern jmodCanonicalPattern;
+
+    public Pattern pattern;
+
+    @Setup
+    public void setup() {
+        flagsString = "\ud83c\udde6\ud83c\uddec\ud83c\uddec\ud83c\udde6\ud83c\uddfa\ud83c\uddf8\ud83c\uddeb\ud83c\uddf7";
+        fileTestString = "META-INF/providers/org.openjdk.foo_hotspot_nodes_PluginFactory_EndLockScopeNode";
+        graphemePattern = Pattern.compile("\\b{g}");
+
+        String jmodRegex = "^.*(?:(?:_the\\.[^/]*)|(?:_[^/]*\\.marker)|(?:[^/]*\\.diz)|(?:[^/]*\\.debuginfo)|(?:[^/]*\\.dSYM/.*)|(?:[^/]*\\.dSYM)|(?:[^/]*\\.pdb)|(?:[^/]*\\.map))$";
+        jmodCanonicalPattern = Pattern.compile(jmodRegex, Pattern.CANON_EQ);
+        jmodPattern = Pattern.compile(jmodRegex);
+    }
+
+    @Benchmark
+    @Warmup(iterations = 3)
+    @Measurement(iterations = 3)
+    public int splitFlags() {
+        return graphemePattern.split(flagsString).length;
+    }
+
+    @Benchmark
+    @Warmup(iterations = 3)
+    @Measurement(iterations = 3)
+    public boolean canonicalJmodMatch() {
+        return jmodCanonicalPattern.matcher(fileTestString).matches();
+    }
+
+    @Benchmark
+    @Warmup(iterations = 3)
+    @Measurement(iterations = 3)
+    public boolean normalJmodMatch() {
+        return jmodPattern.matcher(fileTestString).matches();
+    }
+}