make/jdk/src/classes/build/tools/generateemojidata/GenerateEmojiData.java
author tschatzl
Fri, 22 Nov 2019 10:03:38 +0100
changeset 59220 72e15d757e6c
parent 55141 db105c4c5776
permissions -rw-r--r--
8234000: Make HeapRegion::bottom/end/hrm_index const Reviewed-by: kbarrett, sjohanss
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
55013
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
     1
/*
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
     2
 * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
     3
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
     4
 *
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
     5
 * This code is free software; you can redistribute it and/or modify it
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
     6
 * under the terms of the GNU General Public License version 2 only, as
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
     7
 * published by the Free Software Foundation.  Oracle designates this
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
     8
 * particular file as subject to the "Classpath" exception as provided
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
     9
 * by Oracle in the LICENSE file that accompanied this code.
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    10
 *
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    11
 * This code is distributed in the hope that it will be useful, but WITHOUT
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    12
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    13
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    14
 * version 2 for more details (a copy is included in the LICENSE file that
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    15
 * accompanied this code).
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    16
 *
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    17
 * You should have received a copy of the GNU General Public License version
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    18
 * 2 along with this work; if not, write to the Free Software Foundation,
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    19
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    20
 *
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    21
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    22
 * or visit www.oracle.com if you need additional information or have any
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    23
 * questions.
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    24
 */
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    25
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    26
package build.tools.generateemojidata;
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    27
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    28
import java.io.IOException;
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    29
import java.nio.file.Files;
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    30
import java.nio.file.Paths;
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    31
import java.nio.file.StandardOpenOption;
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    32
import java.util.ArrayList;
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    33
import java.util.List;
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    34
import java.util.function.Predicate;
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    35
import java.util.stream.Collectors;
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    36
import java.util.stream.Stream;
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    37
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    38
/**
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    39
 * Generate EmojiData.java
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    40
 *    args[0]: Full path string to the template file
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    41
 *    args[1]: Full path string to the directory that contains "emoji-data.txt"
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    42
 *    args[2]: Full path string to the generated .java file
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    43
 */
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    44
public class GenerateEmojiData {
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    45
    public static void main(String[] args) {
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    46
        try {
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    47
            final Range[] last = new Range[1]; // last extended pictographic range
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    48
            last[0] = new Range(0, 0);
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    49
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    50
            List<Range> extPictRanges = Files.lines(Paths.get(args[1], "emoji-data.txt"))
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    51
                .filter(Predicate.not(l -> l.startsWith("#") || l.isBlank()))
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    52
                .filter(l -> l.contains("; Extended_Pictograph"))
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    53
                .map(l -> new Range(l.replaceFirst(" .*", "")))
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    54
                .sorted()
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    55
                .collect(ArrayList<Range>::new,
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    56
                    (list, r) -> {
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    57
                        // collapsing consecutive pictographic ranges
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    58
                        int lastIndex = list.size() - 1;
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    59
                        if (lastIndex >= 0) {
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    60
                            Range lastRange = list.get(lastIndex);
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    61
                            if (lastRange.last + 1 == r.start) {
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    62
                                list.set(lastIndex, new Range(lastRange.start, r.last));
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    63
                                return;
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    64
                            }
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    65
                        }
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    66
                        list.add(r);
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    67
                    },
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    68
                    ArrayList<Range>::addAll);
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    69
55141
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
    70
55013
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    71
            // make the code point conditions
55141
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
    72
            // only very few codepoints below 0x2000 are "emojis", so separate them
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
    73
            // out to generate a fast-path check that can be efficiently inlined
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
    74
            String lowExtPictCodePoints = extPictRanges.stream()
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
    75
                    .takeWhile(r -> r.last < 0x2000)
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
    76
                    .map(r -> rangeToString(r))
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
    77
                    .collect(Collectors.joining(" ||\n", "", ";\n"));
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
    78
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
    79
            String highExtPictCodePoints = extPictRanges.stream()
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
    80
                    .dropWhile(r -> r.last < 0x2000)
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
    81
                    .map(r -> rangeToString(r))
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
    82
                    .collect(Collectors.joining(" ||\n", "", ";\n"));
55013
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    83
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    84
            // Generate EmojiData.java file
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    85
            Files.write(Paths.get(args[2]),
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    86
                Files.lines(Paths.get(args[0]))
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    87
                    .flatMap(l -> {
55141
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
    88
                        if (l.equals("%%%EXTPICT_LOW%%%")) {
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
    89
                            return Stream.of(lowExtPictCodePoints);
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
    90
                        } else if (l.equals("%%%EXTPICT_HIGH%%%")) {
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
    91
                            return Stream.of(highExtPictCodePoints);
55013
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    92
                        } else {
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    93
                            return Stream.of(l);
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    94
                        }
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    95
                    })
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    96
                    .collect(Collectors.toList()),
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    97
                StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    98
        } catch (IOException e) {
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
    99
            e.printStackTrace();
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   100
        }
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   101
    }
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   102
55141
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
   103
    static String rangeToString(Range r) {
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
   104
        if (r.start == r.last) {
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
   105
            return (" ".repeat(16) + "cp == 0x" + toHexString(r.start));
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
   106
        } else  if (r.start == r.last - 1) {
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
   107
            return " ".repeat(16) + "cp == 0x" + toHexString(r.start) + " ||\n" +
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
   108
                    " ".repeat(16) + "cp == 0x" + toHexString(r.last);
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
   109
        } else {
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
   110
            return " ".repeat(15) + "(cp >= 0x" + toHexString(r.start) +
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
   111
                    " && cp <= 0x" + toHexString(r.last) + ")";
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
   112
        }
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
   113
    }
db105c4c5776 8225061: Performance regression in Regex
redestad
parents: 55013
diff changeset
   114
55013
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   115
    static int toInt(String hexStr) {
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   116
        return Integer.parseUnsignedInt(hexStr, 16);
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   117
    }
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   118
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   119
    static String toHexString(int cp) {
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   120
        String ret = Integer.toUnsignedString(cp, 16).toUpperCase();
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   121
        if (ret.length() < 4) {
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   122
            ret = "0".repeat(4 - ret.length()) + ret;
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   123
        }
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   124
        return ret;
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   125
    }
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   126
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   127
    static class Range implements Comparable<Range> {
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   128
        int start;
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   129
        int last;
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   130
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   131
        Range (int start, int last) {
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   132
            this.start = start;
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   133
            this.last = last;
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   134
        }
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   135
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   136
        Range (String input) {
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   137
            input = input.replaceFirst("\\s#.*", "");
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   138
            start = toInt(input.replaceFirst("[\\s\\.].*", ""));
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   139
            last = input.contains("..") ?
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   140
                    toInt(input.replaceFirst(".*\\.\\.", "")
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   141
                            .replaceFirst(";.*", "").trim())
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   142
                    : start;
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   143
        }
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   144
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   145
        @Override
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   146
        public String toString() {
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   147
            return "Start: " + toHexString(start) + ", Last: " + toHexString(last);
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   148
        }
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   149
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   150
        @Override
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   151
        public int compareTo(Range other) {
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   152
            return Integer.compare(start, other.start);
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   153
        }
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   154
    }
8dae495a59e7 8221431: Support for Unicode 12.1
naoto
parents:
diff changeset
   155
}