make/jdk/src/classes/build/tools/generatecharacter/CharacterName.java
author pliden
Thu, 26 Sep 2019 13:56:58 +0200
changeset 58355 de246fd65587
parent 47216 71c04702a3d5
permissions -rw-r--r--
8231294: ZGC: vmTestbase/nsk/jvmti/ResourceExhausted/resexhausted002 fails Reviewed-by: shade, dholmes
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
5610
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
     1
package build.tools.generatecharacter;
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
     2
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
     3
import java.io.*;
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
     4
import java.nio.*;
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
     5
import java.util.*;
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
     6
import java.util.zip.*;
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
     7
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
     8
public class CharacterName {
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
     9
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    10
    public static void main(String[] args) {
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    11
        FileReader reader = null;
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    12
        try {
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    13
            if (args.length != 2) {
19864
41c6dfb2022e 7186311: (props) "Unicode" is misspelled as "Uniocde" in JavaDoc and error message
sherman
parents: 5610
diff changeset
    14
                System.err.println("Usage: java CharacterName UnicodeData.txt uniName.dat");
5610
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    15
                System.exit(1);
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    16
            }
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    17
            reader = new FileReader(args[0]);
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    18
            BufferedReader bfr = new BufferedReader(reader);
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    19
            String line = null;
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    20
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    21
            StringBuilder namePool = new StringBuilder();
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    22
            byte[] cpPoolBytes = new byte[0x100000];
35783
2690535d72cc 7071819: To support Extended Grapheme Clusters in Regex
sherman
parents: 21805
diff changeset
    23
            boolean[] cpBlocks = new boolean[(Character.MAX_CODE_POINT + 1) >> 8];
2690535d72cc 7071819: To support Extended Grapheme Clusters in Regex
sherman
parents: 21805
diff changeset
    24
            int bkNum = 0;
5610
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    25
            ByteBuffer cpBB = ByteBuffer.wrap(cpPoolBytes);
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    26
            int lastCp = 0;
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    27
            int cpNum = 0;
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    28
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    29
            while ((line = bfr.readLine()) != null) {
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    30
                if (line.startsWith("#"))
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    31
                    continue;
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    32
                UnicodeSpec spec = UnicodeSpec.parse(line);
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    33
                if (spec != null) {
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    34
                    int cp = spec.getCodePoint();
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    35
                    String name = spec.getName();
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    36
                    if (name.equals("<control>") && spec.getOldName() != null) {
35783
2690535d72cc 7071819: To support Extended Grapheme Clusters in Regex
sherman
parents: 21805
diff changeset
    37
                        if (cp == 0x7)  // <control>BELL -> BEL; u+1f514 <-> BELL
2690535d72cc 7071819: To support Extended Grapheme Clusters in Regex
sherman
parents: 21805
diff changeset
    38
                            name = "BEL";
2690535d72cc 7071819: To support Extended Grapheme Clusters in Regex
sherman
parents: 21805
diff changeset
    39
                        else if (spec.getOldName().length() != 0)
5610
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    40
                            name = spec.getOldName();
35783
2690535d72cc 7071819: To support Extended Grapheme Clusters in Regex
sherman
parents: 21805
diff changeset
    41
                        /*
2690535d72cc 7071819: To support Extended Grapheme Clusters in Regex
sherman
parents: 21805
diff changeset
    42
                           3 "figment" characters from NameAliases.txt
2690535d72cc 7071819: To support Extended Grapheme Clusters in Regex
sherman
parents: 21805
diff changeset
    43
                           Several documented labels for C1 control code points which
2690535d72cc 7071819: To support Extended Grapheme Clusters in Regex
sherman
parents: 21805
diff changeset
    44
                           were never actually approved in any standard...but were
2690535d72cc 7071819: To support Extended Grapheme Clusters in Regex
sherman
parents: 21805
diff changeset
    45
                           implemented in Perl regex.
2690535d72cc 7071819: To support Extended Grapheme Clusters in Regex
sherman
parents: 21805
diff changeset
    46
                           0080;PADDING CHARACTER;figment
2690535d72cc 7071819: To support Extended Grapheme Clusters in Regex
sherman
parents: 21805
diff changeset
    47
                           0081;HIGH OCTET PRESET;figment
2690535d72cc 7071819: To support Extended Grapheme Clusters in Regex
sherman
parents: 21805
diff changeset
    48
                           0099;SINGLE GRAPHIC CHARACTER INTRODUCER;figment
2690535d72cc 7071819: To support Extended Grapheme Clusters in Regex
sherman
parents: 21805
diff changeset
    49
                        */
2690535d72cc 7071819: To support Extended Grapheme Clusters in Regex
sherman
parents: 21805
diff changeset
    50
                        else if (cp == 0x80)
2690535d72cc 7071819: To support Extended Grapheme Clusters in Regex
sherman
parents: 21805
diff changeset
    51
                            name = "PADDING CHARACTER";
2690535d72cc 7071819: To support Extended Grapheme Clusters in Regex
sherman
parents: 21805
diff changeset
    52
                        else if (cp == 0x81)
2690535d72cc 7071819: To support Extended Grapheme Clusters in Regex
sherman
parents: 21805
diff changeset
    53
                            name = "HIGH OCTET PRESET";
2690535d72cc 7071819: To support Extended Grapheme Clusters in Regex
sherman
parents: 21805
diff changeset
    54
                        else if (cp == 0x99)
2690535d72cc 7071819: To support Extended Grapheme Clusters in Regex
sherman
parents: 21805
diff changeset
    55
                            name = "SINGLE GRAPHIC CHARACTER INTRODUCER";
5610
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    56
                        else
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    57
                            continue;
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    58
                    } else if (name.startsWith("<")) {
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    59
                        /*
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    60
                          3400    <CJK Ideograph Extension A, First>
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    61
                          4db5    <CJK Ideograph Extension A, Last>
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    62
                          4e00    <CJK Ideograph, First>
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    63
                          9fc3    <CJK Ideograph, Last>
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    64
                          ac00    <Hangul Syllable, First>
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    65
                          d7a3    <Hangul Syllable, Last>
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    66
                          d800    <Non Private Use High Surrogate, First>
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    67
                          db7f    <Non Private Use High Surrogate, Last>
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    68
                          db80    <Private Use High Surrogate, First>
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    69
                          dbff    <Private Use High Surrogate, Last>
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    70
                          dc00    <Low Surrogate, First>
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    71
                          dfff    <Low Surrogate, Last>
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    72
                          e000    <Private Use, First>
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    73
                          f8ff    <Private Use, Last>
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    74
                         20000    <CJK Ideograph Extension B, First>
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    75
                         2a6d6    <CJK Ideograph Extension B, Last>
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    76
                         f0000    <Plane 15 Private Use, First>
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    77
                         ffffd    <Plane 15 Private Use, Last>
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    78
                        */
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    79
                        continue;
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    80
                    }
35783
2690535d72cc 7071819: To support Extended Grapheme Clusters in Regex
sherman
parents: 21805
diff changeset
    81
                    cpNum++;
2690535d72cc 7071819: To support Extended Grapheme Clusters in Regex
sherman
parents: 21805
diff changeset
    82
                    if (!cpBlocks[cp >> 8]) {
2690535d72cc 7071819: To support Extended Grapheme Clusters in Regex
sherman
parents: 21805
diff changeset
    83
                        cpBlocks[cp >> 8] = true;
2690535d72cc 7071819: To support Extended Grapheme Clusters in Regex
sherman
parents: 21805
diff changeset
    84
                        bkNum++;
2690535d72cc 7071819: To support Extended Grapheme Clusters in Regex
sherman
parents: 21805
diff changeset
    85
                    }
5610
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    86
                    if (cp == lastCp + 1) {
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    87
                        cpBB.put((byte)name.length());
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    88
                    } else {
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    89
                        cpBB.put((byte)0);  // segment start flag
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    90
                        cpBB.putInt((name.length() << 24) | (cp & 0xffffff));
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    91
                    }
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    92
                    namePool.append(name);
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    93
                    lastCp = cp;
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    94
                }
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    95
            }
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    96
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    97
            byte[] namePoolBytes = namePool.toString().getBytes("ASCII");
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    98
            int cpLen = cpBB.position();
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
    99
            int total = cpLen + namePoolBytes.length;
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
   100
            DataOutputStream dos = new DataOutputStream(
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
   101
                                       new DeflaterOutputStream(
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
   102
                                           new FileOutputStream(args[1])));
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
   103
            dos.writeInt(total);  // total
35783
2690535d72cc 7071819: To support Extended Grapheme Clusters in Regex
sherman
parents: 21805
diff changeset
   104
            dos.writeInt(bkNum);  // bkNum;
2690535d72cc 7071819: To support Extended Grapheme Clusters in Regex
sherman
parents: 21805
diff changeset
   105
            dos.writeInt(cpNum);  // cpNum
5610
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
   106
            dos.writeInt(cpLen);  // nameOff
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
   107
            dos.write(cpPoolBytes, 0, cpLen);
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
   108
            dos.write(namePoolBytes);
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
   109
            dos.close();
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
   110
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
   111
        } catch (Throwable e) {
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
   112
            System.out.println("Unexpected exception:");
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
   113
            e.printStackTrace();
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
   114
        } finally {
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
   115
            if (reader != null) {
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
   116
                try {
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
   117
                    reader.close();
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
   118
                } catch (Throwable ee) { ee.printStackTrace(); }
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
   119
            }
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
   120
        }
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
   121
    }
fd2427610c7f 6945564: Unicode script support in Character class
sherman
parents:
diff changeset
   122
}