author | pliden |
Thu, 26 Sep 2019 13:56:58 +0200 | |
changeset 58355 | de246fd65587 |
parent 47216 | 71c04702a3d5 |
permissions | -rw-r--r-- |
5610 | 1 |
package build.tools.generatecharacter; |
2 |
||
3 |
import java.io.*; |
|
4 |
import java.nio.*; |
|
5 |
import java.util.*; |
|
6 |
import java.util.zip.*; |
|
7 |
||
8 |
public class CharacterName { |
|
9 |
||
10 |
public static void main(String[] args) { |
|
11 |
FileReader reader = null; |
|
12 |
try { |
|
13 |
if (args.length != 2) { |
|
19864
41c6dfb2022e
7186311: (props) "Unicode" is misspelled as "Uniocde" in JavaDoc and error message
sherman
parents:
5610
diff
changeset
|
14 |
System.err.println("Usage: java CharacterName UnicodeData.txt uniName.dat"); |
5610 | 15 |
System.exit(1); |
16 |
} |
|
17 |
reader = new FileReader(args[0]); |
|
18 |
BufferedReader bfr = new BufferedReader(reader); |
|
19 |
String line = null; |
|
20 |
||
21 |
StringBuilder namePool = new StringBuilder(); |
|
22 |
byte[] cpPoolBytes = new byte[0x100000]; |
|
35783
2690535d72cc
7071819: To support Extended Grapheme Clusters in Regex
sherman
parents:
21805
diff
changeset
|
23 |
boolean[] cpBlocks = new boolean[(Character.MAX_CODE_POINT + 1) >> 8]; |
2690535d72cc
7071819: To support Extended Grapheme Clusters in Regex
sherman
parents:
21805
diff
changeset
|
24 |
int bkNum = 0; |
5610 | 25 |
ByteBuffer cpBB = ByteBuffer.wrap(cpPoolBytes); |
26 |
int lastCp = 0; |
|
27 |
int cpNum = 0; |
|
28 |
||
29 |
while ((line = bfr.readLine()) != null) { |
|
30 |
if (line.startsWith("#")) |
|
31 |
continue; |
|
32 |
UnicodeSpec spec = UnicodeSpec.parse(line); |
|
33 |
if (spec != null) { |
|
34 |
int cp = spec.getCodePoint(); |
|
35 |
String name = spec.getName(); |
|
36 |
if (name.equals("<control>") && spec.getOldName() != null) { |
|
35783
2690535d72cc
7071819: To support Extended Grapheme Clusters in Regex
sherman
parents:
21805
diff
changeset
|
37 |
if (cp == 0x7) // <control>BELL -> BEL; u+1f514 <-> BELL |
2690535d72cc
7071819: To support Extended Grapheme Clusters in Regex
sherman
parents:
21805
diff
changeset
|
38 |
name = "BEL"; |
2690535d72cc
7071819: To support Extended Grapheme Clusters in Regex
sherman
parents:
21805
diff
changeset
|
39 |
else if (spec.getOldName().length() != 0) |
5610 | 40 |
name = spec.getOldName(); |
35783
2690535d72cc
7071819: To support Extended Grapheme Clusters in Regex
sherman
parents:
21805
diff
changeset
|
41 |
/* |
2690535d72cc
7071819: To support Extended Grapheme Clusters in Regex
sherman
parents:
21805
diff
changeset
|
42 |
3 "figment" characters from NameAliases.txt |
2690535d72cc
7071819: To support Extended Grapheme Clusters in Regex
sherman
parents:
21805
diff
changeset
|
43 |
Several documented labels for C1 control code points which |
2690535d72cc
7071819: To support Extended Grapheme Clusters in Regex
sherman
parents:
21805
diff
changeset
|
44 |
were never actually approved in any standard...but were |
2690535d72cc
7071819: To support Extended Grapheme Clusters in Regex
sherman
parents:
21805
diff
changeset
|
45 |
implemented in Perl regex. |
2690535d72cc
7071819: To support Extended Grapheme Clusters in Regex
sherman
parents:
21805
diff
changeset
|
46 |
0080;PADDING CHARACTER;figment |
2690535d72cc
7071819: To support Extended Grapheme Clusters in Regex
sherman
parents:
21805
diff
changeset
|
47 |
0081;HIGH OCTET PRESET;figment |
2690535d72cc
7071819: To support Extended Grapheme Clusters in Regex
sherman
parents:
21805
diff
changeset
|
48 |
0099;SINGLE GRAPHIC CHARACTER INTRODUCER;figment |
2690535d72cc
7071819: To support Extended Grapheme Clusters in Regex
sherman
parents:
21805
diff
changeset
|
49 |
*/ |
2690535d72cc
7071819: To support Extended Grapheme Clusters in Regex
sherman
parents:
21805
diff
changeset
|
50 |
else if (cp == 0x80) |
2690535d72cc
7071819: To support Extended Grapheme Clusters in Regex
sherman
parents:
21805
diff
changeset
|
51 |
name = "PADDING CHARACTER"; |
2690535d72cc
7071819: To support Extended Grapheme Clusters in Regex
sherman
parents:
21805
diff
changeset
|
52 |
else if (cp == 0x81) |
2690535d72cc
7071819: To support Extended Grapheme Clusters in Regex
sherman
parents:
21805
diff
changeset
|
53 |
name = "HIGH OCTET PRESET"; |
2690535d72cc
7071819: To support Extended Grapheme Clusters in Regex
sherman
parents:
21805
diff
changeset
|
54 |
else if (cp == 0x99) |
2690535d72cc
7071819: To support Extended Grapheme Clusters in Regex
sherman
parents:
21805
diff
changeset
|
55 |
name = "SINGLE GRAPHIC CHARACTER INTRODUCER"; |
5610 | 56 |
else |
57 |
continue; |
|
58 |
} else if (name.startsWith("<")) { |
|
59 |
/* |
|
60 |
3400 <CJK Ideograph Extension A, First> |
|
61 |
4db5 <CJK Ideograph Extension A, Last> |
|
62 |
4e00 <CJK Ideograph, First> |
|
63 |
9fc3 <CJK Ideograph, Last> |
|
64 |
ac00 <Hangul Syllable, First> |
|
65 |
d7a3 <Hangul Syllable, Last> |
|
66 |
d800 <Non Private Use High Surrogate, First> |
|
67 |
db7f <Non Private Use High Surrogate, Last> |
|
68 |
db80 <Private Use High Surrogate, First> |
|
69 |
dbff <Private Use High Surrogate, Last> |
|
70 |
dc00 <Low Surrogate, First> |
|
71 |
dfff <Low Surrogate, Last> |
|
72 |
e000 <Private Use, First> |
|
73 |
f8ff <Private Use, Last> |
|
74 |
20000 <CJK Ideograph Extension B, First> |
|
75 |
2a6d6 <CJK Ideograph Extension B, Last> |
|
76 |
f0000 <Plane 15 Private Use, First> |
|
77 |
ffffd <Plane 15 Private Use, Last> |
|
78 |
*/ |
|
79 |
continue; |
|
80 |
} |
|
35783
2690535d72cc
7071819: To support Extended Grapheme Clusters in Regex
sherman
parents:
21805
diff
changeset
|
81 |
cpNum++; |
2690535d72cc
7071819: To support Extended Grapheme Clusters in Regex
sherman
parents:
21805
diff
changeset
|
82 |
if (!cpBlocks[cp >> 8]) { |
2690535d72cc
7071819: To support Extended Grapheme Clusters in Regex
sherman
parents:
21805
diff
changeset
|
83 |
cpBlocks[cp >> 8] = true; |
2690535d72cc
7071819: To support Extended Grapheme Clusters in Regex
sherman
parents:
21805
diff
changeset
|
84 |
bkNum++; |
2690535d72cc
7071819: To support Extended Grapheme Clusters in Regex
sherman
parents:
21805
diff
changeset
|
85 |
} |
5610 | 86 |
if (cp == lastCp + 1) { |
87 |
cpBB.put((byte)name.length()); |
|
88 |
} else { |
|
89 |
cpBB.put((byte)0); // segment start flag |
|
90 |
cpBB.putInt((name.length() << 24) | (cp & 0xffffff)); |
|
91 |
} |
|
92 |
namePool.append(name); |
|
93 |
lastCp = cp; |
|
94 |
} |
|
95 |
} |
|
96 |
||
97 |
byte[] namePoolBytes = namePool.toString().getBytes("ASCII"); |
|
98 |
int cpLen = cpBB.position(); |
|
99 |
int total = cpLen + namePoolBytes.length; |
|
100 |
DataOutputStream dos = new DataOutputStream( |
|
101 |
new DeflaterOutputStream( |
|
102 |
new FileOutputStream(args[1]))); |
|
103 |
dos.writeInt(total); // total |
|
35783
2690535d72cc
7071819: To support Extended Grapheme Clusters in Regex
sherman
parents:
21805
diff
changeset
|
104 |
dos.writeInt(bkNum); // bkNum; |
2690535d72cc
7071819: To support Extended Grapheme Clusters in Regex
sherman
parents:
21805
diff
changeset
|
105 |
dos.writeInt(cpNum); // cpNum |
5610 | 106 |
dos.writeInt(cpLen); // nameOff |
107 |
dos.write(cpPoolBytes, 0, cpLen); |
|
108 |
dos.write(namePoolBytes); |
|
109 |
dos.close(); |
|
110 |
||
111 |
} catch (Throwable e) { |
|
112 |
System.out.println("Unexpected exception:"); |
|
113 |
e.printStackTrace(); |
|
114 |
} finally { |
|
115 |
if (reader != null) { |
|
116 |
try { |
|
117 |
reader.close(); |
|
118 |
} catch (Throwable ee) { ee.printStackTrace(); } |
|
119 |
} |
|
120 |
} |
|
121 |
} |
|
122 |
} |