|
1 package build.tools.generatecharacter; |
|
2 |
|
3 import java.util.regex.*; |
|
4 import java.util.*; |
|
5 import java.io.*; |
|
6 |
|
7 public class CharacterScript { |
|
8 |
|
9 // generate the code needed for j.l.C.UnicodeScript |
|
10 static void fortest(String fmt, Object... o) { |
|
11 //System.out.printf(fmt, o); |
|
12 } |
|
13 |
|
14 static void print(String fmt, Object... o) { |
|
15 System.out.printf(fmt, o); |
|
16 } |
|
17 |
|
18 static void debug(String fmt, Object... o) { |
|
19 //System.out.printf(fmt, o); |
|
20 } |
|
21 |
|
22 public static void main(String args[]){ |
|
23 try { |
|
24 if (args.length != 1) { |
|
25 System.out.println("java CharacterScript script.txt out"); |
|
26 System.exit(1); |
|
27 } |
|
28 |
|
29 int i, j; |
|
30 BufferedReader sbfr = new BufferedReader(new FileReader(args[0])); |
|
31 HashMap<String,Integer> scriptMap = new HashMap<String,Integer>(); |
|
32 String line = null; |
|
33 |
|
34 Matcher m = Pattern.compile("(\\p{XDigit}+)(?:\\.{2}(\\p{XDigit}+))?\\s+;\\s+(\\w+)\\s+#.*").matcher(""); |
|
35 |
|
36 int prevS = -1; |
|
37 int prevE = -1; |
|
38 String prevN = null; |
|
39 int[][] scripts = new int[1024][3]; |
|
40 int scriptSize = 0; |
|
41 |
|
42 while ((line = sbfr.readLine()) != null) { |
|
43 if (line.length() <= 1 || line.charAt(0) == '#') { |
|
44 continue; |
|
45 } |
|
46 m.reset(line); |
|
47 if (m.matches()) { |
|
48 int start = Integer.parseInt(m.group(1), 16); |
|
49 int end = (m.group(2)==null)?start |
|
50 :Integer.parseInt(m.group(2), 16); |
|
51 String name = m.group(3); |
|
52 if (name.equals(prevN) && start == prevE + 1) { |
|
53 prevE = end; |
|
54 } else { |
|
55 if (prevS != -1) { |
|
56 if (scriptMap.get(prevN) == null) { |
|
57 scriptMap.put(prevN, scriptMap.size()); |
|
58 } |
|
59 scripts[scriptSize][0] = prevS; |
|
60 scripts[scriptSize][1] = prevE; |
|
61 scripts[scriptSize][2] = scriptMap.get(prevN); |
|
62 scriptSize++; |
|
63 } |
|
64 debug("%x-%x\t%s%n", prevS, prevE, prevN); |
|
65 prevS = start; prevE = end; prevN = name; |
|
66 } |
|
67 } else { |
|
68 debug("Warning: Unrecognized line <%s>%n", line); |
|
69 } |
|
70 } |
|
71 |
|
72 //last one. |
|
73 if (scriptMap.get(prevN) == null) { |
|
74 scriptMap.put(prevN, scriptMap.size()); |
|
75 } |
|
76 scripts[scriptSize][0] = prevS; |
|
77 scripts[scriptSize][1] = prevE; |
|
78 scripts[scriptSize][2] = scriptMap.get(prevN); |
|
79 scriptSize++; |
|
80 |
|
81 debug("%x-%x\t%s%n", prevS, prevE, prevN); |
|
82 debug("-----------------%n"); |
|
83 debug("Total scripts=%s%n", scriptMap.size()); |
|
84 debug("-----------------%n%n"); |
|
85 |
|
86 String[] names = new String[scriptMap.size()]; |
|
87 for (String name: scriptMap.keySet()) { |
|
88 names[scriptMap.get(name).intValue()] = name; |
|
89 } |
|
90 |
|
91 for (j = 0; j < scriptSize; j++) { |
|
92 for (int cp = scripts[j][0]; cp <= scripts[j][1]; cp++) { |
|
93 String name = names[scripts[j][2]].toUpperCase(Locale.ENGLISH);; |
|
94 if (cp > 0xffff) |
|
95 System.out.printf("%05X %s%n", cp, name); |
|
96 else |
|
97 System.out.printf("%05X %s%n", cp, name); |
|
98 } |
|
99 } |
|
100 |
|
101 Arrays.sort(scripts, 0, scriptSize, |
|
102 new Comparator<int[]>() { |
|
103 public int compare(int[] a1, int[] a2) { |
|
104 return a1[0] - a2[0]; |
|
105 } |
|
106 public boolean compare(Object obj) { |
|
107 return obj == this; |
|
108 } |
|
109 }); |
|
110 |
|
111 |
|
112 |
|
113 // Consolidation: there are lots of "reserved" code points |
|
114 // embedded in those otherwise "sequential" blocks. |
|
115 // To make the lookup table smaller, we combine those |
|
116 // separated segments with the assumption that the lookup |
|
117 // implementation checks |
|
118 // Character.getType() != Character.UNASSIGNED |
|
119 // first (return UNKNOWN for unassigned) |
|
120 |
|
121 ArrayList<int[]> list = new ArrayList(); |
|
122 list.add(scripts[0]); |
|
123 |
|
124 int[] last = scripts[0]; |
|
125 for (i = 1; i < scriptSize; i++) { |
|
126 if (scripts[i][0] != (last[1] + 1)) { |
|
127 |
|
128 boolean isNotUnassigned = false; |
|
129 for (int cp = last[1] + 1; cp < scripts[i][0]; cp++) { |
|
130 if (Character.getType(cp) != Character.UNASSIGNED) { |
|
131 isNotUnassigned = true; |
|
132 debug("Warning: [%x] is ASSIGNED but in NON script%n", cp); |
|
133 break; |
|
134 } |
|
135 } |
|
136 if (isNotUnassigned) { |
|
137 // surrogates only? |
|
138 int[] a = new int[3]; |
|
139 a[0] = last[1] + 1; |
|
140 a[1] = scripts[i][0] - 1; |
|
141 a[2] = -1; // unknown |
|
142 list.add(a); |
|
143 } else { |
|
144 if (last[2] == scripts[i][2]) { |
|
145 //combine |
|
146 last[1] = scripts[i][1]; |
|
147 continue; |
|
148 } else { |
|
149 // expand last |
|
150 last[1] = scripts[i][0] - 1; |
|
151 } |
|
152 } |
|
153 } |
|
154 list.add(scripts[i]); |
|
155 last = scripts[i]; |
|
156 } |
|
157 |
|
158 for (i = 0; i < list.size(); i++) { |
|
159 int[] a = (int[])list.get(i); |
|
160 String name = "UNKNOWN"; |
|
161 if (a[2] != -1) |
|
162 name = names[a[2]].toUpperCase(Locale.US); |
|
163 debug("0x%05x, 0x%05x %s%n", a[0], a[1], name); |
|
164 } |
|
165 debug("--->total=%d%n", list.size()); |
|
166 |
|
167 |
|
168 //////////////////OUTPUT////////////////////////////////// |
|
169 print("public class Scripts {%n%n"); |
|
170 print(" public static enum UnicodeScript {%n"); |
|
171 for (i = 0; i < names.length; i++) { |
|
172 print(" /**%n * Unicode script \"%s\".%n */%n", names[i]); |
|
173 print(" %s,%n%n", names[i].toUpperCase(Locale.US)); |
|
174 } |
|
175 print(" /**%n * Unicode script \"Unknown\".%n */%n UNKNOWN;%n%n"); |
|
176 |
|
177 |
|
178 // lookup table |
|
179 print(" private static final int[] scriptStarts = {%n"); |
|
180 for (int[] a : list) { |
|
181 String name = "UNKNOWN"; |
|
182 if (a[2] != -1) |
|
183 name = names[a[2]].toUpperCase(Locale.US); |
|
184 if (a[0] < 0x10000) |
|
185 print(" 0x%04X, // %04X..%04X; %s%n", |
|
186 a[0], a[0], a[1], name); |
|
187 else |
|
188 print(" 0x%05X, // %05X..%05X; %s%n", |
|
189 a[0], a[0], a[1], name); |
|
190 } |
|
191 last = list.get(list.size() -1); |
|
192 if (last[1] != Character.MAX_CODE_POINT) |
|
193 print(" 0x%05X // %05X..%06X; %s%n", |
|
194 last[1] + 1, last[1] + 1, Character.MAX_CODE_POINT, |
|
195 "UNKNOWN"); |
|
196 print("%n };%n%n"); |
|
197 |
|
198 print(" private static final UnicodeScript[] scripts = {%n"); |
|
199 for (int[] a : list) { |
|
200 String name = "UNKNOWN"; |
|
201 if (a[2] != -1) |
|
202 name = names[a[2]].toUpperCase(Locale.US); |
|
203 print(" %s,%n", name); |
|
204 } |
|
205 |
|
206 if (last[1] != Character.MAX_CODE_POINT) |
|
207 print(" UNKNOWN%n"); |
|
208 print(" };%n"); |
|
209 print(" }%n"); |
|
210 print("}%n"); |
|
211 |
|
212 } catch (Exception e) { |
|
213 e.printStackTrace(); |
|
214 } |
|
215 } |
|
216 } |