|
1 /* |
|
2 * Copyright (C) 2012 Grigori Goronzy <greg@kinoho.net> |
|
3 * |
|
4 * Permission to use, copy, modify, and/or distribute this software for any |
|
5 * purpose with or without fee is hereby granted, provided that the above |
|
6 * copyright notice and this permission notice appear in all copies. |
|
7 * |
|
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
|
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
|
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
|
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
|
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
|
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
|
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
|
15 */ |
|
16 |
|
17 #include <stdio.h> |
|
18 #include <stdlib.h> |
|
19 #include "ucdn.h" |
|
20 |
|
21 typedef struct { |
|
22 unsigned char category; |
|
23 unsigned char combining; |
|
24 unsigned char bidi_class; |
|
25 unsigned char mirrored; |
|
26 unsigned char east_asian_width; |
|
27 unsigned char normalization_check; |
|
28 unsigned char script; |
|
29 } UCDRecord; |
|
30 |
|
31 typedef struct { |
|
32 unsigned short from, to; |
|
33 } MirrorPair; |
|
34 |
|
35 typedef struct { |
|
36 unsigned int start; |
|
37 short count, index; |
|
38 } Reindex; |
|
39 |
|
40 #include "unicodedata_db.h" |
|
41 |
|
42 /* constants required for Hangul (de)composition */ |
|
43 #define SBASE 0xAC00 |
|
44 #define LBASE 0x1100 |
|
45 #define VBASE 0x1161 |
|
46 #define TBASE 0x11A7 |
|
47 #define SCOUNT 11172 |
|
48 #define LCOUNT 19 |
|
49 #define VCOUNT 21 |
|
50 #define TCOUNT 28 |
|
51 #define NCOUNT (VCOUNT * TCOUNT) |
|
52 |
|
53 static const UCDRecord *get_ucd_record(uint32_t code) |
|
54 { |
|
55 int index, offset; |
|
56 |
|
57 if (code >= 0x110000) |
|
58 index = 0; |
|
59 else { |
|
60 index = index0[code >> (SHIFT1+SHIFT2)] << SHIFT1; |
|
61 offset = (code >> SHIFT2) & ((1<<SHIFT1) - 1); |
|
62 index = index1[index + offset] << SHIFT2; |
|
63 offset = code & ((1<<SHIFT2) - 1); |
|
64 index = index2[index + offset]; |
|
65 } |
|
66 |
|
67 return &ucd_records[index]; |
|
68 } |
|
69 |
|
70 static const unsigned short *get_decomp_record(uint32_t code) |
|
71 { |
|
72 int index, offset; |
|
73 |
|
74 if (code >= 0x110000) |
|
75 index = 0; |
|
76 else { |
|
77 index = decomp_index0[code >> (DECOMP_SHIFT1+DECOMP_SHIFT2)] |
|
78 << DECOMP_SHIFT1; |
|
79 offset = (code >> DECOMP_SHIFT2) & ((1<<DECOMP_SHIFT1) - 1); |
|
80 index = decomp_index1[index + offset] << DECOMP_SHIFT2; |
|
81 offset = code & ((1<<DECOMP_SHIFT2) - 1); |
|
82 index = decomp_index2[index + offset]; |
|
83 } |
|
84 |
|
85 return &decomp_data[index]; |
|
86 } |
|
87 |
|
88 static int get_comp_index(uint32_t code, const Reindex *idx) |
|
89 { |
|
90 int i; |
|
91 |
|
92 for (i = 0; idx[i].start; i++) { |
|
93 const Reindex *cur = &idx[i]; |
|
94 if (code < cur->start) |
|
95 return -1; |
|
96 if (code <= cur->start + cur->count) { |
|
97 return cur->index + (code - cur->start); |
|
98 } |
|
99 } |
|
100 |
|
101 return -1; |
|
102 } |
|
103 |
|
104 static int compare_mp(const void *a, const void *b) |
|
105 { |
|
106 MirrorPair *mpa = (MirrorPair *)a; |
|
107 MirrorPair *mpb = (MirrorPair *)b; |
|
108 return mpa->from - mpb->from; |
|
109 } |
|
110 |
|
111 static int hangul_pair_decompose(uint32_t code, uint32_t *a, uint32_t *b) |
|
112 { |
|
113 int si = code - SBASE; |
|
114 |
|
115 if (si < 0 || si >= SCOUNT) |
|
116 return 0; |
|
117 |
|
118 if (si % TCOUNT) { |
|
119 /* LV,T */ |
|
120 *a = SBASE + (si / TCOUNT) * TCOUNT; |
|
121 *b = TBASE + (si % TCOUNT); |
|
122 return 3; |
|
123 } else { |
|
124 /* L,V */ |
|
125 *a = LBASE + (si / NCOUNT); |
|
126 *b = VBASE + (si % NCOUNT) / TCOUNT; |
|
127 return 2; |
|
128 } |
|
129 } |
|
130 |
|
131 static int hangul_pair_compose(uint32_t *code, uint32_t a, uint32_t b) |
|
132 { |
|
133 if (b < VBASE || b >= (TBASE + TCOUNT)) |
|
134 return 0; |
|
135 |
|
136 if ((a < LBASE || a >= (LBASE + LCOUNT)) |
|
137 && (a < SBASE || a >= (SBASE + SCOUNT))) |
|
138 return 0; |
|
139 |
|
140 if (a >= SBASE) { |
|
141 /* LV,T */ |
|
142 *code = a + (b - TBASE); |
|
143 return 3; |
|
144 } else { |
|
145 /* L,V */ |
|
146 int li = a - LBASE; |
|
147 int vi = b - VBASE; |
|
148 *code = SBASE + li * NCOUNT + vi * TCOUNT; |
|
149 return 2; |
|
150 } |
|
151 } |
|
152 |
|
153 static uint32_t decode_utf16(const unsigned short **code_ptr) |
|
154 { |
|
155 const unsigned short *code = *code_ptr; |
|
156 |
|
157 if ((code[0] & 0xd800) != 0xd800) { |
|
158 *code_ptr += 1; |
|
159 return (uint32_t)code[0]; |
|
160 } else { |
|
161 *code_ptr += 2; |
|
162 return 0x10000 + ((uint32_t)code[1] - 0xdc00) + |
|
163 (((uint32_t)code[0] - 0xd800) << 10); |
|
164 } |
|
165 } |
|
166 |
|
167 const char *ucdn_get_unicode_version(void) |
|
168 { |
|
169 return UNIDATA_VERSION; |
|
170 } |
|
171 |
|
172 int ucdn_get_combining_class(uint32_t code) |
|
173 { |
|
174 return get_ucd_record(code)->combining; |
|
175 } |
|
176 |
|
177 int ucdn_get_east_asian_width(uint32_t code) |
|
178 { |
|
179 return get_ucd_record(code)->east_asian_width; |
|
180 } |
|
181 |
|
182 int ucdn_get_general_category(uint32_t code) |
|
183 { |
|
184 return get_ucd_record(code)->category; |
|
185 } |
|
186 |
|
187 int ucdn_get_bidi_class(uint32_t code) |
|
188 { |
|
189 return get_ucd_record(code)->bidi_class; |
|
190 } |
|
191 |
|
192 int ucdn_get_mirrored(uint32_t code) |
|
193 { |
|
194 return get_ucd_record(code)->mirrored; |
|
195 } |
|
196 |
|
197 int ucdn_get_script(uint32_t code) |
|
198 { |
|
199 return get_ucd_record(code)->script; |
|
200 } |
|
201 |
|
202 uint32_t ucdn_mirror(uint32_t code) |
|
203 { |
|
204 MirrorPair mp = {0}; |
|
205 MirrorPair *res; |
|
206 |
|
207 if (get_ucd_record(code)->mirrored == 0) |
|
208 return code; |
|
209 |
|
210 mp.from = code; |
|
211 res = bsearch(&mp, mirror_pairs, BIDI_MIRROR_LEN, sizeof(MirrorPair), |
|
212 compare_mp); |
|
213 |
|
214 if (res == NULL) |
|
215 return code; |
|
216 else |
|
217 return res->to; |
|
218 } |
|
219 |
|
220 int ucdn_decompose(uint32_t code, uint32_t *a, uint32_t *b) |
|
221 { |
|
222 const unsigned short *rec; |
|
223 int len; |
|
224 |
|
225 if (hangul_pair_decompose(code, a, b)) |
|
226 return 1; |
|
227 |
|
228 rec = get_decomp_record(code); |
|
229 len = rec[0] >> 8; |
|
230 |
|
231 if ((rec[0] & 0xff) != 0 || len == 0) |
|
232 return 0; |
|
233 |
|
234 rec++; |
|
235 *a = decode_utf16(&rec); |
|
236 if (len > 1) |
|
237 *b = decode_utf16(&rec); |
|
238 else |
|
239 *b = 0; |
|
240 |
|
241 return 1; |
|
242 } |
|
243 |
|
244 int ucdn_compose(uint32_t *code, uint32_t a, uint32_t b) |
|
245 { |
|
246 int l, r, index, indexi, offset; |
|
247 |
|
248 if (hangul_pair_compose(code, a, b)) |
|
249 return 1; |
|
250 |
|
251 l = get_comp_index(a, nfc_first); |
|
252 r = get_comp_index(b, nfc_last); |
|
253 |
|
254 if (l < 0 || r < 0) |
|
255 return 0; |
|
256 |
|
257 indexi = l * TOTAL_LAST + r; |
|
258 index = comp_index0[indexi >> (COMP_SHIFT1+COMP_SHIFT2)] << COMP_SHIFT1; |
|
259 offset = (indexi >> COMP_SHIFT2) & ((1<<COMP_SHIFT1) - 1); |
|
260 index = comp_index1[index + offset] << COMP_SHIFT2; |
|
261 offset = indexi & ((1<<COMP_SHIFT2) - 1); |
|
262 *code = comp_data[index + offset]; |
|
263 |
|
264 return *code != 0; |
|
265 } |
|
266 |
|
267 int ucdn_compat_decompose(uint32_t code, uint32_t *decomposed) |
|
268 { |
|
269 int i, len; |
|
270 const unsigned short *rec = get_decomp_record(code); |
|
271 len = rec[0] >> 8; |
|
272 |
|
273 if (len == 0) |
|
274 return 0; |
|
275 |
|
276 rec++; |
|
277 for (i = 0; i < len; i++) |
|
278 decomposed[i] = decode_utf16(&rec); |
|
279 |
|
280 return len; |
|
281 } |