|
1 /* |
|
2 * Copyright 2004-2005 Sun Microsystems, Inc. All Rights Reserved. |
|
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
|
4 * |
|
5 * This code is free software; you can redistribute it and/or modify it |
|
6 * under the terms of the GNU General Public License version 2 only, as |
|
7 * published by the Free Software Foundation. Sun designates this |
|
8 * particular file as subject to the "Classpath" exception as provided |
|
9 * by Sun in the LICENSE file that accompanied this code. |
|
10 * |
|
11 * This code is distributed in the hope that it will be useful, but WITHOUT |
|
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
14 * version 2 for more details (a copy is included in the LICENSE file that |
|
15 * accompanied this code). |
|
16 * |
|
17 * You should have received a copy of the GNU General Public License version |
|
18 * 2 along with this work; if not, write to the Free Software Foundation, |
|
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
|
20 * |
|
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, |
|
22 * CA 95054 USA or visit www.sun.com if you need additional information or |
|
23 * have any questions. |
|
24 */ |
|
25 |
|
26 /* Misc functions for conversion of Unicode and UTF-8 and platform encoding */ |
|
27 |
|
28 #include <stdio.h> |
|
29 #include <stddef.h> |
|
30 #include <stdlib.h> |
|
31 #include <stdarg.h> |
|
32 #include <string.h> |
|
33 #include <ctype.h> |
|
34 |
|
35 #include "jni.h" |
|
36 |
|
37 #include "utf.h" |
|
38 |
|
39 /* |
|
40 * Error handler |
|
41 */ |
|
42 void |
|
43 utfError(char *file, int line, char *message) |
|
44 { |
|
45 (void)fprintf(stderr, "UTF ERROR [\"%s\":%d]: %s\n", file, line, message); |
|
46 abort(); |
|
47 } |
|
48 |
|
49 /* |
|
50 * Convert UTF-8 to UTF-16 |
|
51 * Returns length or -1 if output overflows. |
|
52 */ |
|
53 int JNICALL |
|
54 utf8ToUtf16(struct UtfInst *ui, jbyte *utf8, int len, unsigned short *output, int outputMaxLen) |
|
55 { |
|
56 int outputLen; |
|
57 int i; |
|
58 |
|
59 UTF_ASSERT(utf8); |
|
60 UTF_ASSERT(len>=0); |
|
61 UTF_ASSERT(output); |
|
62 UTF_ASSERT(outputMaxLen>0); |
|
63 |
|
64 i = 0; |
|
65 outputLen = 0; |
|
66 while ( i<len ) { |
|
67 unsigned code, x, y, z; |
|
68 |
|
69 if ( outputLen >= outputMaxLen ) { |
|
70 return -1; |
|
71 } |
|
72 x = (unsigned char)utf8[i++]; |
|
73 code = x; |
|
74 if ( (x & 0xE0)==0xE0 ) { |
|
75 y = (unsigned char)utf8[i++]; |
|
76 z = (unsigned char)utf8[i++]; |
|
77 code = ((x & 0xF)<<12) + ((y & 0x3F)<<6) + (z & 0x3F); |
|
78 } else if ( (x & 0xC0)==0xC0 ) { |
|
79 y = (unsigned char)utf8[i++]; |
|
80 code = ((x & 0x1F)<<6) + (y & 0x3F); |
|
81 } |
|
82 output[outputLen++] = code; |
|
83 } |
|
84 return outputLen; |
|
85 } |
|
86 |
|
87 /* |
|
88 * Convert UTF-16 to UTF-8 Modified |
|
89 * Returns length or -1 if output overflows. |
|
90 */ |
|
91 int JNICALL |
|
92 utf16ToUtf8m(struct UtfInst *ui, unsigned short *utf16, int len, jbyte *output, int outputMaxLen) |
|
93 { |
|
94 int i; |
|
95 int outputLen; |
|
96 |
|
97 UTF_ASSERT(utf16); |
|
98 UTF_ASSERT(len>=0); |
|
99 UTF_ASSERT(output); |
|
100 UTF_ASSERT(outputMaxLen>0); |
|
101 |
|
102 outputLen = 0; |
|
103 for (i = 0; i < len; i++) { |
|
104 unsigned code; |
|
105 |
|
106 code = utf16[i]; |
|
107 if ( code >= 0x0001 && code <= 0x007F ) { |
|
108 output[outputLen++] = code; |
|
109 } else if ( code == 0 || ( code >= 0x0080 && code <= 0x07FF ) ) { |
|
110 output[outputLen++] = ((code>>6) & 0x1F) | 0xC0; |
|
111 output[outputLen++] = (code & 0x3F) | 0x80; |
|
112 } else if ( code >= 0x0800 && code <= 0xFFFF ) { |
|
113 output[outputLen++] = ((code>>12) & 0x0F) | 0xE0; |
|
114 output[outputLen++] = ((code>>6) & 0x3F) | 0x80; |
|
115 output[outputLen++] = (code & 0x3F) | 0x80; |
|
116 } |
|
117 if ( outputLen > outputMaxLen ) { |
|
118 return -1; |
|
119 } |
|
120 } |
|
121 output[outputLen] = 0; |
|
122 return outputLen; |
|
123 } |
|
124 |
|
125 int JNICALL |
|
126 utf16ToUtf8s(struct UtfInst *ui, unsigned short *utf16, int len, jbyte *output, int outputMaxLen) |
|
127 { |
|
128 return -1; /* FIXUP */ |
|
129 } |
|
130 |
|
131 /* Determine length of this Standard UTF-8 in Modified UTF-8. |
|
132 * Validation is done of the basic UTF encoding rules, returns |
|
133 * length (no change) when errors are detected in the UTF encoding. |
|
134 * |
|
135 * Note: Accepts Modified UTF-8 also, no verification on the |
|
136 * correctness of Standard UTF-8 is done. e,g, 0xC080 input is ok. |
|
137 */ |
|
138 int JNICALL |
|
139 utf8sToUtf8mLength(struct UtfInst *ui, jbyte *string, int length) |
|
140 { |
|
141 int newLength; |
|
142 int i; |
|
143 |
|
144 newLength = 0; |
|
145 for ( i = 0 ; i < length ; i++ ) { |
|
146 unsigned byte; |
|
147 |
|
148 byte = (unsigned char)string[i]; |
|
149 if ( (byte & 0x80) == 0 ) { /* 1byte encoding */ |
|
150 newLength++; |
|
151 if ( byte == 0 ) { |
|
152 newLength++; /* We gain one byte in length on NULL bytes */ |
|
153 } |
|
154 } else if ( (byte & 0xE0) == 0xC0 ) { /* 2byte encoding */ |
|
155 /* Check encoding of following bytes */ |
|
156 if ( (i+1) >= length || (string[i+1] & 0xC0) != 0x80 ) { |
|
157 break; /* Error condition */ |
|
158 } |
|
159 i++; /* Skip next byte */ |
|
160 newLength += 2; |
|
161 } else if ( (byte & 0xF0) == 0xE0 ) { /* 3byte encoding */ |
|
162 /* Check encoding of following bytes */ |
|
163 if ( (i+2) >= length || (string[i+1] & 0xC0) != 0x80 |
|
164 || (string[i+2] & 0xC0) != 0x80 ) { |
|
165 break; /* Error condition */ |
|
166 } |
|
167 i += 2; /* Skip next two bytes */ |
|
168 newLength += 3; |
|
169 } else if ( (byte & 0xF8) == 0xF0 ) { /* 4byte encoding */ |
|
170 /* Check encoding of following bytes */ |
|
171 if ( (i+3) >= length || (string[i+1] & 0xC0) != 0x80 |
|
172 || (string[i+2] & 0xC0) != 0x80 |
|
173 || (string[i+3] & 0xC0) != 0x80 ) { |
|
174 break; /* Error condition */ |
|
175 } |
|
176 i += 3; /* Skip next 3 bytes */ |
|
177 newLength += 6; /* 4byte encoding turns into 2 3byte ones */ |
|
178 } else { |
|
179 break; /* Error condition */ |
|
180 } |
|
181 } |
|
182 if ( i != length ) { |
|
183 /* Error in finding new length, return old length so no conversion */ |
|
184 /* FIXUP: ERROR_MESSAGE? */ |
|
185 return length; |
|
186 } |
|
187 return newLength; |
|
188 } |
|
189 |
|
190 /* Convert Standard UTF-8 to Modified UTF-8. |
|
191 * Assumes the UTF-8 encoding was validated by utf8mLength() above. |
|
192 * |
|
193 * Note: Accepts Modified UTF-8 also, no verification on the |
|
194 * correctness of Standard UTF-8 is done. e,g, 0xC080 input is ok. |
|
195 */ |
|
196 void JNICALL |
|
197 utf8sToUtf8m(struct UtfInst *ui, jbyte *string, int length, jbyte *newString, int newLength) |
|
198 { |
|
199 int i; |
|
200 int j; |
|
201 |
|
202 j = 0; |
|
203 for ( i = 0 ; i < length ; i++ ) { |
|
204 unsigned byte1; |
|
205 |
|
206 byte1 = (unsigned char)string[i]; |
|
207 |
|
208 /* NULL bytes and bytes starting with 11110xxx are special */ |
|
209 if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */ |
|
210 if ( byte1 == 0 ) { |
|
211 /* Bits out: 11000000 10000000 */ |
|
212 newString[j++] = (jbyte)0xC0; |
|
213 newString[j++] = (jbyte)0x80; |
|
214 } else { |
|
215 /* Single byte */ |
|
216 newString[j++] = byte1; |
|
217 } |
|
218 } else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */ |
|
219 newString[j++] = byte1; |
|
220 newString[j++] = string[++i]; |
|
221 } else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */ |
|
222 newString[j++] = byte1; |
|
223 newString[j++] = string[++i]; |
|
224 newString[j++] = string[++i]; |
|
225 } else if ( (byte1 & 0xF8) == 0xF0 ) { /* 4byte encoding */ |
|
226 /* Beginning of 4byte encoding, turn into 2 3byte encodings */ |
|
227 unsigned byte2, byte3, byte4, u21; |
|
228 |
|
229 /* Bits in: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ |
|
230 byte2 = (unsigned char)string[++i]; |
|
231 byte3 = (unsigned char)string[++i]; |
|
232 byte4 = (unsigned char)string[++i]; |
|
233 /* Reconstruct full 21bit value */ |
|
234 u21 = (byte1 & 0x07) << 18; |
|
235 u21 += (byte2 & 0x3F) << 12; |
|
236 u21 += (byte3 & 0x3F) << 6; |
|
237 u21 += (byte4 & 0x3F); |
|
238 /* Bits out: 11101101 1010xxxx 10xxxxxx */ |
|
239 newString[j++] = (jbyte)0xED; |
|
240 newString[j++] = (jbyte)(0xA0 + (((u21 >> 16) - 1) & 0x0F)); |
|
241 newString[j++] = (jbyte)(0x80 + ((u21 >> 10) & 0x3F)); |
|
242 /* Bits out: 11101101 1011xxxx 10xxxxxx */ |
|
243 newString[j++] = (jbyte)0xED; |
|
244 newString[j++] = (jbyte)(0xB0 + ((u21 >> 6) & 0x0F)); |
|
245 newString[j++] = byte4; |
|
246 } |
|
247 } |
|
248 UTF_ASSERT(i==length); |
|
249 UTF_ASSERT(j==newLength); |
|
250 newString[j] = (jbyte)0; |
|
251 } |
|
252 |
|
253 /* Given a Modified UTF-8 string, calculate the Standard UTF-8 length. |
|
254 * Basic validation of the UTF encoding rules is done, and length is |
|
255 * returned (no change) when errors are detected. |
|
256 * |
|
257 * Note: No validation is made that this is indeed Modified UTF-8 coming in. |
|
258 * |
|
259 */ |
|
260 int JNICALL |
|
261 utf8mToUtf8sLength(struct UtfInst *ui, jbyte *string, int length) |
|
262 { |
|
263 int newLength; |
|
264 int i; |
|
265 |
|
266 newLength = 0; |
|
267 for ( i = 0 ; i < length ; i++ ) { |
|
268 unsigned byte1, byte2, byte3, byte4, byte5, byte6; |
|
269 |
|
270 byte1 = (unsigned char)string[i]; |
|
271 if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */ |
|
272 newLength++; |
|
273 } else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */ |
|
274 /* Check encoding of following bytes */ |
|
275 if ( (i+1) >= length || (string[i+1] & 0xC0) != 0x80 ) { |
|
276 break; /* Error condition */ |
|
277 } |
|
278 byte2 = (unsigned char)string[++i]; |
|
279 if ( byte1 != 0xC0 || byte2 != 0x80 ) { |
|
280 newLength += 2; /* Normal 2byte encoding, not 0xC080 */ |
|
281 } else { |
|
282 newLength++; /* We will turn 0xC080 into 0 */ |
|
283 } |
|
284 } else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */ |
|
285 /* Check encoding of following bytes */ |
|
286 if ( (i+2) >= length || (string[i+1] & 0xC0) != 0x80 |
|
287 || (string[i+2] & 0xC0) != 0x80 ) { |
|
288 break; /* Error condition */ |
|
289 } |
|
290 byte2 = (unsigned char)string[++i]; |
|
291 byte3 = (unsigned char)string[++i]; |
|
292 newLength += 3; |
|
293 /* Possible process a second 3byte encoding */ |
|
294 if ( (i+3) < length && byte1 == 0xED && (byte2 & 0xF0) == 0xA0 ) { |
|
295 /* See if this is a pair of 3byte encodings */ |
|
296 byte4 = (unsigned char)string[i+1]; |
|
297 byte5 = (unsigned char)string[i+2]; |
|
298 byte6 = (unsigned char)string[i+3]; |
|
299 if ( byte4 == 0xED && (byte5 & 0xF0) == 0xB0 ) { |
|
300 /* Check encoding of 3rd byte */ |
|
301 if ( (byte6 & 0xC0) != 0x80 ) { |
|
302 break; /* Error condition */ |
|
303 } |
|
304 newLength++; /* New string will have 4byte encoding */ |
|
305 i += 3; /* Skip next 3 bytes */ |
|
306 } |
|
307 } |
|
308 } else { |
|
309 break; /* Error condition */ |
|
310 } |
|
311 } |
|
312 if ( i != length ) { |
|
313 /* Error in UTF encoding */ |
|
314 /* FIXUP: ERROR_MESSAGE()? */ |
|
315 return length; |
|
316 } |
|
317 return newLength; |
|
318 } |
|
319 |
|
320 /* Convert a Modified UTF-8 string into a Standard UTF-8 string |
|
321 * It is assumed that this string has been validated in terms of the |
|
322 * basic UTF encoding rules by utf8Length() above. |
|
323 * |
|
324 * Note: No validation is made that this is indeed Modified UTF-8 coming in. |
|
325 * |
|
326 */ |
|
327 void JNICALL |
|
328 utf8mToUtf8s(struct UtfInst *ui, jbyte *string, int length, jbyte *newString, int newLength) |
|
329 { |
|
330 int i; |
|
331 int j; |
|
332 |
|
333 j = 0; |
|
334 for ( i = 0 ; i < length ; i++ ) { |
|
335 unsigned byte1, byte2, byte3, byte4, byte5, byte6; |
|
336 |
|
337 byte1 = (unsigned char)string[i]; |
|
338 if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */ |
|
339 /* Single byte */ |
|
340 newString[j++] = byte1; |
|
341 } else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */ |
|
342 byte2 = (unsigned char)string[++i]; |
|
343 if ( byte1 != 0xC0 || byte2 != 0x80 ) { |
|
344 newString[j++] = byte1; |
|
345 newString[j++] = byte2; |
|
346 } else { |
|
347 newString[j++] = 0; |
|
348 } |
|
349 } else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */ |
|
350 byte2 = (unsigned char)string[++i]; |
|
351 byte3 = (unsigned char)string[++i]; |
|
352 if ( i+3 < length && byte1 == 0xED && (byte2 & 0xF0) == 0xA0 ) { |
|
353 /* See if this is a pair of 3byte encodings */ |
|
354 byte4 = (unsigned char)string[i+1]; |
|
355 byte5 = (unsigned char)string[i+2]; |
|
356 byte6 = (unsigned char)string[i+3]; |
|
357 if ( byte4 == 0xED && (byte5 & 0xF0) == 0xB0 ) { |
|
358 unsigned u21; |
|
359 |
|
360 /* Bits in: 11101101 1010xxxx 10xxxxxx */ |
|
361 /* Bits in: 11101101 1011xxxx 10xxxxxx */ |
|
362 i += 3; |
|
363 |
|
364 /* Reconstruct 21 bit code */ |
|
365 u21 = ((byte2 & 0x0F) + 1) << 16; |
|
366 u21 += (byte3 & 0x3F) << 10; |
|
367 u21 += (byte5 & 0x0F) << 6; |
|
368 u21 += (byte6 & 0x3F); |
|
369 |
|
370 /* Bits out: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ |
|
371 |
|
372 /* Convert to 4byte encoding */ |
|
373 newString[j++] = 0xF0 + ((u21 >> 18) & 0x07); |
|
374 newString[j++] = 0x80 + ((u21 >> 12) & 0x3F); |
|
375 newString[j++] = 0x80 + ((u21 >> 6) & 0x3F); |
|
376 newString[j++] = 0x80 + (u21 & 0x3F); |
|
377 continue; |
|
378 } |
|
379 } |
|
380 /* Normal 3byte encoding */ |
|
381 newString[j++] = byte1; |
|
382 newString[j++] = byte2; |
|
383 newString[j++] = byte3; |
|
384 } |
|
385 } |
|
386 UTF_ASSERT(i==length); |
|
387 UTF_ASSERT(j==newLength); |
|
388 newString[j] = 0; |
|
389 } |
|
390 |
|
391 /* ================================================================= */ |
|
392 |
|
393 #if 1 /* Test program */ |
|
394 |
|
395 /* |
|
396 * Convert any byte array into a printable string. |
|
397 * Returns length or -1 if output overflows. |
|
398 */ |
|
399 static int |
|
400 bytesToPrintable(struct UtfInst *ui, char *bytes, int len, char *output, int outputMaxLen) |
|
401 { |
|
402 int outputLen; |
|
403 int i; |
|
404 |
|
405 UTF_ASSERT(bytes); |
|
406 UTF_ASSERT(len>=0); |
|
407 UTF_ASSERT(output); |
|
408 UTF_ASSERT(outputMaxLen>=0); |
|
409 |
|
410 outputLen = 0; |
|
411 for ( i=0; i<len ; i++ ) { |
|
412 unsigned byte; |
|
413 |
|
414 byte = bytes[i]; |
|
415 if ( outputLen >= outputMaxLen ) { |
|
416 return -1; |
|
417 } |
|
418 if ( byte <= 0x7f && isprint(byte) && !iscntrl(byte) ) { |
|
419 output[outputLen++] = (char)byte; |
|
420 } else { |
|
421 (void)sprintf(output+outputLen,"\\x%02x",byte); |
|
422 outputLen += 4; |
|
423 } |
|
424 } |
|
425 output[outputLen] = 0; |
|
426 return outputLen; |
|
427 } |
|
428 |
|
429 static void |
|
430 test(void) |
|
431 { |
|
432 static char *strings[] = { |
|
433 "characters", |
|
434 "abcdefghijklmnopqrstuvwxyz", |
|
435 "0123456789", |
|
436 "!@#$%^&*()_+=-{}[]:;", |
|
437 NULL }; |
|
438 int i; |
|
439 struct UtfInst *ui; |
|
440 |
|
441 ui = utfInitialize(NULL); |
|
442 |
|
443 i = 0; |
|
444 while ( strings[i] != NULL ) { |
|
445 char *str; |
|
446 #define MAX 1024 |
|
447 char buf0[MAX]; |
|
448 char buf1[MAX]; |
|
449 char buf2[MAX]; |
|
450 unsigned short buf3[MAX]; |
|
451 int len1; |
|
452 int len2; |
|
453 int len3; |
|
454 |
|
455 str = strings[i]; |
|
456 |
|
457 (void)bytesToPrintable(ui, str, (int)strlen(str), buf0, 1024); |
|
458 |
|
459 len1 = utf8FromPlatform(ui, str, (int)strlen(str), (jbyte*)buf1, 1024); |
|
460 |
|
461 UTF_ASSERT(len1==(int)strlen(str)); |
|
462 |
|
463 len3 = utf8ToUtf16(ui, (jbyte*)buf1, len1, (jchar*)buf3, 1024); |
|
464 |
|
465 UTF_ASSERT(len3==len1); |
|
466 |
|
467 len1 = utf16ToUtf8m(ui, (jchar*)buf3, len3, (jbyte*)buf1, 1024); |
|
468 |
|
469 UTF_ASSERT(len1==len3); |
|
470 UTF_ASSERT(strcmp(str, buf1) == 0); |
|
471 |
|
472 len2 = utf8ToPlatform(ui, (jbyte*)buf1, len1, buf2, 1024); |
|
473 |
|
474 UTF_ASSERT(len2==len1); |
|
475 UTF_ASSERT(strcmp(str, buf2) == 0); |
|
476 |
|
477 i++; |
|
478 } |
|
479 |
|
480 utfTerminate(ui, NULL); |
|
481 |
|
482 } |
|
483 |
|
484 int |
|
485 main(int argc, char **argv) |
|
486 { |
|
487 test(); |
|
488 return 0; |
|
489 } |
|
490 |
|
491 #endif |