1 /* |
|
2 * Copyright (c) 2004, 2013, Oracle and/or its affiliates. All rights reserved. |
|
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
|
4 * |
|
5 * This code is free software; you can redistribute it and/or modify it |
|
6 * under the terms of the GNU General Public License version 2 only, as |
|
7 * published by the Free Software Foundation. Oracle designates this |
|
8 * particular file as subject to the "Classpath" exception as provided |
|
9 * by Oracle in the LICENSE file that accompanied this code. |
|
10 * |
|
11 * This code is distributed in the hope that it will be useful, but WITHOUT |
|
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
14 * version 2 for more details (a copy is included in the LICENSE file that |
|
15 * accompanied this code). |
|
16 * |
|
17 * You should have received a copy of the GNU General Public License version |
|
18 * 2 along with this work; if not, write to the Free Software Foundation, |
|
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
|
20 * |
|
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
|
22 * or visit www.oracle.com if you need additional information or have any |
|
23 * questions. |
|
24 */ |
|
25 |
|
26 /* Misc functions for conversion of Unicode and UTF-8 and platform encoding */ |
|
27 |
|
28 #include <stdio.h> |
|
29 #include <stddef.h> |
|
30 #include <stdlib.h> |
|
31 #include <stdarg.h> |
|
32 #include <string.h> |
|
33 #include <ctype.h> |
|
34 |
|
35 #include "jni.h" |
|
36 |
|
37 #include "utf.h" |
|
38 |
|
39 /* |
|
40 * Error handler |
|
41 */ |
|
42 void |
|
43 utfError(char *file, int line, char *message) |
|
44 { |
|
45 (void)fprintf(stderr, "UTF ERROR [\"%s\":%d]: %s\n", file, line, message); |
|
46 abort(); |
|
47 } |
|
48 |
|
49 /* |
|
50 * Convert UTF-8 to UTF-16 |
|
51 * Returns length or -1 if output overflows. |
|
52 */ |
|
53 int JNICALL |
|
54 utf8ToUtf16(struct UtfInst *ui, jbyte *utf8, int len, unsigned short *output, int outputMaxLen) |
|
55 { |
|
56 int outputLen; |
|
57 int i; |
|
58 |
|
59 UTF_ASSERT(utf8); |
|
60 UTF_ASSERT(len>=0); |
|
61 UTF_ASSERT(output); |
|
62 UTF_ASSERT(outputMaxLen>0); |
|
63 |
|
64 i = 0; |
|
65 outputLen = 0; |
|
66 while ( i<len ) { |
|
67 unsigned code, x, y, z; |
|
68 |
|
69 if ( outputLen >= outputMaxLen ) { |
|
70 return -1; |
|
71 } |
|
72 x = (unsigned char)utf8[i++]; |
|
73 code = x; |
|
74 if ( (x & 0xE0)==0xE0 ) { |
|
75 y = (unsigned char)utf8[i++]; |
|
76 z = (unsigned char)utf8[i++]; |
|
77 code = ((x & 0xF)<<12) + ((y & 0x3F)<<6) + (z & 0x3F); |
|
78 } else if ( (x & 0xC0)==0xC0 ) { |
|
79 y = (unsigned char)utf8[i++]; |
|
80 code = ((x & 0x1F)<<6) + (y & 0x3F); |
|
81 } |
|
82 output[outputLen++] = code; |
|
83 } |
|
84 return outputLen; |
|
85 } |
|
86 |
|
87 /* |
|
88 * Convert UTF-16 to UTF-8 Modified |
|
89 * Returns length or -1 if output overflows. |
|
90 */ |
|
91 int JNICALL |
|
92 utf16ToUtf8m(struct UtfInst *ui, unsigned short *utf16, int len, jbyte *output, int outputMaxLen) |
|
93 { |
|
94 int i; |
|
95 int outputLen; |
|
96 |
|
97 UTF_ASSERT(utf16); |
|
98 UTF_ASSERT(len>=0); |
|
99 UTF_ASSERT(output); |
|
100 UTF_ASSERT(outputMaxLen>0); |
|
101 |
|
102 outputLen = 0; |
|
103 for (i = 0; i < len; i++) { |
|
104 unsigned code; |
|
105 |
|
106 code = utf16[i]; |
|
107 if ( code >= 0x0001 && code <= 0x007F ) { |
|
108 if ( outputLen + 1 >= outputMaxLen ) { |
|
109 return -1; |
|
110 } |
|
111 output[outputLen++] = code; |
|
112 } else if ( code == 0 || ( code >= 0x0080 && code <= 0x07FF ) ) { |
|
113 if ( outputLen + 2 >= outputMaxLen ) { |
|
114 return -1; |
|
115 } |
|
116 output[outputLen++] = ((code>>6) & 0x1F) | 0xC0; |
|
117 output[outputLen++] = (code & 0x3F) | 0x80; |
|
118 } else if ( code >= 0x0800 && code <= 0xFFFF ) { |
|
119 if ( outputLen + 3 >= outputMaxLen ) { |
|
120 return -1; |
|
121 } |
|
122 output[outputLen++] = ((code>>12) & 0x0F) | 0xE0; |
|
123 output[outputLen++] = ((code>>6) & 0x3F) | 0x80; |
|
124 output[outputLen++] = (code & 0x3F) | 0x80; |
|
125 } |
|
126 } |
|
127 output[outputLen] = 0; |
|
128 return outputLen; |
|
129 } |
|
130 |
|
131 int JNICALL |
|
132 utf16ToUtf8s(struct UtfInst *ui, unsigned short *utf16, int len, jbyte *output, int outputMaxLen) |
|
133 { |
|
134 return -1; /* FIXUP */ |
|
135 } |
|
136 |
|
137 /* Determine length of this Standard UTF-8 in Modified UTF-8. |
|
138 * Validation is done of the basic UTF encoding rules, returns |
|
139 * length (no change) when errors are detected in the UTF encoding. |
|
140 * |
|
141 * Note: Accepts Modified UTF-8 also, no verification on the |
|
142 * correctness of Standard UTF-8 is done. e,g, 0xC080 input is ok. |
|
143 */ |
|
144 int JNICALL |
|
145 utf8sToUtf8mLength(struct UtfInst *ui, jbyte *string, int length) |
|
146 { |
|
147 int newLength; |
|
148 int i; |
|
149 |
|
150 newLength = 0; |
|
151 for ( i = 0 ; i < length ; i++ ) { |
|
152 unsigned byte; |
|
153 |
|
154 byte = (unsigned char)string[i]; |
|
155 if ( (byte & 0x80) == 0 ) { /* 1byte encoding */ |
|
156 newLength++; |
|
157 if ( byte == 0 ) { |
|
158 newLength++; /* We gain one byte in length on NULL bytes */ |
|
159 } |
|
160 } else if ( (byte & 0xE0) == 0xC0 ) { /* 2byte encoding */ |
|
161 /* Check encoding of following bytes */ |
|
162 if ( (i+1) >= length || (string[i+1] & 0xC0) != 0x80 ) { |
|
163 break; /* Error condition */ |
|
164 } |
|
165 i++; /* Skip next byte */ |
|
166 newLength += 2; |
|
167 } else if ( (byte & 0xF0) == 0xE0 ) { /* 3byte encoding */ |
|
168 /* Check encoding of following bytes */ |
|
169 if ( (i+2) >= length || (string[i+1] & 0xC0) != 0x80 |
|
170 || (string[i+2] & 0xC0) != 0x80 ) { |
|
171 break; /* Error condition */ |
|
172 } |
|
173 i += 2; /* Skip next two bytes */ |
|
174 newLength += 3; |
|
175 } else if ( (byte & 0xF8) == 0xF0 ) { /* 4byte encoding */ |
|
176 /* Check encoding of following bytes */ |
|
177 if ( (i+3) >= length || (string[i+1] & 0xC0) != 0x80 |
|
178 || (string[i+2] & 0xC0) != 0x80 |
|
179 || (string[i+3] & 0xC0) != 0x80 ) { |
|
180 break; /* Error condition */ |
|
181 } |
|
182 i += 3; /* Skip next 3 bytes */ |
|
183 newLength += 6; /* 4byte encoding turns into 2 3byte ones */ |
|
184 } else { |
|
185 break; /* Error condition */ |
|
186 } |
|
187 } |
|
188 if ( i != length ) { |
|
189 /* Error in finding new length, return old length so no conversion */ |
|
190 /* FIXUP: ERROR_MESSAGE? */ |
|
191 return length; |
|
192 } |
|
193 return newLength; |
|
194 } |
|
195 |
|
196 /* Convert Standard UTF-8 to Modified UTF-8. |
|
197 * Assumes the UTF-8 encoding was validated by utf8mLength() above. |
|
198 * |
|
199 * Note: Accepts Modified UTF-8 also, no verification on the |
|
200 * correctness of Standard UTF-8 is done. e,g, 0xC080 input is ok. |
|
201 */ |
|
202 void JNICALL |
|
203 utf8sToUtf8m(struct UtfInst *ui, jbyte *string, int length, jbyte *newString, int newLength) |
|
204 { |
|
205 int i; |
|
206 int j; |
|
207 |
|
208 j = 0; |
|
209 for ( i = 0 ; i < length ; i++ ) { |
|
210 unsigned byte1; |
|
211 |
|
212 byte1 = (unsigned char)string[i]; |
|
213 |
|
214 /* NULL bytes and bytes starting with 11110xxx are special */ |
|
215 if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */ |
|
216 if ( byte1 == 0 ) { |
|
217 /* Bits out: 11000000 10000000 */ |
|
218 newString[j++] = (jbyte)0xC0; |
|
219 newString[j++] = (jbyte)0x80; |
|
220 } else { |
|
221 /* Single byte */ |
|
222 newString[j++] = byte1; |
|
223 } |
|
224 } else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */ |
|
225 newString[j++] = byte1; |
|
226 newString[j++] = string[++i]; |
|
227 } else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */ |
|
228 newString[j++] = byte1; |
|
229 newString[j++] = string[++i]; |
|
230 newString[j++] = string[++i]; |
|
231 } else if ( (byte1 & 0xF8) == 0xF0 ) { /* 4byte encoding */ |
|
232 /* Beginning of 4byte encoding, turn into 2 3byte encodings */ |
|
233 unsigned byte2, byte3, byte4, u21; |
|
234 |
|
235 /* Bits in: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ |
|
236 byte2 = (unsigned char)string[++i]; |
|
237 byte3 = (unsigned char)string[++i]; |
|
238 byte4 = (unsigned char)string[++i]; |
|
239 /* Reconstruct full 21bit value */ |
|
240 u21 = (byte1 & 0x07) << 18; |
|
241 u21 += (byte2 & 0x3F) << 12; |
|
242 u21 += (byte3 & 0x3F) << 6; |
|
243 u21 += (byte4 & 0x3F); |
|
244 /* Bits out: 11101101 1010xxxx 10xxxxxx */ |
|
245 newString[j++] = (jbyte)0xED; |
|
246 newString[j++] = (jbyte)(0xA0 + (((u21 >> 16) - 1) & 0x0F)); |
|
247 newString[j++] = (jbyte)(0x80 + ((u21 >> 10) & 0x3F)); |
|
248 /* Bits out: 11101101 1011xxxx 10xxxxxx */ |
|
249 newString[j++] = (jbyte)0xED; |
|
250 newString[j++] = (jbyte)(0xB0 + ((u21 >> 6) & 0x0F)); |
|
251 newString[j++] = byte4; |
|
252 } |
|
253 } |
|
254 UTF_ASSERT(i==length); |
|
255 UTF_ASSERT(j==newLength); |
|
256 newString[j] = (jbyte)0; |
|
257 } |
|
258 |
|
259 /* Given a Modified UTF-8 string, calculate the Standard UTF-8 length. |
|
260 * Basic validation of the UTF encoding rules is done, and length is |
|
261 * returned (no change) when errors are detected. |
|
262 * |
|
263 * Note: No validation is made that this is indeed Modified UTF-8 coming in. |
|
264 * |
|
265 */ |
|
266 int JNICALL |
|
267 utf8mToUtf8sLength(struct UtfInst *ui, jbyte *string, int length) |
|
268 { |
|
269 int newLength; |
|
270 int i; |
|
271 |
|
272 newLength = 0; |
|
273 for ( i = 0 ; i < length ; i++ ) { |
|
274 unsigned byte1, byte2, byte3, byte4, byte5, byte6; |
|
275 |
|
276 byte1 = (unsigned char)string[i]; |
|
277 if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */ |
|
278 newLength++; |
|
279 } else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */ |
|
280 /* Check encoding of following bytes */ |
|
281 if ( (i+1) >= length || (string[i+1] & 0xC0) != 0x80 ) { |
|
282 break; /* Error condition */ |
|
283 } |
|
284 byte2 = (unsigned char)string[++i]; |
|
285 if ( byte1 != 0xC0 || byte2 != 0x80 ) { |
|
286 newLength += 2; /* Normal 2byte encoding, not 0xC080 */ |
|
287 } else { |
|
288 newLength++; /* We will turn 0xC080 into 0 */ |
|
289 } |
|
290 } else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */ |
|
291 /* Check encoding of following bytes */ |
|
292 if ( (i+2) >= length || (string[i+1] & 0xC0) != 0x80 |
|
293 || (string[i+2] & 0xC0) != 0x80 ) { |
|
294 break; /* Error condition */ |
|
295 } |
|
296 byte2 = (unsigned char)string[++i]; |
|
297 byte3 = (unsigned char)string[++i]; |
|
298 newLength += 3; |
|
299 /* Possible process a second 3byte encoding */ |
|
300 if ( (i+3) < length && byte1 == 0xED && (byte2 & 0xF0) == 0xA0 ) { |
|
301 /* See if this is a pair of 3byte encodings */ |
|
302 byte4 = (unsigned char)string[i+1]; |
|
303 byte5 = (unsigned char)string[i+2]; |
|
304 byte6 = (unsigned char)string[i+3]; |
|
305 if ( byte4 == 0xED && (byte5 & 0xF0) == 0xB0 ) { |
|
306 /* Check encoding of 3rd byte */ |
|
307 if ( (byte6 & 0xC0) != 0x80 ) { |
|
308 break; /* Error condition */ |
|
309 } |
|
310 newLength++; /* New string will have 4byte encoding */ |
|
311 i += 3; /* Skip next 3 bytes */ |
|
312 } |
|
313 } |
|
314 } else { |
|
315 break; /* Error condition */ |
|
316 } |
|
317 } |
|
318 if ( i != length ) { |
|
319 /* Error in UTF encoding */ |
|
320 /* FIXUP: ERROR_MESSAGE()? */ |
|
321 return length; |
|
322 } |
|
323 return newLength; |
|
324 } |
|
325 |
|
326 /* Convert a Modified UTF-8 string into a Standard UTF-8 string |
|
327 * It is assumed that this string has been validated in terms of the |
|
328 * basic UTF encoding rules by utf8Length() above. |
|
329 * |
|
330 * Note: No validation is made that this is indeed Modified UTF-8 coming in. |
|
331 * |
|
332 */ |
|
333 void JNICALL |
|
334 utf8mToUtf8s(struct UtfInst *ui, jbyte *string, int length, jbyte *newString, int newLength) |
|
335 { |
|
336 int i; |
|
337 int j; |
|
338 |
|
339 j = 0; |
|
340 for ( i = 0 ; i < length ; i++ ) { |
|
341 unsigned byte1, byte2, byte3, byte4, byte5, byte6; |
|
342 |
|
343 byte1 = (unsigned char)string[i]; |
|
344 if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */ |
|
345 /* Single byte */ |
|
346 newString[j++] = byte1; |
|
347 } else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */ |
|
348 byte2 = (unsigned char)string[++i]; |
|
349 if ( byte1 != 0xC0 || byte2 != 0x80 ) { |
|
350 newString[j++] = byte1; |
|
351 newString[j++] = byte2; |
|
352 } else { |
|
353 newString[j++] = 0; |
|
354 } |
|
355 } else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */ |
|
356 byte2 = (unsigned char)string[++i]; |
|
357 byte3 = (unsigned char)string[++i]; |
|
358 if ( i+3 < length && byte1 == 0xED && (byte2 & 0xF0) == 0xA0 ) { |
|
359 /* See if this is a pair of 3byte encodings */ |
|
360 byte4 = (unsigned char)string[i+1]; |
|
361 byte5 = (unsigned char)string[i+2]; |
|
362 byte6 = (unsigned char)string[i+3]; |
|
363 if ( byte4 == 0xED && (byte5 & 0xF0) == 0xB0 ) { |
|
364 unsigned u21; |
|
365 |
|
366 /* Bits in: 11101101 1010xxxx 10xxxxxx */ |
|
367 /* Bits in: 11101101 1011xxxx 10xxxxxx */ |
|
368 i += 3; |
|
369 |
|
370 /* Reconstruct 21 bit code */ |
|
371 u21 = ((byte2 & 0x0F) + 1) << 16; |
|
372 u21 += (byte3 & 0x3F) << 10; |
|
373 u21 += (byte5 & 0x0F) << 6; |
|
374 u21 += (byte6 & 0x3F); |
|
375 |
|
376 /* Bits out: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ |
|
377 |
|
378 /* Convert to 4byte encoding */ |
|
379 newString[j++] = 0xF0 + ((u21 >> 18) & 0x07); |
|
380 newString[j++] = 0x80 + ((u21 >> 12) & 0x3F); |
|
381 newString[j++] = 0x80 + ((u21 >> 6) & 0x3F); |
|
382 newString[j++] = 0x80 + (u21 & 0x3F); |
|
383 continue; |
|
384 } |
|
385 } |
|
386 /* Normal 3byte encoding */ |
|
387 newString[j++] = byte1; |
|
388 newString[j++] = byte2; |
|
389 newString[j++] = byte3; |
|
390 } |
|
391 } |
|
392 UTF_ASSERT(i==length); |
|
393 UTF_ASSERT(j==newLength); |
|
394 newString[j] = 0; |
|
395 } |
|
396 |
|
397 /* ================================================================= */ |
|
398 |
|
399 #ifdef COMPILE_WITH_UTF_TEST /* Test program */ |
|
400 |
|
401 /* |
|
402 * Convert any byte array into a printable string. |
|
403 * Returns length or -1 if output overflows. |
|
404 */ |
|
405 static int |
|
406 bytesToPrintable(struct UtfInst *ui, char *bytes, int len, char *output, int outputMaxLen) |
|
407 { |
|
408 int outputLen; |
|
409 int i; |
|
410 |
|
411 UTF_ASSERT(bytes); |
|
412 UTF_ASSERT(len>=0); |
|
413 UTF_ASSERT(output); |
|
414 UTF_ASSERT(outputMaxLen>=0); |
|
415 |
|
416 outputLen = 0; |
|
417 for ( i=0; i<len ; i++ ) { |
|
418 unsigned byte; |
|
419 |
|
420 byte = bytes[i]; |
|
421 if ( byte <= 0x7f && isprint(byte) && !iscntrl(byte) ) { |
|
422 if ( outputLen + 1 >= outputMaxLen ) { |
|
423 return -1; |
|
424 } |
|
425 output[outputLen++] = (char)byte; |
|
426 } else { |
|
427 if ( outputLen + 4 >= outputMaxLen ) { |
|
428 return -1; |
|
429 } |
|
430 (void)sprintf(output+outputLen,"\\x%02x",byte); |
|
431 outputLen += 4; |
|
432 } |
|
433 } |
|
434 output[outputLen] = 0; |
|
435 return outputLen; |
|
436 } |
|
437 |
|
438 static void |
|
439 test(void) |
|
440 { |
|
441 static char *strings[] = { |
|
442 "characters", |
|
443 "abcdefghijklmnopqrstuvwxyz", |
|
444 "0123456789", |
|
445 "!@#$%^&*()_+=-{}[]:;", |
|
446 NULL }; |
|
447 int i; |
|
448 struct UtfInst *ui; |
|
449 |
|
450 ui = utfInitialize(NULL); |
|
451 |
|
452 i = 0; |
|
453 while ( strings[i] != NULL ) { |
|
454 char *str; |
|
455 #define MAX 1024 |
|
456 char buf0[MAX]; |
|
457 char buf1[MAX]; |
|
458 char buf2[MAX]; |
|
459 unsigned short buf3[MAX]; |
|
460 int len1; |
|
461 int len2; |
|
462 int len3; |
|
463 |
|
464 str = strings[i]; |
|
465 |
|
466 (void)bytesToPrintable(ui, str, (int)strlen(str), buf0, 1024); |
|
467 |
|
468 len1 = utf8FromPlatform(ui, str, (int)strlen(str), (jbyte*)buf1, 1024); |
|
469 |
|
470 UTF_ASSERT(len1==(int)strlen(str)); |
|
471 |
|
472 len3 = utf8ToUtf16(ui, (jbyte*)buf1, len1, (jchar*)buf3, 1024); |
|
473 |
|
474 UTF_ASSERT(len3==len1); |
|
475 |
|
476 len1 = utf16ToUtf8m(ui, (jchar*)buf3, len3, (jbyte*)buf1, 1024); |
|
477 |
|
478 UTF_ASSERT(len1==len3); |
|
479 UTF_ASSERT(strcmp(str, buf1) == 0); |
|
480 |
|
481 len2 = utf8ToPlatform(ui, (jbyte*)buf1, len1, buf2, 1024); |
|
482 |
|
483 UTF_ASSERT(len2==len1); |
|
484 UTF_ASSERT(strcmp(str, buf2) == 0); |
|
485 |
|
486 i++; |
|
487 } |
|
488 |
|
489 utfTerminate(ui, NULL); |
|
490 |
|
491 } |
|
492 |
|
493 int |
|
494 main(int argc, char **argv) |
|
495 { |
|
496 test(); |
|
497 return 0; |
|
498 } |
|
499 |
|
500 #endif |
|