2
+ − 1
/*
+ − 2
* Copyright 2004-2005 Sun Microsystems, Inc. All Rights Reserved.
+ − 3
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ − 4
*
+ − 5
* This code is free software; you can redistribute it and/or modify it
+ − 6
* under the terms of the GNU General Public License version 2 only, as
+ − 7
* published by the Free Software Foundation. Sun designates this
+ − 8
* particular file as subject to the "Classpath" exception as provided
+ − 9
* by Sun in the LICENSE file that accompanied this code.
+ − 10
*
+ − 11
* This code is distributed in the hope that it will be useful, but WITHOUT
+ − 12
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ − 13
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ − 14
* version 2 for more details (a copy is included in the LICENSE file that
+ − 15
* accompanied this code).
+ − 16
*
+ − 17
* You should have received a copy of the GNU General Public License version
+ − 18
* 2 along with this work; if not, write to the Free Software Foundation,
+ − 19
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ − 20
*
+ − 21
* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ − 22
* CA 95054 USA or visit www.sun.com if you need additional information or
+ − 23
* have any questions.
+ − 24
*/
+ − 25
+ − 26
/* Misc functions for conversion of Unicode and UTF-8 and platform encoding */
+ − 27
+ − 28
#include <stdio.h>
+ − 29
#include <stddef.h>
+ − 30
#include <stdlib.h>
+ − 31
#include <stdarg.h>
+ − 32
#include <string.h>
+ − 33
#include <ctype.h>
+ − 34
+ − 35
#include "jni.h"
+ − 36
+ − 37
#include "utf.h"
+ − 38
+ − 39
/*
+ − 40
* Error handler
+ − 41
*/
+ − 42
void
+ − 43
utfError(char *file, int line, char *message)
+ − 44
{
+ − 45
(void)fprintf(stderr, "UTF ERROR [\"%s\":%d]: %s\n", file, line, message);
+ − 46
abort();
+ − 47
}
+ − 48
+ − 49
/*
+ − 50
* Convert UTF-8 to UTF-16
+ − 51
* Returns length or -1 if output overflows.
+ − 52
*/
+ − 53
int JNICALL
+ − 54
utf8ToUtf16(struct UtfInst *ui, jbyte *utf8, int len, unsigned short *output, int outputMaxLen)
+ − 55
{
+ − 56
int outputLen;
+ − 57
int i;
+ − 58
+ − 59
UTF_ASSERT(utf8);
+ − 60
UTF_ASSERT(len>=0);
+ − 61
UTF_ASSERT(output);
+ − 62
UTF_ASSERT(outputMaxLen>0);
+ − 63
+ − 64
i = 0;
+ − 65
outputLen = 0;
+ − 66
while ( i<len ) {
+ − 67
unsigned code, x, y, z;
+ − 68
+ − 69
if ( outputLen >= outputMaxLen ) {
+ − 70
return -1;
+ − 71
}
+ − 72
x = (unsigned char)utf8[i++];
+ − 73
code = x;
+ − 74
if ( (x & 0xE0)==0xE0 ) {
+ − 75
y = (unsigned char)utf8[i++];
+ − 76
z = (unsigned char)utf8[i++];
+ − 77
code = ((x & 0xF)<<12) + ((y & 0x3F)<<6) + (z & 0x3F);
+ − 78
} else if ( (x & 0xC0)==0xC0 ) {
+ − 79
y = (unsigned char)utf8[i++];
+ − 80
code = ((x & 0x1F)<<6) + (y & 0x3F);
+ − 81
}
+ − 82
output[outputLen++] = code;
+ − 83
}
+ − 84
return outputLen;
+ − 85
}
+ − 86
+ − 87
/*
+ − 88
* Convert UTF-16 to UTF-8 Modified
+ − 89
* Returns length or -1 if output overflows.
+ − 90
*/
+ − 91
int JNICALL
+ − 92
utf16ToUtf8m(struct UtfInst *ui, unsigned short *utf16, int len, jbyte *output, int outputMaxLen)
+ − 93
{
+ − 94
int i;
+ − 95
int outputLen;
+ − 96
+ − 97
UTF_ASSERT(utf16);
+ − 98
UTF_ASSERT(len>=0);
+ − 99
UTF_ASSERT(output);
+ − 100
UTF_ASSERT(outputMaxLen>0);
+ − 101
+ − 102
outputLen = 0;
+ − 103
for (i = 0; i < len; i++) {
+ − 104
unsigned code;
+ − 105
+ − 106
code = utf16[i];
+ − 107
if ( code >= 0x0001 && code <= 0x007F ) {
+ − 108
output[outputLen++] = code;
+ − 109
} else if ( code == 0 || ( code >= 0x0080 && code <= 0x07FF ) ) {
+ − 110
output[outputLen++] = ((code>>6) & 0x1F) | 0xC0;
+ − 111
output[outputLen++] = (code & 0x3F) | 0x80;
+ − 112
} else if ( code >= 0x0800 && code <= 0xFFFF ) {
+ − 113
output[outputLen++] = ((code>>12) & 0x0F) | 0xE0;
+ − 114
output[outputLen++] = ((code>>6) & 0x3F) | 0x80;
+ − 115
output[outputLen++] = (code & 0x3F) | 0x80;
+ − 116
}
+ − 117
if ( outputLen > outputMaxLen ) {
+ − 118
return -1;
+ − 119
}
+ − 120
}
+ − 121
output[outputLen] = 0;
+ − 122
return outputLen;
+ − 123
}
+ − 124
+ − 125
int JNICALL
+ − 126
utf16ToUtf8s(struct UtfInst *ui, unsigned short *utf16, int len, jbyte *output, int outputMaxLen)
+ − 127
{
+ − 128
return -1; /* FIXUP */
+ − 129
}
+ − 130
+ − 131
/* Determine length of this Standard UTF-8 in Modified UTF-8.
+ − 132
* Validation is done of the basic UTF encoding rules, returns
+ − 133
* length (no change) when errors are detected in the UTF encoding.
+ − 134
*
+ − 135
* Note: Accepts Modified UTF-8 also, no verification on the
+ − 136
* correctness of Standard UTF-8 is done. e,g, 0xC080 input is ok.
+ − 137
*/
+ − 138
int JNICALL
+ − 139
utf8sToUtf8mLength(struct UtfInst *ui, jbyte *string, int length)
+ − 140
{
+ − 141
int newLength;
+ − 142
int i;
+ − 143
+ − 144
newLength = 0;
+ − 145
for ( i = 0 ; i < length ; i++ ) {
+ − 146
unsigned byte;
+ − 147
+ − 148
byte = (unsigned char)string[i];
+ − 149
if ( (byte & 0x80) == 0 ) { /* 1byte encoding */
+ − 150
newLength++;
+ − 151
if ( byte == 0 ) {
+ − 152
newLength++; /* We gain one byte in length on NULL bytes */
+ − 153
}
+ − 154
} else if ( (byte & 0xE0) == 0xC0 ) { /* 2byte encoding */
+ − 155
/* Check encoding of following bytes */
+ − 156
if ( (i+1) >= length || (string[i+1] & 0xC0) != 0x80 ) {
+ − 157
break; /* Error condition */
+ − 158
}
+ − 159
i++; /* Skip next byte */
+ − 160
newLength += 2;
+ − 161
} else if ( (byte & 0xF0) == 0xE0 ) { /* 3byte encoding */
+ − 162
/* Check encoding of following bytes */
+ − 163
if ( (i+2) >= length || (string[i+1] & 0xC0) != 0x80
+ − 164
|| (string[i+2] & 0xC0) != 0x80 ) {
+ − 165
break; /* Error condition */
+ − 166
}
+ − 167
i += 2; /* Skip next two bytes */
+ − 168
newLength += 3;
+ − 169
} else if ( (byte & 0xF8) == 0xF0 ) { /* 4byte encoding */
+ − 170
/* Check encoding of following bytes */
+ − 171
if ( (i+3) >= length || (string[i+1] & 0xC0) != 0x80
+ − 172
|| (string[i+2] & 0xC0) != 0x80
+ − 173
|| (string[i+3] & 0xC0) != 0x80 ) {
+ − 174
break; /* Error condition */
+ − 175
}
+ − 176
i += 3; /* Skip next 3 bytes */
+ − 177
newLength += 6; /* 4byte encoding turns into 2 3byte ones */
+ − 178
} else {
+ − 179
break; /* Error condition */
+ − 180
}
+ − 181
}
+ − 182
if ( i != length ) {
+ − 183
/* Error in finding new length, return old length so no conversion */
+ − 184
/* FIXUP: ERROR_MESSAGE? */
+ − 185
return length;
+ − 186
}
+ − 187
return newLength;
+ − 188
}
+ − 189
+ − 190
/* Convert Standard UTF-8 to Modified UTF-8.
+ − 191
* Assumes the UTF-8 encoding was validated by utf8mLength() above.
+ − 192
*
+ − 193
* Note: Accepts Modified UTF-8 also, no verification on the
+ − 194
* correctness of Standard UTF-8 is done. e,g, 0xC080 input is ok.
+ − 195
*/
+ − 196
void JNICALL
+ − 197
utf8sToUtf8m(struct UtfInst *ui, jbyte *string, int length, jbyte *newString, int newLength)
+ − 198
{
+ − 199
int i;
+ − 200
int j;
+ − 201
+ − 202
j = 0;
+ − 203
for ( i = 0 ; i < length ; i++ ) {
+ − 204
unsigned byte1;
+ − 205
+ − 206
byte1 = (unsigned char)string[i];
+ − 207
+ − 208
/* NULL bytes and bytes starting with 11110xxx are special */
+ − 209
if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */
+ − 210
if ( byte1 == 0 ) {
+ − 211
/* Bits out: 11000000 10000000 */
+ − 212
newString[j++] = (jbyte)0xC0;
+ − 213
newString[j++] = (jbyte)0x80;
+ − 214
} else {
+ − 215
/* Single byte */
+ − 216
newString[j++] = byte1;
+ − 217
}
+ − 218
} else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */
+ − 219
newString[j++] = byte1;
+ − 220
newString[j++] = string[++i];
+ − 221
} else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */
+ − 222
newString[j++] = byte1;
+ − 223
newString[j++] = string[++i];
+ − 224
newString[j++] = string[++i];
+ − 225
} else if ( (byte1 & 0xF8) == 0xF0 ) { /* 4byte encoding */
+ − 226
/* Beginning of 4byte encoding, turn into 2 3byte encodings */
+ − 227
unsigned byte2, byte3, byte4, u21;
+ − 228
+ − 229
/* Bits in: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
+ − 230
byte2 = (unsigned char)string[++i];
+ − 231
byte3 = (unsigned char)string[++i];
+ − 232
byte4 = (unsigned char)string[++i];
+ − 233
/* Reconstruct full 21bit value */
+ − 234
u21 = (byte1 & 0x07) << 18;
+ − 235
u21 += (byte2 & 0x3F) << 12;
+ − 236
u21 += (byte3 & 0x3F) << 6;
+ − 237
u21 += (byte4 & 0x3F);
+ − 238
/* Bits out: 11101101 1010xxxx 10xxxxxx */
+ − 239
newString[j++] = (jbyte)0xED;
+ − 240
newString[j++] = (jbyte)(0xA0 + (((u21 >> 16) - 1) & 0x0F));
+ − 241
newString[j++] = (jbyte)(0x80 + ((u21 >> 10) & 0x3F));
+ − 242
/* Bits out: 11101101 1011xxxx 10xxxxxx */
+ − 243
newString[j++] = (jbyte)0xED;
+ − 244
newString[j++] = (jbyte)(0xB0 + ((u21 >> 6) & 0x0F));
+ − 245
newString[j++] = byte4;
+ − 246
}
+ − 247
}
+ − 248
UTF_ASSERT(i==length);
+ − 249
UTF_ASSERT(j==newLength);
+ − 250
newString[j] = (jbyte)0;
+ − 251
}
+ − 252
+ − 253
/* Given a Modified UTF-8 string, calculate the Standard UTF-8 length.
+ − 254
* Basic validation of the UTF encoding rules is done, and length is
+ − 255
* returned (no change) when errors are detected.
+ − 256
*
+ − 257
* Note: No validation is made that this is indeed Modified UTF-8 coming in.
+ − 258
*
+ − 259
*/
+ − 260
int JNICALL
+ − 261
utf8mToUtf8sLength(struct UtfInst *ui, jbyte *string, int length)
+ − 262
{
+ − 263
int newLength;
+ − 264
int i;
+ − 265
+ − 266
newLength = 0;
+ − 267
for ( i = 0 ; i < length ; i++ ) {
+ − 268
unsigned byte1, byte2, byte3, byte4, byte5, byte6;
+ − 269
+ − 270
byte1 = (unsigned char)string[i];
+ − 271
if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */
+ − 272
newLength++;
+ − 273
} else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */
+ − 274
/* Check encoding of following bytes */
+ − 275
if ( (i+1) >= length || (string[i+1] & 0xC0) != 0x80 ) {
+ − 276
break; /* Error condition */
+ − 277
}
+ − 278
byte2 = (unsigned char)string[++i];
+ − 279
if ( byte1 != 0xC0 || byte2 != 0x80 ) {
+ − 280
newLength += 2; /* Normal 2byte encoding, not 0xC080 */
+ − 281
} else {
+ − 282
newLength++; /* We will turn 0xC080 into 0 */
+ − 283
}
+ − 284
} else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */
+ − 285
/* Check encoding of following bytes */
+ − 286
if ( (i+2) >= length || (string[i+1] & 0xC0) != 0x80
+ − 287
|| (string[i+2] & 0xC0) != 0x80 ) {
+ − 288
break; /* Error condition */
+ − 289
}
+ − 290
byte2 = (unsigned char)string[++i];
+ − 291
byte3 = (unsigned char)string[++i];
+ − 292
newLength += 3;
+ − 293
/* Possible process a second 3byte encoding */
+ − 294
if ( (i+3) < length && byte1 == 0xED && (byte2 & 0xF0) == 0xA0 ) {
+ − 295
/* See if this is a pair of 3byte encodings */
+ − 296
byte4 = (unsigned char)string[i+1];
+ − 297
byte5 = (unsigned char)string[i+2];
+ − 298
byte6 = (unsigned char)string[i+3];
+ − 299
if ( byte4 == 0xED && (byte5 & 0xF0) == 0xB0 ) {
+ − 300
/* Check encoding of 3rd byte */
+ − 301
if ( (byte6 & 0xC0) != 0x80 ) {
+ − 302
break; /* Error condition */
+ − 303
}
+ − 304
newLength++; /* New string will have 4byte encoding */
+ − 305
i += 3; /* Skip next 3 bytes */
+ − 306
}
+ − 307
}
+ − 308
} else {
+ − 309
break; /* Error condition */
+ − 310
}
+ − 311
}
+ − 312
if ( i != length ) {
+ − 313
/* Error in UTF encoding */
+ − 314
/* FIXUP: ERROR_MESSAGE()? */
+ − 315
return length;
+ − 316
}
+ − 317
return newLength;
+ − 318
}
+ − 319
+ − 320
/* Convert a Modified UTF-8 string into a Standard UTF-8 string
+ − 321
* It is assumed that this string has been validated in terms of the
+ − 322
* basic UTF encoding rules by utf8Length() above.
+ − 323
*
+ − 324
* Note: No validation is made that this is indeed Modified UTF-8 coming in.
+ − 325
*
+ − 326
*/
+ − 327
void JNICALL
+ − 328
utf8mToUtf8s(struct UtfInst *ui, jbyte *string, int length, jbyte *newString, int newLength)
+ − 329
{
+ − 330
int i;
+ − 331
int j;
+ − 332
+ − 333
j = 0;
+ − 334
for ( i = 0 ; i < length ; i++ ) {
+ − 335
unsigned byte1, byte2, byte3, byte4, byte5, byte6;
+ − 336
+ − 337
byte1 = (unsigned char)string[i];
+ − 338
if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */
+ − 339
/* Single byte */
+ − 340
newString[j++] = byte1;
+ − 341
} else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */
+ − 342
byte2 = (unsigned char)string[++i];
+ − 343
if ( byte1 != 0xC0 || byte2 != 0x80 ) {
+ − 344
newString[j++] = byte1;
+ − 345
newString[j++] = byte2;
+ − 346
} else {
+ − 347
newString[j++] = 0;
+ − 348
}
+ − 349
} else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */
+ − 350
byte2 = (unsigned char)string[++i];
+ − 351
byte3 = (unsigned char)string[++i];
+ − 352
if ( i+3 < length && byte1 == 0xED && (byte2 & 0xF0) == 0xA0 ) {
+ − 353
/* See if this is a pair of 3byte encodings */
+ − 354
byte4 = (unsigned char)string[i+1];
+ − 355
byte5 = (unsigned char)string[i+2];
+ − 356
byte6 = (unsigned char)string[i+3];
+ − 357
if ( byte4 == 0xED && (byte5 & 0xF0) == 0xB0 ) {
+ − 358
unsigned u21;
+ − 359
+ − 360
/* Bits in: 11101101 1010xxxx 10xxxxxx */
+ − 361
/* Bits in: 11101101 1011xxxx 10xxxxxx */
+ − 362
i += 3;
+ − 363
+ − 364
/* Reconstruct 21 bit code */
+ − 365
u21 = ((byte2 & 0x0F) + 1) << 16;
+ − 366
u21 += (byte3 & 0x3F) << 10;
+ − 367
u21 += (byte5 & 0x0F) << 6;
+ − 368
u21 += (byte6 & 0x3F);
+ − 369
+ − 370
/* Bits out: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
+ − 371
+ − 372
/* Convert to 4byte encoding */
+ − 373
newString[j++] = 0xF0 + ((u21 >> 18) & 0x07);
+ − 374
newString[j++] = 0x80 + ((u21 >> 12) & 0x3F);
+ − 375
newString[j++] = 0x80 + ((u21 >> 6) & 0x3F);
+ − 376
newString[j++] = 0x80 + (u21 & 0x3F);
+ − 377
continue;
+ − 378
}
+ − 379
}
+ − 380
/* Normal 3byte encoding */
+ − 381
newString[j++] = byte1;
+ − 382
newString[j++] = byte2;
+ − 383
newString[j++] = byte3;
+ − 384
}
+ − 385
}
+ − 386
UTF_ASSERT(i==length);
+ − 387
UTF_ASSERT(j==newLength);
+ − 388
newString[j] = 0;
+ − 389
}
+ − 390
+ − 391
/* ================================================================= */
+ − 392
+ − 393
#if 1 /* Test program */
+ − 394
+ − 395
/*
+ − 396
* Convert any byte array into a printable string.
+ − 397
* Returns length or -1 if output overflows.
+ − 398
*/
+ − 399
static int
+ − 400
bytesToPrintable(struct UtfInst *ui, char *bytes, int len, char *output, int outputMaxLen)
+ − 401
{
+ − 402
int outputLen;
+ − 403
int i;
+ − 404
+ − 405
UTF_ASSERT(bytes);
+ − 406
UTF_ASSERT(len>=0);
+ − 407
UTF_ASSERT(output);
+ − 408
UTF_ASSERT(outputMaxLen>=0);
+ − 409
+ − 410
outputLen = 0;
+ − 411
for ( i=0; i<len ; i++ ) {
+ − 412
unsigned byte;
+ − 413
+ − 414
byte = bytes[i];
+ − 415
if ( outputLen >= outputMaxLen ) {
+ − 416
return -1;
+ − 417
}
+ − 418
if ( byte <= 0x7f && isprint(byte) && !iscntrl(byte) ) {
+ − 419
output[outputLen++] = (char)byte;
+ − 420
} else {
+ − 421
(void)sprintf(output+outputLen,"\\x%02x",byte);
+ − 422
outputLen += 4;
+ − 423
}
+ − 424
}
+ − 425
output[outputLen] = 0;
+ − 426
return outputLen;
+ − 427
}
+ − 428
+ − 429
static void
+ − 430
test(void)
+ − 431
{
+ − 432
static char *strings[] = {
+ − 433
"characters",
+ − 434
"abcdefghijklmnopqrstuvwxyz",
+ − 435
"0123456789",
+ − 436
"!@#$%^&*()_+=-{}[]:;",
+ − 437
NULL };
+ − 438
int i;
+ − 439
struct UtfInst *ui;
+ − 440
+ − 441
ui = utfInitialize(NULL);
+ − 442
+ − 443
i = 0;
+ − 444
while ( strings[i] != NULL ) {
+ − 445
char *str;
+ − 446
#define MAX 1024
+ − 447
char buf0[MAX];
+ − 448
char buf1[MAX];
+ − 449
char buf2[MAX];
+ − 450
unsigned short buf3[MAX];
+ − 451
int len1;
+ − 452
int len2;
+ − 453
int len3;
+ − 454
+ − 455
str = strings[i];
+ − 456
+ − 457
(void)bytesToPrintable(ui, str, (int)strlen(str), buf0, 1024);
+ − 458
+ − 459
len1 = utf8FromPlatform(ui, str, (int)strlen(str), (jbyte*)buf1, 1024);
+ − 460
+ − 461
UTF_ASSERT(len1==(int)strlen(str));
+ − 462
+ − 463
len3 = utf8ToUtf16(ui, (jbyte*)buf1, len1, (jchar*)buf3, 1024);
+ − 464
+ − 465
UTF_ASSERT(len3==len1);
+ − 466
+ − 467
len1 = utf16ToUtf8m(ui, (jchar*)buf3, len3, (jbyte*)buf1, 1024);
+ − 468
+ − 469
UTF_ASSERT(len1==len3);
+ − 470
UTF_ASSERT(strcmp(str, buf1) == 0);
+ − 471
+ − 472
len2 = utf8ToPlatform(ui, (jbyte*)buf1, len1, buf2, 1024);
+ − 473
+ − 474
UTF_ASSERT(len2==len1);
+ − 475
UTF_ASSERT(strcmp(str, buf2) == 0);
+ − 476
+ − 477
i++;
+ − 478
}
+ − 479
+ − 480
utfTerminate(ui, NULL);
+ − 481
+ − 482
}
+ − 483
+ − 484
int
+ − 485
main(int argc, char **argv)
+ − 486
{
+ − 487
test();
+ − 488
return 0;
+ − 489
}
+ − 490
+ − 491
#endif