jdk/src/share/npt/utf.c
changeset 2 90ce3da70b43
child 5506 202f599c92aa
equal deleted inserted replaced
0:fd16c54261b3 2:90ce3da70b43
       
     1 /*
       
     2  * Copyright 2004-2005 Sun Microsystems, Inc.  All Rights Reserved.
       
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
       
     4  *
       
     5  * This code is free software; you can redistribute it and/or modify it
       
     6  * under the terms of the GNU General Public License version 2 only, as
       
     7  * published by the Free Software Foundation.  Sun designates this
       
     8  * particular file as subject to the "Classpath" exception as provided
       
     9  * by Sun in the LICENSE file that accompanied this code.
       
    10  *
       
    11  * This code is distributed in the hope that it will be useful, but WITHOUT
       
    12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
       
    13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
       
    14  * version 2 for more details (a copy is included in the LICENSE file that
       
    15  * accompanied this code).
       
    16  *
       
    17  * You should have received a copy of the GNU General Public License version
       
    18  * 2 along with this work; if not, write to the Free Software Foundation,
       
    19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
       
    20  *
       
    21  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
       
    22  * CA 95054 USA or visit www.sun.com if you need additional information or
       
    23  * have any questions.
       
    24  */
       
    25 
       
    26 /* Misc functions for conversion of Unicode and UTF-8 and platform encoding */
       
    27 
       
    28 #include <stdio.h>
       
    29 #include <stddef.h>
       
    30 #include <stdlib.h>
       
    31 #include <stdarg.h>
       
    32 #include <string.h>
       
    33 #include <ctype.h>
       
    34 
       
    35 #include "jni.h"
       
    36 
       
    37 #include "utf.h"
       
    38 
       
    39 /*
       
    40  * Error handler
       
    41  */
       
    42 void
       
    43 utfError(char *file, int line, char *message)
       
    44 {
       
    45     (void)fprintf(stderr, "UTF ERROR [\"%s\":%d]: %s\n", file, line, message);
       
    46     abort();
       
    47 }
       
    48 
       
    49 /*
       
    50  * Convert UTF-8 to UTF-16
       
    51  *    Returns length or -1 if output overflows.
       
    52  */
       
    53 int JNICALL
       
    54 utf8ToUtf16(struct UtfInst *ui, jbyte *utf8, int len, unsigned short *output, int outputMaxLen)
       
    55 {
       
    56     int outputLen;
       
    57     int i;
       
    58 
       
    59     UTF_ASSERT(utf8);
       
    60     UTF_ASSERT(len>=0);
       
    61     UTF_ASSERT(output);
       
    62     UTF_ASSERT(outputMaxLen>0);
       
    63 
       
    64     i = 0;
       
    65     outputLen = 0;
       
    66     while ( i<len ) {
       
    67         unsigned code, x, y, z;
       
    68 
       
    69         if ( outputLen >= outputMaxLen ) {
       
    70             return -1;
       
    71         }
       
    72         x = (unsigned char)utf8[i++];
       
    73         code = x;
       
    74         if ( (x & 0xE0)==0xE0 ) {
       
    75             y = (unsigned char)utf8[i++];
       
    76             z = (unsigned char)utf8[i++];
       
    77             code = ((x & 0xF)<<12) + ((y & 0x3F)<<6) + (z & 0x3F);
       
    78         } else if ( (x & 0xC0)==0xC0 ) {
       
    79             y = (unsigned char)utf8[i++];
       
    80             code = ((x & 0x1F)<<6) + (y & 0x3F);
       
    81         }
       
    82         output[outputLen++] = code;
       
    83     }
       
    84     return outputLen;
       
    85 }
       
    86 
       
    87 /*
       
    88  * Convert UTF-16 to UTF-8 Modified
       
    89  *    Returns length or -1 if output overflows.
       
    90  */
       
    91 int JNICALL
       
    92 utf16ToUtf8m(struct UtfInst *ui, unsigned short *utf16, int len, jbyte *output, int outputMaxLen)
       
    93 {
       
    94     int i;
       
    95     int outputLen;
       
    96 
       
    97     UTF_ASSERT(utf16);
       
    98     UTF_ASSERT(len>=0);
       
    99     UTF_ASSERT(output);
       
   100     UTF_ASSERT(outputMaxLen>0);
       
   101 
       
   102     outputLen = 0;
       
   103     for (i = 0; i < len; i++) {
       
   104         unsigned code;
       
   105 
       
   106         code = utf16[i];
       
   107         if ( code >= 0x0001 && code <= 0x007F ) {
       
   108             output[outputLen++] = code;
       
   109         } else if ( code == 0 || ( code >= 0x0080 && code <= 0x07FF ) ) {
       
   110             output[outputLen++] = ((code>>6) & 0x1F) | 0xC0;
       
   111             output[outputLen++] = (code & 0x3F) | 0x80;
       
   112         } else if ( code >= 0x0800 && code <= 0xFFFF ) {
       
   113             output[outputLen++] = ((code>>12) & 0x0F) | 0xE0;
       
   114             output[outputLen++] = ((code>>6) & 0x3F) | 0x80;
       
   115             output[outputLen++] = (code & 0x3F) | 0x80;
       
   116         }
       
   117         if ( outputLen > outputMaxLen ) {
       
   118             return -1;
       
   119         }
       
   120     }
       
   121     output[outputLen] = 0;
       
   122     return outputLen;
       
   123 }
       
   124 
       
   125 int JNICALL
       
   126 utf16ToUtf8s(struct UtfInst *ui, unsigned short *utf16, int len, jbyte *output, int outputMaxLen)
       
   127 {
       
   128     return -1; /* FIXUP */
       
   129 }
       
   130 
       
   131 /* Determine length of this Standard UTF-8 in Modified UTF-8.
       
   132  *    Validation is done of the basic UTF encoding rules, returns
       
   133  *    length (no change) when errors are detected in the UTF encoding.
       
   134  *
       
   135  *    Note: Accepts Modified UTF-8 also, no verification on the
       
   136  *          correctness of Standard UTF-8 is done. e,g, 0xC080 input is ok.
       
   137  */
       
   138 int JNICALL
       
   139 utf8sToUtf8mLength(struct UtfInst *ui, jbyte *string, int length)
       
   140 {
       
   141     int newLength;
       
   142     int i;
       
   143 
       
   144     newLength = 0;
       
   145     for ( i = 0 ; i < length ; i++ ) {
       
   146         unsigned byte;
       
   147 
       
   148         byte = (unsigned char)string[i];
       
   149         if ( (byte & 0x80) == 0 ) { /* 1byte encoding */
       
   150             newLength++;
       
   151             if ( byte == 0 ) {
       
   152                 newLength++; /* We gain one byte in length on NULL bytes */
       
   153             }
       
   154         } else if ( (byte & 0xE0) == 0xC0 ) { /* 2byte encoding */
       
   155             /* Check encoding of following bytes */
       
   156             if ( (i+1) >= length || (string[i+1] & 0xC0) != 0x80 ) {
       
   157                 break; /* Error condition */
       
   158             }
       
   159             i++; /* Skip next byte */
       
   160             newLength += 2;
       
   161         } else if ( (byte & 0xF0) == 0xE0 ) { /* 3byte encoding */
       
   162             /* Check encoding of following bytes */
       
   163             if ( (i+2) >= length || (string[i+1] & 0xC0) != 0x80
       
   164                                  || (string[i+2] & 0xC0) != 0x80 ) {
       
   165                 break; /* Error condition */
       
   166             }
       
   167             i += 2; /* Skip next two bytes */
       
   168             newLength += 3;
       
   169         } else if ( (byte & 0xF8) == 0xF0 ) { /* 4byte encoding */
       
   170             /* Check encoding of following bytes */
       
   171             if ( (i+3) >= length || (string[i+1] & 0xC0) != 0x80
       
   172                                  || (string[i+2] & 0xC0) != 0x80
       
   173                                  || (string[i+3] & 0xC0) != 0x80 ) {
       
   174                 break; /* Error condition */
       
   175             }
       
   176             i += 3; /* Skip next 3 bytes */
       
   177             newLength += 6; /* 4byte encoding turns into 2 3byte ones */
       
   178         } else {
       
   179             break; /* Error condition */
       
   180         }
       
   181     }
       
   182     if ( i != length ) {
       
   183         /* Error in finding new length, return old length so no conversion */
       
   184         /* FIXUP: ERROR_MESSAGE? */
       
   185         return length;
       
   186     }
       
   187     return newLength;
       
   188 }
       
   189 
       
   190 /* Convert Standard UTF-8 to Modified UTF-8.
       
   191  *    Assumes the UTF-8 encoding was validated by utf8mLength() above.
       
   192  *
       
   193  *    Note: Accepts Modified UTF-8 also, no verification on the
       
   194  *          correctness of Standard UTF-8 is done. e,g, 0xC080 input is ok.
       
   195  */
       
   196 void JNICALL
       
   197 utf8sToUtf8m(struct UtfInst *ui, jbyte *string, int length, jbyte *newString, int newLength)
       
   198 {
       
   199     int i;
       
   200     int j;
       
   201 
       
   202     j = 0;
       
   203     for ( i = 0 ; i < length ; i++ ) {
       
   204         unsigned byte1;
       
   205 
       
   206         byte1 = (unsigned char)string[i];
       
   207 
       
   208         /* NULL bytes and bytes starting with 11110xxx are special */
       
   209         if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */
       
   210             if ( byte1 == 0 ) {
       
   211                 /* Bits out: 11000000 10000000 */
       
   212                 newString[j++] = (jbyte)0xC0;
       
   213                 newString[j++] = (jbyte)0x80;
       
   214             } else {
       
   215                 /* Single byte */
       
   216                 newString[j++] = byte1;
       
   217             }
       
   218         } else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */
       
   219             newString[j++] = byte1;
       
   220             newString[j++] = string[++i];
       
   221         } else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */
       
   222             newString[j++] = byte1;
       
   223             newString[j++] = string[++i];
       
   224             newString[j++] = string[++i];
       
   225         } else if ( (byte1 & 0xF8) == 0xF0 ) { /* 4byte encoding */
       
   226             /* Beginning of 4byte encoding, turn into 2 3byte encodings */
       
   227             unsigned byte2, byte3, byte4, u21;
       
   228 
       
   229             /* Bits in: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
       
   230             byte2 = (unsigned char)string[++i];
       
   231             byte3 = (unsigned char)string[++i];
       
   232             byte4 = (unsigned char)string[++i];
       
   233             /* Reconstruct full 21bit value */
       
   234             u21  = (byte1 & 0x07) << 18;
       
   235             u21 += (byte2 & 0x3F) << 12;
       
   236             u21 += (byte3 & 0x3F) << 6;
       
   237             u21 += (byte4 & 0x3F);
       
   238             /* Bits out: 11101101 1010xxxx 10xxxxxx */
       
   239             newString[j++] = (jbyte)0xED;
       
   240             newString[j++] = (jbyte)(0xA0 + (((u21 >> 16) - 1) & 0x0F));
       
   241             newString[j++] = (jbyte)(0x80 + ((u21 >> 10) & 0x3F));
       
   242             /* Bits out: 11101101 1011xxxx 10xxxxxx */
       
   243             newString[j++] = (jbyte)0xED;
       
   244             newString[j++] = (jbyte)(0xB0 + ((u21 >>  6) & 0x0F));
       
   245             newString[j++] = byte4;
       
   246         }
       
   247     }
       
   248     UTF_ASSERT(i==length);
       
   249     UTF_ASSERT(j==newLength);
       
   250     newString[j] = (jbyte)0;
       
   251 }
       
   252 
       
   253 /* Given a Modified UTF-8 string, calculate the Standard UTF-8 length.
       
   254  *   Basic validation of the UTF encoding rules is done, and length is
       
   255  *   returned (no change) when errors are detected.
       
   256  *
       
   257  *   Note: No validation is made that this is indeed Modified UTF-8 coming in.
       
   258  *
       
   259  */
       
   260 int JNICALL
       
   261 utf8mToUtf8sLength(struct UtfInst *ui, jbyte *string, int length)
       
   262 {
       
   263     int newLength;
       
   264     int i;
       
   265 
       
   266     newLength = 0;
       
   267     for ( i = 0 ; i < length ; i++ ) {
       
   268         unsigned byte1, byte2, byte3, byte4, byte5, byte6;
       
   269 
       
   270         byte1 = (unsigned char)string[i];
       
   271         if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */
       
   272             newLength++;
       
   273         } else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */
       
   274             /* Check encoding of following bytes */
       
   275             if ( (i+1) >= length || (string[i+1] & 0xC0) != 0x80 ) {
       
   276                 break; /* Error condition */
       
   277             }
       
   278             byte2 = (unsigned char)string[++i];
       
   279             if ( byte1 != 0xC0 || byte2 != 0x80 ) {
       
   280                 newLength += 2; /* Normal 2byte encoding, not 0xC080 */
       
   281             } else {
       
   282                 newLength++;    /* We will turn 0xC080 into 0 */
       
   283             }
       
   284         } else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */
       
   285             /* Check encoding of following bytes */
       
   286             if ( (i+2) >= length || (string[i+1] & 0xC0) != 0x80
       
   287                                  || (string[i+2] & 0xC0) != 0x80 ) {
       
   288                 break; /* Error condition */
       
   289             }
       
   290             byte2 = (unsigned char)string[++i];
       
   291             byte3 = (unsigned char)string[++i];
       
   292             newLength += 3;
       
   293             /* Possible process a second 3byte encoding */
       
   294             if ( (i+3) < length && byte1 == 0xED && (byte2 & 0xF0) == 0xA0 ) {
       
   295                 /* See if this is a pair of 3byte encodings */
       
   296                 byte4 = (unsigned char)string[i+1];
       
   297                 byte5 = (unsigned char)string[i+2];
       
   298                 byte6 = (unsigned char)string[i+3];
       
   299                 if ( byte4 == 0xED && (byte5 & 0xF0) == 0xB0 ) {
       
   300                     /* Check encoding of 3rd byte */
       
   301                     if ( (byte6 & 0xC0) != 0x80 ) {
       
   302                         break; /* Error condition */
       
   303                     }
       
   304                     newLength++; /* New string will have 4byte encoding */
       
   305                     i += 3;       /* Skip next 3 bytes */
       
   306                 }
       
   307             }
       
   308         } else {
       
   309             break; /* Error condition */
       
   310         }
       
   311     }
       
   312     if ( i != length ) {
       
   313         /* Error in UTF encoding */
       
   314         /*  FIXUP: ERROR_MESSAGE()? */
       
   315         return length;
       
   316     }
       
   317     return newLength;
       
   318 }
       
   319 
       
   320 /* Convert a Modified UTF-8 string into a Standard UTF-8 string
       
   321  *   It is assumed that this string has been validated in terms of the
       
   322  *   basic UTF encoding rules by utf8Length() above.
       
   323  *
       
   324  *   Note: No validation is made that this is indeed Modified UTF-8 coming in.
       
   325  *
       
   326  */
       
   327 void JNICALL
       
   328 utf8mToUtf8s(struct UtfInst *ui, jbyte *string, int length, jbyte *newString, int newLength)
       
   329 {
       
   330     int i;
       
   331     int j;
       
   332 
       
   333     j = 0;
       
   334     for ( i = 0 ; i < length ; i++ ) {
       
   335         unsigned byte1, byte2, byte3, byte4, byte5, byte6;
       
   336 
       
   337         byte1 = (unsigned char)string[i];
       
   338         if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */
       
   339             /* Single byte */
       
   340             newString[j++] = byte1;
       
   341         } else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */
       
   342             byte2 = (unsigned char)string[++i];
       
   343             if ( byte1 != 0xC0 || byte2 != 0x80 ) {
       
   344                 newString[j++] = byte1;
       
   345                 newString[j++] = byte2;
       
   346             } else {
       
   347                 newString[j++] = 0;
       
   348             }
       
   349         } else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */
       
   350             byte2 = (unsigned char)string[++i];
       
   351             byte3 = (unsigned char)string[++i];
       
   352             if ( i+3 < length && byte1 == 0xED && (byte2 & 0xF0) == 0xA0 ) {
       
   353                 /* See if this is a pair of 3byte encodings */
       
   354                 byte4 = (unsigned char)string[i+1];
       
   355                 byte5 = (unsigned char)string[i+2];
       
   356                 byte6 = (unsigned char)string[i+3];
       
   357                 if ( byte4 == 0xED && (byte5 & 0xF0) == 0xB0 ) {
       
   358                     unsigned u21;
       
   359 
       
   360                     /* Bits in: 11101101 1010xxxx 10xxxxxx */
       
   361                     /* Bits in: 11101101 1011xxxx 10xxxxxx */
       
   362                     i += 3;
       
   363 
       
   364                     /* Reconstruct 21 bit code */
       
   365                     u21  = ((byte2 & 0x0F) + 1) << 16;
       
   366                     u21 += (byte3 & 0x3F) << 10;
       
   367                     u21 += (byte5 & 0x0F) << 6;
       
   368                     u21 += (byte6 & 0x3F);
       
   369 
       
   370                     /* Bits out: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
       
   371 
       
   372                     /* Convert to 4byte encoding */
       
   373                     newString[j++] = 0xF0 + ((u21 >> 18) & 0x07);
       
   374                     newString[j++] = 0x80 + ((u21 >> 12) & 0x3F);
       
   375                     newString[j++] = 0x80 + ((u21 >>  6) & 0x3F);
       
   376                     newString[j++] = 0x80 + (u21 & 0x3F);
       
   377                     continue;
       
   378                 }
       
   379             }
       
   380             /* Normal 3byte encoding */
       
   381             newString[j++] = byte1;
       
   382             newString[j++] = byte2;
       
   383             newString[j++] = byte3;
       
   384         }
       
   385     }
       
   386     UTF_ASSERT(i==length);
       
   387     UTF_ASSERT(j==newLength);
       
   388     newString[j] = 0;
       
   389 }
       
   390 
       
   391 /* ================================================================= */
       
   392 
       
   393 #if 1  /* Test program */
       
   394 
       
   395 /*
       
   396  * Convert any byte array into a printable string.
       
   397  *    Returns length or -1 if output overflows.
       
   398  */
       
   399 static int
       
   400 bytesToPrintable(struct UtfInst *ui, char *bytes, int len, char *output, int outputMaxLen)
       
   401 {
       
   402     int outputLen;
       
   403     int i;
       
   404 
       
   405     UTF_ASSERT(bytes);
       
   406     UTF_ASSERT(len>=0);
       
   407     UTF_ASSERT(output);
       
   408     UTF_ASSERT(outputMaxLen>=0);
       
   409 
       
   410     outputLen = 0;
       
   411     for ( i=0; i<len ; i++ ) {
       
   412         unsigned byte;
       
   413 
       
   414         byte = bytes[i];
       
   415         if ( outputLen >= outputMaxLen ) {
       
   416             return -1;
       
   417         }
       
   418         if ( byte <= 0x7f && isprint(byte) && !iscntrl(byte) ) {
       
   419             output[outputLen++] = (char)byte;
       
   420         } else {
       
   421             (void)sprintf(output+outputLen,"\\x%02x",byte);
       
   422             outputLen += 4;
       
   423         }
       
   424     }
       
   425     output[outputLen] = 0;
       
   426     return outputLen;
       
   427 }
       
   428 
       
   429 static void
       
   430 test(void)
       
   431 {
       
   432     static char *strings[] = {
       
   433                 "characters",
       
   434                 "abcdefghijklmnopqrstuvwxyz",
       
   435                 "0123456789",
       
   436                 "!@#$%^&*()_+=-{}[]:;",
       
   437                 NULL };
       
   438     int i;
       
   439     struct UtfInst *ui;
       
   440 
       
   441     ui = utfInitialize(NULL);
       
   442 
       
   443     i = 0;
       
   444     while ( strings[i] != NULL ) {
       
   445         char *str;
       
   446         #define MAX 1024
       
   447         char buf0[MAX];
       
   448         char buf1[MAX];
       
   449         char buf2[MAX];
       
   450         unsigned short buf3[MAX];
       
   451         int len1;
       
   452         int len2;
       
   453         int len3;
       
   454 
       
   455         str = strings[i];
       
   456 
       
   457         (void)bytesToPrintable(ui, str, (int)strlen(str), buf0, 1024);
       
   458 
       
   459         len1 = utf8FromPlatform(ui, str, (int)strlen(str), (jbyte*)buf1, 1024);
       
   460 
       
   461         UTF_ASSERT(len1==(int)strlen(str));
       
   462 
       
   463         len3 = utf8ToUtf16(ui, (jbyte*)buf1, len1, (jchar*)buf3, 1024);
       
   464 
       
   465         UTF_ASSERT(len3==len1);
       
   466 
       
   467         len1 = utf16ToUtf8m(ui, (jchar*)buf3, len3, (jbyte*)buf1, 1024);
       
   468 
       
   469         UTF_ASSERT(len1==len3);
       
   470         UTF_ASSERT(strcmp(str, buf1) == 0);
       
   471 
       
   472         len2 = utf8ToPlatform(ui, (jbyte*)buf1, len1, buf2, 1024);
       
   473 
       
   474         UTF_ASSERT(len2==len1);
       
   475         UTF_ASSERT(strcmp(str, buf2) == 0);
       
   476 
       
   477         i++;
       
   478     }
       
   479 
       
   480     utfTerminate(ui, NULL);
       
   481 
       
   482 }
       
   483 
       
   484 int
       
   485 main(int argc, char **argv)
       
   486 {
       
   487     test();
       
   488     return 0;
       
   489 }
       
   490 
       
   491 #endif