jdk/src/share/classes/sun/io/CharToByteUTF8.java
changeset 10372 2f6d68f22eae
parent 10321 64f7ee2f31dd
parent 10371 7da2112e4236
child 10373 d4c5e59b82f8
equal deleted inserted replaced
10321:64f7ee2f31dd 10372:2f6d68f22eae
     1 /*
       
     2  * Copyright (c) 1996, 1997, Oracle and/or its affiliates. All rights reserved.
       
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
       
     4  *
       
     5  * This code is free software; you can redistribute it and/or modify it
       
     6  * under the terms of the GNU General Public License version 2 only, as
       
     7  * published by the Free Software Foundation.  Oracle designates this
       
     8  * particular file as subject to the "Classpath" exception as provided
       
     9  * by Oracle in the LICENSE file that accompanied this code.
       
    10  *
       
    11  * This code is distributed in the hope that it will be useful, but WITHOUT
       
    12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
       
    13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
       
    14  * version 2 for more details (a copy is included in the LICENSE file that
       
    15  * accompanied this code).
       
    16  *
       
    17  * You should have received a copy of the GNU General Public License version
       
    18  * 2 along with this work; if not, write to the Free Software Foundation,
       
    19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
       
    20  *
       
    21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
       
    22  * or visit www.oracle.com if you need additional information or have any
       
    23  * questions.
       
    24  */
       
    25 package sun.io;
       
    26 
       
    27 
       
    28 /**
       
    29  * UCS2 (UTF16) -> UCS Transformation Format 8 (UTF-8) converter
       
    30  * It's represented like below.
       
    31  *
       
    32  * # Bits   Bit pattern
       
    33  * 1    7   0xxxxxxx
       
    34  * 2   11   110xxxxx 10xxxxxx
       
    35  * 3   16   1110xxxx 10xxxxxx 10xxxxxx
       
    36  * 4   21   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
       
    37  * 5   26   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
       
    38  * 6   31   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
       
    39  *
       
    40  *     UCS2 uses 1-3 / UTF16 uses 1-4 / UCS4 uses 1-6
       
    41  */
       
    42 
       
    43 public class CharToByteUTF8 extends CharToByteConverter {
       
    44 
       
    45     private char highHalfZoneCode;
       
    46 
       
    47     public int flush(byte[] output, int outStart, int outEnd)
       
    48         throws MalformedInputException
       
    49     {
       
    50         if (highHalfZoneCode != 0) {
       
    51             highHalfZoneCode = 0;
       
    52             badInputLength = 0;
       
    53             throw new MalformedInputException();
       
    54         }
       
    55         byteOff = charOff = 0;
       
    56         return 0;
       
    57     }
       
    58 
       
    59     /**
       
    60      * Character conversion
       
    61      */
       
    62     public int convert(char[] input, int inOff, int inEnd,
       
    63                        byte[] output, int outOff, int outEnd)
       
    64         throws ConversionBufferFullException, MalformedInputException
       
    65     {
       
    66         char inputChar;
       
    67         byte[] outputByte = new byte[6];
       
    68         int inputSize;
       
    69         int outputSize;
       
    70 
       
    71         charOff = inOff;
       
    72         byteOff = outOff;
       
    73 
       
    74         if (highHalfZoneCode != 0) {
       
    75             inputChar = highHalfZoneCode;
       
    76             highHalfZoneCode = 0;
       
    77             if (input[inOff] >= 0xdc00 && input[inOff] <= 0xdfff) {
       
    78                 // This is legal UTF16 sequence.
       
    79                 int ucs4 = (highHalfZoneCode - 0xd800) * 0x400
       
    80                     + (input[inOff] - 0xdc00) + 0x10000;
       
    81                 output[0] = (byte)(0xf0 | ((ucs4 >> 18)) & 0x07);
       
    82                 output[1] = (byte)(0x80 | ((ucs4 >> 12) & 0x3f));
       
    83                 output[2] = (byte)(0x80 | ((ucs4 >> 6) & 0x3f));
       
    84                 output[3] = (byte)(0x80 | (ucs4 & 0x3f));
       
    85                 charOff++;
       
    86                 highHalfZoneCode = 0;
       
    87             } else {
       
    88                 // This is illegal UTF16 sequence.
       
    89                 badInputLength = 0;
       
    90                 throw new MalformedInputException();
       
    91             }
       
    92         }
       
    93 
       
    94         while(charOff < inEnd) {
       
    95             inputChar = input[charOff];
       
    96             if (inputChar < 0x80) {
       
    97                 outputByte[0] = (byte)inputChar;
       
    98                 inputSize = 1;
       
    99                 outputSize = 1;
       
   100             } else if (inputChar < 0x800) {
       
   101                 outputByte[0] = (byte)(0xc0 | ((inputChar >> 6) & 0x1f));
       
   102                 outputByte[1] = (byte)(0x80 | (inputChar & 0x3f));
       
   103                 inputSize = 1;
       
   104                 outputSize = 2;
       
   105             } else if (inputChar >= 0xd800 && inputChar <= 0xdbff) {
       
   106                 // this is <high-half zone code> in UTF-16
       
   107                 if (charOff + 1 >= inEnd) {
       
   108                     highHalfZoneCode = inputChar;
       
   109                     break;
       
   110                 }
       
   111                 // check next char is valid <low-half zone code>
       
   112                 char lowChar = input[charOff + 1];
       
   113                 if (lowChar < 0xdc00 || lowChar > 0xdfff) {
       
   114                     badInputLength = 1;
       
   115                     throw new MalformedInputException();
       
   116                 }
       
   117                 int ucs4 = (inputChar - 0xd800) * 0x400 + (lowChar - 0xdc00)
       
   118                     + 0x10000;
       
   119                 outputByte[0] = (byte)(0xf0 | ((ucs4 >> 18)) & 0x07);
       
   120                 outputByte[1] = (byte)(0x80 | ((ucs4 >> 12) & 0x3f));
       
   121                 outputByte[2] = (byte)(0x80 | ((ucs4 >> 6) & 0x3f));
       
   122                 outputByte[3] = (byte)(0x80 | (ucs4 & 0x3f));
       
   123                 outputSize = 4;
       
   124                 inputSize = 2;
       
   125             } else {
       
   126                 outputByte[0] = (byte)(0xe0 | ((inputChar >> 12)) & 0x0f);
       
   127                 outputByte[1] = (byte)(0x80 | ((inputChar >> 6) & 0x3f));
       
   128                 outputByte[2] = (byte)(0x80 | (inputChar & 0x3f));
       
   129                 inputSize = 1;
       
   130                 outputSize = 3;
       
   131             }
       
   132             if (byteOff + outputSize > outEnd) {
       
   133                 throw new ConversionBufferFullException();
       
   134             }
       
   135             for (int i = 0; i < outputSize; i++) {
       
   136                 output[byteOff++] = outputByte[i];
       
   137             }
       
   138             charOff += inputSize;
       
   139         }
       
   140         return byteOff - outOff;
       
   141     }
       
   142 
       
   143     public boolean canConvert(char ch) {
       
   144         return true;
       
   145     }
       
   146 
       
   147     public int getMaxBytesPerChar() {
       
   148         return 3;
       
   149     }
       
   150 
       
   151     public void reset() {
       
   152         byteOff = charOff = 0;
       
   153         highHalfZoneCode = 0;
       
   154     }
       
   155 
       
   156     public String getCharacterEncoding() {
       
   157         return "UTF8";
       
   158     }
       
   159 }