src/java.base/share/classes/java/lang/StringDecoderUTF8.java
changeset 48262 daf3b49f4839
parent 48261 43edfde828ab
child 48263 a559b7cd1dea
equal deleted inserted replaced
48261:43edfde828ab 48262:daf3b49f4839
     1 /*
       
     2  * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
       
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
       
     4  *
       
     5  * This code is free software; you can redistribute it and/or modify it
       
     6  * under the terms of the GNU General Public License version 2 only, as
       
     7  * published by the Free Software Foundation.  Oracle designates this
       
     8  * particular file as subject to the "Classpath" exception as provided
       
     9  * by Oracle in the LICENSE file that accompanied this code.
       
    10  *
       
    11  * This code is distributed in the hope that it will be useful, but WITHOUT
       
    12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
       
    13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
       
    14  * version 2 for more details (a copy is included in the LICENSE file that
       
    15  * accompanied this code).
       
    16  *
       
    17  * You should have received a copy of the GNU General Public License version
       
    18  * 2 along with this work; if not, write to the Free Software Foundation,
       
    19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
       
    20  *
       
    21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
       
    22  * or visit www.oracle.com if you need additional information or have any
       
    23  * questions.
       
    24  */
       
    25 
       
    26 package java.lang;
       
    27 
       
    28 import java.nio.charset.Charset;
       
    29 import java.util.Arrays;
       
    30 
       
    31 import static java.lang.String.LATIN1;
       
    32 import static java.lang.String.UTF16;
       
    33 import static java.lang.String.COMPACT_STRINGS;
       
    34 import static java.lang.Character.isSurrogate;
       
    35 import static java.lang.Character.highSurrogate;
       
    36 import static java.lang.Character.lowSurrogate;
       
    37 import static java.lang.Character.isSupplementaryCodePoint;
       
    38 import static java.lang.StringUTF16.putChar;
       
    39 
       
    40 class StringDecoderUTF8 extends StringCoding.StringDecoder {
       
    41 
       
    42     StringDecoderUTF8(Charset cs, String rcn) {
       
    43         super(cs, rcn);
       
    44     }
       
    45 
       
    46     private static boolean isNotContinuation(int b) {
       
    47         return (b & 0xc0) != 0x80;
       
    48     }
       
    49 
       
    50     private static boolean isMalformed3(int b1, int b2, int b3) {
       
    51         return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
       
    52                (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80;
       
    53     }
       
    54 
       
    55     private static boolean isMalformed3_2(int b1, int b2) {
       
    56         return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
       
    57                (b2 & 0xc0) != 0x80;
       
    58     }
       
    59 
       
    60     private static boolean isMalformed4(int b2, int b3, int b4) {
       
    61         return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 ||
       
    62                (b4 & 0xc0) != 0x80;
       
    63     }
       
    64 
       
    65     private static boolean isMalformed4_2(int b1, int b2) {
       
    66         return (b1 == 0xf0 && (b2  < 0x90 || b2 > 0xbf)) ||
       
    67                (b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
       
    68                (b2 & 0xc0) != 0x80;
       
    69     }
       
    70 
       
    71     private static boolean isMalformed4_3(int b3) {
       
    72         return (b3 & 0xc0) != 0x80;
       
    73     }
       
    74 
       
    75     // for nb == 3/4
       
    76     private static int malformedN(byte[] src, int sp, int nb) {
       
    77         if (nb == 3) {
       
    78             int b1 = src[sp++];
       
    79             int b2 = src[sp++];    // no need to lookup b3
       
    80             return ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
       
    81                     isNotContinuation(b2)) ? 1 : 2;
       
    82         } else if (nb == 4) { // we don't care the speed here
       
    83             int b1 = src[sp++] & 0xff;
       
    84             int b2 = src[sp++] & 0xff;
       
    85             if (b1 > 0xf4 ||
       
    86                 (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) ||
       
    87                 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
       
    88                 isNotContinuation(b2))
       
    89                 return 1;
       
    90             if (isNotContinuation(src[sp++]))
       
    91                 return 2;
       
    92             return 3;
       
    93         }
       
    94         assert false;
       
    95         return -1;
       
    96     }
       
    97 
       
    98     private static char repl = '\ufffd';
       
    99 
       
   100     StringCoding.Result decode(byte[] src, int sp, int len) {
       
   101         return decode(src, sp, len, result);
       
   102     }
       
   103 
       
   104     static StringCoding.Result decode(byte[] src, int sp, int len,
       
   105                                       StringCoding.Result ret) {
       
   106         int sl = sp + len;
       
   107         byte[] dst = new byte[len];
       
   108         int dp = 0;
       
   109         if (COMPACT_STRINGS) {   // Latin1 only loop
       
   110             while (sp < sl) {
       
   111                 int b1 = src[sp];
       
   112                 if (b1 >= 0) {
       
   113                     dst[dp++] = (byte)b1;
       
   114                     sp++;
       
   115                     continue;
       
   116                 }
       
   117                 if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) &&
       
   118                     sp + 1 < sl) {
       
   119                     int b2 = src[sp + 1];
       
   120                     if (!isNotContinuation(b2)) {
       
   121                         dst[dp++] = (byte)(((b1 << 6) ^ b2)^
       
   122                                            (((byte) 0xC0 << 6) ^
       
   123                                            ((byte) 0x80 << 0)));
       
   124                         sp += 2;
       
   125                         continue;
       
   126                     }
       
   127                 }
       
   128                 // anything not a latin1, including the repl
       
   129                 // we have to go with the utf16
       
   130                 break;
       
   131             }
       
   132             if (sp == sl) {
       
   133                 if (dp != dst.length) {
       
   134                     dst = Arrays.copyOf(dst, dp);
       
   135                 }
       
   136                 return ret.with(dst, LATIN1);
       
   137             }
       
   138         }
       
   139         if (dp == 0) {
       
   140             dst = new byte[len << 1];
       
   141         } else {
       
   142             byte[] buf = new byte[len << 1];
       
   143             StringLatin1.inflate(dst, 0, buf, 0, dp);
       
   144             dst = buf;
       
   145         }
       
   146         while (sp < sl) {
       
   147             int b1 = src[sp++];
       
   148             if (b1 >= 0) {
       
   149                 putChar(dst, dp++, (char) b1);
       
   150             } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {
       
   151                 if (sp < sl) {
       
   152                     int b2 = src[sp++];
       
   153                     if (isNotContinuation(b2)) {
       
   154                         putChar(dst, dp++, repl);
       
   155                         sp--;
       
   156                     } else {
       
   157                         putChar(dst, dp++, (char)(((b1 << 6) ^ b2)^
       
   158                                                   (((byte) 0xC0 << 6) ^
       
   159                                                   ((byte) 0x80 << 0))));
       
   160                     }
       
   161                     continue;
       
   162                 }
       
   163                 putChar(dst, dp++, repl);
       
   164                 break;
       
   165             } else if ((b1 >> 4) == -2) {
       
   166                 if (sp + 1 < sl) {
       
   167                     int b2 = src[sp++];
       
   168                     int b3 = src[sp++];
       
   169                     if (isMalformed3(b1, b2, b3)) {
       
   170                         putChar(dst, dp++, repl);
       
   171                         sp -= 3;
       
   172                         sp += malformedN(src, sp, 3);
       
   173                     } else {
       
   174                         char c = (char)((b1 << 12) ^
       
   175                                         (b2 <<  6) ^
       
   176                                         (b3 ^
       
   177                                          (((byte) 0xE0 << 12) ^
       
   178                                          ((byte) 0x80 <<  6) ^
       
   179                                          ((byte) 0x80 <<  0))));
       
   180                         putChar(dst, dp++, isSurrogate(c) ?  repl : c);
       
   181                     }
       
   182                     continue;
       
   183                 }
       
   184                 if (sp  < sl && isMalformed3_2(b1, src[sp])) {
       
   185                     putChar(dst, dp++, repl);
       
   186                     continue;
       
   187                 }
       
   188                 putChar(dst, dp++, repl);
       
   189                 break;
       
   190             } else if ((b1 >> 3) == -2) {
       
   191                 if (sp + 2 < sl) {
       
   192                     int b2 = src[sp++];
       
   193                     int b3 = src[sp++];
       
   194                     int b4 = src[sp++];
       
   195                     int uc = ((b1 << 18) ^
       
   196                               (b2 << 12) ^
       
   197                               (b3 <<  6) ^
       
   198                               (b4 ^
       
   199                                (((byte) 0xF0 << 18) ^
       
   200                                ((byte) 0x80 << 12) ^
       
   201                                ((byte) 0x80 <<  6) ^
       
   202                                ((byte) 0x80 <<  0))));
       
   203                     if (isMalformed4(b2, b3, b4) ||
       
   204                         !isSupplementaryCodePoint(uc)) { // shortest form check
       
   205                         putChar(dst, dp++, repl);
       
   206                         sp -= 4;
       
   207                         sp += malformedN(src, sp, 4);
       
   208                     } else {
       
   209                         putChar(dst, dp++, highSurrogate(uc));
       
   210                         putChar(dst, dp++, lowSurrogate(uc));
       
   211                     }
       
   212                     continue;
       
   213                 }
       
   214                 b1 &= 0xff;
       
   215                 if (b1 > 0xf4 ||
       
   216                     sp  < sl && isMalformed4_2(b1, src[sp] & 0xff)) {
       
   217                     putChar(dst, dp++, repl);
       
   218                     continue;
       
   219                 }
       
   220                 sp++;
       
   221                 putChar(dst, dp++, repl);
       
   222                 if (sp  < sl && isMalformed4_3(src[sp])) {
       
   223                     continue;
       
   224                 }
       
   225                 break;
       
   226             } else {
       
   227                 putChar(dst, dp++, repl);
       
   228             }
       
   229         }
       
   230         if (dp != len) {
       
   231             dst = Arrays.copyOf(dst, dp << 1);
       
   232         }
       
   233         return ret.with(dst, UTF16);
       
   234     }
       
   235 }