src/java.base/share/classes/sun/nio/cs/UTF_8.java
author sherman
Wed, 13 Dec 2017 07:51:57 -0800
changeset 48262 daf3b49f4839
parent 47216 71c04702a3d5
child 49443 e5679a6661d6
permissions -rw-r--r--
8184947: ZipCoder performance improvements Reviewed-by: martin, redestad
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
     1
/*
45894
995421c69f66 8184665: Skip name and alias checks for standard Charsets
redestad
parents: 33663
diff changeset
     2
 * Copyright (c) 2000, 2017, Oracle and/or its affiliates. All rights reserved.
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
     3
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
90ce3da70b43 Initial load
duke
parents:
diff changeset
     4
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
     5
 * This code is free software; you can redistribute it and/or modify it
90ce3da70b43 Initial load
duke
parents:
diff changeset
     6
 * under the terms of the GNU General Public License version 2 only, as
5506
202f599c92aa 6943119: Rebrand source copyright notices
ohair
parents: 3714
diff changeset
     7
 * published by the Free Software Foundation.  Oracle designates this
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
     8
 * particular file as subject to the "Classpath" exception as provided
5506
202f599c92aa 6943119: Rebrand source copyright notices
ohair
parents: 3714
diff changeset
     9
 * by Oracle in the LICENSE file that accompanied this code.
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
    10
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    11
 * This code is distributed in the hope that it will be useful, but WITHOUT
90ce3da70b43 Initial load
duke
parents:
diff changeset
    12
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
90ce3da70b43 Initial load
duke
parents:
diff changeset
    13
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
90ce3da70b43 Initial load
duke
parents:
diff changeset
    14
 * version 2 for more details (a copy is included in the LICENSE file that
90ce3da70b43 Initial load
duke
parents:
diff changeset
    15
 * accompanied this code).
90ce3da70b43 Initial load
duke
parents:
diff changeset
    16
 *
90ce3da70b43 Initial load
duke
parents:
diff changeset
    17
 * You should have received a copy of the GNU General Public License version
90ce3da70b43 Initial load
duke
parents:
diff changeset
    18
 * 2 along with this work; if not, write to the Free Software Foundation,
90ce3da70b43 Initial load
duke
parents:
diff changeset
    19
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
90ce3da70b43 Initial load
duke
parents:
diff changeset
    20
 *
5506
202f599c92aa 6943119: Rebrand source copyright notices
ohair
parents: 3714
diff changeset
    21
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
202f599c92aa 6943119: Rebrand source copyright notices
ohair
parents: 3714
diff changeset
    22
 * or visit www.oracle.com if you need additional information or have any
202f599c92aa 6943119: Rebrand source copyright notices
ohair
parents: 3714
diff changeset
    23
 * questions.
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
    24
 */
90ce3da70b43 Initial load
duke
parents:
diff changeset
    25
90ce3da70b43 Initial load
duke
parents:
diff changeset
    26
package sun.nio.cs;
90ce3da70b43 Initial load
duke
parents:
diff changeset
    27
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    28
import java.nio.Buffer;
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
    29
import java.nio.ByteBuffer;
90ce3da70b43 Initial load
duke
parents:
diff changeset
    30
import java.nio.CharBuffer;
90ce3da70b43 Initial load
duke
parents:
diff changeset
    31
import java.nio.charset.Charset;
90ce3da70b43 Initial load
duke
parents:
diff changeset
    32
import java.nio.charset.CharsetDecoder;
90ce3da70b43 Initial load
duke
parents:
diff changeset
    33
import java.nio.charset.CharsetEncoder;
90ce3da70b43 Initial load
duke
parents:
diff changeset
    34
import java.nio.charset.CoderResult;
9547
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 7668
diff changeset
    35
import java.nio.charset.CodingErrorAction;
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
    36
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    37
/* Legal UTF-8 Byte Sequences
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    38
 *
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    39
 * #    Code Points      Bits   Bit/Byte pattern
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    40
 * 1                     7      0xxxxxxx
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    41
 *      U+0000..U+007F          00..7F
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    42
 *
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    43
 * 2                     11     110xxxxx    10xxxxxx
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    44
 *      U+0080..U+07FF          C2..DF      80..BF
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
    45
 *
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    46
 * 3                     16     1110xxxx    10xxxxxx    10xxxxxx
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    47
 *      U+0800..U+0FFF          E0          A0..BF      80..BF
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    48
 *      U+1000..U+FFFF          E1..EF      80..BF      80..BF
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    49
 *
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    50
 * 4                     21     11110xxx    10xxxxxx    10xxxxxx    10xxxxxx
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    51
 *     U+10000..U+3FFFF         F0          90..BF      80..BF      80..BF
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    52
 *     U+40000..U+FFFFF         F1..F3      80..BF      80..BF      80..BF
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    53
 *    U+100000..U10FFFF         F4          80..8F      80..BF      80..BF
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    54
 *
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
    55
 */
90ce3da70b43 Initial load
duke
parents:
diff changeset
    56
47026
94c45ad89b9c 8186517: sun.nio.cs.StandardCharsets$Aliases and Classes can be lazily loaded
redestad
parents: 45894
diff changeset
    57
public final class UTF_8 extends Unicode {
94c45ad89b9c 8186517: sun.nio.cs.StandardCharsets$Aliases and Classes can be lazily loaded
redestad
parents: 45894
diff changeset
    58
94c45ad89b9c 8186517: sun.nio.cs.StandardCharsets$Aliases and Classes can be lazily loaded
redestad
parents: 45894
diff changeset
    59
    public static final UTF_8 INSTANCE = new UTF_8();
94c45ad89b9c 8186517: sun.nio.cs.StandardCharsets$Aliases and Classes can be lazily loaded
redestad
parents: 45894
diff changeset
    60
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
    61
    public UTF_8() {
47026
94c45ad89b9c 8186517: sun.nio.cs.StandardCharsets$Aliases and Classes can be lazily loaded
redestad
parents: 45894
diff changeset
    62
        super("UTF-8", StandardCharsets.aliases_UTF_8());
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
    63
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
    64
90ce3da70b43 Initial load
duke
parents:
diff changeset
    65
    public String historicalName() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
    66
        return "UTF8";
90ce3da70b43 Initial load
duke
parents:
diff changeset
    67
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
    68
90ce3da70b43 Initial load
duke
parents:
diff changeset
    69
    public CharsetDecoder newDecoder() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
    70
        return new Decoder(this);
90ce3da70b43 Initial load
duke
parents:
diff changeset
    71
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
    72
90ce3da70b43 Initial load
duke
parents:
diff changeset
    73
    public CharsetEncoder newEncoder() {
90ce3da70b43 Initial load
duke
parents:
diff changeset
    74
        return new Encoder(this);
90ce3da70b43 Initial load
duke
parents:
diff changeset
    75
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
    76
47026
94c45ad89b9c 8186517: sun.nio.cs.StandardCharsets$Aliases and Classes can be lazily loaded
redestad
parents: 45894
diff changeset
    77
    static final void updatePositions(Buffer src, int sp,
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
    78
                                              Buffer dst, int dp) {
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    79
        src.position(sp - src.arrayOffset());
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    80
        dst.position(dp - dst.arrayOffset());
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    81
    }
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
    82
48262
daf3b49f4839 8184947: ZipCoder performance improvements
sherman
parents: 47216
diff changeset
    83
    private static class Decoder extends CharsetDecoder {
daf3b49f4839 8184947: ZipCoder performance improvements
sherman
parents: 47216
diff changeset
    84
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
    85
        private Decoder(Charset cs) {
90ce3da70b43 Initial load
duke
parents:
diff changeset
    86
            super(cs, 1.0f, 1.0f);
90ce3da70b43 Initial load
duke
parents:
diff changeset
    87
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
    88
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    89
        private static boolean isNotContinuation(int b) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    90
            return (b & 0xc0) != 0x80;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    91
        }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    92
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    93
        //  [E0]     [A0..BF] [80..BF]
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    94
        //  [E1..EF] [80..BF] [80..BF]
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    95
        private static boolean isMalformed3(int b1, int b2, int b3) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    96
            return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    97
                   (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    98
        }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
    99
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   100
        // only used when there is only one byte left in src buffer
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   101
        private static boolean isMalformed3_2(int b1, int b2) {
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   102
            return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   103
                   (b2 & 0xc0) != 0x80;
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   104
        }
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   105
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   106
        //  [F0]     [90..BF] [80..BF] [80..BF]
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   107
        //  [F1..F3] [80..BF] [80..BF] [80..BF]
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   108
        //  [F4]     [80..8F] [80..BF] [80..BF]
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   109
        //  only check 80-be range here, the [0xf0,0x80...] and [0xf4,0x90-...]
5986
04eb44085c00 6934265: Add public method Character.isBmpCodePoint
martin
parents: 5506
diff changeset
   110
        //  will be checked by Character.isSupplementaryCodePoint(uc)
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   111
        private static boolean isMalformed4(int b2, int b3, int b4) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   112
            return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 ||
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   113
                   (b4 & 0xc0) != 0x80;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   114
        }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   115
23880
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 23010
diff changeset
   116
        // only used when there is less than 4 bytes left in src buffer.
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 23010
diff changeset
   117
        // both b1 and b2 should be "& 0xff" before passed in.
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   118
        private static boolean isMalformed4_2(int b1, int b2) {
23880
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 23010
diff changeset
   119
            return (b1 == 0xf0 && (b2  < 0x90 || b2 > 0xbf)) ||
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 23010
diff changeset
   120
                   (b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   121
                   (b2 & 0xc0) != 0x80;
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   122
        }
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   123
23880
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 23010
diff changeset
   124
        // tests if b1 and b2 are malformed as the first 2 bytes of a
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 23010
diff changeset
   125
        // legal`4-byte utf-8 byte sequence.
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 23010
diff changeset
   126
        // only used when there is less than 4 bytes left in src buffer,
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 23010
diff changeset
   127
        // after isMalformed4_2 has been invoked.
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   128
        private static boolean isMalformed4_3(int b3) {
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   129
            return (b3 & 0xc0) != 0x80;
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   130
        }
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   131
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   132
        private static CoderResult lookupN(ByteBuffer src, int n)
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   133
        {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   134
            for (int i = 1; i < n; i++) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   135
               if (isNotContinuation(src.get()))
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   136
                   return CoderResult.malformedForLength(i);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   137
            }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   138
            return CoderResult.malformedForLength(n);
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   139
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   140
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   141
        private static CoderResult malformedN(ByteBuffer src, int nb) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   142
            switch (nb) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   143
            case 1:
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   144
            case 2:                    // always 1
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   145
                return CoderResult.malformedForLength(1);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   146
            case 3:
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   147
                int b1 = src.get();
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   148
                int b2 = src.get();    // no need to lookup b3
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   149
                return CoderResult.malformedForLength(
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   150
                    ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   151
                     isNotContinuation(b2)) ? 1 : 2);
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   152
            case 4:  // we don't care the speed here
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   153
                b1 = src.get() & 0xff;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   154
                b2 = src.get() & 0xff;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   155
                if (b1 > 0xf4 ||
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   156
                    (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) ||
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   157
                    (b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   158
                    isNotContinuation(b2))
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   159
                    return CoderResult.malformedForLength(1);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   160
                if (isNotContinuation(src.get()))
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   161
                    return CoderResult.malformedForLength(2);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   162
                return CoderResult.malformedForLength(3);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   163
            default:
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   164
                assert false;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   165
                return null;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   166
            }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   167
        }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   168
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   169
        private static CoderResult malformed(ByteBuffer src, int sp,
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   170
                                             CharBuffer dst, int dp,
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   171
                                             int nb)
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   172
        {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   173
            src.position(sp - src.arrayOffset());
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   174
            CoderResult cr = malformedN(src, nb);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   175
            updatePositions(src, sp, dst, dp);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   176
            return cr;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   177
        }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   178
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   179
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   180
        private static CoderResult malformed(ByteBuffer src,
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   181
                                             int mark, int nb)
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   182
        {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   183
            src.position(mark);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   184
            CoderResult cr = malformedN(src, nb);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   185
            src.position(mark);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   186
            return cr;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   187
        }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   188
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   189
        private static CoderResult malformedForLength(ByteBuffer src,
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   190
                                                      int sp,
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   191
                                                      CharBuffer dst,
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   192
                                                      int dp,
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   193
                                                      int malformedNB)
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   194
        {
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   195
            updatePositions(src, sp, dst, dp);
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   196
            return CoderResult.malformedForLength(malformedNB);
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   197
        }
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   198
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   199
        private static CoderResult malformedForLength(ByteBuffer src,
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   200
                                                      int mark,
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   201
                                                      int malformedNB)
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   202
        {
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   203
            src.position(mark);
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   204
            return CoderResult.malformedForLength(malformedNB);
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   205
        }
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   206
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   207
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   208
        private static CoderResult xflow(Buffer src, int sp, int sl,
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   209
                                         Buffer dst, int dp, int nb) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   210
            updatePositions(src, sp, dst, dp);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   211
            return (nb == 0 || sl - sp < nb)
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   212
                   ? CoderResult.UNDERFLOW : CoderResult.OVERFLOW;
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   213
        }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   214
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   215
        private static CoderResult xflow(Buffer src, int mark, int nb) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   216
            src.position(mark);
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   217
            return (nb == 0 || src.remaining() < nb)
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   218
                   ? CoderResult.UNDERFLOW : CoderResult.OVERFLOW;
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   219
        }
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   220
90ce3da70b43 Initial load
duke
parents:
diff changeset
   221
        private CoderResult decodeArrayLoop(ByteBuffer src,
90ce3da70b43 Initial load
duke
parents:
diff changeset
   222
                                            CharBuffer dst)
90ce3da70b43 Initial load
duke
parents:
diff changeset
   223
        {
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   224
            // This method is optimized for ASCII input.
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   225
            byte[] sa = src.array();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   226
            int sp = src.arrayOffset() + src.position();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   227
            int sl = src.arrayOffset() + src.limit();
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   228
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   229
            char[] da = dst.array();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   230
            int dp = dst.arrayOffset() + dst.position();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   231
            int dl = dst.arrayOffset() + dst.limit();
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   232
            int dlASCII = dp + Math.min(sl - sp, dl - dp);
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   233
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   234
            // ASCII only loop
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   235
            while (dp < dlASCII && sa[sp] >= 0)
5992
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   236
                da[dp++] = (char) sa[sp++];
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   237
            while (sp < sl) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   238
                int b1 = sa[sp];
5992
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   239
                if (b1 >= 0) {
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   240
                    // 1 byte, 7 bits: 0xxxxxxx
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   241
                    if (dp >= dl)
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   242
                        return xflow(src, sp, sl, dst, dp, 1);
5992
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   243
                    da[dp++] = (char) b1;
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   244
                    sp++;
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   245
                } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   246
                    // 2 bytes, 11 bits: 110xxxxx 10xxxxxx
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   247
                    //                   [C2..DF] [80..BF]
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   248
                    if (sl - sp < 2 || dp >= dl)
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   249
                        return xflow(src, sp, sl, dst, dp, 2);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   250
                    int b2 = sa[sp + 1];
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   251
                    // Now we check the first byte of 2-byte sequence as
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   252
                    //     if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0)
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   253
                    // no longer need to check b1 against c1 & c0 for
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   254
                    // malformed as we did in previous version
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   255
                    //   (b1 & 0x1e) == 0x0 || (b2 & 0xc0) != 0x80;
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   256
                    // only need to check the second byte b2.
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   257
                    if (isNotContinuation(b2))
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   258
                        return malformedForLength(src, sp, dst, dp, 1);
5992
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   259
                    da[dp++] = (char) (((b1 << 6) ^ b2)
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   260
                                       ^
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   261
                                       (((byte) 0xC0 << 6) ^
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   262
                                        ((byte) 0x80 << 0)));
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   263
                    sp += 2;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   264
                } else if ((b1 >> 4) == -2) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   265
                    // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   266
                    int srcRemaining = sl - sp;
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   267
                    if (srcRemaining < 3 || dp >= dl) {
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   268
                        if (srcRemaining > 1 && isMalformed3_2(b1, sa[sp + 1]))
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   269
                            return malformedForLength(src, sp, dst, dp, 1);
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   270
                        return xflow(src, sp, sl, dst, dp, 3);
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   271
                    }
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   272
                    int b2 = sa[sp + 1];
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   273
                    int b3 = sa[sp + 2];
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   274
                    if (isMalformed3(b1, b2, b3))
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   275
                        return malformed(src, sp, dst, dp, 3);
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   276
                    char c = (char)
5992
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   277
                        ((b1 << 12) ^
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   278
                         (b2 <<  6) ^
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   279
                         (b3 ^
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   280
                          (((byte) 0xE0 << 12) ^
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   281
                           ((byte) 0x80 <<  6) ^
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   282
                           ((byte) 0x80 <<  0))));
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   283
                    if (Character.isSurrogate(c))
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   284
                        return malformedForLength(src, sp, dst, dp, 3);
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   285
                    da[dp++] = c;
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   286
                    sp += 3;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   287
                } else if ((b1 >> 3) == -2) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   288
                    // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   289
                    int srcRemaining = sl - sp;
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   290
                    if (srcRemaining < 4 || dl - dp < 2) {
23880
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 23010
diff changeset
   291
                        b1 &= 0xff;
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 23010
diff changeset
   292
                        if (b1 > 0xf4 ||
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 23010
diff changeset
   293
                            srcRemaining > 1 && isMalformed4_2(b1, sa[sp + 1] & 0xff))
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   294
                            return malformedForLength(src, sp, dst, dp, 1);
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   295
                        if (srcRemaining > 2 && isMalformed4_3(sa[sp + 2]))
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   296
                            return malformedForLength(src, sp, dst, dp, 2);
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   297
                        return xflow(src, sp, sl, dst, dp, 4);
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   298
                    }
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   299
                    int b2 = sa[sp + 1];
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   300
                    int b3 = sa[sp + 2];
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   301
                    int b4 = sa[sp + 3];
5992
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   302
                    int uc = ((b1 << 18) ^
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   303
                              (b2 << 12) ^
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   304
                              (b3 <<  6) ^
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   305
                              (b4 ^
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   306
                               (((byte) 0xF0 << 18) ^
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   307
                                ((byte) 0x80 << 12) ^
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   308
                                ((byte) 0x80 <<  6) ^
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   309
                                ((byte) 0x80 <<  0))));
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   310
                    if (isMalformed4(b2, b3, b4) ||
5986
04eb44085c00 6934265: Add public method Character.isBmpCodePoint
martin
parents: 5506
diff changeset
   311
                        // shortest form check
04eb44085c00 6934265: Add public method Character.isBmpCodePoint
martin
parents: 5506
diff changeset
   312
                        !Character.isSupplementaryCodePoint(uc)) {
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   313
                        return malformed(src, sp, dst, dp, 4);
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   314
                    }
5991
288afdbbca28 6933322: Add methods highSurrogate(), lowSurrogate() to class Character
martin
parents: 5986
diff changeset
   315
                    da[dp++] = Character.highSurrogate(uc);
288afdbbca28 6933322: Add methods highSurrogate(), lowSurrogate() to class Character
martin
parents: 5986
diff changeset
   316
                    da[dp++] = Character.lowSurrogate(uc);
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   317
                    sp += 4;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   318
                } else
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   319
                    return malformed(src, sp, dst, dp, 1);
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   320
            }
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   321
            return xflow(src, sp, sl, dst, dp, 0);
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   322
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   323
90ce3da70b43 Initial load
duke
parents:
diff changeset
   324
        private CoderResult decodeBufferLoop(ByteBuffer src,
90ce3da70b43 Initial load
duke
parents:
diff changeset
   325
                                             CharBuffer dst)
90ce3da70b43 Initial load
duke
parents:
diff changeset
   326
        {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   327
            int mark = src.position();
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   328
            int limit = src.limit();
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   329
            while (mark < limit) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   330
                int b1 = src.get();
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   331
                if (b1 >= 0) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   332
                    // 1 byte, 7 bits: 0xxxxxxx
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   333
                    if (dst.remaining() < 1)
5992
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   334
                        return xflow(src, mark, 1); // overflow
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   335
                    dst.put((char) b1);
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   336
                    mark++;
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   337
                } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   338
                    // 2 bytes, 11 bits: 110xxxxx 10xxxxxx
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   339
                    if (limit - mark < 2|| dst.remaining() < 1)
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   340
                        return xflow(src, mark, 2);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   341
                    int b2 = src.get();
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   342
                    if (isNotContinuation(b2))
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   343
                        return malformedForLength(src, mark, 1);
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   344
                     dst.put((char) (((b1 << 6) ^ b2)
5992
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   345
                                    ^
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   346
                                    (((byte) 0xC0 << 6) ^
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   347
                                     ((byte) 0x80 << 0))));
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   348
                    mark += 2;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   349
                } else if ((b1 >> 4) == -2) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   350
                    // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   351
                    int srcRemaining = limit - mark;
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   352
                    if (srcRemaining < 3 || dst.remaining() < 1) {
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   353
                        if (srcRemaining > 1 && isMalformed3_2(b1, src.get()))
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   354
                            return malformedForLength(src, mark, 1);
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   355
                        return xflow(src, mark, 3);
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   356
                    }
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   357
                    int b2 = src.get();
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   358
                    int b3 = src.get();
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   359
                    if (isMalformed3(b1, b2, b3))
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   360
                        return malformed(src, mark, 3);
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   361
                    char c = (char)
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   362
                        ((b1 << 12) ^
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   363
                         (b2 <<  6) ^
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   364
                         (b3 ^
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   365
                          (((byte) 0xE0 << 12) ^
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   366
                           ((byte) 0x80 <<  6) ^
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   367
                           ((byte) 0x80 <<  0))));
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   368
                    if (Character.isSurrogate(c))
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   369
                        return malformedForLength(src, mark, 3);
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   370
                    dst.put(c);
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   371
                    mark += 3;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   372
                } else if ((b1 >> 3) == -2) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   373
                    // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   374
                    int srcRemaining = limit - mark;
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   375
                    if (srcRemaining < 4 || dst.remaining() < 2) {
23880
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 23010
diff changeset
   376
                        b1 &= 0xff;
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 23010
diff changeset
   377
                        if (b1 > 0xf4 ||
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 23010
diff changeset
   378
                            srcRemaining > 1 && isMalformed4_2(b1, src.get() & 0xff))
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   379
                            return malformedForLength(src, mark, 1);
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   380
                        if (srcRemaining > 2 && isMalformed4_3(src.get()))
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   381
                            return malformedForLength(src, mark, 2);
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   382
                        return xflow(src, mark, 4);
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   383
                    }
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   384
                    int b2 = src.get();
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   385
                    int b3 = src.get();
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   386
                    int b4 = src.get();
5992
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   387
                    int uc = ((b1 << 18) ^
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   388
                              (b2 << 12) ^
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   389
                              (b3 <<  6) ^
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   390
                              (b4 ^
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   391
                               (((byte) 0xF0 << 18) ^
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   392
                                ((byte) 0x80 << 12) ^
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   393
                                ((byte) 0x80 <<  6) ^
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   394
                                ((byte) 0x80 <<  0))));
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   395
                    if (isMalformed4(b2, b3, b4) ||
5986
04eb44085c00 6934265: Add public method Character.isBmpCodePoint
martin
parents: 5506
diff changeset
   396
                        // shortest form check
04eb44085c00 6934265: Add public method Character.isBmpCodePoint
martin
parents: 5506
diff changeset
   397
                        !Character.isSupplementaryCodePoint(uc)) {
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   398
                        return malformed(src, mark, 4);
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   399
                    }
5991
288afdbbca28 6933322: Add methods highSurrogate(), lowSurrogate() to class Character
martin
parents: 5986
diff changeset
   400
                    dst.put(Character.highSurrogate(uc));
288afdbbca28 6933322: Add methods highSurrogate(), lowSurrogate() to class Character
martin
parents: 5986
diff changeset
   401
                    dst.put(Character.lowSurrogate(uc));
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   402
                    mark += 4;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   403
                } else {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   404
                    return malformed(src, mark, 1);
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   405
                }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   406
            }
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   407
            return xflow(src, mark, 0);
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   408
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   409
90ce3da70b43 Initial load
duke
parents:
diff changeset
   410
        protected CoderResult decodeLoop(ByteBuffer src,
90ce3da70b43 Initial load
duke
parents:
diff changeset
   411
                                         CharBuffer dst)
90ce3da70b43 Initial load
duke
parents:
diff changeset
   412
        {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   413
            if (src.hasArray() && dst.hasArray())
90ce3da70b43 Initial load
duke
parents:
diff changeset
   414
                return decodeArrayLoop(src, dst);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   415
            else
90ce3da70b43 Initial load
duke
parents:
diff changeset
   416
                return decodeBufferLoop(src, dst);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   417
        }
9547
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 7668
diff changeset
   418
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 7668
diff changeset
   419
        private static ByteBuffer getByteBuffer(ByteBuffer bb, byte[] ba, int sp)
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 7668
diff changeset
   420
        {
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 7668
diff changeset
   421
            if (bb == null)
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 7668
diff changeset
   422
                bb = ByteBuffer.wrap(ba);
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 7668
diff changeset
   423
            bb.position(sp);
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 7668
diff changeset
   424
            return bb;
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 7668
diff changeset
   425
        }
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   426
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   427
48262
daf3b49f4839 8184947: ZipCoder performance improvements
sherman
parents: 47216
diff changeset
   428
    private static final class Encoder extends CharsetEncoder {
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   429
90ce3da70b43 Initial load
duke
parents:
diff changeset
   430
        private Encoder(Charset cs) {
7282
b1f801518f89 6957230: CharsetEncoder.maxBytesPerChar() reports 4 for UTF-8; should be 3
sherman
parents: 5992
diff changeset
   431
            super(cs, 1.1f, 3.0f);
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   432
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   433
90ce3da70b43 Initial load
duke
parents:
diff changeset
   434
        public boolean canEncode(char c) {
3714
6a4eb8f53f91 6860431: Character.isSurrogate(char ch)
martin
parents: 1092
diff changeset
   435
            return !Character.isSurrogate(c);
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   436
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   437
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   438
        public boolean isLegalReplacement(byte[] repl) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   439
            return ((repl.length == 1 && repl[0] >= 0) ||
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   440
                    super.isLegalReplacement(repl));
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   441
        }
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   442
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   443
        private static CoderResult overflow(CharBuffer src, int sp,
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   444
                                            ByteBuffer dst, int dp) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   445
            updatePositions(src, sp, dst, dp);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   446
            return CoderResult.OVERFLOW;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   447
        }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   448
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   449
        private static CoderResult overflow(CharBuffer src, int mark) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   450
            src.position(mark);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   451
            return CoderResult.OVERFLOW;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   452
        }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   453
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   454
        private Surrogate.Parser sgp;
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   455
        private CoderResult encodeArrayLoop(CharBuffer src,
90ce3da70b43 Initial load
duke
parents:
diff changeset
   456
                                            ByteBuffer dst)
90ce3da70b43 Initial load
duke
parents:
diff changeset
   457
        {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   458
            char[] sa = src.array();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   459
            int sp = src.arrayOffset() + src.position();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   460
            int sl = src.arrayOffset() + src.limit();
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   461
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   462
            byte[] da = dst.array();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   463
            int dp = dst.arrayOffset() + dst.position();
90ce3da70b43 Initial load
duke
parents:
diff changeset
   464
            int dl = dst.arrayOffset() + dst.limit();
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   465
            int dlASCII = dp + Math.min(sl - sp, dl - dp);
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   466
5992
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   467
            // ASCII only loop
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   468
            while (dp < dlASCII && sa[sp] < '\u0080')
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   469
                da[dp++] = (byte) sa[sp++];
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   470
            while (sp < sl) {
3714
6a4eb8f53f91 6860431: Character.isSurrogate(char ch)
martin
parents: 1092
diff changeset
   471
                char c = sa[sp];
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   472
                if (c < 0x80) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   473
                    // Have at most seven bits
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   474
                    if (dp >= dl)
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   475
                        return overflow(src, sp, dst, dp);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   476
                    da[dp++] = (byte)c;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   477
                } else if (c < 0x800) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   478
                    // 2 bytes, 11 bits
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   479
                    if (dl - dp < 2)
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   480
                        return overflow(src, sp, dst, dp);
5992
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   481
                    da[dp++] = (byte)(0xc0 | (c >> 6));
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   482
                    da[dp++] = (byte)(0x80 | (c & 0x3f));
3714
6a4eb8f53f91 6860431: Character.isSurrogate(char ch)
martin
parents: 1092
diff changeset
   483
                } else if (Character.isSurrogate(c)) {
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   484
                    // Have a surrogate pair
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   485
                    if (sgp == null)
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   486
                        sgp = new Surrogate.Parser();
3714
6a4eb8f53f91 6860431: Character.isSurrogate(char ch)
martin
parents: 1092
diff changeset
   487
                    int uc = sgp.parse(c, sa, sp, sl);
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   488
                    if (uc < 0) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   489
                        updatePositions(src, sp, dst, dp);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   490
                        return sgp.error();
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   491
                    }
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   492
                    if (dl - dp < 4)
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   493
                        return overflow(src, sp, dst, dp);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   494
                    da[dp++] = (byte)(0xf0 | ((uc >> 18)));
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   495
                    da[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f));
5992
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   496
                    da[dp++] = (byte)(0x80 | ((uc >>  6) & 0x3f));
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   497
                    da[dp++] = (byte)(0x80 | (uc & 0x3f));
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   498
                    sp++;  // 2 chars
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   499
                } else {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   500
                    // 3 bytes, 16 bits
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   501
                    if (dl - dp < 3)
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   502
                        return overflow(src, sp, dst, dp);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   503
                    da[dp++] = (byte)(0xe0 | ((c >> 12)));
5992
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   504
                    da[dp++] = (byte)(0x80 | ((c >>  6) & 0x3f));
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   505
                    da[dp++] = (byte)(0x80 | (c & 0x3f));
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   506
                }
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   507
                sp++;
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   508
            }
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   509
            updatePositions(src, sp, dst, dp);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   510
            return CoderResult.UNDERFLOW;
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   511
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   512
90ce3da70b43 Initial load
duke
parents:
diff changeset
   513
        private CoderResult encodeBufferLoop(CharBuffer src,
90ce3da70b43 Initial load
duke
parents:
diff changeset
   514
                                             ByteBuffer dst)
90ce3da70b43 Initial load
duke
parents:
diff changeset
   515
        {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   516
            int mark = src.position();
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   517
            while (src.hasRemaining()) {
3714
6a4eb8f53f91 6860431: Character.isSurrogate(char ch)
martin
parents: 1092
diff changeset
   518
                char c = src.get();
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   519
                if (c < 0x80) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   520
                    // Have at most seven bits
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   521
                    if (!dst.hasRemaining())
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   522
                        return overflow(src, mark);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   523
                    dst.put((byte)c);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   524
                } else if (c < 0x800) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   525
                    // 2 bytes, 11 bits
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   526
                    if (dst.remaining() < 2)
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   527
                        return overflow(src, mark);
5992
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   528
                    dst.put((byte)(0xc0 | (c >> 6)));
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   529
                    dst.put((byte)(0x80 | (c & 0x3f)));
3714
6a4eb8f53f91 6860431: Character.isSurrogate(char ch)
martin
parents: 1092
diff changeset
   530
                } else if (Character.isSurrogate(c)) {
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   531
                    // Have a surrogate pair
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   532
                    if (sgp == null)
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   533
                        sgp = new Surrogate.Parser();
3714
6a4eb8f53f91 6860431: Character.isSurrogate(char ch)
martin
parents: 1092
diff changeset
   534
                    int uc = sgp.parse(c, src);
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   535
                    if (uc < 0) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   536
                        src.position(mark);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   537
                        return sgp.error();
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   538
                    }
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   539
                    if (dst.remaining() < 4)
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   540
                        return overflow(src, mark);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   541
                    dst.put((byte)(0xf0 | ((uc >> 18))));
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   542
                    dst.put((byte)(0x80 | ((uc >> 12) & 0x3f)));
5992
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   543
                    dst.put((byte)(0x80 | ((uc >>  6) & 0x3f)));
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   544
                    dst.put((byte)(0x80 | (uc & 0x3f)));
5992
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   545
                    mark++;  // 2 chars
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   546
                } else {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   547
                    // 3 bytes, 16 bits
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   548
                    if (dst.remaining() < 3)
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   549
                        return overflow(src, mark);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   550
                    dst.put((byte)(0xe0 | ((c >> 12))));
5992
15c59951d875 6934271: Better handling of longer utf-8 sequences
martin
parents: 5991
diff changeset
   551
                    dst.put((byte)(0x80 | ((c >>  6) & 0x3f)));
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   552
                    dst.put((byte)(0x80 | (c & 0x3f)));
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   553
                }
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   554
                mark++;
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   555
            }
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   556
            src.position(mark);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents: 715
diff changeset
   557
            return CoderResult.UNDERFLOW;
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   558
        }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   559
90ce3da70b43 Initial load
duke
parents:
diff changeset
   560
        protected final CoderResult encodeLoop(CharBuffer src,
90ce3da70b43 Initial load
duke
parents:
diff changeset
   561
                                               ByteBuffer dst)
90ce3da70b43 Initial load
duke
parents:
diff changeset
   562
        {
90ce3da70b43 Initial load
duke
parents:
diff changeset
   563
            if (src.hasArray() && dst.hasArray())
90ce3da70b43 Initial load
duke
parents:
diff changeset
   564
                return encodeArrayLoop(src, dst);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   565
            else
90ce3da70b43 Initial load
duke
parents:
diff changeset
   566
                return encodeBufferLoop(src, dst);
90ce3da70b43 Initial load
duke
parents:
diff changeset
   567
        }
9547
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 7668
diff changeset
   568
2
90ce3da70b43 Initial load
duke
parents:
diff changeset
   569
    }
90ce3da70b43 Initial load
duke
parents:
diff changeset
   570
}