test/jdk/sun/nio/cs/TestUTF8.java
author jiangli
Wed, 27 Sep 2017 17:55:20 -0400
changeset 47548 664b9d44db74
parent 47216 71c04702a3d5
permissions -rw-r--r--
8068314: "Java fields that are currently set during shared space dumping" comment is incorrect Summary: CDS dump time should also initialize preallocated out_of_memory error messages. Reviewed-by: iklam, hseigel
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
     1
/*
14342
8435a30053c1 7197491: update copyright year to match last edit in jdk8 jdk repository
alanb
parents: 10898
diff changeset
     2
 * Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
     3
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
     4
 *
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
     5
 * This code is free software; you can redistribute it and/or modify it
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
     6
 * under the terms of the GNU General Public License version 2 only, as
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
     7
 * published by the Free Software Foundation.
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
     8
 *
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
     9
 * This code is distributed in the hope that it will be useful, but WITHOUT
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    10
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    11
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    12
 * version 2 for more details (a copy is included in the LICENSE file that
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    13
 * accompanied this code).
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    14
 *
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    15
 * You should have received a copy of the GNU General Public License version
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    16
 * 2 along with this work; if not, write to the Free Software Foundation,
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    17
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    18
 *
5506
202f599c92aa 6943119: Rebrand source copyright notices
ohair
parents: 1091
diff changeset
    19
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
202f599c92aa 6943119: Rebrand source copyright notices
ohair
parents: 1091
diff changeset
    20
 * or visit www.oracle.com if you need additional information or have any
202f599c92aa 6943119: Rebrand source copyright notices
ohair
parents: 1091
diff changeset
    21
 * questions.
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    22
 */
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    23
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    24
/*
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    25
 * @test
23880
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
    26
 * @bug 4486841 7040220 7096080 8039751
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    27
 * @summary Test UTF-8 charset
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    28
 */
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    29
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    30
import java.nio.charset.*;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    31
import java.nio.*;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    32
import java.util.*;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    33
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    34
public class TestUTF8 {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    35
    static char[] decode(byte[] bb, String csn, boolean testDirect)
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    36
        throws Exception {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    37
        CharsetDecoder dec = Charset.forName(csn).newDecoder();
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    38
        ByteBuffer bbf;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    39
        CharBuffer cbf;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    40
        if (testDirect) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    41
            bbf = ByteBuffer.allocateDirect(bb.length);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    42
            cbf = ByteBuffer.allocateDirect(bb.length*2).asCharBuffer();
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    43
            bbf.put(bb).flip();
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    44
        } else {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    45
            bbf = ByteBuffer.wrap(bb);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    46
            cbf = CharBuffer.allocate(bb.length);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    47
        }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    48
        CoderResult cr = dec.decode(bbf, cbf, true);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    49
        if (cr != CoderResult.UNDERFLOW)
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    50
            throw new RuntimeException("Decoding err: " + csn);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    51
        char[] cc = new char[cbf.position()];
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    52
        cbf.flip(); cbf.get(cc);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    53
        return cc;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    54
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    55
    }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    56
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    57
    static CoderResult decodeCR(byte[] bb, String csn, boolean testDirect)
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    58
        throws Exception {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    59
        CharsetDecoder dec = Charset.forName(csn).newDecoder();
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    60
        ByteBuffer bbf;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    61
        CharBuffer cbf;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    62
        if (testDirect) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    63
            bbf = ByteBuffer.allocateDirect(bb.length);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    64
            cbf = ByteBuffer.allocateDirect(bb.length*2).asCharBuffer();
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    65
            bbf.put(bb).flip();
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    66
        } else {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    67
            bbf = ByteBuffer.wrap(bb);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    68
            cbf = CharBuffer.allocate(bb.length);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    69
        }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    70
        return dec.decode(bbf, cbf, true);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    71
    }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    72
9547
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
    73
    // copy/paste of the StringCoding.decode()
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
    74
    static char[] decode(Charset cs, byte[] ba, int off, int len) {
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
    75
        CharsetDecoder cd = cs.newDecoder();
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
    76
        int en = (int)(len * cd.maxCharsPerByte());
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
    77
        char[] ca = new char[en];
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
    78
        if (len == 0)
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
    79
            return ca;
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
    80
        cd.onMalformedInput(CodingErrorAction.REPLACE)
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
    81
          .onUnmappableCharacter(CodingErrorAction.REPLACE)
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
    82
          .reset();
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
    83
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
    84
        ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
    85
        CharBuffer cb = CharBuffer.wrap(ca);
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
    86
        try {
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
    87
            CoderResult cr = cd.decode(bb, cb, true);
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
    88
            if (!cr.isUnderflow())
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
    89
                cr.throwException();
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
    90
            cr = cd.flush(cb);
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
    91
            if (!cr.isUnderflow())
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
    92
                cr.throwException();
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
    93
        } catch (CharacterCodingException x) {
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
    94
            throw new Error(x);
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
    95
        }
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
    96
        return Arrays.copyOf(ca, cb.position());
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
    97
    }
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
    98
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
    99
    static byte[] encode(char[] cc, String csn, boolean testDirect)
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   100
        throws Exception {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   101
        ByteBuffer bbf;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   102
        CharBuffer cbf;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   103
        CharsetEncoder enc = Charset.forName(csn).newEncoder();
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   104
        if (testDirect) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   105
            bbf = ByteBuffer.allocateDirect(cc.length * 4);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   106
            cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer();
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   107
            cbf.put(cc).flip();
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   108
        } else {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   109
            bbf = ByteBuffer.allocate(cc.length * 4);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   110
            cbf = CharBuffer.wrap(cc);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   111
        }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   112
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   113
        CoderResult cr = enc.encode(cbf, bbf, true);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   114
        if (cr != CoderResult.UNDERFLOW)
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   115
            throw new RuntimeException("Encoding err: " + csn);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   116
        byte[] bb = new byte[bbf.position()];
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   117
        bbf.flip(); bbf.get(bb);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   118
        return bb;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   119
    }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   120
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   121
    static CoderResult encodeCR(char[] cc, String csn, boolean testDirect)
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   122
        throws Exception {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   123
        ByteBuffer bbf;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   124
        CharBuffer cbf;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   125
        CharsetEncoder enc = Charset.forName(csn).newEncoder();
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   126
        if (testDirect) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   127
            bbf = ByteBuffer.allocateDirect(cc.length * 4);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   128
            cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer();
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   129
            cbf.put(cc).flip();
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   130
        } else {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   131
            bbf = ByteBuffer.allocate(cc.length * 4);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   132
            cbf = CharBuffer.wrap(cc);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   133
        }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   134
        return enc.encode(cbf, bbf, true);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   135
    }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   136
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   137
    static char[] getUTFChars() {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   138
        char[] cc = new char[0x10000 - 0xe000 + 0xd800 + //bmp
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   139
                             (0x110000 - 0x10000) * 2];    //supp
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   140
        int pos = 0;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   141
        int i = 0;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   142
        for (i = 0; i < 0xd800; i++)
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   143
            cc[pos++] = (char)i;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   144
        for (i = 0xe000; i < 0x10000; i++)
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   145
            cc[pos++] = (char)i;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   146
        for (i = 0x10000; i < 0x110000; i++) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   147
            pos += Character.toChars(i, cc, pos);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   148
        }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   149
        return cc;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   150
    }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   151
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   152
    static int to3ByteUTF8(char c, byte[] bb, int pos) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   153
        bb[pos++] = (byte)(0xe0 | ((c >> 12)));
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   154
        bb[pos++] = (byte)(0x80 | ((c >> 06) & 0x3f));
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   155
        bb[pos++] = (byte)(0x80 | ((c >> 00) & 0x3f));
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   156
        return 3;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   157
    }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   158
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   159
    static int to4ByteUTF8(int uc, byte[] bb, int pos) {
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   160
        bb[pos++] = (byte)(0xf0 | ((uc >> 18)));
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   161
        bb[pos++] = (byte)(0x80 | ((uc >> 12) & 0x3f));
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   162
        bb[pos++] = (byte)(0x80 | ((uc >>  6) & 0x3f));
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   163
        bb[pos++] = (byte)(0x80 | (uc & 0x3f));
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   164
        return 4;
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   165
    }
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   166
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   167
    static void checkRoundtrip(String csn) throws Exception {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   168
        System.out.printf("    Check roundtrip <%s>...", csn);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   169
        char[] cc = getUTFChars();
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   170
        byte[] bb = encode(cc, csn, false);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   171
        char[] ccO = decode(bb, csn, false);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   172
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   173
        if (!Arrays.equals(cc, ccO))
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   174
            System.out.printf("    non-direct failed");
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   175
        bb = encode(cc, csn, true);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   176
        ccO = decode(bb, csn, true);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   177
        if (!Arrays.equals(cc, ccO)) {
9547
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
   178
            System.out.print("    (direct) failed");
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
   179
        }
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
   180
        // String.getBytes()/toCharArray() goes to ArrayDe/Encoder path
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
   181
        if (!Arrays.equals(bb, new String(cc).getBytes(csn))) {
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
   182
            System.out.printf("    String.getBytes() failed");
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
   183
        }
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
   184
        if (!Arrays.equals(cc, new String(bb, csn).toCharArray())) {
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
   185
            System.out.printf("    String.toCharArray() failed");
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   186
        }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   187
        System.out.println();
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   188
    }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   189
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   190
    static void check4ByteSurrs(String csn) throws Exception {
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   191
        System.out.printf("    Check 4-byte Surrogates <%s>...%n", csn);
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   192
        byte[] bb = new byte[(0x110000 - 0x10000) * 4];
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   193
        char[] cc = new char[(0x110000 - 0x10000) * 2];
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   194
        int bpos = 0;
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   195
        int cpos = 0;
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   196
        for (int i = 0x10000; i < 0x110000; i++) {
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   197
            Character.toChars(i, cc, cpos);
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   198
            bpos += to4ByteUTF8(i, bb, bpos);
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   199
            cpos += 2;
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   200
        }
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   201
        checkSurrs(csn, bb, cc);
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   202
    }
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   203
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   204
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   205
    static void checkSurrs(String csn, byte[] bb, char[] cc)
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   206
        throws Exception
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   207
    {
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   208
        char[] ccO = decode(bb, csn, false);
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   209
        if (!Arrays.equals(cc, ccO)) {
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   210
            System.out.printf("    decoding failed%n");
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   211
        }
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   212
        ccO = decode(bb, csn, true);
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   213
        if (!Arrays.equals(cc, ccO)) {
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   214
            System.out.printf("    decoding(direct) failed%n");
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   215
        }
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   216
        if (!Arrays.equals(cc, new String(bb, csn).toCharArray())) {
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   217
            System.out.printf("    String.toCharArray() failed");
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   218
        }
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   219
        if (!Arrays.equals(bb, new String(cc).getBytes(csn))) {
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   220
            System.out.printf("    String.getBytes() failed");
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   221
        }
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   222
    }
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   223
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   224
    static void check6ByteSurrs(String csn) throws Exception {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   225
        System.out.printf("    Check 6-byte Surrogates <%s>...%n", csn);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   226
        byte[] bb = new byte[(0x110000 - 0x10000) * 6];
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   227
        char[] cc = new char[(0x110000 - 0x10000) * 2];
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   228
        int bpos = 0;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   229
        int cpos = 0;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   230
        for (int i = 0x10000; i < 0x110000; i++) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   231
            Character.toChars(i, cc, cpos);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   232
            bpos += to3ByteUTF8(cc[cpos], bb, bpos);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   233
            bpos += to3ByteUTF8(cc[cpos + 1], bb, bpos);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   234
            cpos += 2;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   235
        }
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   236
        checkSurrs(csn, bb, cc);
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   237
    }
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   238
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   239
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   240
    static void compare(String csn1, String csn2) throws Exception {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   241
        System.out.printf("    Diff <%s> <%s>...%n", csn1, csn2);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   242
        char[] cc = getUTFChars();
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   243
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   244
        byte[] bb1 = encode(cc, csn1, false);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   245
        byte[] bb2 = encode(cc, csn2, false);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   246
        if (!Arrays.equals(bb1, bb2))
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   247
            System.out.printf("        encoding failed%n");
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   248
        char[] cc1 = decode(bb1, csn1, false);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   249
        char[] cc2 = decode(bb1, csn2, false);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   250
        if (!Arrays.equals(cc1, cc2)) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   251
            System.out.printf("        decoding failed%n");
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   252
        }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   253
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   254
        bb1 = encode(cc, csn1, true);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   255
        bb2 = encode(cc, csn2, true);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   256
        if (!Arrays.equals(bb1, bb2))
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   257
            System.out.printf("        encoding (direct) failed%n");
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   258
        cc1 = decode(bb1, csn1, true);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   259
        cc2 = decode(bb1, csn2, true);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   260
        if (!Arrays.equals(cc1, cc2)) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   261
            System.out.printf("        decoding (direct) failed%n");
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   262
        }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   263
    }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   264
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   265
    // The first byte is the length of malformed bytes
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   266
    static byte[][] malformed = {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   267
        // One-byte sequences:
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   268
        {1, (byte)0xFF },
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   269
        {1, (byte)0xC0 },
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   270
        {1, (byte)0x80 },
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   271
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   272
        {1, (byte)0xFF, (byte)0xFF}, // all ones
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   273
        {1, (byte)0xA0, (byte)0x80}, // 101x first byte first nibble
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   274
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   275
        // Two-byte sequences:
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   276
        {1, (byte)0xC0, (byte)0x80}, // invalid first byte
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   277
        {1, (byte)0xC1, (byte)0xBF}, // invalid first byte
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   278
        {1, (byte)0xC2, (byte)0x00}, // invalid second byte
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   279
        {1, (byte)0xC2, (byte)0xC0}, // invalid second byte
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   280
        {1, (byte)0xD0, (byte)0x00}, // invalid second byte
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   281
        {1, (byte)0xD0, (byte)0xC0}, // invalid second byte
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   282
        {1, (byte)0xDF, (byte)0x00}, // invalid second byte
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   283
        {1, (byte)0xDF, (byte)0xC0}, // invalid second byte
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   284
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   285
        // Three-byte sequences
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   286
        {1, (byte)0xE0, (byte)0x80, (byte)0x80},  // 111x first byte first nibble
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   287
        {1, (byte)0xE0, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   288
        {1, (byte)0xE0, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   289
        {1, (byte)0xE0, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   290
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   291
        {1, (byte)0xE0, (byte)0xC0, (byte)0xBF }, // invalid second byte
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   292
        {2, (byte)0xE0, (byte)0xA0, (byte)0x7F }, // invalid third byte
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   293
        {2, (byte)0xE0, (byte)0xA0, (byte)0xC0 }, // invalid third byte
23880
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   294
        {2, (byte)0xE1, (byte)0x80, (byte)0x42},  // invalid third byte
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   295
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   296
        {1, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   297
        {1, (byte)0xE0, (byte)0xC0, (byte)0x80 }, // invalid second byte
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   298
        {1, (byte)0xE0, (byte)0x80, (byte)0xC0 }, // invalid first byte
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   299
        {1, (byte)0xE0, (byte)0x41,},             // invalid second byte & 2 bytes
23880
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   300
        {1, (byte)0xE1, (byte)0x40,},             // invalid second byte & 2 bytes
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   301
        {3, (byte)0xED, (byte)0xAE, (byte)0x80 }, // 3 bytes surrogate
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   302
        {3, (byte)0xED, (byte)0xB0, (byte)0x80 }, // 3 bytes surrogate
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   303
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   304
23880
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   305
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   306
        // Four-byte sequences
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   307
        {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   308
        {1, (byte)0xF0, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   309
        {1, (byte)0xF0, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+007F zero-padded
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   310
        {1, (byte)0xF0, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+07FF zero-padded
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   311
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   312
        {1, (byte)0xFF, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   313
        {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80},  // invalid second byte
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   314
        {1, (byte)0xF0, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   315
        {1, (byte)0xF0, (byte)41 },                           // invalid second byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   316
                                                              // & only 2 bytes
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   317
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   318
        {2, (byte)0xF0, (byte)0x90, (byte)0xC0, (byte)0x80 }, // invalid third byte
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   319
        {3, (byte)0xF0, (byte)0x90, (byte)0x80, (byte)0xC0 }, // invalid forth byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   320
        {2, (byte)0xF0, (byte)0x90, (byte)0x41 },             // invalid third byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   321
                                                              // & 3 bytes input
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   322
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   323
        {1, (byte)0xF1, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   324
        {2, (byte)0xF1, (byte)0x80, (byte)0xC0, (byte)0x80 }, // invalid third byte
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   325
        {3, (byte)0xF1, (byte)0x80, (byte)0x80, (byte)0xC0 }, // invalid forth byte
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   326
        {1, (byte)0xF4, (byte)0x90, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   327
        {1, (byte)0xF4, (byte)0xC0, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   328
        {1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   329
23880
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   330
        // #8039751
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   331
        {1, (byte)0xF6, (byte)0x80, (byte)0x80, (byte)0x80 }, // out-range 1st byte
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   332
        {1, (byte)0xF6, (byte)0x80, (byte)0x80,  },
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   333
        {1, (byte)0xF6, (byte)0x80, },
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   334
        {1, (byte)0xF6, },
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   335
        {1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0x80 }, // out-range 1st byte
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   336
        {1, (byte)0xF5, (byte)0x80, (byte)0x80,  },
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   337
        {1, (byte)0xF5, (byte)0x80,  },
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   338
        {1, (byte)0xF5  },
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   339
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   340
        {1, (byte)0xF4, (byte)0x90, (byte)0x80, (byte)0x80 }, // out-range 2nd byte
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   341
        {1, (byte)0xF4, (byte)0x90, (byte)0x80 },
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   342
        {1, (byte)0xF4, (byte)0x90 },
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   343
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   344
        {1, (byte)0xF4, (byte)0x7f, (byte)0x80, (byte)0x80 }, // out-range/ascii 2nd byte
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   345
        {1, (byte)0xF4, (byte)0x7f, (byte)0x80 },
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   346
        {1, (byte)0xF4, (byte)0x7f },
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   347
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   348
        {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // out-range 2nd byte
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   349
        {1, (byte)0xF0, (byte)0x80, (byte)0x80 },
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   350
        {1, (byte)0xF0, (byte)0x80 },
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   351
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   352
        {1, (byte)0xF0, (byte)0xc0, (byte)0x80, (byte)0x80 }, // out-range 2nd byte
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   353
        {1, (byte)0xF0, (byte)0xc0, (byte)0x80 },
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   354
        {1, (byte)0xF0, (byte)0xc0 },
7d6b060131d3 8039751: UTF-8 decoder fails to handle some edge cases correctly
sherman
parents: 14342
diff changeset
   355
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   356
        // Five-byte sequences
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   357
        {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80},  // invalid first byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   358
        {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   359
        {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   360
        {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   361
        {1, (byte)0xF8, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   362
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   363
        {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80},
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   364
        {1, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80 },
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   365
        {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF },
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   366
        {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0 },
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   367
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   368
        // Six-byte sequences
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   369
        {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   370
        {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   371
        {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   372
        {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   373
        {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 },
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   374
        {1, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80 },
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   375
        {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF, (byte)0x80 },
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   376
        {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0, (byte)0x80 },
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   377
        {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0x80, (byte)0xC0 },
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   378
    };
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   379
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   380
   // The first byte is the length of malformed bytes
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   381
    static byte[][] malformed_cesu8 = {
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   382
        // One-byte sequences:
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   383
        {1, (byte)0xFF },
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   384
        {1, (byte)0xC0 },
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   385
        {1, (byte)0x80 },
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   386
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   387
        {1, (byte)0xFF, (byte)0xFF}, // all ones
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   388
        {1, (byte)0xA0, (byte)0x80}, // 101x first byte first nibble
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   389
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   390
        // Two-byte sequences:
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   391
        {1, (byte)0xC0, (byte)0x80}, // invalid first byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   392
        {1, (byte)0xC1, (byte)0xBF}, // invalid first byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   393
        {1, (byte)0xC2, (byte)0x00}, // invalid second byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   394
        {1, (byte)0xC2, (byte)0xC0}, // invalid second byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   395
        {1, (byte)0xD0, (byte)0x00}, // invalid second byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   396
        {1, (byte)0xD0, (byte)0xC0}, // invalid second byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   397
        {1, (byte)0xDF, (byte)0x00}, // invalid second byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   398
        {1, (byte)0xDF, (byte)0xC0}, // invalid second byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   399
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   400
        // Three-byte sequences
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   401
        {1, (byte)0xE0, (byte)0x80, (byte)0x80},  // 111x first byte first nibble
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   402
        {1, (byte)0xE0, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   403
        {1, (byte)0xE0, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   404
        {1, (byte)0xE0, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   405
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   406
        {1, (byte)0xE0, (byte)0xC0, (byte)0xBF }, // invalid second byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   407
        {2, (byte)0xE0, (byte)0xA0, (byte)0x7F }, // invalid third byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   408
        {2, (byte)0xE0, (byte)0xA0, (byte)0xC0 }, // invalid third byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   409
        {1, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   410
        {1, (byte)0xE0, (byte)0xC0, (byte)0x80 }, // invalid second byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   411
        {1, (byte)0xE0, (byte)0x80, (byte)0xC0 }, // invalid first byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   412
        {1, (byte)0xE0, (byte)0x41,},             // invalid second byte & 2 bytes
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   413
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   414
        // CESU-8 does not have 4, 5, 6 bytes sequenc
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   415
        // Four-byte sequences
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   416
        {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   417
        {1, (byte)0xF0, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   418
        {1, (byte)0xF0, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+007F zero-padded
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   419
        {1, (byte)0xF0, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+07FF zero-padded
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   420
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   421
        {1, (byte)0xFF, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   422
        {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80},  // invalid second byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   423
        {1, (byte)0xF0, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   424
        {1, (byte)0xF0, (byte)41 },                           // invalid second byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   425
                                                              // & only 2 bytes
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   426
        {1, (byte)0xF0, (byte)0x90, (byte)0xC0, (byte)0x80 }, // invalid third byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   427
        {1, (byte)0xF0, (byte)0x90, (byte)0x80, (byte)0xC0 }, // invalid forth byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   428
        {1, (byte)0xF0, (byte)0x90, (byte)0x41 },             // invalid third byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   429
                                                              // & 3 bytes input
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   430
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   431
        {1, (byte)0xF1, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   432
        {1, (byte)0xF1, (byte)0x80, (byte)0xC0, (byte)0x80 }, // invalid third byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   433
        {1, (byte)0xF1, (byte)0x80, (byte)0x80, (byte)0xC0 }, // invalid forth byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   434
        {1, (byte)0xF4, (byte)0x90, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   435
        {1, (byte)0xF4, (byte)0xC0, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   436
        {1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   437
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   438
        // Five-byte sequences
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   439
        {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80},  // invalid first byte
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   440
        {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   441
        {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   442
        {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   443
        {1, (byte)0xF8, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   444
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   445
        {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80},
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   446
        {1, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80 },
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   447
        {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF },
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   448
        {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0 },
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   449
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   450
        // Six-byte sequences
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   451
        {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   452
        {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   453
        {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   454
        {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   455
        {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 },
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   456
        {1, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80 },
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   457
        {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF, (byte)0x80 },
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   458
        {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0, (byte)0x80 },
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   459
        {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0x80, (byte)0xC0 },
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   460
    };
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   461
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   462
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   463
    static void checkMalformed(String csn, byte[][] malformed) throws Exception {
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   464
        boolean failed = false;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   465
        System.out.printf("    Check malformed <%s>...%n", csn);
9547
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
   466
        Charset cs = Charset.forName(csn);
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   467
        for (boolean direct: new boolean[] {false, true}) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   468
            for (byte[] bins : malformed) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   469
                int mlen = bins[0];
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   470
                byte[] bin = Arrays.copyOfRange(bins, 1, bins.length);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   471
                CoderResult cr = decodeCR(bin, csn, direct);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   472
                String ashex = "";
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   473
                for (int i = 0; i < bin.length; i++) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   474
                    if (i > 0) ashex += " ";
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   475
                        ashex += Integer.toBinaryString((int)bin[i] & 0xff);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   476
                }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   477
                if (!cr.isMalformed()) {
9547
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
   478
                    System.out.printf("        FAIL(direct=%b): [%s] not malformed.%n", direct, ashex);
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   479
                    failed = true;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   480
                } else if (cr.length() != mlen) {
9547
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
   481
                    System.out.printf("        FAIL(direct=%b): [%s] malformed[len=%d].%n", direct, ashex, cr.length());
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
   482
                    failed = true;
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
   483
                }
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
   484
                if (!Arrays.equals(decode(cs, bin, 0, bin.length),
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
   485
                                   new String(bin, csn).toCharArray())) {
454881baaca0 7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])
sherman
parents: 5506
diff changeset
   486
                    System.out.printf("        FAIL(new String(bb, %s)) failed%n", csn);
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   487
                    failed = true;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   488
                }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   489
            }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   490
        }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   491
        if (failed)
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   492
            throw new RuntimeException("Check malformed failed " + csn);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   493
    }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   494
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   495
    static boolean check(CharsetDecoder dec, byte[] utf8s, boolean direct, int[] flow) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   496
        int inPos = flow[0];
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   497
        int inLen = flow[1];
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   498
        int outPos = flow[2];
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   499
        int outLen = flow[3];
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   500
        int expedInPos = flow[4];
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   501
        int expedOutPos = flow[5];
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   502
        CoderResult expedCR = (flow[6]==0)?CoderResult.UNDERFLOW
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   503
                                          :CoderResult.OVERFLOW;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   504
        ByteBuffer bbf;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   505
        CharBuffer cbf;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   506
        if (direct) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   507
            bbf = ByteBuffer.allocateDirect(inPos + utf8s.length);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   508
            cbf = ByteBuffer.allocateDirect((outPos + outLen)*2).asCharBuffer();
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   509
        } else {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   510
            bbf = ByteBuffer.allocate(inPos + utf8s.length);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   511
            cbf = CharBuffer.allocate(outPos + outLen);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   512
        }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   513
        bbf.position(inPos);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   514
        bbf.put(utf8s).flip().position(inPos).limit(inPos + inLen);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   515
        cbf.position(outPos);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   516
        dec.reset();
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   517
        CoderResult cr = dec.decode(bbf, cbf, false);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   518
        if (cr != expedCR ||
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   519
            bbf.position() != expedInPos ||
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   520
            cbf.position() != expedOutPos) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   521
            System.out.printf("Expected(direct=%5b): [", direct);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   522
            for (int i:flow) System.out.print(" " + i);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   523
            System.out.println("]  CR=" + cr +
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   524
                               ", inPos=" + bbf.position() +
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   525
                               ", outPos=" + cbf.position());
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   526
            return false;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   527
        }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   528
        return true;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   529
    }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   530
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   531
    static void checkUnderOverflow(String csn) throws Exception {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   532
        System.out.printf("    Check under/overflow <%s>...%n", csn);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   533
        CharsetDecoder dec = Charset.forName(csn).newDecoder();
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   534
        boolean failed = false;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   535
        byte[] utf8s = new String("\u007f\u07ff\ue000\ud800\udc00").getBytes("UTF-8");
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   536
        int    inlen = utf8s.length;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   537
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   538
        for (int inoff = 0; inoff < 20; inoff++) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   539
            for (int outoff = 0; outoff < 20; outoff++) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   540
        int[][] Flows = {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   541
            //inpos, inLen, outPos,  outLen, inPosEP,   outposEP,   under(0)/over(1)
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   542
            {inoff,  inlen, outoff,  1,      inoff + 1, outoff + 1, 1},
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   543
            {inoff,  inlen, outoff,  2,      inoff + 3, outoff + 2, 1},
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   544
            {inoff,  inlen, outoff,  3,      inoff + 6, outoff + 3, 1},
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   545
            {inoff,  inlen, outoff,  4,      inoff + 6, outoff + 3, 1},
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   546
            {inoff,  inlen, outoff,  5,      inoff + 10,outoff + 5, 0},
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   547
             // underflow
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   548
            {inoff,  1,     outoff,  5,      inoff + 1, outoff + 1, 0},
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   549
            {inoff,  2,     outoff,  5,      inoff + 1, outoff + 1, 0},
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   550
            {inoff,  3,     outoff,  5,      inoff + 3, outoff + 2, 0},
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   551
            {inoff,  4,     outoff,  5,      inoff + 3, outoff + 2, 0},
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   552
            {inoff,  5,     outoff,  5,      inoff + 3, outoff + 2, 0},
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   553
            {inoff,  6,     outoff,  5,      inoff + 6, outoff + 3, 0},
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   554
            {inoff,  7,     outoff,  5,      inoff + 6, outoff + 3, 0},
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   555
            {inoff,  8,     outoff,  5,      inoff + 6, outoff + 3, 0},
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   556
            {inoff,  9,     outoff,  5,      inoff + 6, outoff + 3, 0},
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   557
            {inoff,  10,    outoff,  5,      inoff + 10,outoff + 5, 0},
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   558
             // 2-byte underflow/overflow
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   559
            {inoff,  2,     outoff,  1,      inoff + 1, outoff + 1, 0},
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   560
            {inoff,  3,     outoff,  1,      inoff + 1, outoff + 1, 1},
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   561
             // 3-byte underflow/overflow
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   562
            {inoff,  4,     outoff,  2,      inoff + 3, outoff + 2, 0},
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   563
            {inoff,  5,     outoff,  2,      inoff + 3, outoff + 2, 0},
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   564
            {inoff,  6,     outoff,  2,      inoff + 3, outoff + 2, 1},
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   565
             // 4-byte underflow/overflow
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   566
            {inoff,  7,     outoff,  4,      inoff + 6, outoff + 3, 0},
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   567
            {inoff,  8,     outoff,  4,      inoff + 6, outoff + 3, 0},
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   568
            {inoff,  9,     outoff,  4,      inoff + 6, outoff + 3, 0},
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   569
            {inoff,  10,    outoff,  4,      inoff + 6, outoff + 3, 1},
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   570
        };
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   571
        for (boolean direct: new boolean[] {false, true}) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   572
            for (int[] flow: Flows) {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   573
                if (!check(dec, utf8s, direct, flow))
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   574
                    failed = true;
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   575
            }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   576
        }}}
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   577
        if (failed)
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   578
            throw new RuntimeException("Check under/overflow failed " + csn);
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   579
    }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   580
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   581
    public static void main(String[] args) throws Exception {
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   582
        checkRoundtrip("UTF-8");
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   583
        check4ByteSurrs("UTF-8");
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   584
        checkMalformed("UTF-8", malformed);
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   585
        checkUnderOverflow("UTF-8");
10898
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   586
        checkRoundtrip("CESU-8");
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   587
        check6ByteSurrs("CESU-8");
01a232fa2ddb 7096080: UTF8 update and new CESU-8 charset
sherman
parents: 9547
diff changeset
   588
        checkMalformed("CESU-8", malformed_cesu8);
1091
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   589
    }
136d19d6c372 4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
sherman
parents:
diff changeset
   590
}