jdk/test/sun/nio/cs/TestUTF8.java
author never
Mon, 12 Jul 2010 22:27:18 -0700
changeset 5926 a36f90d986b6
parent 5506 202f599c92aa
child 9547 454881baaca0
permissions -rw-r--r--
6968385: malformed xml in sweeper logging Reviewed-by: kvn

/*
 * Copyright (c) 2008, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */

/*
 * @test
 * @bug 4486841
 * @summary Test UTF-8 charset
 */

import java.nio.charset.*;
import java.nio.*;
import java.util.*;

public class TestUTF8 {
    static char[] decode(byte[] bb, String csn, boolean testDirect)
        throws Exception {
        CharsetDecoder dec = Charset.forName(csn).newDecoder();
        ByteBuffer bbf;
        CharBuffer cbf;
        if (testDirect) {
            bbf = ByteBuffer.allocateDirect(bb.length);
            cbf = ByteBuffer.allocateDirect(bb.length*2).asCharBuffer();
            bbf.put(bb).flip();
        } else {
            bbf = ByteBuffer.wrap(bb);
            cbf = CharBuffer.allocate(bb.length);
        }
        CoderResult cr = dec.decode(bbf, cbf, true);
        if (cr != CoderResult.UNDERFLOW)
            throw new RuntimeException("Decoding err: " + csn);
        char[] cc = new char[cbf.position()];
        cbf.flip(); cbf.get(cc);
        return cc;

    }

    static CoderResult decodeCR(byte[] bb, String csn, boolean testDirect)
        throws Exception {
        CharsetDecoder dec = Charset.forName(csn).newDecoder();
        ByteBuffer bbf;
        CharBuffer cbf;
        if (testDirect) {
            bbf = ByteBuffer.allocateDirect(bb.length);
            cbf = ByteBuffer.allocateDirect(bb.length*2).asCharBuffer();
            bbf.put(bb).flip();
        } else {
            bbf = ByteBuffer.wrap(bb);
            cbf = CharBuffer.allocate(bb.length);
        }
        return dec.decode(bbf, cbf, true);
    }

    static byte[] encode(char[] cc, String csn, boolean testDirect)
        throws Exception {
        ByteBuffer bbf;
        CharBuffer cbf;
        CharsetEncoder enc = Charset.forName(csn).newEncoder();
        if (testDirect) {
            bbf = ByteBuffer.allocateDirect(cc.length * 4);
            cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer();
            cbf.put(cc).flip();
        } else {
            bbf = ByteBuffer.allocate(cc.length * 4);
            cbf = CharBuffer.wrap(cc);
        }

        CoderResult cr = enc.encode(cbf, bbf, true);
        if (cr != CoderResult.UNDERFLOW)
            throw new RuntimeException("Encoding err: " + csn);
        byte[] bb = new byte[bbf.position()];
        bbf.flip(); bbf.get(bb);
        return bb;
    }

    static CoderResult encodeCR(char[] cc, String csn, boolean testDirect)
        throws Exception {
        ByteBuffer bbf;
        CharBuffer cbf;
        CharsetEncoder enc = Charset.forName(csn).newEncoder();
        if (testDirect) {
            bbf = ByteBuffer.allocateDirect(cc.length * 4);
            cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer();
            cbf.put(cc).flip();
        } else {
            bbf = ByteBuffer.allocate(cc.length * 4);
            cbf = CharBuffer.wrap(cc);
        }
        return enc.encode(cbf, bbf, true);
    }

    static char[] getUTFChars() {
        char[] cc = new char[0x10000 - 0xe000 + 0xd800 + //bmp
                             (0x110000 - 0x10000) * 2];    //supp
        int pos = 0;
        int i = 0;
        for (i = 0; i < 0xd800; i++)
            cc[pos++] = (char)i;
        for (i = 0xe000; i < 0x10000; i++)
            cc[pos++] = (char)i;
        for (i = 0x10000; i < 0x110000; i++) {
            pos += Character.toChars(i, cc, pos);
        }
        return cc;
    }

    static int to3ByteUTF8(char c, byte[] bb, int pos) {
        bb[pos++] = (byte)(0xe0 | ((c >> 12)));
        bb[pos++] = (byte)(0x80 | ((c >> 06) & 0x3f));
        bb[pos++] = (byte)(0x80 | ((c >> 00) & 0x3f));
        return 3;
    }

    static void checkRoundtrip(String csn) throws Exception {
        System.out.printf("    Check roundtrip <%s>...", csn);
        char[] cc = getUTFChars();
        byte[] bb = encode(cc, csn, false);
        char[] ccO = decode(bb, csn, false);

        if (!Arrays.equals(cc, ccO)) {
            System.out.printf("    non-direct failed");
        }
        bb = encode(cc, csn, true);
        ccO = decode(bb, csn, true);
        if (!Arrays.equals(cc, ccO)) {
            System.out.printf("    (direct) failed");
        }
        System.out.println();
    }

    static void check6ByteSurrs(String csn) throws Exception {
        System.out.printf("    Check 6-byte Surrogates <%s>...%n", csn);
        byte[] bb = new byte[(0x110000 - 0x10000) * 6];
        char[] cc = new char[(0x110000 - 0x10000) * 2];
        int bpos = 0;
        int cpos = 0;
        for (int i = 0x10000; i < 0x110000; i++) {
            Character.toChars(i, cc, cpos);
            bpos += to3ByteUTF8(cc[cpos], bb, bpos);
            bpos += to3ByteUTF8(cc[cpos + 1], bb, bpos);
            cpos += 2;
        }

        char[] ccO = decode(bb, csn, false);
        if (!Arrays.equals(cc, ccO)) {
            System.out.printf("    decoding failed%n");
        }
        ccO = decode(bb, csn, true);
        if (!Arrays.equals(cc, ccO)) {
            System.out.printf("    decoding(direct) failed%n");
        }
    }

    static void compare(String csn1, String csn2) throws Exception {
        System.out.printf("    Diff <%s> <%s>...%n", csn1, csn2);
        char[] cc = getUTFChars();

        byte[] bb1 = encode(cc, csn1, false);
        byte[] bb2 = encode(cc, csn2, false);
        if (!Arrays.equals(bb1, bb2))
            System.out.printf("        encoding failed%n");
        char[] cc1 = decode(bb1, csn1, false);
        char[] cc2 = decode(bb1, csn2, false);
        if (!Arrays.equals(cc1, cc2)) {
            System.out.printf("        decoding failed%n");
        }

        bb1 = encode(cc, csn1, true);
        bb2 = encode(cc, csn2, true);
        if (!Arrays.equals(bb1, bb2))
            System.out.printf("        encoding (direct) failed%n");
        cc1 = decode(bb1, csn1, true);
        cc2 = decode(bb1, csn2, true);
        if (!Arrays.equals(cc1, cc2)) {
            System.out.printf("        decoding (direct) failed%n");
        }
    }

    // The first byte is the length of malformed bytes
    static byte[][] malformed = {
        // One-byte sequences:
        {1, (byte)0xFF },
        {1, (byte)0xC0 },
        {1, (byte)0x80 },

        {1, (byte)0xFF, (byte)0xFF}, // all ones
        {1, (byte)0xA0, (byte)0x80}, // 101x first byte first nibble

        // Two-byte sequences:
        {1, (byte)0xC0, (byte)0x80}, // invalid first byte
        {1, (byte)0xC1, (byte)0xBF}, // invalid first byte
        {1, (byte)0xC2, (byte)0x00}, // invalid second byte
        {1, (byte)0xC2, (byte)0xC0}, // invalid second byte
        {1, (byte)0xD0, (byte)0x00}, // invalid second byte
        {1, (byte)0xD0, (byte)0xC0}, // invalid second byte
        {1, (byte)0xDF, (byte)0x00}, // invalid second byte
        {1, (byte)0xDF, (byte)0xC0}, // invalid second byte

        // Three-byte sequences
        {1, (byte)0xE0, (byte)0x80, (byte)0x80},  // 111x first byte first nibble
        {1, (byte)0xE0, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
        {1, (byte)0xE0, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
        {1, (byte)0xE0, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded

        {1, (byte)0xE0, (byte)0xC0, (byte)0xBF }, // invalid second byte
        {2, (byte)0xE0, (byte)0xA0, (byte)0x7F }, // invalid third byte
        {2, (byte)0xE0, (byte)0xA0, (byte)0xC0 }, // invalid third byte
        {1, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones
        {1, (byte)0xE0, (byte)0xC0, (byte)0x80 }, // invalid second byte
        {1, (byte)0xE0, (byte)0x80, (byte)0xC0 }, // invalid first byte

        // Four-byte sequences
        {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
        {1, (byte)0xF0, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
        {1, (byte)0xF0, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+007F zero-padded
        {1, (byte)0xF0, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+07FF zero-padded

        {1, (byte)0xFF, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones
        {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80},  // invalid second byte
        {1, (byte)0xF0, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte
        {2, (byte)0xF0, (byte)0x90, (byte)0xC0, (byte)0x80 }, // invalid third byte
        {3, (byte)0xF0, (byte)0x90, (byte)0x80, (byte)0xC0 }, // invalid third byte

        {1, (byte)0xF1, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte
        {2, (byte)0xF1, (byte)0x80, (byte)0xC0, (byte)0x80 }, // invalid third byte
        {3, (byte)0xF1, (byte)0x80, (byte)0x80, (byte)0xC0 }, // invalid forth byte
        {1, (byte)0xF4, (byte)0x90, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
        {1, (byte)0xF4, (byte)0xC0, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
        {1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0xC0 }, // out-range 4-byte

        // Five-byte sequences
        {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80},  // invalid first byte
        {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
        {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
        {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded
        {5, (byte)0xF8, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded

        {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80},
        {2, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80 },
        {3, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF },
        {4, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0 },

        // Six-byte sequences
        {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
        {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
        {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded
        {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded
        {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 },
        {2, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80 },
        {3, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF, (byte)0x80 },
        {4, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0, (byte)0x80 },
        {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0x80, (byte)0xC0 },
    };

    static void checkMalformed(String csn) throws Exception {
        boolean failed = false;
        System.out.printf("    Check malformed <%s>...%n", csn);
        for (boolean direct: new boolean[] {false, true}) {
            for (byte[] bins : malformed) {
                int mlen = bins[0];
                byte[] bin = Arrays.copyOfRange(bins, 1, bins.length);
                CoderResult cr = decodeCR(bin, csn, direct);
                String ashex = "";
                for (int i = 0; i < bin.length; i++) {
                    if (i > 0) ashex += " ";
                        ashex += Integer.toBinaryString((int)bin[i] & 0xff);
                }
                if (!cr.isMalformed()) {
                    System.out.printf("        FAIL(direct=%b): [%s] not malformed.\n", direct, ashex);
                    failed = true;
                } else if (cr.length() != mlen) {
                    System.out.printf("        FAIL(direct=%b): [%s] malformed[len=%d].\n", direct, ashex, cr.length());
                    failed = true;
                }
            }
        }
        if (failed)
            throw new RuntimeException("Check malformed failed " + csn);
    }

    static boolean check(CharsetDecoder dec, byte[] utf8s, boolean direct, int[] flow) {
        int inPos = flow[0];
        int inLen = flow[1];
        int outPos = flow[2];
        int outLen = flow[3];
        int expedInPos = flow[4];
        int expedOutPos = flow[5];
        CoderResult expedCR = (flow[6]==0)?CoderResult.UNDERFLOW
                                          :CoderResult.OVERFLOW;
        ByteBuffer bbf;
        CharBuffer cbf;
        if (direct) {
            bbf = ByteBuffer.allocateDirect(inPos + utf8s.length);
            cbf = ByteBuffer.allocateDirect((outPos + outLen)*2).asCharBuffer();
        } else {
            bbf = ByteBuffer.allocate(inPos + utf8s.length);
            cbf = CharBuffer.allocate(outPos + outLen);
        }
        bbf.position(inPos);
        bbf.put(utf8s).flip().position(inPos).limit(inPos + inLen);
        cbf.position(outPos);
        dec.reset();
        CoderResult cr = dec.decode(bbf, cbf, false);
        if (cr != expedCR ||
            bbf.position() != expedInPos ||
            cbf.position() != expedOutPos) {
            System.out.printf("Expected(direct=%5b): [", direct);
            for (int i:flow) System.out.print(" " + i);
            System.out.println("]  CR=" + cr +
                               ", inPos=" + bbf.position() +
                               ", outPos=" + cbf.position());
            return false;
        }
        return true;
    }

    static void checkUnderOverflow(String csn) throws Exception {
        System.out.printf("    Check under/overflow <%s>...%n", csn);
        CharsetDecoder dec = Charset.forName(csn).newDecoder();
        boolean failed = false;
        byte[] utf8s = new String("\u007f\u07ff\ue000\ud800\udc00").getBytes("UTF-8");
        int    inlen = utf8s.length;

        for (int inoff = 0; inoff < 20; inoff++) {
            for (int outoff = 0; outoff < 20; outoff++) {
        int[][] Flows = {
            //inpos, inLen, outPos,  outLen, inPosEP,   outposEP,   under(0)/over(1)
            {inoff,  inlen, outoff,  1,      inoff + 1, outoff + 1, 1},
            {inoff,  inlen, outoff,  2,      inoff + 3, outoff + 2, 1},
            {inoff,  inlen, outoff,  3,      inoff + 6, outoff + 3, 1},
            {inoff,  inlen, outoff,  4,      inoff + 6, outoff + 3, 1},
            {inoff,  inlen, outoff,  5,      inoff + 10,outoff + 5, 0},
             // underflow
            {inoff,  1,     outoff,  5,      inoff + 1, outoff + 1, 0},
            {inoff,  2,     outoff,  5,      inoff + 1, outoff + 1, 0},
            {inoff,  3,     outoff,  5,      inoff + 3, outoff + 2, 0},
            {inoff,  4,     outoff,  5,      inoff + 3, outoff + 2, 0},
            {inoff,  5,     outoff,  5,      inoff + 3, outoff + 2, 0},
            {inoff,  6,     outoff,  5,      inoff + 6, outoff + 3, 0},
            {inoff,  7,     outoff,  5,      inoff + 6, outoff + 3, 0},
            {inoff,  8,     outoff,  5,      inoff + 6, outoff + 3, 0},
            {inoff,  9,     outoff,  5,      inoff + 6, outoff + 3, 0},
            {inoff,  10,    outoff,  5,      inoff + 10,outoff + 5, 0},
             // 2-byte underflow/overflow
            {inoff,  2,     outoff,  1,      inoff + 1, outoff + 1, 0},
            {inoff,  3,     outoff,  1,      inoff + 1, outoff + 1, 1},
             // 3-byte underflow/overflow
            {inoff,  4,     outoff,  2,      inoff + 3, outoff + 2, 0},
            {inoff,  5,     outoff,  2,      inoff + 3, outoff + 2, 0},
            {inoff,  6,     outoff,  2,      inoff + 3, outoff + 2, 1},
             // 4-byte underflow/overflow
            {inoff,  7,     outoff,  4,      inoff + 6, outoff + 3, 0},
            {inoff,  8,     outoff,  4,      inoff + 6, outoff + 3, 0},
            {inoff,  9,     outoff,  4,      inoff + 6, outoff + 3, 0},
            {inoff,  10,    outoff,  4,      inoff + 6, outoff + 3, 1},
        };
        for (boolean direct: new boolean[] {false, true}) {
            for (int[] flow: Flows) {
                if (!check(dec, utf8s, direct, flow))
                    failed = true;
            }
        }}}
        if (failed)
            throw new RuntimeException("Check under/overflow failed " + csn);
    }

    public static void main(String[] args) throws Exception {
        checkRoundtrip("UTF-8");
        check6ByteSurrs("UTF-8");
        //compare("UTF-8", "UTF-8-OLD");
        checkMalformed("UTF-8");
        checkUnderOverflow("UTF-8");
    }
}