jdk/src/share/classes/sun/nio/cs/UTF_8.java
author xdono
Wed, 02 Jul 2008 12:55:45 -0700
changeset 715 f16baef3a20e
parent 492 d34fdbd7da77
child 1091 136d19d6c372
permissions -rw-r--r--
6719955: Update copyright year Summary: Update copyright year for files that have been modified in 2008 Reviewed-by: ohair, tbell

/*
 * Copyright 2000-2008 Sun Microsystems, Inc.  All Rights Reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Sun designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Sun in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
 * CA 95054 USA or visit www.sun.com if you need additional information or
 * have any questions.
 */

package sun.nio.cs;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.BufferOverflowException;
import java.nio.BufferUnderflowException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.MalformedInputException;
import java.nio.charset.UnmappableCharacterException;


/*
 * # Bits   Bit pattern
 * 1    7   0xxxxxxx
 * 2   11   110xxxxx 10xxxxxx
 * 3   16   1110xxxx 10xxxxxx 10xxxxxx
 * 4   21   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 * 5   26   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 * 6   31   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 *
 * UCS-2 uses 1-3, UTF-16 uses 1-4, UCS-4 uses 1-6
 */

class UTF_8 extends Unicode
{

    public UTF_8() {
        super("UTF-8", StandardCharsets.aliases_UTF_8);
    }

    public String historicalName() {
        return "UTF8";
    }

    public CharsetDecoder newDecoder() {
        return new Decoder(this);
    }

    public CharsetEncoder newEncoder() {
        return new Encoder(this);
    }


    private static class Decoder extends CharsetDecoder {
        private Decoder(Charset cs) {
            super(cs, 1.0f, 1.0f);
        }

        private boolean isContinuation(int b) {
            return ((b & 0xc0) == 0x80);
        }

        private final Surrogate.Generator sgg = new Surrogate.Generator();

        private CoderResult decodeArrayLoop(ByteBuffer src,
                                            CharBuffer dst)
        {
            byte[] sa = src.array();
            int sp = src.arrayOffset() + src.position();
            int sl = src.arrayOffset() + src.limit();
            assert (sp <= sl);
            sp = (sp <= sl ? sp : sl);
            char[] da = dst.array();
            int dp = dst.arrayOffset() + dst.position();
            int dl = dst.arrayOffset() + dst.limit();
            assert (dp <= dl);
            dp = (dp <= dl ? dp : dl);

            try {
                while (sp < sl) {
                    int b1 = sa[sp];
                    int b2, b3;
                    switch ((b1 >> 4) & 0x0f) {

                    case 0: case 1: case 2: case 3:
                    case 4: case 5: case 6: case 7:
                        // 1 byte, 7 bits: 0xxxxxxx
                        if (dl - dp < 1)
                            return CoderResult.OVERFLOW;
                        da[dp++] = (char)(b1 & 0x7f);
                        sp++;
                        continue;

                    case 12: case 13:
                        // 2 bytes, 11 bits: 110xxxxx 10xxxxxx
                        if (sl - sp < 2)
                            return CoderResult.UNDERFLOW;
                        if (dl - dp < 1)
                            return CoderResult.OVERFLOW;
                        if (!isContinuation(b2 = sa[sp + 1]))
                            return CoderResult.malformedForLength(1);
                        da[dp++] = ((char)(((b1 & 0x1f) << 6) |
                                           ((b2 & 0x3f) << 0)));
                        sp += 2;
                        continue;

                    case 14:
                        // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx
                        if (sl - sp < 3)
                            return CoderResult.UNDERFLOW;
                        if (dl - dp < 1)
                            return CoderResult.OVERFLOW;
                        if (!isContinuation(b2 = sa[sp + 1]))
                            return CoderResult.malformedForLength(1);
                        if (!isContinuation(b3 = sa[sp + 2]))
                            return CoderResult.malformedForLength(2);
                        da[dp++] = ((char)(((b1 & 0x0f) << 12) |
                                           ((b2 & 0x3f) << 06) |
                                           ((b3 & 0x3f) << 0)));
                        sp += 3;
                        continue;

                    case 15:
                        // 4, 5, or 6 bytes

                        int b4, b5, b6, uc, n;
                        switch (b1 & 0x0f) {

                        case 0: case 1: case 2: case 3:
                        case 4: case 5: case 6: case 7:
                            // 4 bytes, 21 bits
                            if (sl - sp < 4)
                                return CoderResult.UNDERFLOW;
                            if (!isContinuation(b2 = sa[sp + 1]))
                                return CoderResult.malformedForLength(1);
                            if (!isContinuation(b3 = sa[sp + 2]))
                                return CoderResult.malformedForLength(2);
                            if (!isContinuation(b4 = sa[sp + 3]))
                                return CoderResult.malformedForLength(3);
                            uc = (((b1 & 0x07) << 18) |
                                  ((b2 & 0x3f) << 12) |
                                  ((b3 & 0x3f) << 06) |
                                  ((b4 & 0x3f) << 00));
                            n = 4;
                            break;

                        case 8: case 9: case 10: case 11:
                            // 5 bytes, 26 bits
                            if (sl - sp < 5)
                                return CoderResult.UNDERFLOW;
                            if (!isContinuation(b2 = sa[sp + 1]))
                                return CoderResult.malformedForLength(1);
                            if (!isContinuation(b3 = sa[sp + 2]))
                                return CoderResult.malformedForLength(2);
                            if (!isContinuation(b4 = sa[sp + 3]))
                                return CoderResult.malformedForLength(3);
                            if (!isContinuation(b5 = sa[sp + 4]))
                                return CoderResult.malformedForLength(4);
                            uc = (((b1 & 0x03) << 24) |
                                  ((b2 & 0x3f) << 18) |
                                  ((b3 & 0x3f) << 12) |
                                  ((b4 & 0x3f) << 06) |
                                  ((b5 & 0x3f) << 00));
                            n = 5;
                            break;

                        case 12: case 13:
                            // 6 bytes, 31 bits
                            if (sl - sp < 6)
                                return CoderResult.UNDERFLOW;
                            if (!isContinuation(b2 = sa[sp + 1]))
                                return CoderResult.malformedForLength(1);
                            if (!isContinuation(b3 = sa[sp + 2]))
                                return CoderResult.malformedForLength(2);
                            if (!isContinuation(b4 = sa[sp + 3]))
                                return CoderResult.malformedForLength(3);
                            if (!isContinuation(b5 = sa[sp + 4]))
                                return CoderResult.malformedForLength(4);
                            if (!isContinuation(b6 = sa[sp + 5]))
                                return CoderResult.malformedForLength(5);
                            uc = (((b1 & 0x01) << 30) |
                                  ((b2 & 0x3f) << 24) |
                                  ((b3 & 0x3f) << 18) |
                                  ((b4 & 0x3f) << 12) |
                                  ((b5 & 0x3f) << 06) |
                                  ((b6 & 0x3f)));
                            n = 6;
                            break;

                        default:
                            return CoderResult.malformedForLength(1);

                        }

                        int gn = sgg.generate(uc, n, da, dp, dl);
                        if (gn < 0)
                            return sgg.error();
                        dp += gn;
                        sp += n;
                        continue;

                    default:
                        return CoderResult.malformedForLength(1);

                    }

                }

                return CoderResult.UNDERFLOW;
            } finally {
                src.position(sp - src.arrayOffset());
                dst.position(dp - dst.arrayOffset());
            }
        }

        private CoderResult decodeBufferLoop(ByteBuffer src,
                                             CharBuffer dst)
        {
            int mark = src.position();
            try {
                while (src.hasRemaining()) {
                    int b1 = src.get();
                    int b2, b3;
                    switch ((b1 >> 4) & 0x0f) {

                    case 0: case 1: case 2: case 3:
                    case 4: case 5: case 6: case 7:
                        // 1 byte, 7 bits: 0xxxxxxx
                        if (dst.remaining() < 1)
                            return CoderResult.OVERFLOW;
                        dst.put((char)b1);
                        mark++;
                        continue;

                    case 12: case 13:
                        // 2 bytes, 11 bits: 110xxxxx 10xxxxxx
                        if (src.remaining() < 1)
                            return CoderResult.UNDERFLOW;
                        if (dst.remaining() < 1)
                            return CoderResult.OVERFLOW;
                        if (!isContinuation(b2 = src.get()))
                            return CoderResult.malformedForLength(1);
                        dst.put((char)(((b1 & 0x1f) << 6) |
                                       ((b2 & 0x3f) << 0)));
                        mark += 2;
                        continue;

                    case 14:
                        // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx
                        if (src.remaining() < 2)
                            return CoderResult.UNDERFLOW;
                        if (dst.remaining() < 1)
                            return CoderResult.OVERFLOW;
                        if (!isContinuation(b2 = src.get()))
                            return CoderResult.malformedForLength(1);
                        if (!isContinuation(b3 = src.get()))
                            return CoderResult.malformedForLength(2);
                        dst.put((char)(((b1 & 0x0f) << 12) |
                                       ((b2 & 0x3f) << 06) |
                                       ((b3 & 0x3f) << 0)));
                        mark += 3;
                        continue;

                    case 15:
                        // 4, 5, or 6 bytes

                        int b4, b5, b6, uc, n;
                        switch (b1 & 0x0f) {

                        case 0: case 1: case 2: case 3:
                        case 4: case 5: case 6: case 7:
                            // 4 bytes, 21 bits
                            if (src.remaining() < 3)
                                return CoderResult.UNDERFLOW;
                            if (!isContinuation(b2 = src.get()))
                                return CoderResult.malformedForLength(1);
                            if (!isContinuation(b3 = src.get()))
                                return CoderResult.malformedForLength(2);
                            if (!isContinuation(b4 = src.get()))
                                return CoderResult.malformedForLength(3);
                            uc = (((b1 & 0x07) << 18) |
                                  ((b2 & 0x3f) << 12) |
                                  ((b3 & 0x3f) << 06) |
                                  ((b4 & 0x3f) << 00));
                            n = 4;
                            break;

                        case 8: case 9: case 10: case 11:
                            // 5 bytes, 26 bits
                            if (src.remaining() < 4)
                                return CoderResult.UNDERFLOW;
                            if (!isContinuation(b2 = src.get()))
                                return CoderResult.malformedForLength(1);
                            if (!isContinuation(b3 = src.get()))
                                return CoderResult.malformedForLength(2);
                            if (!isContinuation(b4 = src.get()))
                                return CoderResult.malformedForLength(3);
                            if (!isContinuation(b5 = src.get()))
                                return CoderResult.malformedForLength(4);
                            uc = (((b1 & 0x03) << 24) |
                                  ((b2 & 0x3f) << 18) |
                                  ((b3 & 0x3f) << 12) |
                                  ((b4 & 0x3f) << 06) |
                                  ((b5 & 0x3f) << 00));
                            n = 5;
                            break;

                        case 12: case 13:
                            // 6 bytes, 31 bits
                            if (src.remaining() < 5)
                                return CoderResult.UNDERFLOW;
                            if (!isContinuation(b2 = src.get()))
                                return CoderResult.malformedForLength(1);
                            if (!isContinuation(b3 = src.get()))
                                return CoderResult.malformedForLength(2);
                            if (!isContinuation(b4 = src.get()))
                                return CoderResult.malformedForLength(3);
                            if (!isContinuation(b5 = src.get()))
                                return CoderResult.malformedForLength(4);
                            if (!isContinuation(b6 = src.get()))
                                return CoderResult.malformedForLength(5);
                            uc = (((b1 & 0x01) << 30) |
                                  ((b2 & 0x3f) << 24) |
                                  ((b3 & 0x3f) << 18) |
                                  ((b4 & 0x3f) << 12) |
                                  ((b5 & 0x3f) << 06) |
                                  ((b6 & 0x3f)));
                            n = 6;
                            break;

                        default:
                            return CoderResult.malformedForLength(1);

                        }

                        if (sgg.generate(uc, n, dst) < 0)
                            return sgg.error();
                        mark += n;
                        continue;

                    default:
                        return CoderResult.malformedForLength(1);

                    }

                }
                return CoderResult.UNDERFLOW;
            } finally {
                src.position(mark);
            }
        }

        protected CoderResult decodeLoop(ByteBuffer src,
                                         CharBuffer dst)
        {
            if (src.hasArray() && dst.hasArray())
                return decodeArrayLoop(src, dst);
            else
                return decodeBufferLoop(src, dst);
        }

    }


    private static class Encoder extends CharsetEncoder {

        private Encoder(Charset cs) {
            super(cs, 1.1f, 4.0f);
        }

        public boolean canEncode(char c) {
            return !Surrogate.is(c);
        }

        private final Surrogate.Parser sgp = new Surrogate.Parser();

        private CoderResult encodeArrayLoop(CharBuffer src,
                                            ByteBuffer dst)
        {
            char[] sa = src.array();
            int sp = src.arrayOffset() + src.position();
            int sl = src.arrayOffset() + src.limit();
            assert (sp <= sl);
            sp = (sp <= sl ? sp : sl);
            byte[] da = dst.array();
            int dp = dst.arrayOffset() + dst.position();
            int dl = dst.arrayOffset() + dst.limit();
            assert (dp <= dl);
            dp = (dp <= dl ? dp : dl);

            try {
                while (sp < sl) {
                    char c = sa[sp];

                    if (c < 0x80) {
                        // Have at most seven bits
                        if (dp >= dl)
                            return CoderResult.OVERFLOW;
                        da[dp++] = (byte)c;
                        sp++;
                        continue;
                    }

                    if (!Surrogate.is(c)) {
                        // 2 bytes, 11 bits
                        if (c < 0x800) {
                            if (dl - dp < 2)
                                return CoderResult.OVERFLOW;
                            da[dp++] = (byte)(0xc0 | ((c >> 06)));
                            da[dp++] = (byte)(0x80 | ((c >> 00) & 0x3f));
                            sp++;
                            continue;
                        }
                        if (c <= '\uFFFF') {
                            // 3 bytes, 16 bits
                            if (dl - dp < 3)
                                return CoderResult.OVERFLOW;
                            da[dp++] = (byte)(0xe0 | ((c >> 12)));
                            da[dp++] = (byte)(0x80 | ((c >> 06) & 0x3f));
                            da[dp++] = (byte)(0x80 | ((c >> 00) & 0x3f));
                            sp++;
                            continue;
                        }
                    }

                    // Have a surrogate pair
                    int uc = sgp.parse(c, sa, sp, sl);
                    if (uc < 0)
                        return sgp.error();
                    if (uc < 0x200000) {
                        if (dl - dp < 4)
                            return CoderResult.OVERFLOW;
                        da[dp++] = (byte)(0xf0 | ((uc >> 18)));
                        da[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f));
                        da[dp++] = (byte)(0x80 | ((uc >> 06) & 0x3f));
                        da[dp++] = (byte)(0x80 | ((uc >> 00) & 0x3f));
                        sp += sgp.increment();
                        continue;
                    }
                    assert false;

                }
                return CoderResult.UNDERFLOW;
            } finally {
                src.position(sp - src.arrayOffset());
                dst.position(dp - dst.arrayOffset());
            }
        }

        private CoderResult encodeBufferLoop(CharBuffer src,
                                             ByteBuffer dst)
        {
            int mark = src.position();
            try {
                while (src.hasRemaining()) {
                    char c = src.get();

                    if (c < 0x80) {
                        // Have at most seven bits
                        if (!dst.hasRemaining())
                            return CoderResult.OVERFLOW;
                        dst.put((byte)c);
                        mark++;
                        continue;
                    }

                    if (!Surrogate.is(c)) {
                        if (c < 0x800) {
                            // 2 bytes, 11 bits
                            if (dst.remaining() < 2)
                                return CoderResult.OVERFLOW;
                            dst.put((byte)(0xc0 | ((c >> 06))));
                            dst.put((byte)(0x80 | ((c >> 00) & 0x3f)));
                            mark++;
                            continue;
                        }
                        if (c <= '\uFFFF') {
                            // 3 bytes, 16 bits
                            if (dst.remaining() < 3)
                                return CoderResult.OVERFLOW;
                            dst.put((byte)(0xe0 | ((c >> 12))));
                            dst.put((byte)(0x80 | ((c >> 06) & 0x3f)));
                            dst.put((byte)(0x80 | ((c >> 00) & 0x3f)));
                            mark++;
                            continue;
                        }
                    }

                    // Have a surrogate pair
                    int uc = sgp.parse(c, src);
                    if (uc < 0)
                        return sgp.error();
                    if (uc < 0x200000) {
                        if (dst.remaining() < 4)
                            return CoderResult.OVERFLOW;
                        dst.put((byte)(0xf0 | ((uc >> 18))));
                        dst.put((byte)(0x80 | ((uc >> 12) & 0x3f)));
                        dst.put((byte)(0x80 | ((uc >> 06) & 0x3f)));
                        dst.put((byte)(0x80 | ((uc >> 00) & 0x3f)));
                        mark += sgp.increment();
                        continue;
                    }
                    assert false;

                }
                return CoderResult.UNDERFLOW;
            } finally {
                src.position(mark);
            }
        }

        protected final CoderResult encodeLoop(CharBuffer src,
                                               ByteBuffer dst)
        {
            if (src.hasArray() && dst.hasArray())
                return encodeArrayLoop(src, dst);
            else
                return encodeBufferLoop(src, dst);
        }

    }

}