jdk-sandbox: src/jdk.charsets/share/classes/sun/nio/cs/ext/JISAutoDetect.java@5d15fd7e9bb1


/*
 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */

package sun.nio.cs.ext;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.MalformedInputException;
import sun.nio.cs.DelegatableDecoder;
import sun.nio.cs.HistoricallyNamedCharset;
import java.security.AccessController;
import java.security.PrivilegedAction;
import sun.nio.cs.*;
import static java.lang.Character.UnicodeBlock;


public class JISAutoDetect
    extends Charset
    implements HistoricallyNamedCharset
{

    private final static int EUCJP_MASK       = 0x01;
    private final static int SJIS2B_MASK      = 0x02;
    private final static int SJIS1B_MASK      = 0x04;
    private final static int EUCJP_KANA1_MASK = 0x08;
    private final static int EUCJP_KANA2_MASK = 0x10;

    public JISAutoDetect() {
        super("x-JISAutoDetect", ExtendedCharsets.aliasesFor("x-JISAutoDetect"));
    }

    public boolean contains(Charset cs) {
        return ((cs.name().equals("US-ASCII"))
                || (cs instanceof SJIS)
                || (cs instanceof EUC_JP)
                || (cs instanceof ISO2022_JP));
    }

    public boolean canEncode() {
        return false;
    }

    public CharsetDecoder newDecoder() {
        return new Decoder(this);
    }

    public String historicalName() {
        return "JISAutoDetect";
    }

    public CharsetEncoder newEncoder() {
        throw new UnsupportedOperationException();
    }

    // A heuristic algorithm for guessing if EUC-decoded text really
    // might be Japanese text.  Better heuristics are possible...
    private static boolean looksLikeJapanese(CharBuffer cb) {
        int hiragana = 0;       // Fullwidth Hiragana
        int katakana = 0;       // Halfwidth Katakana
        while (cb.hasRemaining()) {
            char c = cb.get();
            if (0x3040 <= c && c <= 0x309f && ++hiragana > 1) return true;
            if (0xff65 <= c && c <= 0xff9f && ++katakana > 1) return true;
        }
        return false;
    }

    private static class Decoder extends CharsetDecoder {
        private final static String osName = AccessController.doPrivileged(
            (PrivilegedAction<String>) () -> System.getProperty("os.name"));

        private final static String SJISName = getSJISName();
        private final static String EUCJPName = getEUCJPName();
        private DelegatableDecoder detectedDecoder = null;

        public Decoder(Charset cs) {
            super(cs, 0.5f, 1.0f);
        }

        private static boolean isPlainASCII(byte b) {
            return b >= 0 && b != 0x1b;
        }

        private static void copyLeadingASCII(ByteBuffer src, CharBuffer dst) {
            int start = src.position();
            int limit = start + Math.min(src.remaining(), dst.remaining());
            int p;
            byte b;
            for (p = start; p < limit && isPlainASCII(b = src.get(p)); p++)
                dst.put((char)(b & 0xff));
            src.position(p);
        }

        private CoderResult decodeLoop(DelegatableDecoder decoder,
                                       ByteBuffer src, CharBuffer dst) {
            ((CharsetDecoder)decoder).reset();
            detectedDecoder = decoder;
            return detectedDecoder.decodeLoop(src, dst);
        }

        protected CoderResult decodeLoop(ByteBuffer src, CharBuffer dst) {
            if (detectedDecoder == null) {
                copyLeadingASCII(src, dst);

                // All ASCII?
                if (! src.hasRemaining())
                    return CoderResult.UNDERFLOW;
                // Overflow only if there is still ascii but no out buffer.
                if (!dst.hasRemaining() &&
                    isPlainASCII(src.get(src.position())))
                    return CoderResult.OVERFLOW;

                // We need to perform double, not float, arithmetic; otherwise
                // we lose low order bits when src is larger than 2**24.
                int cbufsiz = (int)(src.limit() * (double)maxCharsPerByte());
                CharBuffer sandbox = CharBuffer.allocate(cbufsiz);

                // First try ISO-2022-JP, since there is no ambiguity
                Charset cs2022 = Charset.forName("ISO-2022-JP");
                DelegatableDecoder dd2022
                    = (DelegatableDecoder) cs2022.newDecoder();
                ByteBuffer src2022 = src.asReadOnlyBuffer();
                CoderResult res2022 = dd2022.decodeLoop(src2022, sandbox);
                if (! res2022.isError())
                    return decodeLoop(dd2022, src, dst);

                // We must choose between EUC and SJIS
                Charset csEUCJ = Charset.forName(EUCJPName);
                Charset csSJIS = Charset.forName(SJISName);

                DelegatableDecoder ddEUCJ
                    = (DelegatableDecoder) csEUCJ.newDecoder();
                DelegatableDecoder ddSJIS
                    = (DelegatableDecoder) csSJIS.newDecoder();

                ByteBuffer srcEUCJ = src.asReadOnlyBuffer();
                sandbox.clear();
                CoderResult resEUCJ = ddEUCJ.decodeLoop(srcEUCJ, sandbox);
                // If EUC decoding fails, must be SJIS
                if (resEUCJ.isError())
                    return decodeLoop(ddSJIS, src, dst);
                ByteBuffer srcSJIS = src.asReadOnlyBuffer();
                CharBuffer sandboxSJIS = CharBuffer.allocate(cbufsiz);
                CoderResult resSJIS = ddSJIS.decodeLoop(srcSJIS, sandboxSJIS);
                // If SJIS decoding fails, must be EUC
                if (resSJIS.isError())
                    return decodeLoop(ddEUCJ, src, dst);

                // From here on, we have some ambiguity, and must guess.

                // We prefer input that does not appear to end mid-character.
                if (srcEUCJ.position() > srcSJIS.position())
                    return decodeLoop(ddEUCJ, src, dst);

                if (srcEUCJ.position() < srcSJIS.position())
                    return decodeLoop(ddSJIS, src, dst);

                // end-of-input is after the first byte of the first char?
                if (src.position() == srcEUCJ.position())
                    return CoderResult.UNDERFLOW;

                // Use heuristic knowledge of typical Japanese text
                sandbox.flip();
                return decodeLoop(looksLikeJapanese(sandbox) ? ddEUCJ : ddSJIS,
                                  src, dst);
            }

            return detectedDecoder.decodeLoop(src, dst);
        }

        protected void implReset() {
            detectedDecoder = null;
        }

        protected CoderResult implFlush(CharBuffer out) {
            if (detectedDecoder != null)
                return detectedDecoder.implFlush(out);
            else
                return super.implFlush(out);
        }

        public boolean isAutoDetecting() {
            return true;
        }

        public boolean isCharsetDetected() {
            return detectedDecoder != null;
        }

        public Charset detectedCharset() {
            if (detectedDecoder == null)
                throw new IllegalStateException("charset not yet detected");
            return ((CharsetDecoder) detectedDecoder).charset();
        }


        /**
         * Returned Shift_JIS Charset name is OS dependent
         */
        private static String getSJISName() {
            if (osName.equals("Solaris") || osName.equals("SunOS"))
                return("PCK");
            else if (osName.startsWith("Windows"))
                return("windows-31J");
            else
                return("Shift_JIS");
        }

        /**
         * Returned EUC-JP Charset name is OS dependent
         */

        private static String getEUCJPName() {
            if (osName.equals("Solaris") || osName.equals("SunOS"))
                return("x-eucjp-open");
            else
                return("EUC_JP");
        }

    }
}
author	sjohanss
	Thu, 09 Nov 2017 10:11:19 +0100
changeset 47817	5d15fd7e9bb1
parent 47216	71c04702a3d5
permissions	-rw-r--r--