jdk-sandbox: src/demo/share/jpackager/JNLPConverter/src/jnlp/converter/parser/xml/XMLEncoding.java@eaca4369b068


/*
 * Copyright (c) 2006, 2018, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */

package jnlp.converter.parser.xml;

import java.io.ByteArrayInputStream;
import java.io.EOFException;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.Reader;
import java.io.UnsupportedEncodingException;

public class XMLEncoding {
    /**
     * Decodes a byte stream into a String by testing for a Byte Order Mark
     * (BOM) or an XML declaration.
     * <br />
     * Detection begins by examining the first four octets of the stream for a
     * BOM. If a BOM is not found, then an encoding declaration is looked for
     * at the beginning of the stream. If the encoding still can not be
     * determined at this point, then UTF-8 is assumed.
     *
     * @param data  an array of bytes containing an encoded XML document.
     *
     * @return A string containing the decoded XML document.
     */
    public static String decodeXML(byte [] data) throws IOException {
        int start = 0;
        String encoding;

        if (data.length < BOM_LENGTH) {
            throw (new EOFException("encoding.error.not.xml"));
        }
        // no else required; successfully read stream
        int firstFour = ((0xff000000 & ((int) data[0] << 24)) |
                         (0x00ff0000 & ((int) data[1] << 16)) |
                         (0x0000ff00 & ((int) data[2] <<  8)) |
                         (0x000000ff &  (int) data[3]));

        // start by examining the first four bytes for a BOM
        switch (firstFour) {
            case EBCDIC:
                // examine the encoding declaration
                encoding = examineEncodingDeclaration(data, IBM037_ENC);
                break;

            case XML_DECLARATION:
                // assume UTF-8, but examine the encoding declaration
                encoding = examineEncodingDeclaration(data, UTF_8_ENC);
                break;

            case UTF_16BE:
                encoding = UTF_16BE_ENC;
                break;

            case UTF_16LE:
                encoding = UTF_16LE_ENC;
                break;

            case UNUSUAL_OCTET_1:
            case UNUSUAL_OCTET_2:
                throw (new UnsupportedEncodingException("encoding.error.unusual.octet"));

            case UTF_32_BE_BOM:
            case UTF_32_LE_BOM:
                encoding = UTF_32_ENC;
                break;

            default:
                int firstThree = firstFour & 0xffffff00;

                switch (firstThree) {
                    case UTF_8_BOM:
                        // the InputStreamReader class doen't properly handle
                        // the Byte Order Mark (BOM) in UTF-8 streams, so don't
                        // putback those 3 bytes.
                        start    = 3;
                        encoding = UTF_8_ENC;
                        break;

                    default:
                        int firstTwo = firstFour & 0xffff0000;

                        switch (firstTwo) {
                            case UTF_16_BE_BOM:
                            case UTF_16_LE_BOM:
                                encoding = UTF_16_ENC;
                                break;

                            default:
                                // this is probably UTF-8 without the encoding
                                // declaration
                                encoding = UTF_8_ENC;
                                break;
                        }
                        break;
                }
                break;
        }

        return (new String(data, start, data.length - start, encoding));
    }

    /**
     * [3]  S            ::= ( #x20 | #x09 | #x0d | #x0a )
     * [23] XMLDecl      ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
     * [24] VersionInfo  ::= S 'version' Eq ( '"' VersionNum '"' |
     *                                        "'" VersionNum "'" )
     * [25] Eq           ::= S? '=' S?
     * [26] VersionNum   ::= ([a-zA-Z0-9_.:] | '-')+
     * [80] EncodingDecl ::= S 'encoding' Eq ( '"' EncName '"' |
     *                                         "'" EncName "'" )
     * [81] EncName      ::= [a-zA-Z] ([a-zA-Z0-9_.] | '-')*
     */
    private static String examineEncodingDeclaration(byte [] data,
                          String    encoding) throws IOException {
        boolean loop       = false;
        boolean recognized = false;
        boolean almost     = false;
        boolean question   = false;
        boolean done       = false;
        boolean found      = false;
        int     pos        = 0;
        int     ch         = -1;
        Reader  reader     = null;
        String  result     = ((encoding != null) ? encoding : UTF_8_ENC);

        reader = new InputStreamReader(new ByteArrayInputStream(data), result);
        ch     = reader.read();

        // if this is an XML declaration, it will start with the text '<?xml'
        for (int i = 0; ((i < XML_DECL_START.length()) && (done == false)); i++) {
            if (ch != XML_DECL_START.charAt(i)) {
                // This doesn't look like an XML declaration.  This method
                // should only be called if the stream contains an XML
                // declaration in the encoding that is passed into the method.
                done = true;
                break;
            }
            // no else required; still matches
            ch = reader.read();
        }

        // there must be at least one whitespace character next.
        loop = true;
        while ((loop == true) && (done == false)) {
            switch (ch) {
                case SPACE:
                case TAB:         // intentional
                case LINEFEED:    // fall
                case RETURN:      // through
                    ch = reader.read();
                    break;

                case -1:
                    // unexpected EOF
                    done = true;
                    break;

                default:
                    // non-whitespace
                    loop = false;
                    break;
            }
        }

        // now look for the text 'encoding', but if the end of the XML
        // declaration (signified by the text '?>') comes first, then
        // assume the encoding is UTF-8
        loop = true;
        while ((loop == true) && (done == false)) {
            if (ch == -1) {
                // unexpected EOF
                done = true;
                break;
            } else if (recognized == true) {
                // this is the encoding declaration as long as the next few
                // characters are whitespace and/or the equals ('=') sign
                switch (ch) {
                    case SPACE:       // intentional
                    case TAB:         // fall
                    case LINEFEED:    // through
                    case RETURN:
                        // don't need to do anything
                        break;

                    case EQUAL:
                        if (almost == false) {
                            // got the equal, now find a quote
                            almost = true;
                        } else {
                            // this is not valid XML, so punt
                            recognized = false;
                            done       = true;
                        }
                        break;

                    case DOUBLE_QUOTE:    // intentional
                    case SINGLE_QUOTE:    // fall through
                        if (almost == true) {
                            // got the quote, so move on to get the value
                            loop = false;
                        } else {
                            // got a quote before the equal; this is not valid
                            // XML, so punt
                            recognized = false;
                            done       = true;
                        }
                        break;

                    default:
                        // non-whitespace
                        recognized = false;
                        if (almost == true) {
                            // this is not valid XML, so punt
                            done = true;
                        }
                        // no else required; this wasn't the encoding
                        // declaration
                        break;
                }

                if (recognized == false) {
                    // this isn't the encoding declaration, so go back to the
                    // top without reading the next character
                    pos = 0;
                    continue;
                }
                // no else required; still looking good
            } else if (ch == ENCODING_DECL.charAt(pos++)) {
                if (ENCODING_DECL.length() == pos) {
                    // this looks like the encoding declaration
                    recognized = true;
                }
                // no else required; this might be the encoding declaration
            } else if (ch == '?') {
                question = true;
                pos      = 0;
            } else if ((ch == '>') && (question == true)) {
                // there is no encoding declaration, so assume that the initial
                // encoding guess was correct
                done   = true;
                continue;
            } else {
                // still searching for the encoding declaration
                pos = 0;
            }

            ch = reader.read();
        }

        if (done == false) {
            StringBuilder buffer = new StringBuilder(MAX_ENC_NAME);

            if (((ch >= 'a') && (ch <= 'z')) |
                ((ch >= 'A') && (ch <= 'Z'))) {
                // add the character to the result
                buffer.append((char) ch);

                loop = true;
                while ((loop == true) && (done == false)) {
                    ch = reader.read();

                    if (((ch >= 'a') && (ch <= 'z')) ||
                        ((ch >= 'A') && (ch <= 'Z')) ||
                        ((ch >= '0') && (ch <= '9')) ||
                        (ch == '_') || (ch == '.') || (ch == '-')) {
                        // add the character to the result
                        buffer.append((char) ch);
                    } else if ((ch == DOUBLE_QUOTE) || (ch == SINGLE_QUOTE)) {
                        // finished!
                        found  = true;
                        done   = true;
                        result = buffer.toString();
                    } else {
                        // this is not a valid encoding name, so punt
                        done = true;
                    }
                }
            } else {
                // this is not a valid encoding name, so punt
                done = true;
            }
        }
        // no else required; already failed to find the encoding somewhere else

        return (result);
    }

    private static final int BOM_LENGTH   = 4;
    private static final int MAX_ENC_NAME = 512;

    private static final int SPACE        = 0x00000020;
    private static final int TAB          = 0x00000009;
    private static final int LINEFEED     = 0x0000000a;
    private static final int RETURN       = 0x0000000d;
    private static final int EQUAL        = '=';
    private static final int DOUBLE_QUOTE = '\"';
    private static final int SINGLE_QUOTE = '\'';

    private static final int UTF_32_BE_BOM   = 0x0000feff;
    private static final int UTF_32_LE_BOM   = 0xfffe0000;
    private static final int UTF_16_BE_BOM   = 0xfeff0000;
    private static final int UTF_16_LE_BOM   = 0xfffe0000;
    private static final int UTF_8_BOM       = 0xefbbbf00;
    private static final int UNUSUAL_OCTET_1 = 0x00003c00;
    private static final int UNUSUAL_OCTET_2 = 0x003c0000;
    private static final int UTF_16BE        = 0x003c003f;
    private static final int UTF_16LE        = 0x3c003f00;
    private static final int EBCDIC          = 0x4c6fa794;
    private static final int XML_DECLARATION = 0x3c3f786d;

    private static final String UTF_32_ENC   = "UTF-32";
    private static final String UTF_16_ENC   = "UTF-16";
    private static final String UTF_16BE_ENC = "UTF-16BE";
    private static final String UTF_16LE_ENC = "UTF-16LE";
    private static final String UTF_8_ENC    = "UTF-8";
    private static final String IBM037_ENC   = "IBM037";

    private static final String XML_DECL_START = "<?xml";
    private static final String ENCODING_DECL  = "encoding";
}
author	herrick
	Fri, 12 Oct 2018 19:00:51 -0400
branch	JDK-8200758-branch
changeset 56963	eaca4369b068
permissions	-rw-r--r--