src/demo/share/jpackager/JNLPConverter/src/jnlp/converter/parser/xml/XMLEncoding.java
branchJDK-8200758-branch
changeset 56963 eaca4369b068
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/demo/share/jpackager/JNLPConverter/src/jnlp/converter/parser/xml/XMLEncoding.java	Fri Oct 12 19:00:51 2018 -0400
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) 2006, 2018, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package jnlp.converter.parser.xml;
+
+import java.io.ByteArrayInputStream;
+import java.io.EOFException;
+import java.io.InputStreamReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.UnsupportedEncodingException;
+
+public class XMLEncoding {
+    /**
+     * Decodes a byte stream into a String by testing for a Byte Order Mark
+     * (BOM) or an XML declaration.
+     * <br />
+     * Detection begins by examining the first four octets of the stream for a
+     * BOM. If a BOM is not found, then an encoding declaration is looked for
+     * at the beginning of the stream. If the encoding still can not be
+     * determined at this point, then UTF-8 is assumed.
+     *
+     * @param data  an array of bytes containing an encoded XML document.
+     *
+     * @return A string containing the decoded XML document.
+     */
+    public static String decodeXML(byte [] data) throws IOException {
+        int start = 0;
+        String encoding;
+
+        if (data.length < BOM_LENGTH) {
+            throw (new EOFException("encoding.error.not.xml"));
+        }
+        // no else required; successfully read stream
+        int firstFour = ((0xff000000 & ((int) data[0] << 24)) |
+                         (0x00ff0000 & ((int) data[1] << 16)) |
+                         (0x0000ff00 & ((int) data[2] <<  8)) |
+                         (0x000000ff &  (int) data[3]));
+
+        // start by examining the first four bytes for a BOM
+        switch (firstFour) {
+            case EBCDIC:
+                // examine the encoding declaration
+                encoding = examineEncodingDeclaration(data, IBM037_ENC);
+                break;
+
+            case XML_DECLARATION:
+                // assume UTF-8, but examine the encoding declaration
+                encoding = examineEncodingDeclaration(data, UTF_8_ENC);
+                break;
+
+            case UTF_16BE:
+                encoding = UTF_16BE_ENC;
+                break;
+
+            case UTF_16LE:
+                encoding = UTF_16LE_ENC;
+                break;
+
+            case UNUSUAL_OCTET_1:
+            case UNUSUAL_OCTET_2:
+                throw (new UnsupportedEncodingException("encoding.error.unusual.octet"));
+
+            case UTF_32_BE_BOM:
+            case UTF_32_LE_BOM:
+                encoding = UTF_32_ENC;
+                break;
+
+            default:
+                int firstThree = firstFour & 0xffffff00;
+
+                switch (firstThree) {
+                    case UTF_8_BOM:
+                        // the InputStreamReader class doen't properly handle
+                        // the Byte Order Mark (BOM) in UTF-8 streams, so don't
+                        // putback those 3 bytes.
+                        start    = 3;
+                        encoding = UTF_8_ENC;
+                        break;
+
+                    default:
+                        int firstTwo = firstFour & 0xffff0000;
+
+                        switch (firstTwo) {
+                            case UTF_16_BE_BOM:
+                            case UTF_16_LE_BOM:
+                                encoding = UTF_16_ENC;
+                                break;
+
+                            default:
+                                // this is probably UTF-8 without the encoding
+                                // declaration
+                                encoding = UTF_8_ENC;
+                                break;
+                        }
+                        break;
+                }
+                break;
+        }
+
+        return (new String(data, start, data.length - start, encoding));
+    }
+
+    /**
+     * [3]  S            ::= ( #x20 | #x09 | #x0d | #x0a )
+     * [23] XMLDecl      ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
+     * [24] VersionInfo  ::= S 'version' Eq ( '"' VersionNum '"' |
+     *                                        "'" VersionNum "'" )
+     * [25] Eq           ::= S? '=' S?
+     * [26] VersionNum   ::= ([a-zA-Z0-9_.:] | '-')+
+     * [80] EncodingDecl ::= S 'encoding' Eq ( '"' EncName '"' |
+     *                                         "'" EncName "'" )
+     * [81] EncName      ::= [a-zA-Z] ([a-zA-Z0-9_.] | '-')*
+     */
+    private static String examineEncodingDeclaration(byte [] data,
+                          String    encoding) throws IOException {
+        boolean loop       = false;
+        boolean recognized = false;
+        boolean almost     = false;
+        boolean question   = false;
+        boolean done       = false;
+        boolean found      = false;
+        int     pos        = 0;
+        int     ch         = -1;
+        Reader  reader     = null;
+        String  result     = ((encoding != null) ? encoding : UTF_8_ENC);
+
+        reader = new InputStreamReader(new ByteArrayInputStream(data), result);
+        ch     = reader.read();
+
+        // if this is an XML declaration, it will start with the text '<?xml'
+        for (int i = 0; ((i < XML_DECL_START.length()) && (done == false)); i++) {
+            if (ch != XML_DECL_START.charAt(i)) {
+                // This doesn't look like an XML declaration.  This method
+                // should only be called if the stream contains an XML
+                // declaration in the encoding that is passed into the method.
+                done = true;
+                break;
+            }
+            // no else required; still matches
+            ch = reader.read();
+        }
+
+        // there must be at least one whitespace character next.
+        loop = true;
+        while ((loop == true) && (done == false)) {
+            switch (ch) {
+                case SPACE:
+                case TAB:         // intentional
+                case LINEFEED:    // fall
+                case RETURN:      // through
+                    ch = reader.read();
+                    break;
+
+                case -1:
+                    // unexpected EOF
+                    done = true;
+                    break;
+
+                default:
+                    // non-whitespace
+                    loop = false;
+                    break;
+            }
+        }
+
+        // now look for the text 'encoding', but if the end of the XML
+        // declaration (signified by the text '?>') comes first, then
+        // assume the encoding is UTF-8
+        loop = true;
+        while ((loop == true) && (done == false)) {
+            if (ch == -1) {
+                // unexpected EOF
+                done = true;
+                break;
+            } else if (recognized == true) {
+                // this is the encoding declaration as long as the next few
+                // characters are whitespace and/or the equals ('=') sign
+                switch (ch) {
+                    case SPACE:       // intentional
+                    case TAB:         // fall
+                    case LINEFEED:    // through
+                    case RETURN:
+                        // don't need to do anything
+                        break;
+
+                    case EQUAL:
+                        if (almost == false) {
+                            // got the equal, now find a quote
+                            almost = true;
+                        } else {
+                            // this is not valid XML, so punt
+                            recognized = false;
+                            done       = true;
+                        }
+                        break;
+
+                    case DOUBLE_QUOTE:    // intentional
+                    case SINGLE_QUOTE:    // fall through
+                        if (almost == true) {
+                            // got the quote, so move on to get the value
+                            loop = false;
+                        } else {
+                            // got a quote before the equal; this is not valid
+                            // XML, so punt
+                            recognized = false;
+                            done       = true;
+                        }
+                        break;
+
+                    default:
+                        // non-whitespace
+                        recognized = false;
+                        if (almost == true) {
+                            // this is not valid XML, so punt
+                            done = true;
+                        }
+                        // no else required; this wasn't the encoding
+                        // declaration
+                        break;
+                }
+
+                if (recognized == false) {
+                    // this isn't the encoding declaration, so go back to the
+                    // top without reading the next character
+                    pos = 0;
+                    continue;
+                }
+                // no else required; still looking good
+            } else if (ch == ENCODING_DECL.charAt(pos++)) {
+                if (ENCODING_DECL.length() == pos) {
+                    // this looks like the encoding declaration
+                    recognized = true;
+                }
+                // no else required; this might be the encoding declaration
+            } else if (ch == '?') {
+                question = true;
+                pos      = 0;
+            } else if ((ch == '>') && (question == true)) {
+                // there is no encoding declaration, so assume that the initial
+                // encoding guess was correct
+                done   = true;
+                continue;
+            } else {
+                // still searching for the encoding declaration
+                pos = 0;
+            }
+
+            ch = reader.read();
+        }
+
+        if (done == false) {
+            StringBuilder buffer = new StringBuilder(MAX_ENC_NAME);
+
+            if (((ch >= 'a') && (ch <= 'z')) |
+                ((ch >= 'A') && (ch <= 'Z'))) {
+                // add the character to the result
+                buffer.append((char) ch);
+
+                loop = true;
+                while ((loop == true) && (done == false)) {
+                    ch = reader.read();
+
+                    if (((ch >= 'a') && (ch <= 'z')) ||
+                        ((ch >= 'A') && (ch <= 'Z')) ||
+                        ((ch >= '0') && (ch <= '9')) ||
+                        (ch == '_') || (ch == '.') || (ch == '-')) {
+                        // add the character to the result
+                        buffer.append((char) ch);
+                    } else if ((ch == DOUBLE_QUOTE) || (ch == SINGLE_QUOTE)) {
+                        // finished!
+                        found  = true;
+                        done   = true;
+                        result = buffer.toString();
+                    } else {
+                        // this is not a valid encoding name, so punt
+                        done = true;
+                    }
+                }
+            } else {
+                // this is not a valid encoding name, so punt
+                done = true;
+            }
+        }
+        // no else required; already failed to find the encoding somewhere else
+
+        return (result);
+    }
+
+    private static final int BOM_LENGTH   = 4;
+    private static final int MAX_ENC_NAME = 512;
+
+    private static final int SPACE        = 0x00000020;
+    private static final int TAB          = 0x00000009;
+    private static final int LINEFEED     = 0x0000000a;
+    private static final int RETURN       = 0x0000000d;
+    private static final int EQUAL        = '=';
+    private static final int DOUBLE_QUOTE = '\"';
+    private static final int SINGLE_QUOTE = '\'';
+
+    private static final int UTF_32_BE_BOM   = 0x0000feff;
+    private static final int UTF_32_LE_BOM   = 0xfffe0000;
+    private static final int UTF_16_BE_BOM   = 0xfeff0000;
+    private static final int UTF_16_LE_BOM   = 0xfffe0000;
+    private static final int UTF_8_BOM       = 0xefbbbf00;
+    private static final int UNUSUAL_OCTET_1 = 0x00003c00;
+    private static final int UNUSUAL_OCTET_2 = 0x003c0000;
+    private static final int UTF_16BE        = 0x003c003f;
+    private static final int UTF_16LE        = 0x3c003f00;
+    private static final int EBCDIC          = 0x4c6fa794;
+    private static final int XML_DECLARATION = 0x3c3f786d;
+
+    private static final String UTF_32_ENC   = "UTF-32";
+    private static final String UTF_16_ENC   = "UTF-16";
+    private static final String UTF_16BE_ENC = "UTF-16BE";
+    private static final String UTF_16LE_ENC = "UTF-16LE";
+    private static final String UTF_8_ENC    = "UTF-8";
+    private static final String IBM037_ENC   = "IBM037";
+
+    private static final String XML_DECL_START = "<?xml";
+    private static final String ENCODING_DECL  = "encoding";
+}