src/demo/share/jpackager/JNLPConverter/src/jnlp/converter/parser/xml/XMLEncoding.java
branchJDK-8200758-branch
changeset 56963 eaca4369b068
equal deleted inserted replaced
56962:a769ad2d40d6 56963:eaca4369b068
       
     1 /*
       
     2  * Copyright (c) 2006, 2018, Oracle and/or its affiliates. All rights reserved.
       
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
       
     4  *
       
     5  * This code is free software; you can redistribute it and/or modify it
       
     6  * under the terms of the GNU General Public License version 2 only, as
       
     7  * published by the Free Software Foundation.
       
     8  *
       
     9  * This code is distributed in the hope that it will be useful, but WITHOUT
       
    10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
       
    11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
       
    12  * version 2 for more details (a copy is included in the LICENSE file that
       
    13  * accompanied this code).
       
    14  *
       
    15  * You should have received a copy of the GNU General Public License version
       
    16  * 2 along with this work; if not, write to the Free Software Foundation,
       
    17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
       
    18  *
       
    19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
       
    20  * or visit www.oracle.com if you need additional information or have any
       
    21  * questions.
       
    22  */
       
    23 
       
    24 package jnlp.converter.parser.xml;
       
    25 
       
    26 import java.io.ByteArrayInputStream;
       
    27 import java.io.EOFException;
       
    28 import java.io.InputStreamReader;
       
    29 import java.io.IOException;
       
    30 import java.io.Reader;
       
    31 import java.io.UnsupportedEncodingException;
       
    32 
       
    33 public class XMLEncoding {
       
    34     /**
       
    35      * Decodes a byte stream into a String by testing for a Byte Order Mark
       
    36      * (BOM) or an XML declaration.
       
    37      * <br />
       
    38      * Detection begins by examining the first four octets of the stream for a
       
    39      * BOM. If a BOM is not found, then an encoding declaration is looked for
       
    40      * at the beginning of the stream. If the encoding still can not be
       
    41      * determined at this point, then UTF-8 is assumed.
       
    42      *
       
    43      * @param data  an array of bytes containing an encoded XML document.
       
    44      *
       
    45      * @return A string containing the decoded XML document.
       
    46      */
       
    47     public static String decodeXML(byte [] data) throws IOException {
       
    48         int start = 0;
       
    49         String encoding;
       
    50 
       
    51         if (data.length < BOM_LENGTH) {
       
    52             throw (new EOFException("encoding.error.not.xml"));
       
    53         }
       
    54         // no else required; successfully read stream
       
    55         int firstFour = ((0xff000000 & ((int) data[0] << 24)) |
       
    56                          (0x00ff0000 & ((int) data[1] << 16)) |
       
    57                          (0x0000ff00 & ((int) data[2] <<  8)) |
       
    58                          (0x000000ff &  (int) data[3]));
       
    59 
       
    60         // start by examining the first four bytes for a BOM
       
    61         switch (firstFour) {
       
    62             case EBCDIC:
       
    63                 // examine the encoding declaration
       
    64                 encoding = examineEncodingDeclaration(data, IBM037_ENC);
       
    65                 break;
       
    66 
       
    67             case XML_DECLARATION:
       
    68                 // assume UTF-8, but examine the encoding declaration
       
    69                 encoding = examineEncodingDeclaration(data, UTF_8_ENC);
       
    70                 break;
       
    71 
       
    72             case UTF_16BE:
       
    73                 encoding = UTF_16BE_ENC;
       
    74                 break;
       
    75 
       
    76             case UTF_16LE:
       
    77                 encoding = UTF_16LE_ENC;
       
    78                 break;
       
    79 
       
    80             case UNUSUAL_OCTET_1:
       
    81             case UNUSUAL_OCTET_2:
       
    82                 throw (new UnsupportedEncodingException("encoding.error.unusual.octet"));
       
    83 
       
    84             case UTF_32_BE_BOM:
       
    85             case UTF_32_LE_BOM:
       
    86                 encoding = UTF_32_ENC;
       
    87                 break;
       
    88 
       
    89             default:
       
    90                 int firstThree = firstFour & 0xffffff00;
       
    91 
       
    92                 switch (firstThree) {
       
    93                     case UTF_8_BOM:
       
    94                         // the InputStreamReader class doen't properly handle
       
    95                         // the Byte Order Mark (BOM) in UTF-8 streams, so don't
       
    96                         // putback those 3 bytes.
       
    97                         start    = 3;
       
    98                         encoding = UTF_8_ENC;
       
    99                         break;
       
   100 
       
   101                     default:
       
   102                         int firstTwo = firstFour & 0xffff0000;
       
   103 
       
   104                         switch (firstTwo) {
       
   105                             case UTF_16_BE_BOM:
       
   106                             case UTF_16_LE_BOM:
       
   107                                 encoding = UTF_16_ENC;
       
   108                                 break;
       
   109 
       
   110                             default:
       
   111                                 // this is probably UTF-8 without the encoding
       
   112                                 // declaration
       
   113                                 encoding = UTF_8_ENC;
       
   114                                 break;
       
   115                         }
       
   116                         break;
       
   117                 }
       
   118                 break;
       
   119         }
       
   120 
       
   121         return (new String(data, start, data.length - start, encoding));
       
   122     }
       
   123 
       
   124     /**
       
   125      * [3]  S            ::= ( #x20 | #x09 | #x0d | #x0a )
       
   126      * [23] XMLDecl      ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
       
   127      * [24] VersionInfo  ::= S 'version' Eq ( '"' VersionNum '"' |
       
   128      *                                        "'" VersionNum "'" )
       
   129      * [25] Eq           ::= S? '=' S?
       
   130      * [26] VersionNum   ::= ([a-zA-Z0-9_.:] | '-')+
       
   131      * [80] EncodingDecl ::= S 'encoding' Eq ( '"' EncName '"' |
       
   132      *                                         "'" EncName "'" )
       
   133      * [81] EncName      ::= [a-zA-Z] ([a-zA-Z0-9_.] | '-')*
       
   134      */
       
   135     private static String examineEncodingDeclaration(byte [] data,
       
   136                           String    encoding) throws IOException {
       
   137         boolean loop       = false;
       
   138         boolean recognized = false;
       
   139         boolean almost     = false;
       
   140         boolean question   = false;
       
   141         boolean done       = false;
       
   142         boolean found      = false;
       
   143         int     pos        = 0;
       
   144         int     ch         = -1;
       
   145         Reader  reader     = null;
       
   146         String  result     = ((encoding != null) ? encoding : UTF_8_ENC);
       
   147 
       
   148         reader = new InputStreamReader(new ByteArrayInputStream(data), result);
       
   149         ch     = reader.read();
       
   150 
       
   151         // if this is an XML declaration, it will start with the text '<?xml'
       
   152         for (int i = 0; ((i < XML_DECL_START.length()) && (done == false)); i++) {
       
   153             if (ch != XML_DECL_START.charAt(i)) {
       
   154                 // This doesn't look like an XML declaration.  This method
       
   155                 // should only be called if the stream contains an XML
       
   156                 // declaration in the encoding that is passed into the method.
       
   157                 done = true;
       
   158                 break;
       
   159             }
       
   160             // no else required; still matches
       
   161             ch = reader.read();
       
   162         }
       
   163 
       
   164         // there must be at least one whitespace character next.
       
   165         loop = true;
       
   166         while ((loop == true) && (done == false)) {
       
   167             switch (ch) {
       
   168                 case SPACE:
       
   169                 case TAB:         // intentional
       
   170                 case LINEFEED:    // fall
       
   171                 case RETURN:      // through
       
   172                     ch = reader.read();
       
   173                     break;
       
   174 
       
   175                 case -1:
       
   176                     // unexpected EOF
       
   177                     done = true;
       
   178                     break;
       
   179 
       
   180                 default:
       
   181                     // non-whitespace
       
   182                     loop = false;
       
   183                     break;
       
   184             }
       
   185         }
       
   186 
       
   187         // now look for the text 'encoding', but if the end of the XML
       
   188         // declaration (signified by the text '?>') comes first, then
       
   189         // assume the encoding is UTF-8
       
   190         loop = true;
       
   191         while ((loop == true) && (done == false)) {
       
   192             if (ch == -1) {
       
   193                 // unexpected EOF
       
   194                 done = true;
       
   195                 break;
       
   196             } else if (recognized == true) {
       
   197                 // this is the encoding declaration as long as the next few
       
   198                 // characters are whitespace and/or the equals ('=') sign
       
   199                 switch (ch) {
       
   200                     case SPACE:       // intentional
       
   201                     case TAB:         // fall
       
   202                     case LINEFEED:    // through
       
   203                     case RETURN:
       
   204                         // don't need to do anything
       
   205                         break;
       
   206 
       
   207                     case EQUAL:
       
   208                         if (almost == false) {
       
   209                             // got the equal, now find a quote
       
   210                             almost = true;
       
   211                         } else {
       
   212                             // this is not valid XML, so punt
       
   213                             recognized = false;
       
   214                             done       = true;
       
   215                         }
       
   216                         break;
       
   217 
       
   218                     case DOUBLE_QUOTE:    // intentional
       
   219                     case SINGLE_QUOTE:    // fall through
       
   220                         if (almost == true) {
       
   221                             // got the quote, so move on to get the value
       
   222                             loop = false;
       
   223                         } else {
       
   224                             // got a quote before the equal; this is not valid
       
   225                             // XML, so punt
       
   226                             recognized = false;
       
   227                             done       = true;
       
   228                         }
       
   229                         break;
       
   230 
       
   231                     default:
       
   232                         // non-whitespace
       
   233                         recognized = false;
       
   234                         if (almost == true) {
       
   235                             // this is not valid XML, so punt
       
   236                             done = true;
       
   237                         }
       
   238                         // no else required; this wasn't the encoding
       
   239                         // declaration
       
   240                         break;
       
   241                 }
       
   242 
       
   243                 if (recognized == false) {
       
   244                     // this isn't the encoding declaration, so go back to the
       
   245                     // top without reading the next character
       
   246                     pos = 0;
       
   247                     continue;
       
   248                 }
       
   249                 // no else required; still looking good
       
   250             } else if (ch == ENCODING_DECL.charAt(pos++)) {
       
   251                 if (ENCODING_DECL.length() == pos) {
       
   252                     // this looks like the encoding declaration
       
   253                     recognized = true;
       
   254                 }
       
   255                 // no else required; this might be the encoding declaration
       
   256             } else if (ch == '?') {
       
   257                 question = true;
       
   258                 pos      = 0;
       
   259             } else if ((ch == '>') && (question == true)) {
       
   260                 // there is no encoding declaration, so assume that the initial
       
   261                 // encoding guess was correct
       
   262                 done   = true;
       
   263                 continue;
       
   264             } else {
       
   265                 // still searching for the encoding declaration
       
   266                 pos = 0;
       
   267             }
       
   268 
       
   269             ch = reader.read();
       
   270         }
       
   271 
       
   272         if (done == false) {
       
   273             StringBuilder buffer = new StringBuilder(MAX_ENC_NAME);
       
   274 
       
   275             if (((ch >= 'a') && (ch <= 'z')) |
       
   276                 ((ch >= 'A') && (ch <= 'Z'))) {
       
   277                 // add the character to the result
       
   278                 buffer.append((char) ch);
       
   279 
       
   280                 loop = true;
       
   281                 while ((loop == true) && (done == false)) {
       
   282                     ch = reader.read();
       
   283 
       
   284                     if (((ch >= 'a') && (ch <= 'z')) ||
       
   285                         ((ch >= 'A') && (ch <= 'Z')) ||
       
   286                         ((ch >= '0') && (ch <= '9')) ||
       
   287                         (ch == '_') || (ch == '.') || (ch == '-')) {
       
   288                         // add the character to the result
       
   289                         buffer.append((char) ch);
       
   290                     } else if ((ch == DOUBLE_QUOTE) || (ch == SINGLE_QUOTE)) {
       
   291                         // finished!
       
   292                         found  = true;
       
   293                         done   = true;
       
   294                         result = buffer.toString();
       
   295                     } else {
       
   296                         // this is not a valid encoding name, so punt
       
   297                         done = true;
       
   298                     }
       
   299                 }
       
   300             } else {
       
   301                 // this is not a valid encoding name, so punt
       
   302                 done = true;
       
   303             }
       
   304         }
       
   305         // no else required; already failed to find the encoding somewhere else
       
   306 
       
   307         return (result);
       
   308     }
       
   309 
       
   310     private static final int BOM_LENGTH   = 4;
       
   311     private static final int MAX_ENC_NAME = 512;
       
   312 
       
   313     private static final int SPACE        = 0x00000020;
       
   314     private static final int TAB          = 0x00000009;
       
   315     private static final int LINEFEED     = 0x0000000a;
       
   316     private static final int RETURN       = 0x0000000d;
       
   317     private static final int EQUAL        = '=';
       
   318     private static final int DOUBLE_QUOTE = '\"';
       
   319     private static final int SINGLE_QUOTE = '\'';
       
   320 
       
   321     private static final int UTF_32_BE_BOM   = 0x0000feff;
       
   322     private static final int UTF_32_LE_BOM   = 0xfffe0000;
       
   323     private static final int UTF_16_BE_BOM   = 0xfeff0000;
       
   324     private static final int UTF_16_LE_BOM   = 0xfffe0000;
       
   325     private static final int UTF_8_BOM       = 0xefbbbf00;
       
   326     private static final int UNUSUAL_OCTET_1 = 0x00003c00;
       
   327     private static final int UNUSUAL_OCTET_2 = 0x003c0000;
       
   328     private static final int UTF_16BE        = 0x003c003f;
       
   329     private static final int UTF_16LE        = 0x3c003f00;
       
   330     private static final int EBCDIC          = 0x4c6fa794;
       
   331     private static final int XML_DECLARATION = 0x3c3f786d;
       
   332 
       
   333     private static final String UTF_32_ENC   = "UTF-32";
       
   334     private static final String UTF_16_ENC   = "UTF-16";
       
   335     private static final String UTF_16BE_ENC = "UTF-16BE";
       
   336     private static final String UTF_16LE_ENC = "UTF-16LE";
       
   337     private static final String UTF_8_ENC    = "UTF-8";
       
   338     private static final String IBM037_ENC   = "IBM037";
       
   339 
       
   340     private static final String XML_DECL_START = "<?xml";
       
   341     private static final String ENCODING_DECL  = "encoding";
       
   342 }