8043592: The basic XML parser based on UKit fails to read XML files encoded in UTF-16BE or LE
authorjoehw
Tue, 27 May 2014 17:26:52 -0700
changeset 24621 ab51c36f66db
parent 24615 74eb0778e4f2
child 24622 0b2ad6558dc6
8043592: The basic XML parser based on UKit fails to read XML files encoded in UTF-16BE or LE Reviewed-by: sherman, lancea
jdk/src/share/classes/jdk/internal/util/xml/impl/Parser.java
jdk/test/java/util/Properties/LoadAndStoreXML.java
--- a/jdk/src/share/classes/jdk/internal/util/xml/impl/Parser.java	Wed Jul 05 19:42:32 2017 +0200
+++ b/jdk/src/share/classes/jdk/internal/util/xml/impl/Parser.java	Tue May 27 17:26:52 2014 -0700
@@ -2860,14 +2860,25 @@
             } else {
                 //              Get encoding from BOM or the xml text decl.
                 reader = bom(is.getByteStream(), ' ');
+                /**
+                 * [#4.3.3] requires BOM for UTF-16, however, it's not uncommon
+                 * that it may be missing. A mature technique exists in Xerces
+                 * to further check for possible UTF-16 encoding
+                 */
+                if (reader == null) {
+                    reader = utf16(is.getByteStream());
+                }
+
                 if (reader == null) {
                     //          Encoding is defined by the xml text decl.
                     reader = enc("UTF-8", is.getByteStream());
                     expenc = xml(reader);
-                    if (expenc.startsWith("UTF-16")) {
-                        panic(FAULT);  // UTF-16 must have BOM [#4.3.3]
+                    if (!expenc.equals("UTF-8")) {
+                        if (expenc.startsWith("UTF-16")) {
+                            panic(FAULT);  // UTF-16 must have BOM [#4.3.3]
+                        }
+                        reader = enc(expenc, is.getByteStream());
                     }
-                    reader = enc(expenc, is.getByteStream());
                 } else {
                     //          Encoding is defined by the BOM.
                     xml(reader);
@@ -2956,6 +2967,49 @@
         }
     }
 
+
+    /**
+     * Using a mature technique from Xerces, this method checks further after
+     * the bom method above to see if the encoding is UTF-16
+     *
+     * @param is A byte stream of the entity.
+     * @return a reader, may be null
+     * @exception Exception is parser specific exception form panic method.
+     * @exception IOException
+     */
+    private Reader utf16(InputStream is)
+            throws Exception {
+        if (mChIdx != 0) {
+            //The bom method has read ONE byte into the buffer.
+            byte b0 = (byte)mChars[0];
+            if (b0 == 0x00 || b0 == 0x3C) {
+                int b1 = is.read();
+                int b2 = is.read();
+                int b3 = is.read();
+                if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
+                    // UTF-16, big-endian, no BOM
+                    mChars[0] = (char)(b1);
+                    mChars[mChIdx++] = (char)(b3);
+                    return new ReaderUTF16(is, 'b');
+                } else if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
+                    // UTF-16, little-endian, no BOM
+                    mChars[0] = (char)(b0);
+                    mChars[mChIdx++] = (char)(b2);
+                    return new ReaderUTF16(is, 'l');
+                } else {
+                    /**not every InputStream supports reset, so we have to remember
+                     * the state for further parsing
+                    **/
+                    mChars[0] = (char)(b0);
+                    mChars[mChIdx++] = (char)(b1);
+                    mChars[mChIdx++] = (char)(b2);
+                    mChars[mChIdx++] = (char)(b3);
+                }
+
+            }
+        }
+        return null;
+    }
     /**
      * Parses the xml text declaration.
      *
@@ -2974,17 +3028,17 @@
         String enc = "UTF-8";
         char ch;
         int val;
-        short st;
-        //              Read the xml text declaration into the buffer
-        if (mChIdx != 0) {
-            //          The bom method have read ONE char into the buffer.
-            st = (short) ((mChars[0] == '<') ? 1 : -1);
-        } else {
-            st = 0;
-        }
+        short st = 0;
+        int byteRead =  mChIdx; //number of bytes read prior to entering this method
+
         while (st >= 0 && mChIdx < mChars.length) {
-            ch = ((val = reader.read()) >= 0) ? (char) val : EOS;
-            mChars[mChIdx++] = ch;
+            if (st < byteRead) {
+                ch = mChars[st];
+            } else {
+                ch = ((val = reader.read()) >= 0) ? (char) val : EOS;
+                mChars[mChIdx++] = ch;
+            }
+
             switch (st) {
                 case 0:     // read '<' of xml declaration
                     switch (ch) {
--- a/jdk/test/java/util/Properties/LoadAndStoreXML.java	Wed Jul 05 19:42:32 2017 +0200
+++ b/jdk/test/java/util/Properties/LoadAndStoreXML.java	Tue May 27 17:26:52 2014 -0700
@@ -32,6 +32,7 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.UnsupportedEncodingException;
+import java.nio.charset.Charset;
 import java.nio.file.DirectoryStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
@@ -47,6 +48,7 @@
 import java.util.PropertyPermission;
 
 public class LoadAndStoreXML {
+    static final String bomChar = "\uFEFF";
 
     /**
      * Simple policy implementation that grants a set of permissions to
@@ -125,13 +127,14 @@
      * Sanity test that properties saved with Properties#storeToXML can be
      * read with Properties#loadFromXML.
      */
-    static void testLoadAndStore(String encoding) throws IOException {
+    static void testLoadAndStore(String encoding, boolean appendBOM) throws IOException {
         System.out.println("testLoadAndStore, encoding=" + encoding);
 
         Properties props = new Properties();
+        props.put("k0", "\u6C34");
         props.put("k1", "foo");
         props.put("k2", "bar");
-        props.put("k3", "\\u0020\\u0391\\u0392\\u0393\\u0394\\u0395\\u0396\\u0397");
+        props.put("k3", "\u0020\u0391\u0392\u0393\u0394\u0395\u0396\u0397");
         props.put("k4", "\u7532\u9aa8\u6587");
         props.put("k5", "<java.home>/lib/jaxp.properties");
 
@@ -141,7 +144,17 @@
             throw new RuntimeException("OutputStream closed by storeToXML");
 
         Properties p = new Properties();
-        TestInputStream in = new TestInputStream(out.toByteArray());
+        TestInputStream in;
+        if (appendBOM) {
+            byte[] byteOrderMark = bomChar.getBytes(Charset.forName(encoding));
+            byte[] outArray = out.toByteArray();
+            byte[] inputArray = new byte[byteOrderMark.length + outArray.length];
+            System.arraycopy(byteOrderMark, 0, inputArray, 0, byteOrderMark.length);
+            System.arraycopy(outArray, 0, inputArray, byteOrderMark.length, outArray.length);
+            in = new TestInputStream(inputArray);
+        } else {
+            in = new TestInputStream(out.toByteArray());
+        }
         p.loadFromXML(in);
         if (in.isOpen())
             throw new RuntimeException("InputStream not closed by loadFromXML");
@@ -231,8 +244,12 @@
 
     public static void main(String[] args) throws IOException {
 
-        testLoadAndStore("UTF-8");
-        testLoadAndStore("UTF-16");
+        testLoadAndStore("UTF-8", false);
+        testLoadAndStore("UTF-16", false);
+        testLoadAndStore("UTF-16BE", false);
+        testLoadAndStore("UTF-16LE", false);
+        testLoadAndStore("UTF-16BE", true);
+        testLoadAndStore("UTF-16LE", true);
         testLoadWithoutEncoding();
         testLoadWithBadEncoding();
         testStoreWithBadEncoding();
@@ -250,7 +267,7 @@
         Policy.setPolicy(p);
         System.setSecurityManager(new SecurityManager());
         try {
-            testLoadAndStore("UTF-8");
+            testLoadAndStore("UTF-8", false);
         } finally {
             // turn off security manager and restore policy
             System.setSecurityManager(null);