8043592: The basic XML parser based on UKit fails to read XML files encoded in UTF-16BE or LE
Reviewed-by: sherman, lancea
--- a/jdk/src/share/classes/jdk/internal/util/xml/impl/Parser.java Wed Jul 05 19:42:32 2017 +0200
+++ b/jdk/src/share/classes/jdk/internal/util/xml/impl/Parser.java Tue May 27 17:26:52 2014 -0700
@@ -2860,14 +2860,25 @@
} else {
// Get encoding from BOM or the xml text decl.
reader = bom(is.getByteStream(), ' ');
+ /**
+ * [#4.3.3] requires BOM for UTF-16, however, it's not uncommon
+ * that it may be missing. A mature technique exists in Xerces
+ * to further check for possible UTF-16 encoding
+ */
+ if (reader == null) {
+ reader = utf16(is.getByteStream());
+ }
+
if (reader == null) {
// Encoding is defined by the xml text decl.
reader = enc("UTF-8", is.getByteStream());
expenc = xml(reader);
- if (expenc.startsWith("UTF-16")) {
- panic(FAULT); // UTF-16 must have BOM [#4.3.3]
+ if (!expenc.equals("UTF-8")) {
+ if (expenc.startsWith("UTF-16")) {
+ panic(FAULT); // UTF-16 must have BOM [#4.3.3]
+ }
+ reader = enc(expenc, is.getByteStream());
}
- reader = enc(expenc, is.getByteStream());
} else {
// Encoding is defined by the BOM.
xml(reader);
@@ -2956,6 +2967,49 @@
}
}
+
+ /**
+ * Using a mature technique from Xerces, this method checks further after
+ * the bom method above to see if the encoding is UTF-16
+ *
+ * @param is A byte stream of the entity.
+ * @return a reader, may be null
+ * @exception Exception is parser specific exception form panic method.
+ * @exception IOException
+ */
+ private Reader utf16(InputStream is)
+ throws Exception {
+ if (mChIdx != 0) {
+ //The bom method has read ONE byte into the buffer.
+ byte b0 = (byte)mChars[0];
+ if (b0 == 0x00 || b0 == 0x3C) {
+ int b1 = is.read();
+ int b2 = is.read();
+ int b3 = is.read();
+ if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
+ // UTF-16, big-endian, no BOM
+ mChars[0] = (char)(b1);
+ mChars[mChIdx++] = (char)(b3);
+ return new ReaderUTF16(is, 'b');
+ } else if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
+ // UTF-16, little-endian, no BOM
+ mChars[0] = (char)(b0);
+ mChars[mChIdx++] = (char)(b2);
+ return new ReaderUTF16(is, 'l');
+ } else {
+ /**not every InputStream supports reset, so we have to remember
+ * the state for further parsing
+ **/
+ mChars[0] = (char)(b0);
+ mChars[mChIdx++] = (char)(b1);
+ mChars[mChIdx++] = (char)(b2);
+ mChars[mChIdx++] = (char)(b3);
+ }
+
+ }
+ }
+ return null;
+ }
/**
* Parses the xml text declaration.
*
@@ -2974,17 +3028,17 @@
String enc = "UTF-8";
char ch;
int val;
- short st;
- // Read the xml text declaration into the buffer
- if (mChIdx != 0) {
- // The bom method have read ONE char into the buffer.
- st = (short) ((mChars[0] == '<') ? 1 : -1);
- } else {
- st = 0;
- }
+ short st = 0;
+ int byteRead = mChIdx; //number of bytes read prior to entering this method
+
while (st >= 0 && mChIdx < mChars.length) {
- ch = ((val = reader.read()) >= 0) ? (char) val : EOS;
- mChars[mChIdx++] = ch;
+ if (st < byteRead) {
+ ch = mChars[st];
+ } else {
+ ch = ((val = reader.read()) >= 0) ? (char) val : EOS;
+ mChars[mChIdx++] = ch;
+ }
+
switch (st) {
case 0: // read '<' of xml declaration
switch (ch) {
--- a/jdk/test/java/util/Properties/LoadAndStoreXML.java Wed Jul 05 19:42:32 2017 +0200
+++ b/jdk/test/java/util/Properties/LoadAndStoreXML.java Tue May 27 17:26:52 2014 -0700
@@ -32,6 +32,7 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
+import java.nio.charset.Charset;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
@@ -47,6 +48,7 @@
import java.util.PropertyPermission;
public class LoadAndStoreXML {
+ static final String bomChar = "\uFEFF";
/**
* Simple policy implementation that grants a set of permissions to
@@ -125,13 +127,14 @@
* Sanity test that properties saved with Properties#storeToXML can be
* read with Properties#loadFromXML.
*/
- static void testLoadAndStore(String encoding) throws IOException {
+ static void testLoadAndStore(String encoding, boolean appendBOM) throws IOException {
System.out.println("testLoadAndStore, encoding=" + encoding);
Properties props = new Properties();
+ props.put("k0", "\u6C34");
props.put("k1", "foo");
props.put("k2", "bar");
- props.put("k3", "\\u0020\\u0391\\u0392\\u0393\\u0394\\u0395\\u0396\\u0397");
+ props.put("k3", "\u0020\u0391\u0392\u0393\u0394\u0395\u0396\u0397");
props.put("k4", "\u7532\u9aa8\u6587");
props.put("k5", "<java.home>/lib/jaxp.properties");
@@ -141,7 +144,17 @@
throw new RuntimeException("OutputStream closed by storeToXML");
Properties p = new Properties();
- TestInputStream in = new TestInputStream(out.toByteArray());
+ TestInputStream in;
+ if (appendBOM) {
+ byte[] byteOrderMark = bomChar.getBytes(Charset.forName(encoding));
+ byte[] outArray = out.toByteArray();
+ byte[] inputArray = new byte[byteOrderMark.length + outArray.length];
+ System.arraycopy(byteOrderMark, 0, inputArray, 0, byteOrderMark.length);
+ System.arraycopy(outArray, 0, inputArray, byteOrderMark.length, outArray.length);
+ in = new TestInputStream(inputArray);
+ } else {
+ in = new TestInputStream(out.toByteArray());
+ }
p.loadFromXML(in);
if (in.isOpen())
throw new RuntimeException("InputStream not closed by loadFromXML");
@@ -231,8 +244,12 @@
public static void main(String[] args) throws IOException {
- testLoadAndStore("UTF-8");
- testLoadAndStore("UTF-16");
+ testLoadAndStore("UTF-8", false);
+ testLoadAndStore("UTF-16", false);
+ testLoadAndStore("UTF-16BE", false);
+ testLoadAndStore("UTF-16LE", false);
+ testLoadAndStore("UTF-16BE", true);
+ testLoadAndStore("UTF-16LE", true);
testLoadWithoutEncoding();
testLoadWithBadEncoding();
testStoreWithBadEncoding();
@@ -250,7 +267,7 @@
Policy.setPolicy(p);
System.setSecurityManager(new SecurityManager());
try {
- testLoadAndStore("UTF-8");
+ testLoadAndStore("UTF-8", false);
} finally {
// turn off security manager and restore policy
System.setSecurityManager(null);