8145974: XMLStreamWriter produces invalid XML for surrogate pairs on OutputStreamWriter
authoraefimov
Fri, 13 May 2016 01:19:41 +0300
changeset 37933 dc320206b458
parent 37932 71238492c828
child 37934 4c38661774bf
8145974: XMLStreamWriter produces invalid XML for surrogate pairs on OutputStreamWriter Reviewed-by: joehw
jaxp/src/java.xml/share/classes/com/sun/xml/internal/stream/writers/XMLStreamWriterImpl.java
jaxp/test/javax/xml/jaxp/unittest/stream/XMLStreamWriterTest/SurrogatesTest.java
--- a/jaxp/src/java.xml/share/classes/com/sun/xml/internal/stream/writers/XMLStreamWriterImpl.java	Thu May 12 18:46:32 2016 +0000
+++ b/jaxp/src/java.xml/share/classes/com/sun/xml/internal/stream/writers/XMLStreamWriterImpl.java	Fri May 13 01:19:41 2016 +0300
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -1346,6 +1346,15 @@
     }
 
     /**
+     * Writes character reference in hex format.
+     */
+    private void writeCharRef(int codePoint) throws IOException {
+        fWriter.write( "&#x" );
+        fWriter.write( Integer.toHexString(codePoint) );
+        fWriter.write( ';' );
+    }
+
+    /**
      * Writes XML content to underlying writer. Escapes characters unless
      * escaping character feature is turned off.
      */
@@ -1368,10 +1377,14 @@
             if (fEncoder != null && !fEncoder.canEncode(ch)){
                 fWriter.write(content, startWritePos, index - startWritePos );
 
-                // Escape this char as underlying encoder cannot handle it
-                fWriter.write( "&#x" );
-                fWriter.write(Integer.toHexString(ch));
-                fWriter.write( ';' );
+                // Check if current and next characters forms a surrogate pair
+                // and escape it to avoid generation of invalid xml content
+                if ( index != end - 1 && Character.isSurrogatePair(ch, content[index+1])) {
+                    writeCharRef(Character.toCodePoint(ch, content[index+1]));
+                    index++;
+                } else {
+                    writeCharRef(ch);
+                }
                 startWritePos = index + 1;
                 continue;
             }
@@ -1439,10 +1452,15 @@
             if (fEncoder != null && !fEncoder.canEncode(ch)){
                 fWriter.write(content, startWritePos, index - startWritePos );
 
-                // Escape this char as underlying encoder cannot handle it
-                fWriter.write( "&#x" );
-                fWriter.write(Integer.toHexString(ch));
-                fWriter.write( ';' );
+                // Check if current and next characters forms a surrogate pair
+                // and escape it to avoid generation of invalid xml content
+                if ( index != end - 1 && Character.isSurrogatePair(ch, content.charAt(index+1))) {
+                    writeCharRef(Character.toCodePoint(ch, content.charAt(index+1)));
+                    index++;
+                } else {
+                    writeCharRef(ch);
+                }
+
                 startWritePos = index + 1;
                 continue;
             }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/jaxp/test/javax/xml/jaxp/unittest/stream/XMLStreamWriterTest/SurrogatesTest.java	Fri May 13 01:19:41 2016 +0300
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package stream.XMLStreamWriterTest;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.InputStream;
+import java.io.OutputStreamWriter;
+
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLOutputFactory;
+import javax.xml.stream.XMLStreamConstants;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.XMLStreamReader;
+import javax.xml.stream.XMLStreamWriter;
+
+import org.testng.Assert;
+import org.testng.annotations.Test;
+import org.testng.annotations.DataProvider;
+
+/*
+ * @test
+ * @bug 8145974
+ * @modules javax.xml
+ * @summary Check that XMLStreamWriter generates valid xml with surrogate pair
+ *  used within element text
+ */
+
+public class SurrogatesTest {
+
+    // Test that valid surrogate characters can be written/readen by xml stream
+    // reader/writer
+    @Test(dataProvider = "validData")
+    public void xmlWithValidSurrogatesTest(String content)
+            throws Exception {
+        generateAndReadXml(content);
+    }
+
+    // Test that unbalanced surrogate character will
+    @Test(dataProvider = "invalidData",
+            expectedExceptions = XMLStreamException.class)
+    public void xmlWithUnbalancedSurrogatesTest(String content)
+            throws Exception {
+        generateAndReadXml(content);
+    }
+
+    // Generates xml content with XMLStreamWriter and read it to check
+    // for correctness of xml and generated data
+    void generateAndReadXml(String content) throws Exception {
+        ByteArrayOutputStream stream = new ByteArrayOutputStream();
+        XMLOutputFactory factory = XMLOutputFactory.newInstance();
+        OutputStreamWriter streamWriter = new OutputStreamWriter(stream);
+        XMLStreamWriter writer = factory.createXMLStreamWriter(streamWriter);
+
+        // Generate xml with selected stream writer type
+        generateXML(writer, content);
+        String output = stream.toString();
+        System.out.println("Generated xml: " + output);
+        // Read generated xml with StAX parser
+        readXML(output.getBytes(), content);
+    }
+
+    // Generates XML with provided xml stream writer. Provided string
+    // is inserted into xml twice: with usage of writeCharacters( String )
+    // and writeCharacters( char [], int , int )
+    private void generateXML(XMLStreamWriter writer, String sequence)
+            throws XMLStreamException {
+        char[] seqArr = sequence.toCharArray();
+        writer.writeStartDocument();
+        writer.writeStartElement("root");
+
+        // Use writeCharacters( String ) to write characters
+        writer.writeStartElement("writeCharactersWithString");
+        writer.writeCharacters(sequence);
+        writer.writeEndElement();
+
+        // Use writeCharacters( char [], int , int ) to write characters
+        writer.writeStartElement("writeCharactersWithArray");
+        writer.writeCharacters(seqArr, 0, seqArr.length);
+        writer.writeEndElement();
+
+        // Close root element and document
+        writer.writeEndElement();
+        writer.writeEndDocument();
+        writer.flush();
+        writer.close();
+    }
+
+    // Reads generated XML data and check if it contains expected
+    // text in writeCharactersWithString and writeCharactersWithArray
+    // elements
+    private void readXML(byte[] xmlData, String expectedContent)
+            throws Exception {
+        InputStream stream = new ByteArrayInputStream(xmlData);
+        XMLInputFactory factory = XMLInputFactory.newInstance();
+        XMLStreamReader xmlReader
+                = factory.createXMLStreamReader(stream);
+        boolean inTestElement = false;
+        StringBuilder sb = new StringBuilder();
+        while (xmlReader.hasNext()) {
+            String ename;
+            switch (xmlReader.getEventType()) {
+                case XMLStreamConstants.START_ELEMENT:
+                    ename = xmlReader.getLocalName();
+                    if (ename.equals("writeCharactersWithString")
+                            || ename.equals("writeCharactersWithArray")) {
+                        inTestElement = true;
+                    }
+                    break;
+                case XMLStreamConstants.END_ELEMENT:
+                    ename = xmlReader.getLocalName();
+                    if (ename.equals("writeCharactersWithString")
+                            || ename.equals("writeCharactersWithArray")) {
+                        inTestElement = false;
+                        String content = sb.toString();
+                        System.out.println(ename + " text:'" + content + "' expected:'" + expectedContent+"'");
+                        Assert.assertEquals(content, expectedContent);
+                        sb.setLength(0);
+                    }
+                    break;
+                case XMLStreamConstants.CHARACTERS:
+                    if (inTestElement) {
+                        sb.append(xmlReader.getText());
+                    }
+                    break;
+            }
+            xmlReader.next();
+        }
+    }
+
+    @DataProvider(name = "validData")
+    Object[][] getValidData() {
+        return new Object[][] {
+            {"Don't Worry Be \uD83D\uDE0A"},
+            {"BMP characters \uE000\uFFFD"},
+            {"Simple text"},
+        };
+    }
+
+    @DataProvider(name = "invalidData")
+    Object[][] getInvalidData() {
+        return new Object[][] {
+            {"Unbalanced surrogate \uD83D"},
+            {"Unbalanced surrogate \uD83Dis here"},
+            {"Surrogate with followup BMP\uD83D\uFFF9"},
+        };
+    }
+}