8072081: Supplementary characters are rejected in comments
authorjoehw
Wed, 09 Dec 2015 21:40:44 -0800
changeset 34554 20d95817d066
parent 34463 c269a10aca11
child 34555 65254ce59909
8072081: Supplementary characters are rejected in comments Reviewed-by: lancea
jaxp/src/java.xml/share/classes/com/sun/org/apache/xerces/internal/impl/XMLDocumentFragmentScannerImpl.java
jaxp/src/java.xml/share/classes/com/sun/org/apache/xerces/internal/impl/XMLDocumentScannerImpl.java
jaxp/src/java.xml/share/classes/com/sun/org/apache/xerces/internal/impl/XMLScanner.java
jaxp/test/javax/xml/jaxp/unittest/parsers/SupplementaryChars.java
--- a/jaxp/src/java.xml/share/classes/com/sun/org/apache/xerces/internal/impl/XMLDocumentFragmentScannerImpl.java	Thu Dec 03 11:42:25 2015 -0800
+++ b/jaxp/src/java.xml/share/classes/com/sun/org/apache/xerces/internal/impl/XMLDocumentFragmentScannerImpl.java	Wed Dec 09 21:40:44 2015 -0800
@@ -1394,7 +1394,12 @@
             fEmptyElement = true;
             return true;
         } else if (!isValidNameStartChar(c) || !sawSpace) {
-            reportFatalError("ElementUnterminated", new Object[]{fElementQName.rawname});
+            // Second chance. Check if this character is a high
+            // surrogate of a valid name start character.
+            if (!isValidNameStartHighSurrogate(c) || !sawSpace) {
+                reportFatalError("ElementUnterminated",
+                        new Object[]{fElementQName.rawname});
+            }
         }
 
         return false;
@@ -2606,40 +2611,38 @@
         private void startOfMarkup() throws IOException {
             fMarkupDepth++;
             final int ch = fEntityScanner.peekChar();
-
-            switch(ch){
-                case '?' :{
-                    setScannerState(SCANNER_STATE_PI);
-                    fEntityScanner.skipChar(ch);
-                    break;
-                }
-                case '!' :{
-                    fEntityScanner.skipChar(ch);
-                    if (fEntityScanner.skipChar('-')) {
-                        if (!fEntityScanner.skipChar('-')) {
-                            reportFatalError("InvalidCommentStart",
+            if (isValidNameStartChar(ch) || isValidNameStartHighSurrogate(ch)) {
+                setScannerState(SCANNER_STATE_START_ELEMENT_TAG);
+            } else {
+                switch(ch){
+                    case '?' :{
+                        setScannerState(SCANNER_STATE_PI);
+                        fEntityScanner.skipChar(ch);
+                        break;
+                    }
+                    case '!' :{
+                        fEntityScanner.skipChar(ch);
+                        if (fEntityScanner.skipChar('-')) {
+                            if (!fEntityScanner.skipChar('-')) {
+                                reportFatalError("InvalidCommentStart",
+                                        null);
+                            }
+                            setScannerState(SCANNER_STATE_COMMENT);
+                        } else if (fEntityScanner.skipString(cdata)) {
+                            setScannerState(SCANNER_STATE_CDATA );
+                        } else if (!scanForDoctypeHook()) {
+                            reportFatalError("MarkupNotRecognizedInContent",
                                     null);
                         }
-                        setScannerState(SCANNER_STATE_COMMENT);
-                    } else if (fEntityScanner.skipString(cdata)) {
-                        setScannerState(SCANNER_STATE_CDATA );
-                    } else if (!scanForDoctypeHook()) {
-                        reportFatalError("MarkupNotRecognizedInContent",
-                                null);
+                        break;
                     }
-                    break;
-                }
-                case '/' :{
-                    setScannerState(SCANNER_STATE_END_ELEMENT_TAG);
-                    fEntityScanner.skipChar(ch);
-                    break;
-                }
-                default :{
-                    if (isValidNameStartChar(ch)) {
-                        setScannerState(SCANNER_STATE_START_ELEMENT_TAG);
-                    } else {
-                        reportFatalError("MarkupNotRecognizedInContent",
-                                null);
+                    case '/' :{
+                        setScannerState(SCANNER_STATE_END_ELEMENT_TAG);
+                        fEntityScanner.skipChar(ch);
+                        break;
+                    }
+                    default :{
+                        reportFatalError("MarkupNotRecognizedInContent", null);
                     }
                 }
             }
--- a/jaxp/src/java.xml/share/classes/com/sun/org/apache/xerces/internal/impl/XMLDocumentScannerImpl.java	Thu Dec 03 11:42:25 2015 -0800
+++ b/jaxp/src/java.xml/share/classes/com/sun/org/apache/xerces/internal/impl/XMLDocumentScannerImpl.java	Wed Dec 09 21:40:44 2015 -0800
@@ -847,9 +847,12 @@
 
                         case SCANNER_STATE_START_OF_MARKUP: {
                             fMarkupDepth++;
-
-                            if (fEntityScanner.skipChar('?')) {
-                                setScannerState(SCANNER_STATE_PI);
+                            if (isValidNameStartChar(fEntityScanner.peekChar()) ||
+                                    isValidNameStartHighSurrogate(fEntityScanner.peekChar())) {
+                                setScannerState(SCANNER_STATE_ROOT_ELEMENT);
+                                setDriver(fContentDriver);
+                                //from now onwards this would be handled by fContentDriver,in the same next() call
+                                return fContentDriver.next();
                             } else if (fEntityScanner.skipChar('!')) {
                                 if (fEntityScanner.skipChar('-')) {
                                     if (!fEntityScanner.skipChar('-')) {
@@ -872,12 +875,8 @@
                                     reportFatalError("MarkupNotRecognizedInProlog",
                                             null);
                                 }
-                            } else if (XMLChar.isNameStart(fEntityScanner.peekChar())) {
-                                setScannerState(SCANNER_STATE_ROOT_ELEMENT);
-                                setDriver(fContentDriver);
-                                //from now onwards this would be handled by fContentDriver,in the same next() call
-                                return fContentDriver.next();
-
+                            } else if (fEntityScanner.skipChar('?')) {
+                                setScannerState(SCANNER_STATE_PI);
                             } else {
                                 reportFatalError("MarkupNotRecognizedInProlog",
                                         null);
@@ -1395,7 +1394,8 @@
                             } else if (fEntityScanner.skipChar('/')) {
                                 reportFatalError("MarkupNotRecognizedInMisc",
                                         null);
-                            } else if (XMLChar.isNameStart(fEntityScanner.peekChar())) {
+                            } else if (isValidNameStartChar(fEntityScanner.peekChar()) ||
+                                    isValidNameStartHighSurrogate(fEntityScanner.peekChar())) {
                                 reportFatalError("MarkupNotRecognizedInMisc",
                                         null);
                                 scanStartElement();
--- a/jaxp/src/java.xml/share/classes/com/sun/org/apache/xerces/internal/impl/XMLScanner.java	Thu Dec 03 11:42:25 2015 -0800
+++ b/jaxp/src/java.xml/share/classes/com/sun/org/apache/xerces/internal/impl/XMLScanner.java	Wed Dec 09 21:40:44 2015 -0800
@@ -784,7 +784,7 @@
                 if (XMLChar.isHighSurrogate(c)) {
                     scanSurrogates(text);
                 }
-                if (isInvalidLiteral(c)) {
+                else if (isInvalidLiteral(c)) {
                     reportFatalError("InvalidCharInComment",
                             new Object[] { Integer.toHexString(c) });
                             fEntityScanner.scanChar();
@@ -1385,6 +1385,14 @@
         return (XMLChar.isNameStart(value));
     } // isValidNameStartChar(int):  boolean
 
+    // returns true if the given character is
+    // a valid high surrogate for a nameStartChar
+    // with respect to the version of XML understood
+    // by this scanner.
+    protected boolean isValidNameStartHighSurrogate(int value) {
+        return false;
+    } // isValidNameStartHighSurrogate(int):  boolean
+
     protected boolean versionSupported(String version ) {
         return version.equals("1.0") || version.equals("1.1");
     } // version Supported
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/jaxp/test/javax/xml/jaxp/unittest/parsers/SupplementaryChars.java	Wed Dec 09 21:40:44 2015 -0800
@@ -0,0 +1,67 @@
+package parsers;
+
+import java.io.ByteArrayInputStream;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+import org.xml.sax.SAXParseException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * @bug 8072081
+ * @summary verifies that supplementary characters are supported as character
+ * data in xml 1.0, and also names in xml 1.1.
+ *
+ * Joe Wang (huizhe.wang@oracle.com)
+ */
+
+public class SupplementaryChars {
+
+    @Test(dataProvider = "supported")
+    public void test(String xml) throws Exception {
+        ByteArrayInputStream stream = new ByteArrayInputStream(xml.getBytes("UTF-8"));
+        getParser().parse(stream, new DefaultHandler());
+        stream.close();
+    }
+
+    @Test(dataProvider = "unsupported", expectedExceptions = SAXParseException.class)
+    public void testInvalid(String xml) throws Exception {
+        ByteArrayInputStream stream = new ByteArrayInputStream(xml.getBytes("UTF-8"));
+        getParser().parse(stream, new DefaultHandler());
+        stream.close();
+    }
+
+    @DataProvider(name = "supported")
+    private Object[][] supported() {
+
+        return new Object[][] {
+            {"<?xml version=\"1.0\"?><tag>\uD840\uDC0B</tag>"},
+            {"<?xml version=\"1.0\"?><!-- \uD840\uDC0B --><tag/>"},
+            {"<?xml version=\"1.1\"?><tag\uD840\uDC0B>in tag name</tag\uD840\uDC0B>"},
+            {"<?xml version=\"1.1\"?><tag attr\uD840\uDC0B=\"in attribute\">in attribute name</tag>"},
+            {"<?xml version=\"1.1\"?><tag>\uD840\uDC0B</tag>"},
+            {"<?xml version=\"1.1\"?><!-- \uD840\uDC0B --><dontCare/>"}
+        };
+    }
+
+    @DataProvider(name = "unsupported")
+    private Object[][] unsupported() {
+        return new Object[][] {
+            {"<?xml version=\"1.0\"?><tag\uD840\uDC0B>in tag name</tag\uD840\uDC0B>"},
+            {"<?xml version=\"1.0\"?><tag attr\uD840\uDC0B=\"in attribute\">in attribute name</tag>"}
+        };
+    }
+
+    private SAXParser getParser() {
+        SAXParser parser = null;
+        try {
+            SAXParserFactory factory = SAXParserFactory.newInstance();
+            parser = factory.newSAXParser();
+        } catch (Exception e) {
+            throw new RuntimeException(e.getMessage());
+        }
+        return parser;
+    }
+}