src/java.xml/share/classes/com/sun/org/apache/xml/internal/serializer/ToStream.java
changeset 51786 c93f14a4ae29
parent 48947 b4dd09b5d6d8
child 55575 25165403c62e
child 58678 9cf78a70fa4f
--- a/src/java.xml/share/classes/com/sun/org/apache/xml/internal/serializer/ToStream.java	Tue Sep 18 11:55:33 2018 -0400
+++ b/src/java.xml/share/classes/com/sun/org/apache/xml/internal/serializer/ToStream.java	Tue Sep 18 09:44:20 2018 -0700
@@ -51,7 +51,7 @@
  * serializers (xml, html, text ...) that write output to a stream.
  *
  * @xsl.usage internal
- * @LastModified: Feb 2018
+ * @LastModified: Sept 2018
  */
 abstract public class ToStream extends SerializerBase {
 
@@ -193,6 +193,8 @@
      */
     private boolean m_expandDTDEntities = true;
 
+    private char m_highSurrogate = 0;
+
     /**
      * Default constructor
      */
@@ -953,45 +955,46 @@
      * @param ch Character array.
      * @param i position Where the surrogate was detected.
      * @param end The end index of the significant characters.
-     * @return 0 if the pair of characters was written out as-is,
-     * the unicode code point of the character represented by
-     * the surrogate pair if an entity reference with that value
-     * was written out.
+     * @return the status of writing a surrogate pair.
+     *        -1 -- nothing is written
+     *         0 -- the pair is written as-is
+     *         code point -- the pair is written as an entity reference
      *
      * @throws IOException
      * @throws org.xml.sax.SAXException if invalid UTF-16 surrogate detected.
      */
     protected int writeUTF16Surrogate(char c, char ch[], int i, int end)
-        throws IOException
+        throws IOException, SAXException
     {
-        int codePoint = 0;
+        int status = -1;
         if (i + 1 >= end)
         {
-            throw new IOException(
-                Utils.messages.createMessage(
-                    MsgKey.ER_INVALID_UTF16_SURROGATE,
-                    new Object[] { Integer.toHexString((int) c)}));
+            m_highSurrogate = c;
+            return status;
         }
 
-        final char high = c;
-        final char low = ch[i+1];
+        char high, low;
+        if (m_highSurrogate == 0) {
+            high = c;
+            low = ch[i+1];
+            status = 0;
+        } else {
+            high = m_highSurrogate;
+            low = c;
+            m_highSurrogate = 0;
+        }
+
         if (!Encodings.isLowUTF16Surrogate(low)) {
-            throw new IOException(
-                Utils.messages.createMessage(
-                    MsgKey.ER_INVALID_UTF16_SURROGATE,
-                    new Object[] {
-                        Integer.toHexString((int) c)
-                            + " "
-                            + Integer.toHexString(low)}));
+            throwIOE(high, low);
         }
 
         final Writer writer = m_writer;
 
         // If we make it to here we have a valid high, low surrogate pair
-        if (m_encodingInfo.isInEncoding(c,low)) {
+        if (m_encodingInfo.isInEncoding(high,low)) {
             // If the character formed by the surrogate pair
             // is in the encoding, so just write it out
-            writer.write(ch,i,2);
+            writer.write(new char[]{high, low}, 0, 2);
         }
         else {
             // Don't know what to do with this char, it is
@@ -999,24 +1002,16 @@
             // a surrogate pair, so write out as an entity ref
             final String encoding = getEncoding();
             if (encoding != null) {
-                /* The output encoding is known,
-                 * so somthing is wrong.
-                  */
-                codePoint = Encodings.toCodePoint(high, low);
-                // not in the encoding, so write out a character reference
-                writer.write('&');
-                writer.write('#');
-                writer.write(Integer.toString(codePoint));
-                writer.write(';');
+                status = writeCharRef(writer, high, low);
             } else {
                 /* The output encoding is not known,
                  * so just write it out as-is.
                  */
-                writer.write(ch, i, 2);
+                writer.write(new char[]{high, low}, 0, 2);
             }
         }
         // non-zero only if character reference was written out.
-        return codePoint;
+        return status;
     }
 
     /**
@@ -1106,32 +1101,7 @@
             }
             else if (isCData && (!escapingNotNeeded(c)))
             {
-                //                if (i != 0)
-                if (m_cdataTagOpen)
-                    closeCDATA();
-
-                // This needs to go into a function...
-                if (Encodings.isHighUTF16Surrogate(c))
-                {
-                    writeUTF16Surrogate(c, ch, i, end);
-                    i++ ; // process two input characters
-                }
-                else
-                {
-                    writer.write("&#");
-
-                    String intStr = Integer.toString((int) c);
-
-                    writer.write(intStr);
-                    writer.write(';');
-                }
-
-                //                if ((i != 0) && (i < (end - 1)))
-                //                if (!m_cdataTagOpen && (i < (end - 1)))
-                //                {
-                //                    writer.write(CDATA_DELIMITER_OPEN);
-                //                    m_cdataTagOpen = true;
-                //                }
+                i = handleEscaping(writer, c, ch, i, end);
             }
             else if (
                 isCData
@@ -1155,25 +1125,8 @@
                     }
                     writer.write(c);
                 }
-
-                // This needs to go into a function...
-                else if (Encodings.isHighUTF16Surrogate(c))
-                {
-                    if (m_cdataTagOpen)
-                        closeCDATA();
-                    writeUTF16Surrogate(c, ch, i, end);
-                    i++; // process two input characters
-                }
-                else
-                {
-                    if (m_cdataTagOpen)
-                        closeCDATA();
-                    writer.write("&#");
-
-                    String intStr = Integer.toString((int) c);
-
-                    writer.write(intStr);
-                    writer.write(';');
+                else {
+                    i = handleEscaping(writer, c, ch, i, end);
                 }
             }
         }
@@ -1181,6 +1134,38 @@
     }
 
     /**
+     * Handles escaping, writes either with a surrogate pair or a character
+     * reference.
+     *
+     * @param c the current char
+     * @param ch the character array
+     * @param i the current position
+     * @param end the end index of the array
+     * @return the next index
+     *
+     * @throws IOException
+     * @throws org.xml.sax.SAXException if invalid UTF-16 surrogate detected.
+     */
+    private int handleEscaping(Writer writer, char c, char ch[], int i, int end)
+            throws IOException, SAXException {
+        if (Encodings.isHighUTF16Surrogate(c) || Encodings.isLowUTF16Surrogate(c))
+        {
+            if (writeUTF16Surrogate(c, ch, i, end) >= 0) {
+                // move the index if the low surrogate is consumed
+                // as writeUTF16Surrogate has written the pair
+                if (Encodings.isHighUTF16Surrogate(c)) {
+                    i++ ;
+                }
+            }
+        }
+        else
+        {
+            writeCharRef(writer, c);
+        }
+        return i;
+    }
+
+    /**
      * Ends an un-escaping section.
      *
      * @see #startNonEscaping
@@ -1246,7 +1231,7 @@
                 m_elemContext.m_startTagOpen = false;
             }
 
-            if (shouldIndent())
+            if (!m_cdataTagOpen && shouldIndent())
                 indent();
 
             boolean writeCDataBrackets =
@@ -1644,7 +1629,7 @@
         int i,
         char ch,
         int lastDirty,
-        boolean fromTextNode) throws IOException
+        boolean fromTextNode) throws IOException, SAXException
     {
         int startClean = lastDirty + 1;
         // if we have some clean characters accumulated
@@ -1723,54 +1708,40 @@
         int len,
         boolean fromTextNode,
         boolean escLF)
-        throws IOException
+        throws IOException, SAXException
     {
 
         int pos = accumDefaultEntity(writer, ch, i, chars, len, fromTextNode, escLF);
 
         if (i == pos)
         {
+            if (m_highSurrogate != 0) {
+                if (!(Encodings.isLowUTF16Surrogate(ch))) {
+                    throwIOE(m_highSurrogate, ch);
+                }
+                writeCharRef(writer, m_highSurrogate, ch);
+                m_highSurrogate = 0;
+                return ++pos;
+            }
+
             if (Encodings.isHighUTF16Surrogate(ch))
             {
-
-                // Should be the UTF-16 low surrogate of the hig/low pair.
-                char next;
-                // Unicode code point formed from the high/low pair.
-                int codePoint = 0;
-
                 if (i + 1 >= len)
                 {
-                    throw new IOException(
-                        Utils.messages.createMessage(
-                            MsgKey.ER_INVALID_UTF16_SURROGATE,
-                            new Object[] { Integer.toHexString(ch)}));
-                    //"Invalid UTF-16 surrogate detected: "
-
-                    //+Integer.toHexString(ch)+ " ?");
+                    // save for the next read
+                    m_highSurrogate = ch;
+                    pos++;
                 }
                 else
                 {
-                    next = chars[++i];
-
+                    // the next should be the UTF-16 low surrogate of the hig/low pair.
+                    char next = chars[++i];
                     if (!(Encodings.isLowUTF16Surrogate(next)))
-                        throw new IOException(
-                            Utils.messages.createMessage(
-                                MsgKey
-                                    .ER_INVALID_UTF16_SURROGATE,
-                                new Object[] {
-                                    Integer.toHexString(ch)
-                                        + " "
-                                        + Integer.toHexString(next)}));
-                    //"Invalid UTF-16 surrogate detected: "
-
-                    //+Integer.toHexString(ch)+" "+Integer.toHexString(next));
-                    codePoint = Encodings.toCodePoint(ch,next);
+                        throwIOE(ch, next);
+
+                    writeCharRef(writer, ch, next);
+                    pos += 2; // count the two characters that went into writing out this entity
                 }
-
-                writer.write("&#");
-                writer.write(Integer.toString(codePoint));
-                writer.write(';');
-                pos += 2; // count the two characters that went into writing out this entity
             }
             else
             {
@@ -1782,18 +1753,14 @@
                 if (isCharacterInC0orC1Range(ch) ||
                         (XMLVERSION11.equals(getVersion()) && isNELorLSEPCharacter(ch)))
                 {
-                    writer.write("&#");
-                    writer.write(Integer.toString(ch));
-                    writer.write(';');
+                    writeCharRef(writer, ch);
                 }
                 else if ((!escapingNotNeeded(ch) ||
                     (  (fromTextNode && m_charInfo.isSpecialTextChar(ch))
                      || (!fromTextNode && m_charInfo.isSpecialAttrChar(ch))))
-                && m_elemContext.m_currentElemDepth > 0)
+                     && m_elemContext.m_currentElemDepth > 0)
                 {
-                    writer.write("&#");
-                    writer.write(Integer.toString(ch));
-                    writer.write(';');
+                    writeCharRef(writer, ch);
                 }
                 else
                 {
@@ -1807,6 +1774,45 @@
     }
 
     /**
+     * Writes out a character reference.
+     * @param writer the writer
+     * @param c the character
+     * @throws IOException
+     */
+    private void writeCharRef(Writer writer, char c) throws IOException, SAXException {
+        if (m_cdataTagOpen)
+            closeCDATA();
+        writer.write("&#");
+        writer.write(Integer.toString(c));
+        writer.write(';');
+    }
+
+    /**
+     * Writes out a pair of surrogates as a character reference
+     * @param writer the writer
+     * @param high the high surrogate
+     * @param low the low surrogate
+     * @throws IOException
+     */
+    private int writeCharRef(Writer writer, char high, char low) throws IOException, SAXException {
+        if (m_cdataTagOpen)
+            closeCDATA();
+        // Unicode code point formed from the high/low pair.
+        int codePoint = Encodings.toCodePoint(high, low);
+        writer.write("&#");
+        writer.write(Integer.toString(codePoint));
+        writer.write(';');
+        return codePoint;
+    }
+
+    private void throwIOE(char ch, char next) throws IOException {
+        throw new IOException(Utils.messages.createMessage(
+                MsgKey.ER_INVALID_UTF16_SURROGATE,
+                new Object[] {Integer.toHexString(ch) + " "
+                        + Integer.toHexString(next)}));
+    }
+
+    /**
      * Receive notification of the beginning of an element, although this is a
      * SAX method additional namespace or attribute information can occur before
      * or after this call, that is associated with this element.
@@ -2053,7 +2059,7 @@
         Writer writer,
         String string,
         String encoding)
-        throws IOException
+        throws IOException, SAXException
     {
         final int len = string.length();
         if (len > m_attrBuff.length)