jaxp/src/com/sun/org/apache/xml/internal/serializer/ToStream.java
changeset 12458 d601e4bba306
parent 12457 c348e06f0e82
child 12902 0a840d92fa30
--- a/jaxp/src/com/sun/org/apache/xml/internal/serializer/ToStream.java	Thu Apr 12 08:38:26 2012 -0700
+++ b/jaxp/src/com/sun/org/apache/xml/internal/serializer/ToStream.java	Tue Apr 17 11:17:59 2012 -0700
@@ -919,7 +919,8 @@
         {
             // This is the old/fast code here, but is this
             // correct for all encodings?
-            if (ch >= 0x20 || (0x0A == ch || 0x0D == ch || 0x09 == ch))
+            if (ch >= CharInfo.S_SPACE || (CharInfo.S_LINEFEED == ch ||
+                    CharInfo.S_CARRIAGERETURN == ch || CharInfo.S_HORIZONAL_TAB == ch))
                 ret= true;
             else
                 ret = false;
@@ -1028,7 +1029,7 @@
      *
      * @throws java.io.IOException
      */
-    protected int accumDefaultEntity(
+    int accumDefaultEntity(
         java.io.Writer writer,
         char ch,
         int i,
@@ -1047,7 +1048,7 @@
         {
             // if this is text node character and a special one of those,
             // or if this is a character from attribute value and a special one of those
-            if ((fromTextNode && m_charInfo.isSpecialTextChar(ch)) || (!fromTextNode && m_charInfo.isSpecialAttrChar(ch)))
+            if ((fromTextNode && m_charInfo.shouldMapTextChar(ch)) || (!fromTextNode && m_charInfo.shouldMapAttrChar(ch)))
             {
                 String outputStringForChar = m_charInfo.getOutputStringForChar(ch);
 
@@ -1398,7 +1399,6 @@
 
         if (m_cdataTagOpen)
             closeCDATA();
-        // the check with _escaping is a bit of a hack for XLSTC
 
         if (m_disableOutputEscapingStates.peekOrFalse() || (!m_escaping))
         {
@@ -1421,82 +1421,173 @@
         try
         {
             int i;
-            char ch1;
             int startClean;
 
             // skip any leading whitspace
             // don't go off the end and use a hand inlined version
             // of isWhitespace(ch)
             final int end = start + length;
-            int lastDirty = start - 1; // last character that needed processing
-            for (i = start;
-                ((i < end)
-                    && ((ch1 = chars[i]) == 0x20
-                        || (ch1 == 0xA && m_lineSepUse)
-                        || ch1 == 0xD
-                        || ch1 == 0x09));
-                i++)
-            {
-                /*
-                 * We are processing leading whitespace, but are doing the same
-                 * processing for dirty characters here as for non-whitespace.
-                 *
-                 */
-                if (!m_charInfo.isTextASCIIClean(ch1))
-                {
-                    lastDirty = processDirty(chars,end, i,ch1, lastDirty, true);
-                    i = lastDirty;
+            int lastDirtyCharProcessed = start - 1; // last non-clean character that was processed
+                                                                                                        // that was processed
+            final Writer writer = m_writer;
+            boolean isAllWhitespace = true;
+
+            // process any leading whitspace
+            i = start;
+            while (i < end && isAllWhitespace) {
+                char ch1 = chars[i];
+
+                if (m_charInfo.shouldMapTextChar(ch1)) {
+                    // The character is supposed to be replaced by a String
+                    // so write out the clean whitespace characters accumulated
+                    // so far
+                    // then the String.
+                    writeOutCleanChars(chars, i, lastDirtyCharProcessed);
+                    String outputStringForChar = m_charInfo
+                            .getOutputStringForChar(ch1);
+                    writer.write(outputStringForChar);
+                    // We can't say that everything we are writing out is
+                    // all whitespace, we just wrote out a String.
+                    isAllWhitespace = false;
+                    lastDirtyCharProcessed = i; // mark the last non-clean
+                    // character processed
+                    i++;
+                } else {
+                    // The character is clean, but is it a whitespace ?
+                    switch (ch1) {
+                    // TODO: Any other whitespace to consider?
+                    case CharInfo.S_SPACE:
+                        // Just accumulate the clean whitespace
+                        i++;
+                        break;
+                    case CharInfo.S_LINEFEED:
+                        lastDirtyCharProcessed = processLineFeed(chars, i,
+                                lastDirtyCharProcessed, writer);
+                        i++;
+                        break;
+                    case CharInfo.S_CARRIAGERETURN:
+                        writeOutCleanChars(chars, i, lastDirtyCharProcessed);
+                        writer.write("&#13;");
+                        lastDirtyCharProcessed = i;
+                        i++;
+                        break;
+                    case CharInfo.S_HORIZONAL_TAB:
+                        // Just accumulate the clean whitespace
+                        i++;
+                        break;
+                    default:
+                        // The character was clean, but not a whitespace
+                        // so break the loop to continue with this character
+                        // (we don't increment index i !!)
+                        isAllWhitespace = false;
+                        break;
                 }
             }
+            }
             /* If there is some non-whitespace, mark that we may need
              * to preserve this. This is only important if we have indentation on.
              */
-            if (i < end)
+            if (i < end || !isAllWhitespace)
                 m_ispreserve = true;
 
-
-//            int lengthClean;    // number of clean characters in a row
-//            final boolean[] isAsciiClean = m_charInfo.getASCIIClean();
-
-            final boolean isXML10 = XMLVERSION10.equals(getVersion());
-            // we've skipped the leading whitespace, now deal with the rest
             for (; i < end; i++)
             {
-                {
-                    // A tight loop to skip over common clean chars
-                    // This tight loop makes it easier for the JIT
-                    // to optimize.
-                    char ch2;
-                    while (i<end
-                            && ((ch2 = chars[i])<127)
-                            && m_charInfo.isTextASCIIClean(ch2))
-                            i++;
-                    if (i == end)
-                        break;
+                char ch = chars[i];
+
+                if (m_charInfo.shouldMapTextChar(ch)) {
+                    // The character is supposed to be replaced by a String
+                    // e.g.   '&'  -->  "&amp;"
+                    // e.g.   '<'  -->  "&lt;"
+                    writeOutCleanChars(chars, i, lastDirtyCharProcessed);
+                    String outputStringForChar = m_charInfo.getOutputStringForChar(ch);
+                    writer.write(outputStringForChar);
+                    lastDirtyCharProcessed = i;
                 }
-
-                final char ch = chars[i];
-                /*  The check for isCharacterInC0orC1Ranger and
-                 *  isNELorLSEPCharacter has been added
-                 *  to support Control Characters in XML 1.1
-                 */
-                if (!isCharacterInC0orC1Range(ch) &&
-                    (isXML10 || !isNELorLSEPCharacter(ch)) &&
-                    (escapingNotNeeded(ch) && (!m_charInfo.isSpecialTextChar(ch)))
-                        || ('"' == ch))
-                {
-                    ; // a character needing no special processing
+                else {
+                    if (ch <= 0x1F) {
+                        // Range 0x00 through 0x1F inclusive
+                        //
+                        // This covers the non-whitespace control characters
+                        // in the range 0x1 to 0x1F inclusive.
+                        // It also covers the whitespace control characters in the same way:
+                        // 0x9   TAB
+                        // 0xA   NEW LINE
+                        // 0xD   CARRIAGE RETURN
+                        //
+                        // We also cover 0x0 ... It isn't valid
+                        // but we will output "&#0;"
+
+                        // The default will handle this just fine, but this
+                        // is a little performance boost to handle the more
+                        // common TAB, NEW-LINE, CARRIAGE-RETURN
+                        switch (ch) {
+
+                        case CharInfo.S_HORIZONAL_TAB:
+                            // Leave whitespace TAB as a real character
+                        break;
+                        case CharInfo.S_LINEFEED:
+                            lastDirtyCharProcessed = processLineFeed(chars, i, lastDirtyCharProcessed, writer);
+                            break;
+                        case CharInfo.S_CARRIAGERETURN:
+                                writeOutCleanChars(chars, i, lastDirtyCharProcessed);
+                                writer.write("&#13;");
+                                lastDirtyCharProcessed = i;
+                            // Leave whitespace carriage return as a real character
+                            break;
+                        default:
+                            writeOutCleanChars(chars, i, lastDirtyCharProcessed);
+                            writer.write("&#");
+                            writer.write(Integer.toString(ch));
+                            writer.write(';');
+                            lastDirtyCharProcessed = i;
+                            break;
+
                 }
-                else
-                {
-                    lastDirty = processDirty(chars,end, i, ch, lastDirty, true);
-                    i = lastDirty;
+                    }
+                    else if (ch < 0x7F) {
+                        // Range 0x20 through 0x7E inclusive
+                        // Normal ASCII chars, do nothing, just add it to
+                        // the clean characters
+
+                }
+                    else if (ch <= 0x9F){
+                        // Range 0x7F through 0x9F inclusive
+                        // More control characters, including NEL (0x85)
+                        writeOutCleanChars(chars, i, lastDirtyCharProcessed);
+                        writer.write("&#");
+                        writer.write(Integer.toString(ch));
+                        writer.write(';');
+                        lastDirtyCharProcessed = i;
+                }
+                    else if (ch == CharInfo.S_LINE_SEPARATOR) {
+                        // LINE SEPARATOR
+                        writeOutCleanChars(chars, i, lastDirtyCharProcessed);
+                        writer.write("&#8232;");
+                        lastDirtyCharProcessed = i;
+            }
+                    else if (m_encodingInfo.isInEncoding(ch)) {
+                        // If the character is in the encoding, and
+                        // not in the normal ASCII range, we also
+                        // just leave it get added on to the clean characters
+
+                    }
+                    else {
+                        // This is a fallback plan, we should never get here
+                        // but if the character wasn't previously handled
+                        // (i.e. isn't in the encoding, etc.) then what
+                        // should we do?  We choose to write out an entity
+                        writeOutCleanChars(chars, i, lastDirtyCharProcessed);
+                        writer.write("&#");
+                        writer.write(Integer.toString(ch));
+                        writer.write(';');
+                        lastDirtyCharProcessed = i;
+                    }
                 }
             }
 
             // we've reached the end. Any clean characters at the
             // end of the array than need to be written out?
-            startClean = lastDirty + 1;
+            startClean = lastDirtyCharProcessed + 1;
             if (i > startClean)
             {
                 int lengthClean = i - startClean;
@@ -1515,6 +1606,32 @@
         if (m_tracer != null)
             super.fireCharEvent(chars, start, length);
     }
+
+        private int processLineFeed(final char[] chars, int i, int lastProcessed, final Writer writer) throws IOException {
+                if (!m_lineSepUse
+                || (m_lineSepLen ==1 && m_lineSep[0] == CharInfo.S_LINEFEED)){
+                    // We are leaving the new-line alone, and it is just
+                    // being added to the 'clean' characters,
+                        // so the last dirty character processed remains unchanged
+                }
+                else {
+                    writeOutCleanChars(chars, i, lastProcessed);
+                    writer.write(m_lineSep, 0, m_lineSepLen);
+                    lastProcessed = i;
+                }
+                return lastProcessed;
+        }
+
+    private void writeOutCleanChars(final char[] chars, int i, int lastProcessed) throws IOException {
+        int startClean;
+        startClean = lastProcessed + 1;
+        if (startClean < i)
+        {
+            int lengthClean = i - startClean;
+            m_writer.write(chars, startClean, lengthClean);
+        }
+     }
+
     /**
      * This method checks if a given character is between C0 or C1 range
      * of Control characters.
@@ -1634,7 +1751,7 @@
      *
      * @throws org.xml.sax.SAXException
      */
-    protected int accumDefaultEscape(
+    private int accumDefaultEscape(
         Writer writer,
         char ch,
         int i,
@@ -1698,16 +1815,15 @@
                  *  to write it out as Numeric Character Reference(NCR) regardless of XML Version
                  *  being used for output document.
                  */
-                if (isCharacterInC0orC1Range(ch) ||
-                        (XMLVERSION11.equals(getVersion()) && isNELorLSEPCharacter(ch)))
+                if (isCharacterInC0orC1Range(ch) || isNELorLSEPCharacter(ch))
                 {
                     writer.write("&#");
                     writer.write(Integer.toString(ch));
                     writer.write(';');
                 }
                 else if ((!escapingNotNeeded(ch) ||
-                    (  (fromTextNode && m_charInfo.isSpecialTextChar(ch))
-                     || (!fromTextNode && m_charInfo.isSpecialAttrChar(ch))))
+                    (  (fromTextNode && m_charInfo.shouldMapTextChar(ch))
+                     || (!fromTextNode && m_charInfo.shouldMapAttrChar(ch))))
                 && m_elemContext.m_currentElemDepth > 0)
                 {
                     writer.write("&#");
@@ -1971,28 +2087,86 @@
         string.getChars(0,len, m_attrBuff, 0);
         final char[] stringChars = m_attrBuff;
 
-        for (int i = 0; i < len; )
+        for (int i = 0; i < len;)
         {
             char ch = stringChars[i];
-            if (escapingNotNeeded(ch) && (!m_charInfo.isSpecialAttrChar(ch)))
-            {
-                writer.write(ch);
-                i++;
-            }
-            else
-            { // I guess the parser doesn't normalize cr/lf in attributes. -sb
-//                if ((CharInfo.S_CARRIAGERETURN == ch)
-//                    && ((i + 1) < len)
-//                    && (CharInfo.S_LINEFEED == stringChars[i + 1]))
-//                {
-//                    i++;
-//                    ch = CharInfo.S_LINEFEED;
-//                }
-
+
+            if (m_charInfo.shouldMapAttrChar(ch) || !(escapingNotNeeded(ch))) {
+                // The character is supposed to be replaced by a String
+                // e.g.   '&'  -->  "&amp;"
+                // e.g.   '<'  -->  "&lt;"
                 i = accumDefaultEscape(writer, ch, i, stringChars, len, false, true);
             }
+            else {
+                i++;
+                if (0x0 <= ch && ch <= 0x1F) {
+                    // Range 0x00 through 0x1F inclusive
+                    // This covers the non-whitespace control characters
+                    // in the range 0x1 to 0x1F inclusive.
+                    // It also covers the whitespace control characters in the same way:
+                    // 0x9   TAB
+                    // 0xA   NEW LINE
+                    // 0xD   CARRIAGE RETURN
+                    //
+                    // We also cover 0x0 ... It isn't valid
+                    // but we will output "&#0;"
+
+                    // The default will handle this just fine, but this
+                    // is a little performance boost to handle the more
+                    // common TAB, NEW-LINE, CARRIAGE-RETURN
+                    switch (ch) {
+
+                    case CharInfo.S_HORIZONAL_TAB:
+                        writer.write("&#9;");
+                        break;
+                    case CharInfo.S_LINEFEED:
+                        writer.write("&#10;");
+                        break;
+                    case CharInfo.S_CARRIAGERETURN:
+                        writer.write("&#13;");
+                        break;
+                    default:
+                        writer.write("&#");
+                        writer.write(Integer.toString(ch));
+                        writer.write(';');
+                        break;
+
         }
-
+                }
+                else if (ch < 0x7F) {
+                    // Range 0x20 through 0x7E inclusive
+                    // Normal ASCII chars
+                        writer.write(ch);
+                }
+                else if (ch <= 0x9F){
+                    // Range 0x7F through 0x9F inclusive
+                    // More control characters
+                    writer.write("&#");
+                    writer.write(Integer.toString(ch));
+                    writer.write(';');
+                }
+                else if (ch == CharInfo.S_LINE_SEPARATOR) {
+                    // LINE SEPARATOR
+                    writer.write("&#8232;");
+                }
+                else if (m_encodingInfo.isInEncoding(ch)) {
+                    // If the character is in the encoding, and
+                    // not in the normal ASCII range, we also
+                    // just write it out
+                    writer.write(ch);
+                }
+                else {
+                    // This is a fallback plan, we should never get here
+                    // but if the character wasn't previously handled
+                    // (i.e. isn't in the encoding, etc.) then what
+                    // should we do?  We choose to write out a character ref
+                    writer.write("&#");
+                    writer.write(Integer.toString(ch));
+                    writer.write(';');
+                }
+
+    }
+        }
     }
 
     /**
@@ -2219,13 +2393,17 @@
 
         try
         {
-            if (shouldIndent())
+            if (shouldIndent() && m_isStandalone)
                 indent();
 
             final int limit = start + length;
             boolean wasDash = false;
             if (m_cdataTagOpen)
                 closeCDATA();
+
+            if (shouldIndent() && !m_isStandalone)
+                indent();
+
             final java.io.Writer writer = m_writer;
             writer.write(COMMENT_BEGIN);
             // Detect occurrences of two consecutive dashes, handle as necessary.
@@ -2258,6 +2436,15 @@
             throw new SAXException(e);
         }
 
+        /*
+         * Don't write out any indentation whitespace now,
+         * because there may be non-whitespace text after this.
+         *
+         * Simply mark that at this point if we do decide
+         * to indent that we should
+         * add a newline on the end of the current line before
+         * the indentation at the start of the next line.
+         */
         m_startNewLine = true;
         // time to generate comment event
         if (m_tracer != null)
@@ -2506,7 +2693,7 @@
      */
     protected boolean shouldIndent()
     {
-        return m_doIndent && (!m_ispreserve && !m_isprevtext);
+        return m_doIndent && (!m_ispreserve && !m_isprevtext) && (m_elemContext.m_currentElemDepth > 0 || m_isStandalone);
     }
 
     /**
@@ -2749,6 +2936,14 @@
                 closeCDATA();
                 m_cdataTagOpen = false;
             }
+            if (m_writer != null) {
+                try {
+                    m_writer.flush();
+    }
+                catch(IOException e) {
+                    // what? me worry?
+                }
+            }
     }
 
     public void setContentHandler(ContentHandler ch)