jaxws/src/java.xml.soap/share/classes/com/sun/xml/internal/messaging/saaj/packaging/mime/internet/MimeUtility.java
changeset 28977 d7609b65606b
parent 28976 8c912c147654
parent 28344 722378bc599e
child 28978 8431abc709c0
equal deleted inserted replaced
28976:8c912c147654 28977:d7609b65606b
     1 /*
       
     2  * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
       
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
       
     4  *
       
     5  * This code is free software; you can redistribute it and/or modify it
       
     6  * under the terms of the GNU General Public License version 2 only, as
       
     7  * published by the Free Software Foundation.  Oracle designates this
       
     8  * particular file as subject to the "Classpath" exception as provided
       
     9  * by Oracle in the LICENSE file that accompanied this code.
       
    10  *
       
    11  * This code is distributed in the hope that it will be useful, but WITHOUT
       
    12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
       
    13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
       
    14  * version 2 for more details (a copy is included in the LICENSE file that
       
    15  * accompanied this code).
       
    16  *
       
    17  * You should have received a copy of the GNU General Public License version
       
    18  * 2 along with this work; if not, write to the Free Software Foundation,
       
    19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
       
    20  *
       
    21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
       
    22  * or visit www.oracle.com if you need additional information or have any
       
    23  * questions.
       
    24  */
       
    25 
       
    26 /*
       
    27  * @(#)MimeUtility.java       1.45 03/03/10
       
    28  */
       
    29 
       
    30 
       
    31 
       
    32 package com.sun.xml.internal.messaging.saaj.packaging.mime.internet;
       
    33 
       
    34 import java.io.*;
       
    35 import java.util.*;
       
    36 
       
    37 import javax.activation.DataHandler;
       
    38 import javax.activation.DataSource;
       
    39 
       
    40 import com.sun.xml.internal.messaging.saaj.packaging.mime.MessagingException;
       
    41 import com.sun.xml.internal.messaging.saaj.packaging.mime.util.*;
       
    42 import com.sun.xml.internal.messaging.saaj.util.SAAJUtil;
       
    43 
       
    44 /**
       
    45  * This is a utility class that provides various MIME related
       
    46  * functionality. <p>
       
    47  *
       
    48  * There are a set of methods to encode and decode MIME headers as
       
    49  * per RFC 2047. A brief description on handling such headers is
       
    50  * given below: <p>
       
    51  *
       
    52  * RFC 822 mail headers <strong>must</strong> contain only US-ASCII
       
    53  * characters. Headers that contain non US-ASCII characters must be
       
    54  * encoded so that they contain only US-ASCII characters. Basically,
       
    55  * this process involves using either BASE64 or QP to encode certain
       
    56  * characters. RFC 2047 describes this in detail. <p>
       
    57  *
       
    58  * In Java, Strings contain (16 bit) Unicode characters. ASCII is a
       
    59  * subset of Unicode (and occupies the range 0 - 127). A String
       
    60  * that contains only ASCII characters is already mail-safe. If the
       
    61  * String contains non US-ASCII characters, it must be encoded. An
       
    62  * additional complexity in this step is that since Unicode is not
       
    63  * yet a widely used charset, one might want to first charset-encode
       
    64  * the String into another charset and then do the transfer-encoding.
       
    65  * <p>
       
    66  * Note that to get the actual bytes of a mail-safe String (say,
       
    67  * for sending over SMTP), one must do
       
    68  * <p><blockquote><pre>
       
    69  *
       
    70  *      byte[] bytes = string.getBytes("iso-8859-1");
       
    71  *
       
    72  * </pre></blockquote><p>
       
    73  *
       
    74  * The <code>setHeader</code> and <code>addHeader</code> methods
       
    75  * on MimeMessage and MimeBodyPart assume that the given header values
       
    76  * are Unicode strings that contain only US-ASCII characters. Hence
       
    77  * the callers of those methods must insure that the values they pass
       
    78  * do not contain non US-ASCII characters. The methods in this class
       
    79  * help do this. <p>
       
    80  *
       
    81  * The <code>getHeader</code> family of methods on MimeMessage and
       
    82  * MimeBodyPart return the raw header value. These might be encoded
       
    83  * as per RFC 2047, and if so, must be decoded into Unicode Strings.
       
    84  * The methods in this class help to do this. <p>
       
    85  *
       
    86  * Several System properties control strict conformance to the MIME
       
    87  * spec.  Note that these are not session properties but must be set
       
    88  * globally as System properties. <p>
       
    89  *
       
    90  * The <code>mail.mime.decodetext.strict</code> property controls
       
    91  * decoding of MIME encoded words.  The MIME spec requires that encoded
       
    92  * words start at the beginning of a whitespace separated word.  Some
       
    93  * mailers incorrectly include encoded words in the middle of a word.
       
    94  * If the <code>mail.mime.decodetext.strict</code> System property is
       
    95  * set to <code>"false"</code>, an attempt will be made to decode these
       
    96  * illegal encoded words. The default is true. <p>
       
    97  *
       
    98  * The <code>mail.mime.encodeeol.strict</code> property controls the
       
    99  * choice of Content-Transfer-Encoding for MIME parts that are not of
       
   100  * type "text".  Often such parts will contain textual data for which
       
   101  * an encoding that allows normal end of line conventions is appropriate.
       
   102  * In rare cases, such a part will appear to contain entirely textual
       
   103  * data, but will require an encoding that preserves CR and LF characters
       
   104  * without change.  If the <code>mail.mime.decodetext.strict</code>
       
   105  * System property is set to <code>"true"</code>, such an encoding will
       
   106  * be used when necessary.  The default is false. <p>
       
   107  *
       
   108  * In addition, the <code>mail.mime.charset</code> System property can
       
   109  * be used to specify the default MIME charset to use for encoded words
       
   110  * and text parts that don't otherwise specify a charset.  Normally, the
       
   111  * default MIME charset is derived from the default Java charset, as
       
   112  * specified in the <code>file.encoding</code> System property.  Most
       
   113  * applications will have no need to explicitly set the default MIME
       
   114  * charset.  In cases where the default MIME charset to be used for
       
   115  * mail messages is different than the charset used for files stored on
       
   116  * the system, this property should be set.
       
   117  *
       
   118  * @version 1.45, 03/03/10
       
   119  * @author  John Mani
       
   120  * @author  Bill Shannon
       
   121  */
       
   122 
       
   123 public class MimeUtility {
       
   124 
       
   125     // This class cannot be instantiated
       
   126     private MimeUtility() { }
       
   127 
       
   128     public static final int ALL = -1;
       
   129 
       
   130     private static final int BUFFER_SIZE = 1024;
       
   131     private static boolean decodeStrict = true;
       
   132     private static boolean encodeEolStrict = false;
       
   133     private static boolean foldEncodedWords = false;
       
   134     private static boolean foldText = true;
       
   135 
       
   136     static {
       
   137         try {
       
   138             String s = SAAJUtil.getSystemProperty("mail.mime.decodetext.strict");
       
   139             // default to true
       
   140             decodeStrict = s == null || !s.equalsIgnoreCase("false");
       
   141             s = SAAJUtil.getSystemProperty("mail.mime.encodeeol.strict");
       
   142             // default to false
       
   143             encodeEolStrict = s != null && s.equalsIgnoreCase("true");
       
   144             s = SAAJUtil.getSystemProperty("mail.mime.foldencodedwords");
       
   145             // default to false
       
   146             foldEncodedWords = s != null && s.equalsIgnoreCase("true");
       
   147             s = SAAJUtil.getSystemProperty("mail.mime.foldtext");
       
   148             // default to true
       
   149             foldText = s == null || !s.equalsIgnoreCase("false");
       
   150         } catch (SecurityException sex) {
       
   151             // ignore it
       
   152         }
       
   153     }
       
   154 
       
   155 
       
   156     /**
       
   157      * Get the content-transfer-encoding that should be applied
       
   158      * to the input stream of this datasource, to make it mailsafe. <p>
       
   159      *
       
   160      * The algorithm used here is: <br>
       
   161      * <ul>
       
   162      * <li>
       
   163      * If the primary type of this datasource is "text" and if all
       
   164      * the bytes in its input stream are US-ASCII, then the encoding
       
   165      * is "7bit". If more than half of the bytes are non-US-ASCII, then
       
   166      * the encoding is "base64". If less than half of the bytes are
       
   167      * non-US-ASCII, then the encoding is "quoted-printable".
       
   168      * <li>
       
   169      * If the primary type of this datasource is not "text", then if
       
   170      * all the bytes of its input stream are US-ASCII, the encoding
       
   171      * is "7bit". If there is even one non-US-ASCII character, the
       
   172      * encoding is "base64".
       
   173      * </ul>
       
   174      *
       
   175      * @param   ds      DataSource
       
   176      * @return          the encoding. This is either "7bit",
       
   177      *                  "quoted-printable" or "base64"
       
   178      */
       
   179     public static String getEncoding(DataSource ds) {
       
   180         ContentType cType = null;
       
   181         InputStream is = null;
       
   182         String encoding = null;
       
   183 
       
   184         try {
       
   185             cType = new ContentType(ds.getContentType());
       
   186             is = ds.getInputStream();
       
   187         } catch (Exception ex) {
       
   188             return "base64"; // what else ?!
       
   189         }
       
   190 
       
   191         boolean isText = cType.match("text/*");
       
   192         // if not text, stop processing when we see non-ASCII
       
   193         int i = checkAscii(is, ALL, !isText);
       
   194         switch (i) {
       
   195         case ALL_ASCII:
       
   196             encoding = "7bit"; // all ascii
       
   197             break;
       
   198         case MOSTLY_ASCII:
       
   199             encoding = "quoted-printable"; // mostly ascii
       
   200             break;
       
   201         default:
       
   202             encoding = "base64"; // mostly binary
       
   203             break;
       
   204         }
       
   205 
       
   206         // Close the input stream
       
   207         try {
       
   208             is.close();
       
   209         } catch (IOException ioex) { }
       
   210 
       
   211         return encoding;
       
   212     }
       
   213 
       
   214     /**
       
   215      * Same as <code>getEncoding(DataSource)</code> except that instead
       
   216      * of reading the data from an <code>InputStream</code> it uses the
       
   217      * <code>writeTo</code> method to examine the data.  This is more
       
   218      * efficient in the common case of a <code>DataHandler</code>
       
   219      * created with an object and a MIME type (for example, a
       
   220      * "text/plain" String) because all the I/O is done in this
       
   221      * thread.  In the case requiring an <code>InputStream</code> the
       
   222      * <code>DataHandler</code> uses a thread, a pair of pipe streams,
       
   223      * and the <code>writeTo</code> method to produce the data. <p>
       
   224      *
       
   225      * @since   JavaMail 1.2
       
   226      */
       
   227     public static String getEncoding(DataHandler dh) {
       
   228         ContentType cType = null;
       
   229         String encoding = null;
       
   230 
       
   231         /*
       
   232          * Try to pick the most efficient means of determining the
       
   233          * encoding.  If this DataHandler was created using a DataSource,
       
   234          * the getEncoding(DataSource) method is typically faster.  If
       
   235          * the DataHandler was created with an object, this method is
       
   236          * much faster.  To distinguish the two cases, we use a heuristic.
       
   237          * A DataHandler created with an object will always have a null name.
       
   238          * A DataHandler created with a DataSource will usually have a
       
   239          * non-null name.
       
   240          *
       
   241          * XXX - This is actually quite a disgusting hack, but it makes
       
   242          *       a common case run over twice as fast.
       
   243          */
       
   244         if (dh.getName() != null)
       
   245             return getEncoding(dh.getDataSource());
       
   246 
       
   247         try {
       
   248             cType = new ContentType(dh.getContentType());
       
   249         } catch (Exception ex) {
       
   250             return "base64"; // what else ?!
       
   251         }
       
   252 
       
   253         if (cType.match("text/*")) {
       
   254             // Check all of the available bytes
       
   255             AsciiOutputStream aos = new AsciiOutputStream(false, false);
       
   256             try {
       
   257                 dh.writeTo(aos);
       
   258             } catch (IOException ex) { }        // ignore it
       
   259             switch (aos.getAscii()) {
       
   260             case ALL_ASCII:
       
   261                 encoding = "7bit"; // all ascii
       
   262                 break;
       
   263             case MOSTLY_ASCII:
       
   264                 encoding = "quoted-printable"; // mostly ascii
       
   265                 break;
       
   266             default:
       
   267                 encoding = "base64"; // mostly binary
       
   268                 break;
       
   269             }
       
   270         } else { // not "text"
       
   271             // Check all of available bytes, break out if we find
       
   272             // at least one non-US-ASCII character
       
   273             AsciiOutputStream aos =
       
   274                         new AsciiOutputStream(true, encodeEolStrict);
       
   275             try {
       
   276                 dh.writeTo(aos);
       
   277             } catch (IOException ex) { }        // ignore it
       
   278             if (aos.getAscii() == ALL_ASCII) // all ascii
       
   279                 encoding = "7bit";
       
   280             else // found atleast one non-ascii character, use b64
       
   281                 encoding = "base64";
       
   282         }
       
   283 
       
   284         return encoding;
       
   285     }
       
   286 
       
   287     /**
       
   288      * Decode the given input stream. The Input stream returned is
       
   289      * the decoded input stream. All the encodings defined in RFC 2045
       
   290      * are supported here. They include "base64", "quoted-printable",
       
   291      * "7bit", "8bit", and "binary". In addition, "uuencode" is also
       
   292      * supported.
       
   293      *
       
   294      * @param   is              input stream
       
   295      * @param   encoding        the encoding of the stream.
       
   296      * @return                  decoded input stream.
       
   297      */
       
   298     public static InputStream decode(InputStream is, String encoding)
       
   299                 throws MessagingException {
       
   300         if (encoding.equalsIgnoreCase("base64"))
       
   301             return new BASE64DecoderStream(is);
       
   302         else if (encoding.equalsIgnoreCase("quoted-printable"))
       
   303             return new QPDecoderStream(is);
       
   304         else if (encoding.equalsIgnoreCase("uuencode") ||
       
   305                  encoding.equalsIgnoreCase("x-uuencode") ||
       
   306                  encoding.equalsIgnoreCase("x-uue"))
       
   307             return new UUDecoderStream(is);
       
   308         else if (encoding.equalsIgnoreCase("binary") ||
       
   309                  encoding.equalsIgnoreCase("7bit") ||
       
   310                  encoding.equalsIgnoreCase("8bit"))
       
   311             return is;
       
   312         else
       
   313             throw new MessagingException("Unknown encoding: " + encoding);
       
   314     }
       
   315 
       
   316     /**
       
   317      * Wrap an encoder around the given output stream.
       
   318      * All the encodings defined in RFC 2045 are supported here.
       
   319      * They include "base64", "quoted-printable", "7bit", "8bit" and
       
   320      * "binary". In addition, "uuencode" is also supported.
       
   321      *
       
   322      * @param   os              output stream
       
   323      * @param   encoding        the encoding of the stream.
       
   324      * @return                  output stream that applies the
       
   325      *                          specified encoding.
       
   326      */
       
   327     public static OutputStream encode(OutputStream os, String encoding)
       
   328                 throws MessagingException {
       
   329         if (encoding == null)
       
   330             return os;
       
   331         else if (encoding.equalsIgnoreCase("base64"))
       
   332             return new BASE64EncoderStream(os);
       
   333         else if (encoding.equalsIgnoreCase("quoted-printable"))
       
   334             return new QPEncoderStream(os);
       
   335         else if (encoding.equalsIgnoreCase("uuencode") ||
       
   336                  encoding.equalsIgnoreCase("x-uuencode") ||
       
   337                  encoding.equalsIgnoreCase("x-uue"))
       
   338             return new UUEncoderStream(os);
       
   339         else if (encoding.equalsIgnoreCase("binary") ||
       
   340                  encoding.equalsIgnoreCase("7bit") ||
       
   341                  encoding.equalsIgnoreCase("8bit"))
       
   342             return os;
       
   343         else
       
   344             throw new MessagingException("Unknown encoding: " +encoding);
       
   345     }
       
   346 
       
   347     /**
       
   348      * Wrap an encoder around the given output stream.
       
   349      * All the encodings defined in RFC 2045 are supported here.
       
   350      * They include "base64", "quoted-printable", "7bit", "8bit" and
       
   351      * "binary". In addition, "uuencode" is also supported.
       
   352      * The <code>filename</code> parameter is used with the "uuencode"
       
   353      * encoding and is included in the encoded output.
       
   354      *
       
   355      * @param   os              output stream
       
   356      * @param   encoding        the encoding of the stream.
       
   357      * @param   filename        name for the file being encoded (only used
       
   358      *                          with uuencode)
       
   359      * @return                  output stream that applies the
       
   360      *                          specified encoding.
       
   361      * @since                   JavaMail 1.2
       
   362      */
       
   363     public static OutputStream encode(OutputStream os, String encoding,
       
   364                                       String filename)
       
   365                 throws MessagingException {
       
   366         if (encoding == null)
       
   367             return os;
       
   368         else if (encoding.equalsIgnoreCase("base64"))
       
   369             return new BASE64EncoderStream(os);
       
   370         else if (encoding.equalsIgnoreCase("quoted-printable"))
       
   371             return new QPEncoderStream(os);
       
   372         else if (encoding.equalsIgnoreCase("uuencode") ||
       
   373                  encoding.equalsIgnoreCase("x-uuencode") ||
       
   374                  encoding.equalsIgnoreCase("x-uue"))
       
   375             return new UUEncoderStream(os, filename);
       
   376         else if (encoding.equalsIgnoreCase("binary") ||
       
   377                  encoding.equalsIgnoreCase("7bit") ||
       
   378                  encoding.equalsIgnoreCase("8bit"))
       
   379             return os;
       
   380         else
       
   381             throw new MessagingException("Unknown encoding: " +encoding);
       
   382     }
       
   383 
       
   384     /**
       
   385      * Encode a RFC 822 "text" token into mail-safe form as per
       
   386      * RFC 2047. <p>
       
   387      *
       
   388      * The given Unicode string is examined for non US-ASCII
       
   389      * characters. If the string contains only US-ASCII characters,
       
   390      * it is returned as-is.  If the string contains non US-ASCII
       
   391      * characters, it is first character-encoded using the platform's
       
   392      * default charset, then transfer-encoded using either the B or
       
   393      * Q encoding. The resulting bytes are then returned as a Unicode
       
   394      * string containing only ASCII  characters. <p>
       
   395      *
       
   396      * Note that this method should be used to encode only
       
   397      * "unstructured" RFC 822 headers. <p>
       
   398      *
       
   399      * Example of usage:
       
   400      * <p><blockquote><pre>
       
   401      *
       
   402      *  MimeBodyPart part = ...
       
   403      *  String rawvalue = "FooBar Mailer, Japanese version 1.1"
       
   404      *  try {
       
   405      *    // If we know for sure that rawvalue contains only US-ASCII
       
   406      *    // characters, we can skip the encoding part
       
   407      *    part.setHeader("X-mailer", MimeUtility.encodeText(rawvalue));
       
   408      *  } catch (UnsupportedEncodingException e) {
       
   409      *    // encoding failure
       
   410      *  } catch (MessagingException me) {
       
   411      *   // setHeader() failure
       
   412      *  }
       
   413      *
       
   414      * </pre></blockquote><p>
       
   415      *
       
   416      * @param   text    unicode string
       
   417      * @return  Unicode string containing only US-ASCII characters
       
   418      * @exception UnsupportedEncodingException if the encoding fails
       
   419      */
       
   420     public static String encodeText(String text)
       
   421                         throws UnsupportedEncodingException {
       
   422         return encodeText(text, null, null);
       
   423     }
       
   424 
       
   425     /**
       
   426      * Encode a RFC 822 "text" token into mail-safe form as per
       
   427      * RFC 2047. <p>
       
   428      *
       
   429      * The given Unicode string is examined for non US-ASCII
       
   430      * characters. If the string contains only US-ASCII characters,
       
   431      * it is returned as-is.  If the string contains non US-ASCII
       
   432      * characters, it is first character-encoded using the specified
       
   433      * charset, then transfer-encoded using either the B or Q encoding.
       
   434      * The resulting bytes are then returned as a Unicode string
       
   435      * containing only ASCII characters. <p>
       
   436      *
       
   437      * Note that this method should be used to encode only
       
   438      * "unstructured" RFC 822 headers.
       
   439      *
       
   440      * @param   text    the header value
       
   441      * @param   charset the charset. If this parameter is null, the
       
   442      *          platform's default chatset is used.
       
   443      * @param   encoding the encoding to be used. Currently supported
       
   444      *          values are "B" and "Q". If this parameter is null, then
       
   445      *          the "Q" encoding is used if most of characters to be
       
   446      *          encoded are in the ASCII charset, otherwise "B" encoding
       
   447      *          is used.
       
   448      * @return  Unicode string containing only US-ASCII characters
       
   449      */
       
   450     public static String encodeText(String text, String charset,
       
   451                                     String encoding)
       
   452                         throws UnsupportedEncodingException {
       
   453         return encodeWord(text, charset, encoding, false);
       
   454     }
       
   455 
       
   456     /**
       
   457      * Decode "unstructured" headers, that is, headers that are defined
       
   458      * as '*text' as per RFC 822. <p>
       
   459      *
       
   460      * The string is decoded using the algorithm specified in
       
   461      * RFC 2047, Section 6.1.1. If the charset-conversion fails
       
   462      * for any sequence, an UnsupportedEncodingException is thrown.
       
   463      * If the String is not an RFC 2047 style encoded header, it is
       
   464      * returned as-is <p>
       
   465      *
       
   466      * Example of usage:
       
   467      * <p><blockquote><pre>
       
   468      *
       
   469      *  MimeBodyPart part = ...
       
   470      *  String rawvalue = null;
       
   471      *  String  value = null;
       
   472      *  try {
       
   473      *    if ((rawvalue = part.getHeader("X-mailer")[0]) != null)
       
   474      *      value = MimeUtility.decodeText(rawvalue);
       
   475      *  } catch (UnsupportedEncodingException e) {
       
   476      *      // Don't care
       
   477      *      value = rawvalue;
       
   478      *  } catch (MessagingException me) { }
       
   479      *
       
   480      *  return value;
       
   481      *
       
   482      * </pre></blockquote><p>
       
   483      *
       
   484      * @param   etext   the possibly encoded value
       
   485      * @exception       UnsupportedEncodingException if the charset
       
   486      *                  conversion failed.
       
   487      */
       
   488     public static String decodeText(String etext)
       
   489                 throws UnsupportedEncodingException {
       
   490         /*
       
   491          * We look for sequences separated by "linear-white-space".
       
   492          * (as per RFC 2047, Section 6.1.1)
       
   493          * RFC 822 defines "linear-white-space" as SPACE | HT | CR | NL.
       
   494          */
       
   495         String lwsp = " \t\n\r";
       
   496         StringTokenizer st;
       
   497 
       
   498         /*
       
   499          * First, lets do a quick run thru the string and check
       
   500          * whether the sequence "=?"  exists at all. If none exists,
       
   501          * we know there are no encoded-words in here and we can just
       
   502          * return the string as-is, without suffering thru the later
       
   503          * decoding logic.
       
   504          * This handles the most common case of unencoded headers
       
   505          * efficiently.
       
   506          */
       
   507         if (etext.indexOf("=?") == -1)
       
   508             return etext;
       
   509 
       
   510         // Encoded words found. Start decoding ...
       
   511 
       
   512         st = new StringTokenizer(etext, lwsp, true);
       
   513         StringBuffer sb = new StringBuffer();  // decode buffer
       
   514         StringBuffer wsb = new StringBuffer(); // white space buffer
       
   515         boolean prevWasEncoded = false;
       
   516 
       
   517         while (st.hasMoreTokens()) {
       
   518             char c;
       
   519             String s = st.nextToken();
       
   520             // If whitespace, append it to the whitespace buffer
       
   521             if (((c = s.charAt(0)) == ' ') || (c == '\t') ||
       
   522                 (c == '\r') || (c == '\n'))
       
   523                 wsb.append(c);
       
   524             else {
       
   525                 // Check if token is an 'encoded-word' ..
       
   526                 String word;
       
   527                 try {
       
   528                     word = decodeWord(s);
       
   529                     // Yes, this IS an 'encoded-word'.
       
   530                     if (!prevWasEncoded && wsb.length() > 0) {
       
   531                         // if the previous word was also encoded, we
       
   532                         // should ignore the collected whitespace. Else
       
   533                         // we include the whitespace as well.
       
   534                         sb.append(wsb);
       
   535                     }
       
   536                     prevWasEncoded = true;
       
   537                 } catch (ParseException pex) {
       
   538                     // This is NOT an 'encoded-word'.
       
   539                     word = s;
       
   540                     // possibly decode inner encoded words
       
   541                     if (!decodeStrict)
       
   542                         word = decodeInnerWords(word);
       
   543                     // include colleced whitespace ..
       
   544                     if (wsb.length() > 0)
       
   545                         sb.append(wsb);
       
   546                     prevWasEncoded = false;
       
   547                 }
       
   548                 sb.append(word); // append the actual word
       
   549                 wsb.setLength(0); // reset wsb for reuse
       
   550             }
       
   551         }
       
   552         return sb.toString();
       
   553     }
       
   554 
       
   555     /**
       
   556      * Encode a RFC 822 "word" token into mail-safe form as per
       
   557      * RFC 2047. <p>
       
   558      *
       
   559      * The given Unicode string is examined for non US-ASCII
       
   560      * characters. If the string contains only US-ASCII characters,
       
   561      * it is returned as-is.  If the string contains non US-ASCII
       
   562      * characters, it is first character-encoded using the platform's
       
   563      * default charset, then transfer-encoded using either the B or
       
   564      * Q encoding. The resulting bytes are then returned as a Unicode
       
   565      * string containing only ASCII  characters. <p>
       
   566      *
       
   567      * This method is meant to be used when creating RFC 822 "phrases".
       
   568      * The InternetAddress class, for example, uses this to encode
       
   569      * it's 'phrase' component.
       
   570      *
       
   571      * @param   text    unicode string
       
   572      * @return  Array of Unicode strings containing only US-ASCII
       
   573      *          characters.
       
   574      * @exception UnsupportedEncodingException if the encoding fails
       
   575      */
       
   576     public static String encodeWord(String word)
       
   577                         throws UnsupportedEncodingException {
       
   578         return encodeWord(word, null, null);
       
   579     }
       
   580 
       
   581     /**
       
   582      * Encode a RFC 822 "word" token into mail-safe form as per
       
   583      * RFC 2047. <p>
       
   584      *
       
   585      * The given Unicode string is examined for non US-ASCII
       
   586      * characters. If the string contains only US-ASCII characters,
       
   587      * it is returned as-is.  If the string contains non US-ASCII
       
   588      * characters, it is first character-encoded using the specified
       
   589      * charset, then transfer-encoded using either the B or Q encoding.
       
   590      * The resulting bytes are then returned as a Unicode string
       
   591      * containing only ASCII characters. <p>
       
   592      *
       
   593      * @param   text    unicode string
       
   594      * @param   charset the MIME charset
       
   595      * @param   encoding the encoding to be used. Currently supported
       
   596      *          values are "B" and "Q". If this parameter is null, then
       
   597      *          the "Q" encoding is used if most of characters to be
       
   598      *          encoded are in the ASCII charset, otherwise "B" encoding
       
   599      *          is used.
       
   600      * @return  Unicode string containing only US-ASCII characters
       
   601      * @exception UnsupportedEncodingException if the encoding fails
       
   602      */
       
   603     public static String encodeWord(String word, String charset,
       
   604                                     String encoding)
       
   605                         throws UnsupportedEncodingException {
       
   606         return encodeWord(word, charset, encoding, true);
       
   607     }
       
   608 
       
   609     /*
       
   610      * Encode the given string. The parameter 'encodingWord' should
       
   611      * be true if a RFC 822 "word" token is being encoded and false if a
       
   612      * RFC 822 "text" token is being encoded. This is because the
       
   613      * "Q" encoding defined in RFC 2047 has more restrictions when
       
   614      * encoding "word" tokens. (Sigh)
       
   615      */
       
   616     private static String encodeWord(String string, String charset,
       
   617                                      String encoding, boolean encodingWord)
       
   618                         throws UnsupportedEncodingException {
       
   619 
       
   620         // If 'string' contains only US-ASCII characters, just
       
   621         // return it.
       
   622         int ascii = checkAscii(string);
       
   623         if (ascii == ALL_ASCII)
       
   624             return string;
       
   625 
       
   626         // Else, apply the specified charset conversion.
       
   627         String jcharset;
       
   628         if (charset == null) { // use default charset
       
   629             jcharset = getDefaultJavaCharset(); // the java charset
       
   630             charset = getDefaultMIMECharset(); // the MIME equivalent
       
   631         } else // MIME charset -> java charset
       
   632             jcharset = javaCharset(charset);
       
   633 
       
   634         // If no transfer-encoding is specified, figure one out.
       
   635         if (encoding == null) {
       
   636             if (ascii != MOSTLY_NONASCII)
       
   637                 encoding = "Q";
       
   638             else
       
   639                 encoding = "B";
       
   640         }
       
   641 
       
   642         boolean b64;
       
   643         if (encoding.equalsIgnoreCase("B"))
       
   644             b64 = true;
       
   645         else if (encoding.equalsIgnoreCase("Q"))
       
   646             b64 = false;
       
   647         else
       
   648             throw new UnsupportedEncodingException(
       
   649                         "Unknown transfer encoding: " + encoding);
       
   650 
       
   651         StringBuffer outb = new StringBuffer(); // the output buffer
       
   652         doEncode(string, b64, jcharset,
       
   653                  // As per RFC 2047, size of an encoded string should not
       
   654                  // exceed 75 bytes.
       
   655                  // 7 = size of "=?", '?', 'B'/'Q', '?', "?="
       
   656                  75 - 7 - charset.length(), // the available space
       
   657                  "=?" + charset + "?" + encoding + "?", // prefix
       
   658                  true, encodingWord, outb);
       
   659 
       
   660         return outb.toString();
       
   661     }
       
   662 
       
   663     private static void doEncode(String string, boolean b64,
       
   664                 String jcharset, int avail, String prefix,
       
   665                 boolean first, boolean encodingWord, StringBuffer buf)
       
   666                         throws UnsupportedEncodingException {
       
   667 
       
   668         // First find out what the length of the encoded version of
       
   669         // 'string' would be.
       
   670         byte[] bytes = string.getBytes(jcharset);
       
   671         int len;
       
   672         if (b64) // "B" encoding
       
   673             len = BEncoderStream.encodedLength(bytes);
       
   674         else // "Q"
       
   675             len = QEncoderStream.encodedLength(bytes, encodingWord);
       
   676 
       
   677         int size;
       
   678         if ((len > avail) && ((size = string.length()) > 1)) {
       
   679             // If the length is greater than 'avail', split 'string'
       
   680             // into two and recurse.
       
   681             doEncode(string.substring(0, size/2), b64, jcharset,
       
   682                      avail, prefix, first, encodingWord, buf);
       
   683             doEncode(string.substring(size/2, size), b64, jcharset,
       
   684                      avail, prefix, false, encodingWord, buf);
       
   685         } else {
       
   686             // length <= than 'avail'. Encode the given string
       
   687             ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
       
   688             OutputStream eos; // the encoder
       
   689             if (b64) // "B" encoding
       
   690                 eos = new BEncoderStream(os);
       
   691             else // "Q" encoding
       
   692                 eos = new QEncoderStream(os, encodingWord);
       
   693 
       
   694             try { // do the encoding
       
   695                 eos.write(bytes);
       
   696                 eos.close();
       
   697             } catch (IOException ioex) { }
       
   698 
       
   699             byte[] encodedBytes = os.toByteArray(); // the encoded stuff
       
   700             // Now write out the encoded (all ASCII) bytes into our
       
   701             // StringBuffer
       
   702             if (!first) // not the first line of this sequence
       
   703                 if (foldEncodedWords)
       
   704                     buf.append("\r\n "); // start a continuation line
       
   705                 else
       
   706                     buf.append(" "); // line will be folded later
       
   707 
       
   708             buf.append(prefix);
       
   709             for (int i = 0; i < encodedBytes.length; i++)
       
   710                 buf.append((char)encodedBytes[i]);
       
   711             buf.append("?="); // terminate the current sequence
       
   712         }
       
   713     }
       
   714 
       
   715     /**
       
   716      * The string is parsed using the rules in RFC 2047 for parsing
       
   717      * an "encoded-word". If the parse fails, a ParseException is
       
   718      * thrown. Otherwise, it is transfer-decoded, and then
       
   719      * charset-converted into Unicode. If the charset-conversion
       
   720      * fails, an UnsupportedEncodingException is thrown.<p>
       
   721      *
       
   722      * @param   eword   the possibly encoded value
       
   723      * @exception       ParseException if the string is not an
       
   724      *                  encoded-word as per RFC 2047.
       
   725      * @exception       UnsupportedEncodingException if the charset
       
   726      *                  conversion failed.
       
   727      */
       
   728     public static String decodeWord(String eword)
       
   729                 throws ParseException, UnsupportedEncodingException {
       
   730 
       
   731         if (!eword.startsWith("=?")) // not an encoded word
       
   732             throw new ParseException();
       
   733 
       
   734         // get charset
       
   735         int start = 2; int pos;
       
   736         if ((pos = eword.indexOf('?', start)) == -1)
       
   737             throw new ParseException();
       
   738         String charset = javaCharset(eword.substring(start, pos));
       
   739 
       
   740         // get encoding
       
   741         start = pos+1;
       
   742         if ((pos = eword.indexOf('?', start)) == -1)
       
   743             throw new ParseException();
       
   744         String encoding = eword.substring(start, pos);
       
   745 
       
   746         // get encoded-sequence
       
   747         start = pos+1;
       
   748         if ((pos = eword.indexOf("?=", start)) == -1)
       
   749             throw new ParseException();
       
   750         String word = eword.substring(start, pos);
       
   751 
       
   752         try {
       
   753             // Extract the bytes from word
       
   754             ByteArrayInputStream bis =
       
   755                 new ByteArrayInputStream(ASCIIUtility.getBytes(word));
       
   756 
       
   757             // Get the appropriate decoder
       
   758             InputStream is;
       
   759             if (encoding.equalsIgnoreCase("B"))
       
   760                 is = new BASE64DecoderStream(bis);
       
   761             else if (encoding.equalsIgnoreCase("Q"))
       
   762                 is = new QDecoderStream(bis);
       
   763             else
       
   764                 throw new UnsupportedEncodingException(
       
   765                                 "unknown encoding: " + encoding);
       
   766 
       
   767             // For b64 & q, size of decoded word <= size of word. So
       
   768             // the decoded bytes must fit into the 'bytes' array. This
       
   769             // is certainly more efficient than writing bytes into a
       
   770             // ByteArrayOutputStream and then pulling out the byte[]
       
   771             // from it.
       
   772             int count = bis.available();
       
   773             byte[] bytes = new byte[count];
       
   774             // count is set to the actual number of decoded bytes
       
   775             count = is.read(bytes, 0, count);
       
   776 
       
   777             // Finally, convert the decoded bytes into a String using
       
   778             // the specified charset
       
   779             String s = new String(bytes, 0, count, charset);
       
   780             if (pos + 2 < eword.length()) {
       
   781                 // there's still more text in the string
       
   782                 String rest = eword.substring(pos + 2);
       
   783                 if (!decodeStrict)
       
   784                     rest = decodeInnerWords(rest);
       
   785                 s += rest;
       
   786             }
       
   787             return s;
       
   788         } catch (UnsupportedEncodingException uex) {
       
   789             // explicitly catch and rethrow this exception, otherwise
       
   790             // the below IOException catch will swallow this up!
       
   791             throw uex;
       
   792         } catch (IOException ioex) {
       
   793             // Shouldn't happen.
       
   794             throw new ParseException();
       
   795         } catch (IllegalArgumentException iex) {
       
   796             /* An unknown charset of the form ISO-XXX-XXX, will cause
       
   797              * the JDK to throw an IllegalArgumentException ... Since the
       
   798              * JDK will attempt to create a classname using this string,
       
   799              * but valid classnames must not contain the character '-',
       
   800              * and this results in an IllegalArgumentException, rather than
       
   801              * the expected UnsupportedEncodingException. Yikes
       
   802              */
       
   803             throw new UnsupportedEncodingException();
       
   804         }
       
   805     }
       
   806 
       
   807     /**
       
   808      * Look for encoded words within a word.  The MIME spec doesn't
       
   809      * allow this, but many broken mailers, especially Japanese mailers,
       
   810      * produce such incorrect encodings.
       
   811      */
       
   812     private static String decodeInnerWords(String word)
       
   813                                 throws UnsupportedEncodingException {
       
   814         int start = 0, i;
       
   815         StringBuffer buf = new StringBuffer();
       
   816         while ((i = word.indexOf("=?", start)) >= 0) {
       
   817             buf.append(word.substring(start, i));
       
   818             int end = word.indexOf("?=", i);
       
   819             if (end < 0)
       
   820                 break;
       
   821             String s = word.substring(i, end + 2);
       
   822             try {
       
   823                 s = decodeWord(s);
       
   824             } catch (ParseException pex) {
       
   825                 // ignore it, just use the original string
       
   826             }
       
   827             buf.append(s);
       
   828             start = end + 2;
       
   829         }
       
   830         if (start == 0)
       
   831             return word;
       
   832         if (start < word.length())
       
   833             buf.append(word.substring(start));
       
   834         return buf.toString();
       
   835     }
       
   836 
       
   837     /**
       
   838      * A utility method to quote a word, if the word contains any
       
   839      * characters from the specified 'specials' list.<p>
       
   840      *
       
   841      * The <code>HeaderTokenizer</code> class defines two special
       
   842      * sets of delimiters - MIME and RFC 822. <p>
       
   843      *
       
   844      * This method is typically used during the generation of
       
   845      * RFC 822 and MIME header fields.
       
   846      *
       
   847      * @param   word    word to be quoted
       
   848      * @param   specials the set of special characters
       
   849      * @return          the possibly quoted word
       
   850      * @see     javax.mail.internet.HeaderTokenizer#MIME
       
   851      * @see     javax.mail.internet.HeaderTokenizer#RFC822
       
   852      */
       
   853     public static String quote(String word, String specials) {
       
   854         int len = word.length();
       
   855 
       
   856         /*
       
   857          * Look for any "bad" characters, Escape and
       
   858          *  quote the entire string if necessary.
       
   859          */
       
   860         boolean needQuoting = false;
       
   861         for (int i = 0; i < len; i++) {
       
   862             char c = word.charAt(i);
       
   863             if (c == '"' || c == '\\' || c == '\r' || c == '\n') {
       
   864                 // need to escape them and then quote the whole string
       
   865                 StringBuffer sb = new StringBuffer(len + 3);
       
   866                 sb.append('"');
       
   867                 sb.append(word.substring(0, i));
       
   868                 int lastc = 0;
       
   869                 for (int j = i; j < len; j++) {
       
   870                     char cc = word.charAt(j);
       
   871                     if ((cc == '"') || (cc == '\\') ||
       
   872                         (cc == '\r') || (cc == '\n'))
       
   873                         if (cc == '\n' && lastc == '\r')
       
   874                             ;   // do nothing, CR was already escaped
       
   875                         else
       
   876                             sb.append('\\');    // Escape the character
       
   877                     sb.append(cc);
       
   878                     lastc = cc;
       
   879                 }
       
   880                 sb.append('"');
       
   881                 return sb.toString();
       
   882             } else if (c < 040 || c >= 0177 || specials.indexOf(c) >= 0)
       
   883                 // These characters cause the string to be quoted
       
   884                 needQuoting = true;
       
   885         }
       
   886 
       
   887         if (needQuoting) {
       
   888             StringBuffer sb = new StringBuffer(len + 2);
       
   889             sb.append('"').append(word).append('"');
       
   890             return sb.toString();
       
   891         } else
       
   892             return word;
       
   893     }
       
   894 
       
   895     /**
       
   896      * Fold a string at linear whitespace so that each line is no longer
       
   897      * than 76 characters, if possible.  If there are more than 76
       
   898      * non-whitespace characters consecutively, the string is folded at
       
   899      * the first whitespace after that sequence.  The parameter
       
   900      * <code>used</code> indicates how many characters have been used in
       
   901      * the current line; it is usually the length of the header name. <p>
       
   902      *
       
   903      * Note that line breaks in the string aren't escaped; they probably
       
   904      * should be.
       
   905      *
       
   906      * @param   used    characters used in line so far
       
   907      * @param   s       the string to fold
       
   908      * @return          the folded string
       
   909      */
       
   910     /*public*/ static String fold(int used, String s) {
       
   911         if (!foldText)
       
   912             return s;
       
   913 
       
   914         int end;
       
   915         char c;
       
   916         // Strip trailing spaces
       
   917         for (end = s.length() - 1; end >= 0; end--) {
       
   918             c = s.charAt(end);
       
   919             if (c != ' ' && c != '\t')
       
   920                 break;
       
   921         }
       
   922         if (end != s.length() - 1)
       
   923             s = s.substring(0, end + 1);
       
   924 
       
   925         // if the string fits now, just return it
       
   926         if (used + s.length() <= 76)
       
   927             return s;
       
   928 
       
   929         // have to actually fold the string
       
   930         StringBuffer sb = new StringBuffer(s.length() + 4);
       
   931         char lastc = 0;
       
   932         while (used + s.length() > 76) {
       
   933             int lastspace = -1;
       
   934             for (int i = 0; i < s.length(); i++) {
       
   935                 if (lastspace != -1 && used + i > 76)
       
   936                     break;
       
   937                 c = s.charAt(i);
       
   938                 if (c == ' ' || c == '\t')
       
   939                     if (!(lastc == ' ' || lastc == '\t'))
       
   940                         lastspace = i;
       
   941                 lastc = c;
       
   942             }
       
   943             if (lastspace == -1) {
       
   944                 // no space, use the whole thing
       
   945                 sb.append(s);
       
   946                 s = "";
       
   947                 used = 0;
       
   948                 break;
       
   949             }
       
   950             sb.append(s.substring(0, lastspace));
       
   951             sb.append("\r\n");
       
   952             lastc = s.charAt(lastspace);
       
   953             sb.append(lastc);
       
   954             s = s.substring(lastspace + 1);
       
   955             used = 1;
       
   956         }
       
   957         sb.append(s);
       
   958         return sb.toString();
       
   959     }
       
   960 
       
   961     /**
       
   962      * Unfold a folded header.  Any line breaks that aren't escaped and
       
   963      * are followed by whitespace are removed.
       
   964      *
       
   965      * @param   s       the string to unfold
       
   966      * @return          the unfolded string
       
   967      */
       
   968     /*public*/ static String unfold(String s) {
       
   969         if (!foldText)
       
   970             return s;
       
   971 
       
   972         StringBuffer sb = null;
       
   973         int i;
       
   974         while ((i = indexOfAny(s, "\r\n")) >= 0) {
       
   975             int start = i;
       
   976             int l = s.length();
       
   977             i++;                // skip CR or NL
       
   978             if (i < l && s.charAt(i - 1) == '\r' && s.charAt(i) == '\n')
       
   979                 i++;    // skip LF
       
   980             if (start == 0 || s.charAt(start - 1) != '\\') {
       
   981                 char c;
       
   982                 // if next line starts with whitespace, skip all of it
       
   983                 // XXX - always has to be true?
       
   984                 if (i < l && ((c = s.charAt(i)) == ' ' || c == '\t')) {
       
   985                     i++;        // skip whitespace
       
   986                     while (i < l && ((c = s.charAt(i)) == ' ' || c == '\t'))
       
   987                         i++;
       
   988                     if (sb == null)
       
   989                         sb = new StringBuffer(s.length());
       
   990                     if (start != 0) {
       
   991                         sb.append(s.substring(0, start));
       
   992                         sb.append(' ');
       
   993                     }
       
   994                     s = s.substring(i);
       
   995                     continue;
       
   996                 }
       
   997                 // it's not a continuation line, just leave it in
       
   998                 if (sb == null)
       
   999                     sb = new StringBuffer(s.length());
       
  1000                 sb.append(s.substring(0, i));
       
  1001                 s = s.substring(i);
       
  1002             } else {
       
  1003                 // there's a backslash at "start - 1"
       
  1004                 // strip it out, but leave in the line break
       
  1005                 if (sb == null)
       
  1006                     sb = new StringBuffer(s.length());
       
  1007                 sb.append(s.substring(0, start - 1));
       
  1008                 sb.append(s.substring(start, i));
       
  1009                 s = s.substring(i);
       
  1010             }
       
  1011         }
       
  1012         if (sb != null) {
       
  1013             sb.append(s);
       
  1014             return sb.toString();
       
  1015         } else
       
  1016             return s;
       
  1017     }
       
  1018 
       
  1019     /**
       
  1020      * Return the first index of any of the characters in "any" in "s",
       
  1021      * or -1 if none are found.
       
  1022      *
       
  1023      * This should be a method on String.
       
  1024      */
       
  1025     private static int indexOfAny(String s, String any) {
       
  1026         return indexOfAny(s, any, 0);
       
  1027     }
       
  1028 
       
  1029     private static int indexOfAny(String s, String any, int start) {
       
  1030         try {
       
  1031             int len = s.length();
       
  1032             for (int i = start; i < len; i++) {
       
  1033                 if (any.indexOf(s.charAt(i)) >= 0)
       
  1034                     return i;
       
  1035             }
       
  1036             return -1;
       
  1037         } catch (StringIndexOutOfBoundsException e) {
       
  1038             return -1;
       
  1039         }
       
  1040     }
       
  1041 
       
  1042     /**
       
  1043      * Convert a MIME charset name into a valid Java charset name. <p>
       
  1044      *
       
  1045      * @param charset   the MIME charset name
       
  1046      * @return  the Java charset equivalent. If a suitable mapping is
       
  1047      *          not available, the passed in charset is itself returned.
       
  1048      */
       
  1049     public static String javaCharset(String charset) {
       
  1050         if (mime2java == null || charset == null)
       
  1051             // no mapping table, or charset parameter is null
       
  1052             return charset;
       
  1053 
       
  1054         String alias = (String)mime2java.get(charset.toLowerCase());
       
  1055         return alias == null ? charset : alias;
       
  1056     }
       
  1057 
       
  1058     /**
       
  1059      * Convert a java charset into its MIME charset name. <p>
       
  1060      *
       
  1061      * Note that a future version of JDK (post 1.2) might provide
       
  1062      * this functionality, in which case, we may deprecate this
       
  1063      * method then.
       
  1064      *
       
  1065      * @param   charset    the JDK charset
       
  1066      * @return          the MIME/IANA equivalent. If a mapping
       
  1067      *                  is not possible, the passed in charset itself
       
  1068      *                  is returned.
       
  1069      * @since           JavaMail 1.1
       
  1070      */
       
  1071     public static String mimeCharset(String charset) {
       
  1072         if (java2mime == null || charset == null)
       
  1073             // no mapping table or charset param is null
       
  1074             return charset;
       
  1075 
       
  1076         String alias = (String)java2mime.get(charset.toLowerCase());
       
  1077         return alias == null ? charset : alias;
       
  1078     }
       
  1079 
       
  1080     private static String defaultJavaCharset;
       
  1081     private static String defaultMIMECharset;
       
  1082 
       
  1083     /**
       
  1084      * Get the default charset corresponding to the system's current
       
  1085      * default locale.  If the System property <code>mail.mime.charset</code>
       
  1086      * is set, a system charset corresponding to this MIME charset will be
       
  1087      * returned. <p>
       
  1088      *
       
  1089      * @return  the default charset of the system's default locale,
       
  1090      *          as a Java charset. (NOT a MIME charset)
       
  1091      * @since   JavaMail 1.1
       
  1092      */
       
  1093     public static String getDefaultJavaCharset() {
       
  1094         if (defaultJavaCharset == null) {
       
  1095             /*
       
  1096              * If mail.mime.charset is set, it controls the default
       
  1097              * Java charset as well.
       
  1098              */
       
  1099             String mimecs = null;
       
  1100 
       
  1101             mimecs = SAAJUtil.getSystemProperty("mail.mime.charset");
       
  1102 
       
  1103             if (mimecs != null && mimecs.length() > 0) {
       
  1104                 defaultJavaCharset = javaCharset(mimecs);
       
  1105                 return defaultJavaCharset;
       
  1106             }
       
  1107 
       
  1108             try {
       
  1109                 defaultJavaCharset = System.getProperty("file.encoding",
       
  1110                                                         "8859_1");
       
  1111             } catch (SecurityException sex) {
       
  1112 
       
  1113                 class NullInputStream extends InputStream {
       
  1114                     public int read() {
       
  1115                         return 0;
       
  1116                     }
       
  1117                 }
       
  1118                 InputStreamReader reader =
       
  1119                         new InputStreamReader(new NullInputStream());
       
  1120                 defaultJavaCharset = reader.getEncoding();
       
  1121                 if (defaultJavaCharset == null)
       
  1122                     defaultJavaCharset = "8859_1";
       
  1123             }
       
  1124         }
       
  1125 
       
  1126         return defaultJavaCharset;
       
  1127     }
       
  1128 
       
  1129     /*
       
  1130      * Get the default MIME charset for this locale.
       
  1131      */
       
  1132     static String getDefaultMIMECharset() {
       
  1133         if (defaultMIMECharset == null) {
       
  1134                 defaultMIMECharset = SAAJUtil.getSystemProperty("mail.mime.charset");
       
  1135         }
       
  1136         if (defaultMIMECharset == null)
       
  1137             defaultMIMECharset = mimeCharset(getDefaultJavaCharset());
       
  1138         return defaultMIMECharset;
       
  1139     }
       
  1140 
       
  1141     // Tables to map MIME charset names to Java names and vice versa.
       
  1142     // XXX - Should eventually use J2SE 1.4 java.nio.charset.Charset
       
  1143     private static Hashtable mime2java;
       
  1144     private static Hashtable java2mime;
       
  1145 
       
  1146     static {
       
  1147         java2mime = new Hashtable(40);
       
  1148         mime2java = new Hashtable(10);
       
  1149 
       
  1150         try {
       
  1151             // Use this class's classloader to load the mapping file
       
  1152             // XXX - we should use SecuritySupport, but it's in another package
       
  1153             InputStream is =
       
  1154                     com.sun.xml.internal.messaging.saaj.packaging.mime.internet.MimeUtility.class.getResourceAsStream(
       
  1155                     "/META-INF/javamail.charset.map");
       
  1156 
       
  1157             if (is != null) {
       
  1158                 is = new LineInputStream(is);
       
  1159 
       
  1160                 // Load the JDK-to-MIME charset mapping table
       
  1161                 loadMappings((LineInputStream)is, java2mime);
       
  1162 
       
  1163                 // Load the MIME-to-JDK charset mapping table
       
  1164                 loadMappings((LineInputStream)is, mime2java);
       
  1165             }
       
  1166         } catch (Exception ex) { }
       
  1167 
       
  1168         // If we didn't load the tables, e.g., because we didn't have
       
  1169         // permission, load them manually.  The entries here should be
       
  1170         // the same as the default javamail.charset.map.
       
  1171         if (java2mime.isEmpty()) {
       
  1172             java2mime.put("8859_1", "ISO-8859-1");
       
  1173             java2mime.put("iso8859_1", "ISO-8859-1");
       
  1174             java2mime.put("ISO8859-1", "ISO-8859-1");
       
  1175 
       
  1176             java2mime.put("8859_2", "ISO-8859-2");
       
  1177             java2mime.put("iso8859_2", "ISO-8859-2");
       
  1178             java2mime.put("ISO8859-2", "ISO-8859-2");
       
  1179 
       
  1180             java2mime.put("8859_3", "ISO-8859-3");
       
  1181             java2mime.put("iso8859_3", "ISO-8859-3");
       
  1182             java2mime.put("ISO8859-3", "ISO-8859-3");
       
  1183 
       
  1184             java2mime.put("8859_4", "ISO-8859-4");
       
  1185             java2mime.put("iso8859_4", "ISO-8859-4");
       
  1186             java2mime.put("ISO8859-4", "ISO-8859-4");
       
  1187 
       
  1188             java2mime.put("8859_5", "ISO-8859-5");
       
  1189             java2mime.put("iso8859_5", "ISO-8859-5");
       
  1190             java2mime.put("ISO8859-5", "ISO-8859-5");
       
  1191 
       
  1192             java2mime.put("8859_6", "ISO-8859-6");
       
  1193             java2mime.put("iso8859_6", "ISO-8859-6");
       
  1194             java2mime.put("ISO8859-6", "ISO-8859-6");
       
  1195 
       
  1196             java2mime.put("8859_7", "ISO-8859-7");
       
  1197             java2mime.put("iso8859_7", "ISO-8859-7");
       
  1198             java2mime.put("ISO8859-7", "ISO-8859-7");
       
  1199 
       
  1200             java2mime.put("8859_8", "ISO-8859-8");
       
  1201             java2mime.put("iso8859_8", "ISO-8859-8");
       
  1202             java2mime.put("ISO8859-8", "ISO-8859-8");
       
  1203 
       
  1204             java2mime.put("8859_9", "ISO-8859-9");
       
  1205             java2mime.put("iso8859_9", "ISO-8859-9");
       
  1206             java2mime.put("ISO8859-9", "ISO-8859-9");
       
  1207 
       
  1208             java2mime.put("SJIS", "Shift_JIS");
       
  1209             java2mime.put("MS932", "Shift_JIS");
       
  1210             java2mime.put("JIS", "ISO-2022-JP");
       
  1211             java2mime.put("ISO2022JP", "ISO-2022-JP");
       
  1212             java2mime.put("EUC_JP", "euc-jp");
       
  1213             java2mime.put("KOI8_R", "koi8-r");
       
  1214             java2mime.put("EUC_CN", "euc-cn");
       
  1215             java2mime.put("EUC_TW", "euc-tw");
       
  1216             java2mime.put("EUC_KR", "euc-kr");
       
  1217         }
       
  1218         if (mime2java.isEmpty()) {
       
  1219             mime2java.put("iso-2022-cn", "ISO2022CN");
       
  1220             mime2java.put("iso-2022-kr", "ISO2022KR");
       
  1221             mime2java.put("utf-8", "UTF8");
       
  1222             mime2java.put("utf8", "UTF8");
       
  1223             mime2java.put("ja_jp.iso2022-7", "ISO2022JP");
       
  1224             mime2java.put("ja_jp.eucjp", "EUCJIS");
       
  1225             mime2java.put("euc-kr", "KSC5601");
       
  1226             mime2java.put("euckr", "KSC5601");
       
  1227             mime2java.put("us-ascii", "ISO-8859-1");
       
  1228             mime2java.put("x-us-ascii", "ISO-8859-1");
       
  1229         }
       
  1230     }
       
  1231 
       
  1232     private static void loadMappings(LineInputStream is, Hashtable table) {
       
  1233         String currLine;
       
  1234 
       
  1235         while (true) {
       
  1236             try {
       
  1237                 currLine = is.readLine();
       
  1238             } catch (IOException ioex) {
       
  1239                 break; // error in reading, stop
       
  1240             }
       
  1241 
       
  1242             if (currLine == null) // end of file, stop
       
  1243                 break;
       
  1244             if (currLine.startsWith("--") && currLine.endsWith("--"))
       
  1245                 // end of this table
       
  1246                 break;
       
  1247 
       
  1248             // ignore empty lines and comments
       
  1249             if (currLine.trim().length() == 0 || currLine.startsWith("#"))
       
  1250                 continue;
       
  1251 
       
  1252             // A valid entry is of the form <key><separator><value>
       
  1253             // where, <separator> := SPACE | HT. Parse this
       
  1254             StringTokenizer tk = new StringTokenizer(currLine, " \t");
       
  1255             try {
       
  1256                 String key = tk.nextToken();
       
  1257                 String value = tk.nextToken();
       
  1258                 table.put(key.toLowerCase(), value);
       
  1259             } catch (NoSuchElementException nex) { }
       
  1260         }
       
  1261     }
       
  1262 
       
  1263     static final int ALL_ASCII          = 1;
       
  1264     static final int MOSTLY_ASCII       = 2;
       
  1265     static final int MOSTLY_NONASCII    = 3;
       
  1266 
       
  1267     /**
       
  1268      * Check if the given string contains non US-ASCII characters.
       
  1269      * @param   s       string
       
  1270      * @return          ALL_ASCII if all characters in the string
       
  1271      *                  belong to the US-ASCII charset. MOSTLY_ASCII
       
  1272      *                  if more than half of the available characters
       
  1273      *                  are US-ASCII characters. Else MOSTLY_NONASCII.
       
  1274      */
       
  1275     static int checkAscii(String s) {
       
  1276         int ascii = 0, non_ascii = 0;
       
  1277         int l = s.length();
       
  1278 
       
  1279         for (int i = 0; i < l; i++) {
       
  1280             if (nonascii((int)s.charAt(i))) // non-ascii
       
  1281                 non_ascii++;
       
  1282             else
       
  1283                 ascii++;
       
  1284         }
       
  1285 
       
  1286         if (non_ascii == 0)
       
  1287             return ALL_ASCII;
       
  1288         if (ascii > non_ascii)
       
  1289             return MOSTLY_ASCII;
       
  1290 
       
  1291         return MOSTLY_NONASCII;
       
  1292     }
       
  1293 
       
  1294     /**
       
  1295      * Check if the given byte array contains non US-ASCII characters.
       
  1296      * @param   b       byte array
       
  1297      * @return          ALL_ASCII if all characters in the string
       
  1298      *                  belong to the US-ASCII charset. MOSTLY_ASCII
       
  1299      *                  if more than half of the available characters
       
  1300      *                  are US-ASCII characters. Else MOSTLY_NONASCII.
       
  1301      *
       
  1302      * XXX - this method is no longer used
       
  1303      */
       
  1304     static int checkAscii(byte[] b) {
       
  1305         int ascii = 0, non_ascii = 0;
       
  1306 
       
  1307         for (int i=0; i < b.length; i++) {
       
  1308             // The '&' operator automatically causes b[i] to be promoted
       
  1309             // to an int, and we mask out the higher bytes in the int
       
  1310             // so that the resulting value is not a negative integer.
       
  1311             if (nonascii(b[i] & 0xff)) // non-ascii
       
  1312                 non_ascii++;
       
  1313             else
       
  1314                 ascii++;
       
  1315         }
       
  1316 
       
  1317         if (non_ascii == 0)
       
  1318             return ALL_ASCII;
       
  1319         if (ascii > non_ascii)
       
  1320             return MOSTLY_ASCII;
       
  1321 
       
  1322         return MOSTLY_NONASCII;
       
  1323     }
       
  1324 
       
  1325     /**
       
  1326      * Check if the given input stream contains non US-ASCII characters.
       
  1327      * Upto <code>max</code> bytes are checked. If <code>max</code> is
       
  1328      * set to <code>ALL</code>, then all the bytes available in this
       
  1329      * input stream are checked. If <code>breakOnNonAscii</code> is true
       
  1330      * the check terminates when the first non-US-ASCII character is
       
  1331      * found and MOSTLY_NONASCII is returned. Else, the check continues
       
  1332      * till <code>max</code> bytes or till the end of stream.
       
  1333      *
       
  1334      * @param   is      the input stream
       
  1335      * @param   max     maximum bytes to check for. The special value
       
  1336      *                  ALL indicates that all the bytes in this input
       
  1337      *                  stream must be checked.
       
  1338      * @param   breakOnNonAscii if <code>true</code>, then terminate the
       
  1339      *                  the check when the first non-US-ASCII character
       
  1340      *                  is found.
       
  1341      * @return          ALL_ASCII if all characters in the string
       
  1342      *                  belong to the US-ASCII charset. MOSTLY_ASCII
       
  1343      *                  if more than half of the available characters
       
  1344      *                  are US-ASCII characters. Else MOSTLY_NONASCII.
       
  1345      */
       
  1346     static int checkAscii(InputStream is, int max, boolean breakOnNonAscii) {
       
  1347         int ascii = 0, non_ascii = 0;
       
  1348         int len;
       
  1349         int block = 4096;
       
  1350         int linelen = 0;
       
  1351         boolean longLine = false, badEOL = false;
       
  1352         boolean checkEOL = encodeEolStrict && breakOnNonAscii;
       
  1353         byte buf[] = null;
       
  1354         if (max != 0) {
       
  1355             block = (max == ALL) ? 4096 : Math.min(max, 4096);
       
  1356             buf = new byte[block];
       
  1357         }
       
  1358         while (max != 0) {
       
  1359             try {
       
  1360                 if ((len = is.read(buf, 0, block)) == -1)
       
  1361                     break;
       
  1362                 int lastb = 0;
       
  1363                 for (int i = 0; i < len; i++) {
       
  1364                     // The '&' operator automatically causes b[i] to
       
  1365                     // be promoted to an int, and we mask out the higher
       
  1366                     // bytes in the int so that the resulting value is
       
  1367                     // not a negative integer.
       
  1368                     int b = buf[i] & 0xff;
       
  1369                     if (checkEOL &&
       
  1370                             ((lastb == '\r' && b != '\n') ||
       
  1371                             (lastb != '\r' && b == '\n')))
       
  1372                         badEOL = true;
       
  1373                     if (b == '\r' || b == '\n')
       
  1374                         linelen = 0;
       
  1375                     else {
       
  1376                         linelen++;
       
  1377                         if (linelen > 998)      // 1000 - CRLF
       
  1378                             longLine = true;
       
  1379                     }
       
  1380                     if (nonascii(b)) {  // non-ascii
       
  1381                         if (breakOnNonAscii) // we are done
       
  1382                             return MOSTLY_NONASCII;
       
  1383                         else
       
  1384                             non_ascii++;
       
  1385                     } else
       
  1386                         ascii++;
       
  1387                     lastb = b;
       
  1388                 }
       
  1389             } catch (IOException ioex) {
       
  1390                 break;
       
  1391             }
       
  1392             if (max != ALL)
       
  1393                 max -= len;
       
  1394         }
       
  1395 
       
  1396         if (max == 0 && breakOnNonAscii)
       
  1397             // We have been told to break on the first non-ascii character.
       
  1398             // We haven't got any non-ascii character yet, but then we
       
  1399             // have not checked all of the available bytes either. So we
       
  1400             // cannot say for sure that this input stream is ALL_ASCII,
       
  1401             // and hence we must play safe and return MOSTLY_NONASCII
       
  1402 
       
  1403             return MOSTLY_NONASCII;
       
  1404 
       
  1405         if (non_ascii == 0) { // no non-us-ascii characters so far
       
  1406             // If we're looking at non-text data, and we saw CR without LF
       
  1407             // or vice versa, consider this mostly non-ASCII so that it
       
  1408             // will be base64 encoded (since the quoted-printable encoder
       
  1409             // doesn't encode this case properly).
       
  1410             if (badEOL)
       
  1411                 return MOSTLY_NONASCII;
       
  1412             // if we've seen a long line, we degrade to mostly ascii
       
  1413             else if (longLine)
       
  1414                 return MOSTLY_ASCII;
       
  1415             else
       
  1416                 return ALL_ASCII;
       
  1417         }
       
  1418         if (ascii > non_ascii) // mostly ascii
       
  1419             return MOSTLY_ASCII;
       
  1420         return MOSTLY_NONASCII;
       
  1421     }
       
  1422 
       
  1423     static final boolean nonascii(int b) {
       
  1424         return b >= 0177 || (b < 040 && b != '\r' && b != '\n' && b != '\t');
       
  1425     }
       
  1426 }
       
  1427 
       
  1428 /**
       
  1429  * An OutputStream that determines whether the data written to
       
  1430  * it is all ASCII, mostly ASCII, or mostly non-ASCII.
       
  1431  */
       
  1432 class AsciiOutputStream extends OutputStream {
       
  1433     private boolean breakOnNonAscii;
       
  1434     private int ascii = 0, non_ascii = 0;
       
  1435     private int linelen = 0;
       
  1436     private boolean longLine = false;
       
  1437     private boolean badEOL = false;
       
  1438     private boolean checkEOL = false;
       
  1439     private int lastb = 0;
       
  1440     private int ret = 0;
       
  1441 
       
  1442     public AsciiOutputStream(boolean breakOnNonAscii, boolean encodeEolStrict) {
       
  1443         this.breakOnNonAscii = breakOnNonAscii;
       
  1444         checkEOL = encodeEolStrict && breakOnNonAscii;
       
  1445     }
       
  1446 
       
  1447     public void write(int b) throws IOException {
       
  1448         check(b);
       
  1449     }
       
  1450 
       
  1451     public void write(byte b[]) throws IOException {
       
  1452         write(b, 0, b.length);
       
  1453     }
       
  1454 
       
  1455     public void write(byte b[], int off, int len) throws IOException {
       
  1456         len += off;
       
  1457         for (int i = off; i < len ; i++)
       
  1458             check(b[i]);
       
  1459     }
       
  1460 
       
  1461     private final void check(int b) throws IOException {
       
  1462         b &= 0xff;
       
  1463         if (checkEOL &&
       
  1464                 ((lastb == '\r' && b != '\n') || (lastb != '\r' && b == '\n')))
       
  1465             badEOL = true;
       
  1466         if (b == '\r' || b == '\n')
       
  1467             linelen = 0;
       
  1468         else {
       
  1469             linelen++;
       
  1470             if (linelen > 998)  // 1000 - CRLF
       
  1471                 longLine = true;
       
  1472         }
       
  1473         if (MimeUtility.nonascii(b)) { // non-ascii
       
  1474             non_ascii++;
       
  1475             if (breakOnNonAscii) {      // we are done
       
  1476                 ret = MimeUtility.MOSTLY_NONASCII;
       
  1477                 throw new EOFException();
       
  1478             }
       
  1479         } else
       
  1480             ascii++;
       
  1481         lastb = b;
       
  1482     }
       
  1483 
       
  1484     /**
       
  1485      * Return ASCII-ness of data stream.
       
  1486      */
       
  1487     public int getAscii() {
       
  1488         if (ret != 0)
       
  1489             return ret;
       
  1490         // If we're looking at non-text data, and we saw CR without LF
       
  1491         // or vice versa, consider this mostly non-ASCII so that it
       
  1492         // will be base64 encoded (since the quoted-printable encoder
       
  1493         // doesn't encode this case properly).
       
  1494         if (badEOL)
       
  1495             return MimeUtility.MOSTLY_NONASCII;
       
  1496         else if (non_ascii == 0) { // no non-us-ascii characters so far
       
  1497             // if we've seen a long line, we degrade to mostly ascii
       
  1498             if (longLine)
       
  1499                 return MimeUtility.MOSTLY_ASCII;
       
  1500             else
       
  1501                 return MimeUtility.ALL_ASCII;
       
  1502         }
       
  1503         if (ascii > non_ascii) // mostly ascii
       
  1504             return MimeUtility.MOSTLY_ASCII;
       
  1505         return MimeUtility.MOSTLY_NONASCII;
       
  1506     }
       
  1507 }