1 /* |
|
2 * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved. |
|
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
|
4 * |
|
5 * This code is free software; you can redistribute it and/or modify it |
|
6 * under the terms of the GNU General Public License version 2 only, as |
|
7 * published by the Free Software Foundation. Oracle designates this |
|
8 * particular file as subject to the "Classpath" exception as provided |
|
9 * by Oracle in the LICENSE file that accompanied this code. |
|
10 * |
|
11 * This code is distributed in the hope that it will be useful, but WITHOUT |
|
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
14 * version 2 for more details (a copy is included in the LICENSE file that |
|
15 * accompanied this code). |
|
16 * |
|
17 * You should have received a copy of the GNU General Public License version |
|
18 * 2 along with this work; if not, write to the Free Software Foundation, |
|
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
|
20 * |
|
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
|
22 * or visit www.oracle.com if you need additional information or have any |
|
23 * questions. |
|
24 */ |
|
25 |
|
26 /* |
|
27 * @(#)MimeUtility.java 1.45 03/03/10 |
|
28 */ |
|
29 |
|
30 |
|
31 |
|
32 package com.sun.xml.internal.messaging.saaj.packaging.mime.internet; |
|
33 |
|
34 import java.io.*; |
|
35 import java.util.*; |
|
36 |
|
37 import javax.activation.DataHandler; |
|
38 import javax.activation.DataSource; |
|
39 |
|
40 import com.sun.xml.internal.messaging.saaj.packaging.mime.MessagingException; |
|
41 import com.sun.xml.internal.messaging.saaj.packaging.mime.util.*; |
|
42 import com.sun.xml.internal.messaging.saaj.util.SAAJUtil; |
|
43 |
|
44 /** |
|
45 * This is a utility class that provides various MIME related |
|
46 * functionality. <p> |
|
47 * |
|
48 * There are a set of methods to encode and decode MIME headers as |
|
49 * per RFC 2047. A brief description on handling such headers is |
|
50 * given below: <p> |
|
51 * |
|
52 * RFC 822 mail headers <strong>must</strong> contain only US-ASCII |
|
53 * characters. Headers that contain non US-ASCII characters must be |
|
54 * encoded so that they contain only US-ASCII characters. Basically, |
|
55 * this process involves using either BASE64 or QP to encode certain |
|
56 * characters. RFC 2047 describes this in detail. <p> |
|
57 * |
|
58 * In Java, Strings contain (16 bit) Unicode characters. ASCII is a |
|
59 * subset of Unicode (and occupies the range 0 - 127). A String |
|
60 * that contains only ASCII characters is already mail-safe. If the |
|
61 * String contains non US-ASCII characters, it must be encoded. An |
|
62 * additional complexity in this step is that since Unicode is not |
|
63 * yet a widely used charset, one might want to first charset-encode |
|
64 * the String into another charset and then do the transfer-encoding. |
|
65 * <p> |
|
66 * Note that to get the actual bytes of a mail-safe String (say, |
|
67 * for sending over SMTP), one must do |
|
68 * <p><blockquote><pre> |
|
69 * |
|
70 * byte[] bytes = string.getBytes("iso-8859-1"); |
|
71 * |
|
72 * </pre></blockquote><p> |
|
73 * |
|
74 * The <code>setHeader</code> and <code>addHeader</code> methods |
|
75 * on MimeMessage and MimeBodyPart assume that the given header values |
|
76 * are Unicode strings that contain only US-ASCII characters. Hence |
|
77 * the callers of those methods must insure that the values they pass |
|
78 * do not contain non US-ASCII characters. The methods in this class |
|
79 * help do this. <p> |
|
80 * |
|
81 * The <code>getHeader</code> family of methods on MimeMessage and |
|
82 * MimeBodyPart return the raw header value. These might be encoded |
|
83 * as per RFC 2047, and if so, must be decoded into Unicode Strings. |
|
84 * The methods in this class help to do this. <p> |
|
85 * |
|
86 * Several System properties control strict conformance to the MIME |
|
87 * spec. Note that these are not session properties but must be set |
|
88 * globally as System properties. <p> |
|
89 * |
|
90 * The <code>mail.mime.decodetext.strict</code> property controls |
|
91 * decoding of MIME encoded words. The MIME spec requires that encoded |
|
92 * words start at the beginning of a whitespace separated word. Some |
|
93 * mailers incorrectly include encoded words in the middle of a word. |
|
94 * If the <code>mail.mime.decodetext.strict</code> System property is |
|
95 * set to <code>"false"</code>, an attempt will be made to decode these |
|
96 * illegal encoded words. The default is true. <p> |
|
97 * |
|
98 * The <code>mail.mime.encodeeol.strict</code> property controls the |
|
99 * choice of Content-Transfer-Encoding for MIME parts that are not of |
|
100 * type "text". Often such parts will contain textual data for which |
|
101 * an encoding that allows normal end of line conventions is appropriate. |
|
102 * In rare cases, such a part will appear to contain entirely textual |
|
103 * data, but will require an encoding that preserves CR and LF characters |
|
104 * without change. If the <code>mail.mime.decodetext.strict</code> |
|
105 * System property is set to <code>"true"</code>, such an encoding will |
|
106 * be used when necessary. The default is false. <p> |
|
107 * |
|
108 * In addition, the <code>mail.mime.charset</code> System property can |
|
109 * be used to specify the default MIME charset to use for encoded words |
|
110 * and text parts that don't otherwise specify a charset. Normally, the |
|
111 * default MIME charset is derived from the default Java charset, as |
|
112 * specified in the <code>file.encoding</code> System property. Most |
|
113 * applications will have no need to explicitly set the default MIME |
|
114 * charset. In cases where the default MIME charset to be used for |
|
115 * mail messages is different than the charset used for files stored on |
|
116 * the system, this property should be set. |
|
117 * |
|
118 * @version 1.45, 03/03/10 |
|
119 * @author John Mani |
|
120 * @author Bill Shannon |
|
121 */ |
|
122 |
|
123 public class MimeUtility { |
|
124 |
|
125 // This class cannot be instantiated |
|
126 private MimeUtility() { } |
|
127 |
|
128 public static final int ALL = -1; |
|
129 |
|
130 private static final int BUFFER_SIZE = 1024; |
|
131 private static boolean decodeStrict = true; |
|
132 private static boolean encodeEolStrict = false; |
|
133 private static boolean foldEncodedWords = false; |
|
134 private static boolean foldText = true; |
|
135 |
|
136 static { |
|
137 try { |
|
138 String s = SAAJUtil.getSystemProperty("mail.mime.decodetext.strict"); |
|
139 // default to true |
|
140 decodeStrict = s == null || !s.equalsIgnoreCase("false"); |
|
141 s = SAAJUtil.getSystemProperty("mail.mime.encodeeol.strict"); |
|
142 // default to false |
|
143 encodeEolStrict = s != null && s.equalsIgnoreCase("true"); |
|
144 s = SAAJUtil.getSystemProperty("mail.mime.foldencodedwords"); |
|
145 // default to false |
|
146 foldEncodedWords = s != null && s.equalsIgnoreCase("true"); |
|
147 s = SAAJUtil.getSystemProperty("mail.mime.foldtext"); |
|
148 // default to true |
|
149 foldText = s == null || !s.equalsIgnoreCase("false"); |
|
150 } catch (SecurityException sex) { |
|
151 // ignore it |
|
152 } |
|
153 } |
|
154 |
|
155 |
|
156 /** |
|
157 * Get the content-transfer-encoding that should be applied |
|
158 * to the input stream of this datasource, to make it mailsafe. <p> |
|
159 * |
|
160 * The algorithm used here is: <br> |
|
161 * <ul> |
|
162 * <li> |
|
163 * If the primary type of this datasource is "text" and if all |
|
164 * the bytes in its input stream are US-ASCII, then the encoding |
|
165 * is "7bit". If more than half of the bytes are non-US-ASCII, then |
|
166 * the encoding is "base64". If less than half of the bytes are |
|
167 * non-US-ASCII, then the encoding is "quoted-printable". |
|
168 * <li> |
|
169 * If the primary type of this datasource is not "text", then if |
|
170 * all the bytes of its input stream are US-ASCII, the encoding |
|
171 * is "7bit". If there is even one non-US-ASCII character, the |
|
172 * encoding is "base64". |
|
173 * </ul> |
|
174 * |
|
175 * @param ds DataSource |
|
176 * @return the encoding. This is either "7bit", |
|
177 * "quoted-printable" or "base64" |
|
178 */ |
|
179 public static String getEncoding(DataSource ds) { |
|
180 ContentType cType = null; |
|
181 InputStream is = null; |
|
182 String encoding = null; |
|
183 |
|
184 try { |
|
185 cType = new ContentType(ds.getContentType()); |
|
186 is = ds.getInputStream(); |
|
187 } catch (Exception ex) { |
|
188 return "base64"; // what else ?! |
|
189 } |
|
190 |
|
191 boolean isText = cType.match("text/*"); |
|
192 // if not text, stop processing when we see non-ASCII |
|
193 int i = checkAscii(is, ALL, !isText); |
|
194 switch (i) { |
|
195 case ALL_ASCII: |
|
196 encoding = "7bit"; // all ascii |
|
197 break; |
|
198 case MOSTLY_ASCII: |
|
199 encoding = "quoted-printable"; // mostly ascii |
|
200 break; |
|
201 default: |
|
202 encoding = "base64"; // mostly binary |
|
203 break; |
|
204 } |
|
205 |
|
206 // Close the input stream |
|
207 try { |
|
208 is.close(); |
|
209 } catch (IOException ioex) { } |
|
210 |
|
211 return encoding; |
|
212 } |
|
213 |
|
214 /** |
|
215 * Same as <code>getEncoding(DataSource)</code> except that instead |
|
216 * of reading the data from an <code>InputStream</code> it uses the |
|
217 * <code>writeTo</code> method to examine the data. This is more |
|
218 * efficient in the common case of a <code>DataHandler</code> |
|
219 * created with an object and a MIME type (for example, a |
|
220 * "text/plain" String) because all the I/O is done in this |
|
221 * thread. In the case requiring an <code>InputStream</code> the |
|
222 * <code>DataHandler</code> uses a thread, a pair of pipe streams, |
|
223 * and the <code>writeTo</code> method to produce the data. <p> |
|
224 * |
|
225 * @since JavaMail 1.2 |
|
226 */ |
|
227 public static String getEncoding(DataHandler dh) { |
|
228 ContentType cType = null; |
|
229 String encoding = null; |
|
230 |
|
231 /* |
|
232 * Try to pick the most efficient means of determining the |
|
233 * encoding. If this DataHandler was created using a DataSource, |
|
234 * the getEncoding(DataSource) method is typically faster. If |
|
235 * the DataHandler was created with an object, this method is |
|
236 * much faster. To distinguish the two cases, we use a heuristic. |
|
237 * A DataHandler created with an object will always have a null name. |
|
238 * A DataHandler created with a DataSource will usually have a |
|
239 * non-null name. |
|
240 * |
|
241 * XXX - This is actually quite a disgusting hack, but it makes |
|
242 * a common case run over twice as fast. |
|
243 */ |
|
244 if (dh.getName() != null) |
|
245 return getEncoding(dh.getDataSource()); |
|
246 |
|
247 try { |
|
248 cType = new ContentType(dh.getContentType()); |
|
249 } catch (Exception ex) { |
|
250 return "base64"; // what else ?! |
|
251 } |
|
252 |
|
253 if (cType.match("text/*")) { |
|
254 // Check all of the available bytes |
|
255 AsciiOutputStream aos = new AsciiOutputStream(false, false); |
|
256 try { |
|
257 dh.writeTo(aos); |
|
258 } catch (IOException ex) { } // ignore it |
|
259 switch (aos.getAscii()) { |
|
260 case ALL_ASCII: |
|
261 encoding = "7bit"; // all ascii |
|
262 break; |
|
263 case MOSTLY_ASCII: |
|
264 encoding = "quoted-printable"; // mostly ascii |
|
265 break; |
|
266 default: |
|
267 encoding = "base64"; // mostly binary |
|
268 break; |
|
269 } |
|
270 } else { // not "text" |
|
271 // Check all of available bytes, break out if we find |
|
272 // at least one non-US-ASCII character |
|
273 AsciiOutputStream aos = |
|
274 new AsciiOutputStream(true, encodeEolStrict); |
|
275 try { |
|
276 dh.writeTo(aos); |
|
277 } catch (IOException ex) { } // ignore it |
|
278 if (aos.getAscii() == ALL_ASCII) // all ascii |
|
279 encoding = "7bit"; |
|
280 else // found atleast one non-ascii character, use b64 |
|
281 encoding = "base64"; |
|
282 } |
|
283 |
|
284 return encoding; |
|
285 } |
|
286 |
|
287 /** |
|
288 * Decode the given input stream. The Input stream returned is |
|
289 * the decoded input stream. All the encodings defined in RFC 2045 |
|
290 * are supported here. They include "base64", "quoted-printable", |
|
291 * "7bit", "8bit", and "binary". In addition, "uuencode" is also |
|
292 * supported. |
|
293 * |
|
294 * @param is input stream |
|
295 * @param encoding the encoding of the stream. |
|
296 * @return decoded input stream. |
|
297 */ |
|
298 public static InputStream decode(InputStream is, String encoding) |
|
299 throws MessagingException { |
|
300 if (encoding.equalsIgnoreCase("base64")) |
|
301 return new BASE64DecoderStream(is); |
|
302 else if (encoding.equalsIgnoreCase("quoted-printable")) |
|
303 return new QPDecoderStream(is); |
|
304 else if (encoding.equalsIgnoreCase("uuencode") || |
|
305 encoding.equalsIgnoreCase("x-uuencode") || |
|
306 encoding.equalsIgnoreCase("x-uue")) |
|
307 return new UUDecoderStream(is); |
|
308 else if (encoding.equalsIgnoreCase("binary") || |
|
309 encoding.equalsIgnoreCase("7bit") || |
|
310 encoding.equalsIgnoreCase("8bit")) |
|
311 return is; |
|
312 else |
|
313 throw new MessagingException("Unknown encoding: " + encoding); |
|
314 } |
|
315 |
|
316 /** |
|
317 * Wrap an encoder around the given output stream. |
|
318 * All the encodings defined in RFC 2045 are supported here. |
|
319 * They include "base64", "quoted-printable", "7bit", "8bit" and |
|
320 * "binary". In addition, "uuencode" is also supported. |
|
321 * |
|
322 * @param os output stream |
|
323 * @param encoding the encoding of the stream. |
|
324 * @return output stream that applies the |
|
325 * specified encoding. |
|
326 */ |
|
327 public static OutputStream encode(OutputStream os, String encoding) |
|
328 throws MessagingException { |
|
329 if (encoding == null) |
|
330 return os; |
|
331 else if (encoding.equalsIgnoreCase("base64")) |
|
332 return new BASE64EncoderStream(os); |
|
333 else if (encoding.equalsIgnoreCase("quoted-printable")) |
|
334 return new QPEncoderStream(os); |
|
335 else if (encoding.equalsIgnoreCase("uuencode") || |
|
336 encoding.equalsIgnoreCase("x-uuencode") || |
|
337 encoding.equalsIgnoreCase("x-uue")) |
|
338 return new UUEncoderStream(os); |
|
339 else if (encoding.equalsIgnoreCase("binary") || |
|
340 encoding.equalsIgnoreCase("7bit") || |
|
341 encoding.equalsIgnoreCase("8bit")) |
|
342 return os; |
|
343 else |
|
344 throw new MessagingException("Unknown encoding: " +encoding); |
|
345 } |
|
346 |
|
347 /** |
|
348 * Wrap an encoder around the given output stream. |
|
349 * All the encodings defined in RFC 2045 are supported here. |
|
350 * They include "base64", "quoted-printable", "7bit", "8bit" and |
|
351 * "binary". In addition, "uuencode" is also supported. |
|
352 * The <code>filename</code> parameter is used with the "uuencode" |
|
353 * encoding and is included in the encoded output. |
|
354 * |
|
355 * @param os output stream |
|
356 * @param encoding the encoding of the stream. |
|
357 * @param filename name for the file being encoded (only used |
|
358 * with uuencode) |
|
359 * @return output stream that applies the |
|
360 * specified encoding. |
|
361 * @since JavaMail 1.2 |
|
362 */ |
|
363 public static OutputStream encode(OutputStream os, String encoding, |
|
364 String filename) |
|
365 throws MessagingException { |
|
366 if (encoding == null) |
|
367 return os; |
|
368 else if (encoding.equalsIgnoreCase("base64")) |
|
369 return new BASE64EncoderStream(os); |
|
370 else if (encoding.equalsIgnoreCase("quoted-printable")) |
|
371 return new QPEncoderStream(os); |
|
372 else if (encoding.equalsIgnoreCase("uuencode") || |
|
373 encoding.equalsIgnoreCase("x-uuencode") || |
|
374 encoding.equalsIgnoreCase("x-uue")) |
|
375 return new UUEncoderStream(os, filename); |
|
376 else if (encoding.equalsIgnoreCase("binary") || |
|
377 encoding.equalsIgnoreCase("7bit") || |
|
378 encoding.equalsIgnoreCase("8bit")) |
|
379 return os; |
|
380 else |
|
381 throw new MessagingException("Unknown encoding: " +encoding); |
|
382 } |
|
383 |
|
384 /** |
|
385 * Encode a RFC 822 "text" token into mail-safe form as per |
|
386 * RFC 2047. <p> |
|
387 * |
|
388 * The given Unicode string is examined for non US-ASCII |
|
389 * characters. If the string contains only US-ASCII characters, |
|
390 * it is returned as-is. If the string contains non US-ASCII |
|
391 * characters, it is first character-encoded using the platform's |
|
392 * default charset, then transfer-encoded using either the B or |
|
393 * Q encoding. The resulting bytes are then returned as a Unicode |
|
394 * string containing only ASCII characters. <p> |
|
395 * |
|
396 * Note that this method should be used to encode only |
|
397 * "unstructured" RFC 822 headers. <p> |
|
398 * |
|
399 * Example of usage: |
|
400 * <p><blockquote><pre> |
|
401 * |
|
402 * MimeBodyPart part = ... |
|
403 * String rawvalue = "FooBar Mailer, Japanese version 1.1" |
|
404 * try { |
|
405 * // If we know for sure that rawvalue contains only US-ASCII |
|
406 * // characters, we can skip the encoding part |
|
407 * part.setHeader("X-mailer", MimeUtility.encodeText(rawvalue)); |
|
408 * } catch (UnsupportedEncodingException e) { |
|
409 * // encoding failure |
|
410 * } catch (MessagingException me) { |
|
411 * // setHeader() failure |
|
412 * } |
|
413 * |
|
414 * </pre></blockquote><p> |
|
415 * |
|
416 * @param text unicode string |
|
417 * @return Unicode string containing only US-ASCII characters |
|
418 * @exception UnsupportedEncodingException if the encoding fails |
|
419 */ |
|
420 public static String encodeText(String text) |
|
421 throws UnsupportedEncodingException { |
|
422 return encodeText(text, null, null); |
|
423 } |
|
424 |
|
425 /** |
|
426 * Encode a RFC 822 "text" token into mail-safe form as per |
|
427 * RFC 2047. <p> |
|
428 * |
|
429 * The given Unicode string is examined for non US-ASCII |
|
430 * characters. If the string contains only US-ASCII characters, |
|
431 * it is returned as-is. If the string contains non US-ASCII |
|
432 * characters, it is first character-encoded using the specified |
|
433 * charset, then transfer-encoded using either the B or Q encoding. |
|
434 * The resulting bytes are then returned as a Unicode string |
|
435 * containing only ASCII characters. <p> |
|
436 * |
|
437 * Note that this method should be used to encode only |
|
438 * "unstructured" RFC 822 headers. |
|
439 * |
|
440 * @param text the header value |
|
441 * @param charset the charset. If this parameter is null, the |
|
442 * platform's default chatset is used. |
|
443 * @param encoding the encoding to be used. Currently supported |
|
444 * values are "B" and "Q". If this parameter is null, then |
|
445 * the "Q" encoding is used if most of characters to be |
|
446 * encoded are in the ASCII charset, otherwise "B" encoding |
|
447 * is used. |
|
448 * @return Unicode string containing only US-ASCII characters |
|
449 */ |
|
450 public static String encodeText(String text, String charset, |
|
451 String encoding) |
|
452 throws UnsupportedEncodingException { |
|
453 return encodeWord(text, charset, encoding, false); |
|
454 } |
|
455 |
|
456 /** |
|
457 * Decode "unstructured" headers, that is, headers that are defined |
|
458 * as '*text' as per RFC 822. <p> |
|
459 * |
|
460 * The string is decoded using the algorithm specified in |
|
461 * RFC 2047, Section 6.1.1. If the charset-conversion fails |
|
462 * for any sequence, an UnsupportedEncodingException is thrown. |
|
463 * If the String is not an RFC 2047 style encoded header, it is |
|
464 * returned as-is <p> |
|
465 * |
|
466 * Example of usage: |
|
467 * <p><blockquote><pre> |
|
468 * |
|
469 * MimeBodyPart part = ... |
|
470 * String rawvalue = null; |
|
471 * String value = null; |
|
472 * try { |
|
473 * if ((rawvalue = part.getHeader("X-mailer")[0]) != null) |
|
474 * value = MimeUtility.decodeText(rawvalue); |
|
475 * } catch (UnsupportedEncodingException e) { |
|
476 * // Don't care |
|
477 * value = rawvalue; |
|
478 * } catch (MessagingException me) { } |
|
479 * |
|
480 * return value; |
|
481 * |
|
482 * </pre></blockquote><p> |
|
483 * |
|
484 * @param etext the possibly encoded value |
|
485 * @exception UnsupportedEncodingException if the charset |
|
486 * conversion failed. |
|
487 */ |
|
488 public static String decodeText(String etext) |
|
489 throws UnsupportedEncodingException { |
|
490 /* |
|
491 * We look for sequences separated by "linear-white-space". |
|
492 * (as per RFC 2047, Section 6.1.1) |
|
493 * RFC 822 defines "linear-white-space" as SPACE | HT | CR | NL. |
|
494 */ |
|
495 String lwsp = " \t\n\r"; |
|
496 StringTokenizer st; |
|
497 |
|
498 /* |
|
499 * First, lets do a quick run thru the string and check |
|
500 * whether the sequence "=?" exists at all. If none exists, |
|
501 * we know there are no encoded-words in here and we can just |
|
502 * return the string as-is, without suffering thru the later |
|
503 * decoding logic. |
|
504 * This handles the most common case of unencoded headers |
|
505 * efficiently. |
|
506 */ |
|
507 if (etext.indexOf("=?") == -1) |
|
508 return etext; |
|
509 |
|
510 // Encoded words found. Start decoding ... |
|
511 |
|
512 st = new StringTokenizer(etext, lwsp, true); |
|
513 StringBuffer sb = new StringBuffer(); // decode buffer |
|
514 StringBuffer wsb = new StringBuffer(); // white space buffer |
|
515 boolean prevWasEncoded = false; |
|
516 |
|
517 while (st.hasMoreTokens()) { |
|
518 char c; |
|
519 String s = st.nextToken(); |
|
520 // If whitespace, append it to the whitespace buffer |
|
521 if (((c = s.charAt(0)) == ' ') || (c == '\t') || |
|
522 (c == '\r') || (c == '\n')) |
|
523 wsb.append(c); |
|
524 else { |
|
525 // Check if token is an 'encoded-word' .. |
|
526 String word; |
|
527 try { |
|
528 word = decodeWord(s); |
|
529 // Yes, this IS an 'encoded-word'. |
|
530 if (!prevWasEncoded && wsb.length() > 0) { |
|
531 // if the previous word was also encoded, we |
|
532 // should ignore the collected whitespace. Else |
|
533 // we include the whitespace as well. |
|
534 sb.append(wsb); |
|
535 } |
|
536 prevWasEncoded = true; |
|
537 } catch (ParseException pex) { |
|
538 // This is NOT an 'encoded-word'. |
|
539 word = s; |
|
540 // possibly decode inner encoded words |
|
541 if (!decodeStrict) |
|
542 word = decodeInnerWords(word); |
|
543 // include colleced whitespace .. |
|
544 if (wsb.length() > 0) |
|
545 sb.append(wsb); |
|
546 prevWasEncoded = false; |
|
547 } |
|
548 sb.append(word); // append the actual word |
|
549 wsb.setLength(0); // reset wsb for reuse |
|
550 } |
|
551 } |
|
552 return sb.toString(); |
|
553 } |
|
554 |
|
555 /** |
|
556 * Encode a RFC 822 "word" token into mail-safe form as per |
|
557 * RFC 2047. <p> |
|
558 * |
|
559 * The given Unicode string is examined for non US-ASCII |
|
560 * characters. If the string contains only US-ASCII characters, |
|
561 * it is returned as-is. If the string contains non US-ASCII |
|
562 * characters, it is first character-encoded using the platform's |
|
563 * default charset, then transfer-encoded using either the B or |
|
564 * Q encoding. The resulting bytes are then returned as a Unicode |
|
565 * string containing only ASCII characters. <p> |
|
566 * |
|
567 * This method is meant to be used when creating RFC 822 "phrases". |
|
568 * The InternetAddress class, for example, uses this to encode |
|
569 * it's 'phrase' component. |
|
570 * |
|
571 * @param text unicode string |
|
572 * @return Array of Unicode strings containing only US-ASCII |
|
573 * characters. |
|
574 * @exception UnsupportedEncodingException if the encoding fails |
|
575 */ |
|
576 public static String encodeWord(String word) |
|
577 throws UnsupportedEncodingException { |
|
578 return encodeWord(word, null, null); |
|
579 } |
|
580 |
|
581 /** |
|
582 * Encode a RFC 822 "word" token into mail-safe form as per |
|
583 * RFC 2047. <p> |
|
584 * |
|
585 * The given Unicode string is examined for non US-ASCII |
|
586 * characters. If the string contains only US-ASCII characters, |
|
587 * it is returned as-is. If the string contains non US-ASCII |
|
588 * characters, it is first character-encoded using the specified |
|
589 * charset, then transfer-encoded using either the B or Q encoding. |
|
590 * The resulting bytes are then returned as a Unicode string |
|
591 * containing only ASCII characters. <p> |
|
592 * |
|
593 * @param text unicode string |
|
594 * @param charset the MIME charset |
|
595 * @param encoding the encoding to be used. Currently supported |
|
596 * values are "B" and "Q". If this parameter is null, then |
|
597 * the "Q" encoding is used if most of characters to be |
|
598 * encoded are in the ASCII charset, otherwise "B" encoding |
|
599 * is used. |
|
600 * @return Unicode string containing only US-ASCII characters |
|
601 * @exception UnsupportedEncodingException if the encoding fails |
|
602 */ |
|
603 public static String encodeWord(String word, String charset, |
|
604 String encoding) |
|
605 throws UnsupportedEncodingException { |
|
606 return encodeWord(word, charset, encoding, true); |
|
607 } |
|
608 |
|
609 /* |
|
610 * Encode the given string. The parameter 'encodingWord' should |
|
611 * be true if a RFC 822 "word" token is being encoded and false if a |
|
612 * RFC 822 "text" token is being encoded. This is because the |
|
613 * "Q" encoding defined in RFC 2047 has more restrictions when |
|
614 * encoding "word" tokens. (Sigh) |
|
615 */ |
|
616 private static String encodeWord(String string, String charset, |
|
617 String encoding, boolean encodingWord) |
|
618 throws UnsupportedEncodingException { |
|
619 |
|
620 // If 'string' contains only US-ASCII characters, just |
|
621 // return it. |
|
622 int ascii = checkAscii(string); |
|
623 if (ascii == ALL_ASCII) |
|
624 return string; |
|
625 |
|
626 // Else, apply the specified charset conversion. |
|
627 String jcharset; |
|
628 if (charset == null) { // use default charset |
|
629 jcharset = getDefaultJavaCharset(); // the java charset |
|
630 charset = getDefaultMIMECharset(); // the MIME equivalent |
|
631 } else // MIME charset -> java charset |
|
632 jcharset = javaCharset(charset); |
|
633 |
|
634 // If no transfer-encoding is specified, figure one out. |
|
635 if (encoding == null) { |
|
636 if (ascii != MOSTLY_NONASCII) |
|
637 encoding = "Q"; |
|
638 else |
|
639 encoding = "B"; |
|
640 } |
|
641 |
|
642 boolean b64; |
|
643 if (encoding.equalsIgnoreCase("B")) |
|
644 b64 = true; |
|
645 else if (encoding.equalsIgnoreCase("Q")) |
|
646 b64 = false; |
|
647 else |
|
648 throw new UnsupportedEncodingException( |
|
649 "Unknown transfer encoding: " + encoding); |
|
650 |
|
651 StringBuffer outb = new StringBuffer(); // the output buffer |
|
652 doEncode(string, b64, jcharset, |
|
653 // As per RFC 2047, size of an encoded string should not |
|
654 // exceed 75 bytes. |
|
655 // 7 = size of "=?", '?', 'B'/'Q', '?', "?=" |
|
656 75 - 7 - charset.length(), // the available space |
|
657 "=?" + charset + "?" + encoding + "?", // prefix |
|
658 true, encodingWord, outb); |
|
659 |
|
660 return outb.toString(); |
|
661 } |
|
662 |
|
663 private static void doEncode(String string, boolean b64, |
|
664 String jcharset, int avail, String prefix, |
|
665 boolean first, boolean encodingWord, StringBuffer buf) |
|
666 throws UnsupportedEncodingException { |
|
667 |
|
668 // First find out what the length of the encoded version of |
|
669 // 'string' would be. |
|
670 byte[] bytes = string.getBytes(jcharset); |
|
671 int len; |
|
672 if (b64) // "B" encoding |
|
673 len = BEncoderStream.encodedLength(bytes); |
|
674 else // "Q" |
|
675 len = QEncoderStream.encodedLength(bytes, encodingWord); |
|
676 |
|
677 int size; |
|
678 if ((len > avail) && ((size = string.length()) > 1)) { |
|
679 // If the length is greater than 'avail', split 'string' |
|
680 // into two and recurse. |
|
681 doEncode(string.substring(0, size/2), b64, jcharset, |
|
682 avail, prefix, first, encodingWord, buf); |
|
683 doEncode(string.substring(size/2, size), b64, jcharset, |
|
684 avail, prefix, false, encodingWord, buf); |
|
685 } else { |
|
686 // length <= than 'avail'. Encode the given string |
|
687 ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE); |
|
688 OutputStream eos; // the encoder |
|
689 if (b64) // "B" encoding |
|
690 eos = new BEncoderStream(os); |
|
691 else // "Q" encoding |
|
692 eos = new QEncoderStream(os, encodingWord); |
|
693 |
|
694 try { // do the encoding |
|
695 eos.write(bytes); |
|
696 eos.close(); |
|
697 } catch (IOException ioex) { } |
|
698 |
|
699 byte[] encodedBytes = os.toByteArray(); // the encoded stuff |
|
700 // Now write out the encoded (all ASCII) bytes into our |
|
701 // StringBuffer |
|
702 if (!first) // not the first line of this sequence |
|
703 if (foldEncodedWords) |
|
704 buf.append("\r\n "); // start a continuation line |
|
705 else |
|
706 buf.append(" "); // line will be folded later |
|
707 |
|
708 buf.append(prefix); |
|
709 for (int i = 0; i < encodedBytes.length; i++) |
|
710 buf.append((char)encodedBytes[i]); |
|
711 buf.append("?="); // terminate the current sequence |
|
712 } |
|
713 } |
|
714 |
|
715 /** |
|
716 * The string is parsed using the rules in RFC 2047 for parsing |
|
717 * an "encoded-word". If the parse fails, a ParseException is |
|
718 * thrown. Otherwise, it is transfer-decoded, and then |
|
719 * charset-converted into Unicode. If the charset-conversion |
|
720 * fails, an UnsupportedEncodingException is thrown.<p> |
|
721 * |
|
722 * @param eword the possibly encoded value |
|
723 * @exception ParseException if the string is not an |
|
724 * encoded-word as per RFC 2047. |
|
725 * @exception UnsupportedEncodingException if the charset |
|
726 * conversion failed. |
|
727 */ |
|
728 public static String decodeWord(String eword) |
|
729 throws ParseException, UnsupportedEncodingException { |
|
730 |
|
731 if (!eword.startsWith("=?")) // not an encoded word |
|
732 throw new ParseException(); |
|
733 |
|
734 // get charset |
|
735 int start = 2; int pos; |
|
736 if ((pos = eword.indexOf('?', start)) == -1) |
|
737 throw new ParseException(); |
|
738 String charset = javaCharset(eword.substring(start, pos)); |
|
739 |
|
740 // get encoding |
|
741 start = pos+1; |
|
742 if ((pos = eword.indexOf('?', start)) == -1) |
|
743 throw new ParseException(); |
|
744 String encoding = eword.substring(start, pos); |
|
745 |
|
746 // get encoded-sequence |
|
747 start = pos+1; |
|
748 if ((pos = eword.indexOf("?=", start)) == -1) |
|
749 throw new ParseException(); |
|
750 String word = eword.substring(start, pos); |
|
751 |
|
752 try { |
|
753 // Extract the bytes from word |
|
754 ByteArrayInputStream bis = |
|
755 new ByteArrayInputStream(ASCIIUtility.getBytes(word)); |
|
756 |
|
757 // Get the appropriate decoder |
|
758 InputStream is; |
|
759 if (encoding.equalsIgnoreCase("B")) |
|
760 is = new BASE64DecoderStream(bis); |
|
761 else if (encoding.equalsIgnoreCase("Q")) |
|
762 is = new QDecoderStream(bis); |
|
763 else |
|
764 throw new UnsupportedEncodingException( |
|
765 "unknown encoding: " + encoding); |
|
766 |
|
767 // For b64 & q, size of decoded word <= size of word. So |
|
768 // the decoded bytes must fit into the 'bytes' array. This |
|
769 // is certainly more efficient than writing bytes into a |
|
770 // ByteArrayOutputStream and then pulling out the byte[] |
|
771 // from it. |
|
772 int count = bis.available(); |
|
773 byte[] bytes = new byte[count]; |
|
774 // count is set to the actual number of decoded bytes |
|
775 count = is.read(bytes, 0, count); |
|
776 |
|
777 // Finally, convert the decoded bytes into a String using |
|
778 // the specified charset |
|
779 String s = new String(bytes, 0, count, charset); |
|
780 if (pos + 2 < eword.length()) { |
|
781 // there's still more text in the string |
|
782 String rest = eword.substring(pos + 2); |
|
783 if (!decodeStrict) |
|
784 rest = decodeInnerWords(rest); |
|
785 s += rest; |
|
786 } |
|
787 return s; |
|
788 } catch (UnsupportedEncodingException uex) { |
|
789 // explicitly catch and rethrow this exception, otherwise |
|
790 // the below IOException catch will swallow this up! |
|
791 throw uex; |
|
792 } catch (IOException ioex) { |
|
793 // Shouldn't happen. |
|
794 throw new ParseException(); |
|
795 } catch (IllegalArgumentException iex) { |
|
796 /* An unknown charset of the form ISO-XXX-XXX, will cause |
|
797 * the JDK to throw an IllegalArgumentException ... Since the |
|
798 * JDK will attempt to create a classname using this string, |
|
799 * but valid classnames must not contain the character '-', |
|
800 * and this results in an IllegalArgumentException, rather than |
|
801 * the expected UnsupportedEncodingException. Yikes |
|
802 */ |
|
803 throw new UnsupportedEncodingException(); |
|
804 } |
|
805 } |
|
806 |
|
807 /** |
|
808 * Look for encoded words within a word. The MIME spec doesn't |
|
809 * allow this, but many broken mailers, especially Japanese mailers, |
|
810 * produce such incorrect encodings. |
|
811 */ |
|
812 private static String decodeInnerWords(String word) |
|
813 throws UnsupportedEncodingException { |
|
814 int start = 0, i; |
|
815 StringBuffer buf = new StringBuffer(); |
|
816 while ((i = word.indexOf("=?", start)) >= 0) { |
|
817 buf.append(word.substring(start, i)); |
|
818 int end = word.indexOf("?=", i); |
|
819 if (end < 0) |
|
820 break; |
|
821 String s = word.substring(i, end + 2); |
|
822 try { |
|
823 s = decodeWord(s); |
|
824 } catch (ParseException pex) { |
|
825 // ignore it, just use the original string |
|
826 } |
|
827 buf.append(s); |
|
828 start = end + 2; |
|
829 } |
|
830 if (start == 0) |
|
831 return word; |
|
832 if (start < word.length()) |
|
833 buf.append(word.substring(start)); |
|
834 return buf.toString(); |
|
835 } |
|
836 |
|
837 /** |
|
838 * A utility method to quote a word, if the word contains any |
|
839 * characters from the specified 'specials' list.<p> |
|
840 * |
|
841 * The <code>HeaderTokenizer</code> class defines two special |
|
842 * sets of delimiters - MIME and RFC 822. <p> |
|
843 * |
|
844 * This method is typically used during the generation of |
|
845 * RFC 822 and MIME header fields. |
|
846 * |
|
847 * @param word word to be quoted |
|
848 * @param specials the set of special characters |
|
849 * @return the possibly quoted word |
|
850 * @see javax.mail.internet.HeaderTokenizer#MIME |
|
851 * @see javax.mail.internet.HeaderTokenizer#RFC822 |
|
852 */ |
|
853 public static String quote(String word, String specials) { |
|
854 int len = word.length(); |
|
855 |
|
856 /* |
|
857 * Look for any "bad" characters, Escape and |
|
858 * quote the entire string if necessary. |
|
859 */ |
|
860 boolean needQuoting = false; |
|
861 for (int i = 0; i < len; i++) { |
|
862 char c = word.charAt(i); |
|
863 if (c == '"' || c == '\\' || c == '\r' || c == '\n') { |
|
864 // need to escape them and then quote the whole string |
|
865 StringBuffer sb = new StringBuffer(len + 3); |
|
866 sb.append('"'); |
|
867 sb.append(word.substring(0, i)); |
|
868 int lastc = 0; |
|
869 for (int j = i; j < len; j++) { |
|
870 char cc = word.charAt(j); |
|
871 if ((cc == '"') || (cc == '\\') || |
|
872 (cc == '\r') || (cc == '\n')) |
|
873 if (cc == '\n' && lastc == '\r') |
|
874 ; // do nothing, CR was already escaped |
|
875 else |
|
876 sb.append('\\'); // Escape the character |
|
877 sb.append(cc); |
|
878 lastc = cc; |
|
879 } |
|
880 sb.append('"'); |
|
881 return sb.toString(); |
|
882 } else if (c < 040 || c >= 0177 || specials.indexOf(c) >= 0) |
|
883 // These characters cause the string to be quoted |
|
884 needQuoting = true; |
|
885 } |
|
886 |
|
887 if (needQuoting) { |
|
888 StringBuffer sb = new StringBuffer(len + 2); |
|
889 sb.append('"').append(word).append('"'); |
|
890 return sb.toString(); |
|
891 } else |
|
892 return word; |
|
893 } |
|
894 |
|
895 /** |
|
896 * Fold a string at linear whitespace so that each line is no longer |
|
897 * than 76 characters, if possible. If there are more than 76 |
|
898 * non-whitespace characters consecutively, the string is folded at |
|
899 * the first whitespace after that sequence. The parameter |
|
900 * <code>used</code> indicates how many characters have been used in |
|
901 * the current line; it is usually the length of the header name. <p> |
|
902 * |
|
903 * Note that line breaks in the string aren't escaped; they probably |
|
904 * should be. |
|
905 * |
|
906 * @param used characters used in line so far |
|
907 * @param s the string to fold |
|
908 * @return the folded string |
|
909 */ |
|
910 /*public*/ static String fold(int used, String s) { |
|
911 if (!foldText) |
|
912 return s; |
|
913 |
|
914 int end; |
|
915 char c; |
|
916 // Strip trailing spaces |
|
917 for (end = s.length() - 1; end >= 0; end--) { |
|
918 c = s.charAt(end); |
|
919 if (c != ' ' && c != '\t') |
|
920 break; |
|
921 } |
|
922 if (end != s.length() - 1) |
|
923 s = s.substring(0, end + 1); |
|
924 |
|
925 // if the string fits now, just return it |
|
926 if (used + s.length() <= 76) |
|
927 return s; |
|
928 |
|
929 // have to actually fold the string |
|
930 StringBuffer sb = new StringBuffer(s.length() + 4); |
|
931 char lastc = 0; |
|
932 while (used + s.length() > 76) { |
|
933 int lastspace = -1; |
|
934 for (int i = 0; i < s.length(); i++) { |
|
935 if (lastspace != -1 && used + i > 76) |
|
936 break; |
|
937 c = s.charAt(i); |
|
938 if (c == ' ' || c == '\t') |
|
939 if (!(lastc == ' ' || lastc == '\t')) |
|
940 lastspace = i; |
|
941 lastc = c; |
|
942 } |
|
943 if (lastspace == -1) { |
|
944 // no space, use the whole thing |
|
945 sb.append(s); |
|
946 s = ""; |
|
947 used = 0; |
|
948 break; |
|
949 } |
|
950 sb.append(s.substring(0, lastspace)); |
|
951 sb.append("\r\n"); |
|
952 lastc = s.charAt(lastspace); |
|
953 sb.append(lastc); |
|
954 s = s.substring(lastspace + 1); |
|
955 used = 1; |
|
956 } |
|
957 sb.append(s); |
|
958 return sb.toString(); |
|
959 } |
|
960 |
|
961 /** |
|
962 * Unfold a folded header. Any line breaks that aren't escaped and |
|
963 * are followed by whitespace are removed. |
|
964 * |
|
965 * @param s the string to unfold |
|
966 * @return the unfolded string |
|
967 */ |
|
968 /*public*/ static String unfold(String s) { |
|
969 if (!foldText) |
|
970 return s; |
|
971 |
|
972 StringBuffer sb = null; |
|
973 int i; |
|
974 while ((i = indexOfAny(s, "\r\n")) >= 0) { |
|
975 int start = i; |
|
976 int l = s.length(); |
|
977 i++; // skip CR or NL |
|
978 if (i < l && s.charAt(i - 1) == '\r' && s.charAt(i) == '\n') |
|
979 i++; // skip LF |
|
980 if (start == 0 || s.charAt(start - 1) != '\\') { |
|
981 char c; |
|
982 // if next line starts with whitespace, skip all of it |
|
983 // XXX - always has to be true? |
|
984 if (i < l && ((c = s.charAt(i)) == ' ' || c == '\t')) { |
|
985 i++; // skip whitespace |
|
986 while (i < l && ((c = s.charAt(i)) == ' ' || c == '\t')) |
|
987 i++; |
|
988 if (sb == null) |
|
989 sb = new StringBuffer(s.length()); |
|
990 if (start != 0) { |
|
991 sb.append(s.substring(0, start)); |
|
992 sb.append(' '); |
|
993 } |
|
994 s = s.substring(i); |
|
995 continue; |
|
996 } |
|
997 // it's not a continuation line, just leave it in |
|
998 if (sb == null) |
|
999 sb = new StringBuffer(s.length()); |
|
1000 sb.append(s.substring(0, i)); |
|
1001 s = s.substring(i); |
|
1002 } else { |
|
1003 // there's a backslash at "start - 1" |
|
1004 // strip it out, but leave in the line break |
|
1005 if (sb == null) |
|
1006 sb = new StringBuffer(s.length()); |
|
1007 sb.append(s.substring(0, start - 1)); |
|
1008 sb.append(s.substring(start, i)); |
|
1009 s = s.substring(i); |
|
1010 } |
|
1011 } |
|
1012 if (sb != null) { |
|
1013 sb.append(s); |
|
1014 return sb.toString(); |
|
1015 } else |
|
1016 return s; |
|
1017 } |
|
1018 |
|
1019 /** |
|
1020 * Return the first index of any of the characters in "any" in "s", |
|
1021 * or -1 if none are found. |
|
1022 * |
|
1023 * This should be a method on String. |
|
1024 */ |
|
1025 private static int indexOfAny(String s, String any) { |
|
1026 return indexOfAny(s, any, 0); |
|
1027 } |
|
1028 |
|
1029 private static int indexOfAny(String s, String any, int start) { |
|
1030 try { |
|
1031 int len = s.length(); |
|
1032 for (int i = start; i < len; i++) { |
|
1033 if (any.indexOf(s.charAt(i)) >= 0) |
|
1034 return i; |
|
1035 } |
|
1036 return -1; |
|
1037 } catch (StringIndexOutOfBoundsException e) { |
|
1038 return -1; |
|
1039 } |
|
1040 } |
|
1041 |
|
1042 /** |
|
1043 * Convert a MIME charset name into a valid Java charset name. <p> |
|
1044 * |
|
1045 * @param charset the MIME charset name |
|
1046 * @return the Java charset equivalent. If a suitable mapping is |
|
1047 * not available, the passed in charset is itself returned. |
|
1048 */ |
|
1049 public static String javaCharset(String charset) { |
|
1050 if (mime2java == null || charset == null) |
|
1051 // no mapping table, or charset parameter is null |
|
1052 return charset; |
|
1053 |
|
1054 String alias = (String)mime2java.get(charset.toLowerCase()); |
|
1055 return alias == null ? charset : alias; |
|
1056 } |
|
1057 |
|
1058 /** |
|
1059 * Convert a java charset into its MIME charset name. <p> |
|
1060 * |
|
1061 * Note that a future version of JDK (post 1.2) might provide |
|
1062 * this functionality, in which case, we may deprecate this |
|
1063 * method then. |
|
1064 * |
|
1065 * @param charset the JDK charset |
|
1066 * @return the MIME/IANA equivalent. If a mapping |
|
1067 * is not possible, the passed in charset itself |
|
1068 * is returned. |
|
1069 * @since JavaMail 1.1 |
|
1070 */ |
|
1071 public static String mimeCharset(String charset) { |
|
1072 if (java2mime == null || charset == null) |
|
1073 // no mapping table or charset param is null |
|
1074 return charset; |
|
1075 |
|
1076 String alias = (String)java2mime.get(charset.toLowerCase()); |
|
1077 return alias == null ? charset : alias; |
|
1078 } |
|
1079 |
|
1080 private static String defaultJavaCharset; |
|
1081 private static String defaultMIMECharset; |
|
1082 |
|
1083 /** |
|
1084 * Get the default charset corresponding to the system's current |
|
1085 * default locale. If the System property <code>mail.mime.charset</code> |
|
1086 * is set, a system charset corresponding to this MIME charset will be |
|
1087 * returned. <p> |
|
1088 * |
|
1089 * @return the default charset of the system's default locale, |
|
1090 * as a Java charset. (NOT a MIME charset) |
|
1091 * @since JavaMail 1.1 |
|
1092 */ |
|
1093 public static String getDefaultJavaCharset() { |
|
1094 if (defaultJavaCharset == null) { |
|
1095 /* |
|
1096 * If mail.mime.charset is set, it controls the default |
|
1097 * Java charset as well. |
|
1098 */ |
|
1099 String mimecs = null; |
|
1100 |
|
1101 mimecs = SAAJUtil.getSystemProperty("mail.mime.charset"); |
|
1102 |
|
1103 if (mimecs != null && mimecs.length() > 0) { |
|
1104 defaultJavaCharset = javaCharset(mimecs); |
|
1105 return defaultJavaCharset; |
|
1106 } |
|
1107 |
|
1108 try { |
|
1109 defaultJavaCharset = System.getProperty("file.encoding", |
|
1110 "8859_1"); |
|
1111 } catch (SecurityException sex) { |
|
1112 |
|
1113 class NullInputStream extends InputStream { |
|
1114 public int read() { |
|
1115 return 0; |
|
1116 } |
|
1117 } |
|
1118 InputStreamReader reader = |
|
1119 new InputStreamReader(new NullInputStream()); |
|
1120 defaultJavaCharset = reader.getEncoding(); |
|
1121 if (defaultJavaCharset == null) |
|
1122 defaultJavaCharset = "8859_1"; |
|
1123 } |
|
1124 } |
|
1125 |
|
1126 return defaultJavaCharset; |
|
1127 } |
|
1128 |
|
1129 /* |
|
1130 * Get the default MIME charset for this locale. |
|
1131 */ |
|
1132 static String getDefaultMIMECharset() { |
|
1133 if (defaultMIMECharset == null) { |
|
1134 defaultMIMECharset = SAAJUtil.getSystemProperty("mail.mime.charset"); |
|
1135 } |
|
1136 if (defaultMIMECharset == null) |
|
1137 defaultMIMECharset = mimeCharset(getDefaultJavaCharset()); |
|
1138 return defaultMIMECharset; |
|
1139 } |
|
1140 |
|
1141 // Tables to map MIME charset names to Java names and vice versa. |
|
1142 // XXX - Should eventually use J2SE 1.4 java.nio.charset.Charset |
|
1143 private static Hashtable mime2java; |
|
1144 private static Hashtable java2mime; |
|
1145 |
|
1146 static { |
|
1147 java2mime = new Hashtable(40); |
|
1148 mime2java = new Hashtable(10); |
|
1149 |
|
1150 try { |
|
1151 // Use this class's classloader to load the mapping file |
|
1152 // XXX - we should use SecuritySupport, but it's in another package |
|
1153 InputStream is = |
|
1154 com.sun.xml.internal.messaging.saaj.packaging.mime.internet.MimeUtility.class.getResourceAsStream( |
|
1155 "/META-INF/javamail.charset.map"); |
|
1156 |
|
1157 if (is != null) { |
|
1158 is = new LineInputStream(is); |
|
1159 |
|
1160 // Load the JDK-to-MIME charset mapping table |
|
1161 loadMappings((LineInputStream)is, java2mime); |
|
1162 |
|
1163 // Load the MIME-to-JDK charset mapping table |
|
1164 loadMappings((LineInputStream)is, mime2java); |
|
1165 } |
|
1166 } catch (Exception ex) { } |
|
1167 |
|
1168 // If we didn't load the tables, e.g., because we didn't have |
|
1169 // permission, load them manually. The entries here should be |
|
1170 // the same as the default javamail.charset.map. |
|
1171 if (java2mime.isEmpty()) { |
|
1172 java2mime.put("8859_1", "ISO-8859-1"); |
|
1173 java2mime.put("iso8859_1", "ISO-8859-1"); |
|
1174 java2mime.put("ISO8859-1", "ISO-8859-1"); |
|
1175 |
|
1176 java2mime.put("8859_2", "ISO-8859-2"); |
|
1177 java2mime.put("iso8859_2", "ISO-8859-2"); |
|
1178 java2mime.put("ISO8859-2", "ISO-8859-2"); |
|
1179 |
|
1180 java2mime.put("8859_3", "ISO-8859-3"); |
|
1181 java2mime.put("iso8859_3", "ISO-8859-3"); |
|
1182 java2mime.put("ISO8859-3", "ISO-8859-3"); |
|
1183 |
|
1184 java2mime.put("8859_4", "ISO-8859-4"); |
|
1185 java2mime.put("iso8859_4", "ISO-8859-4"); |
|
1186 java2mime.put("ISO8859-4", "ISO-8859-4"); |
|
1187 |
|
1188 java2mime.put("8859_5", "ISO-8859-5"); |
|
1189 java2mime.put("iso8859_5", "ISO-8859-5"); |
|
1190 java2mime.put("ISO8859-5", "ISO-8859-5"); |
|
1191 |
|
1192 java2mime.put("8859_6", "ISO-8859-6"); |
|
1193 java2mime.put("iso8859_6", "ISO-8859-6"); |
|
1194 java2mime.put("ISO8859-6", "ISO-8859-6"); |
|
1195 |
|
1196 java2mime.put("8859_7", "ISO-8859-7"); |
|
1197 java2mime.put("iso8859_7", "ISO-8859-7"); |
|
1198 java2mime.put("ISO8859-7", "ISO-8859-7"); |
|
1199 |
|
1200 java2mime.put("8859_8", "ISO-8859-8"); |
|
1201 java2mime.put("iso8859_8", "ISO-8859-8"); |
|
1202 java2mime.put("ISO8859-8", "ISO-8859-8"); |
|
1203 |
|
1204 java2mime.put("8859_9", "ISO-8859-9"); |
|
1205 java2mime.put("iso8859_9", "ISO-8859-9"); |
|
1206 java2mime.put("ISO8859-9", "ISO-8859-9"); |
|
1207 |
|
1208 java2mime.put("SJIS", "Shift_JIS"); |
|
1209 java2mime.put("MS932", "Shift_JIS"); |
|
1210 java2mime.put("JIS", "ISO-2022-JP"); |
|
1211 java2mime.put("ISO2022JP", "ISO-2022-JP"); |
|
1212 java2mime.put("EUC_JP", "euc-jp"); |
|
1213 java2mime.put("KOI8_R", "koi8-r"); |
|
1214 java2mime.put("EUC_CN", "euc-cn"); |
|
1215 java2mime.put("EUC_TW", "euc-tw"); |
|
1216 java2mime.put("EUC_KR", "euc-kr"); |
|
1217 } |
|
1218 if (mime2java.isEmpty()) { |
|
1219 mime2java.put("iso-2022-cn", "ISO2022CN"); |
|
1220 mime2java.put("iso-2022-kr", "ISO2022KR"); |
|
1221 mime2java.put("utf-8", "UTF8"); |
|
1222 mime2java.put("utf8", "UTF8"); |
|
1223 mime2java.put("ja_jp.iso2022-7", "ISO2022JP"); |
|
1224 mime2java.put("ja_jp.eucjp", "EUCJIS"); |
|
1225 mime2java.put("euc-kr", "KSC5601"); |
|
1226 mime2java.put("euckr", "KSC5601"); |
|
1227 mime2java.put("us-ascii", "ISO-8859-1"); |
|
1228 mime2java.put("x-us-ascii", "ISO-8859-1"); |
|
1229 } |
|
1230 } |
|
1231 |
|
1232 private static void loadMappings(LineInputStream is, Hashtable table) { |
|
1233 String currLine; |
|
1234 |
|
1235 while (true) { |
|
1236 try { |
|
1237 currLine = is.readLine(); |
|
1238 } catch (IOException ioex) { |
|
1239 break; // error in reading, stop |
|
1240 } |
|
1241 |
|
1242 if (currLine == null) // end of file, stop |
|
1243 break; |
|
1244 if (currLine.startsWith("--") && currLine.endsWith("--")) |
|
1245 // end of this table |
|
1246 break; |
|
1247 |
|
1248 // ignore empty lines and comments |
|
1249 if (currLine.trim().length() == 0 || currLine.startsWith("#")) |
|
1250 continue; |
|
1251 |
|
1252 // A valid entry is of the form <key><separator><value> |
|
1253 // where, <separator> := SPACE | HT. Parse this |
|
1254 StringTokenizer tk = new StringTokenizer(currLine, " \t"); |
|
1255 try { |
|
1256 String key = tk.nextToken(); |
|
1257 String value = tk.nextToken(); |
|
1258 table.put(key.toLowerCase(), value); |
|
1259 } catch (NoSuchElementException nex) { } |
|
1260 } |
|
1261 } |
|
1262 |
|
1263 static final int ALL_ASCII = 1; |
|
1264 static final int MOSTLY_ASCII = 2; |
|
1265 static final int MOSTLY_NONASCII = 3; |
|
1266 |
|
1267 /** |
|
1268 * Check if the given string contains non US-ASCII characters. |
|
1269 * @param s string |
|
1270 * @return ALL_ASCII if all characters in the string |
|
1271 * belong to the US-ASCII charset. MOSTLY_ASCII |
|
1272 * if more than half of the available characters |
|
1273 * are US-ASCII characters. Else MOSTLY_NONASCII. |
|
1274 */ |
|
1275 static int checkAscii(String s) { |
|
1276 int ascii = 0, non_ascii = 0; |
|
1277 int l = s.length(); |
|
1278 |
|
1279 for (int i = 0; i < l; i++) { |
|
1280 if (nonascii((int)s.charAt(i))) // non-ascii |
|
1281 non_ascii++; |
|
1282 else |
|
1283 ascii++; |
|
1284 } |
|
1285 |
|
1286 if (non_ascii == 0) |
|
1287 return ALL_ASCII; |
|
1288 if (ascii > non_ascii) |
|
1289 return MOSTLY_ASCII; |
|
1290 |
|
1291 return MOSTLY_NONASCII; |
|
1292 } |
|
1293 |
|
1294 /** |
|
1295 * Check if the given byte array contains non US-ASCII characters. |
|
1296 * @param b byte array |
|
1297 * @return ALL_ASCII if all characters in the string |
|
1298 * belong to the US-ASCII charset. MOSTLY_ASCII |
|
1299 * if more than half of the available characters |
|
1300 * are US-ASCII characters. Else MOSTLY_NONASCII. |
|
1301 * |
|
1302 * XXX - this method is no longer used |
|
1303 */ |
|
1304 static int checkAscii(byte[] b) { |
|
1305 int ascii = 0, non_ascii = 0; |
|
1306 |
|
1307 for (int i=0; i < b.length; i++) { |
|
1308 // The '&' operator automatically causes b[i] to be promoted |
|
1309 // to an int, and we mask out the higher bytes in the int |
|
1310 // so that the resulting value is not a negative integer. |
|
1311 if (nonascii(b[i] & 0xff)) // non-ascii |
|
1312 non_ascii++; |
|
1313 else |
|
1314 ascii++; |
|
1315 } |
|
1316 |
|
1317 if (non_ascii == 0) |
|
1318 return ALL_ASCII; |
|
1319 if (ascii > non_ascii) |
|
1320 return MOSTLY_ASCII; |
|
1321 |
|
1322 return MOSTLY_NONASCII; |
|
1323 } |
|
1324 |
|
1325 /** |
|
1326 * Check if the given input stream contains non US-ASCII characters. |
|
1327 * Upto <code>max</code> bytes are checked. If <code>max</code> is |
|
1328 * set to <code>ALL</code>, then all the bytes available in this |
|
1329 * input stream are checked. If <code>breakOnNonAscii</code> is true |
|
1330 * the check terminates when the first non-US-ASCII character is |
|
1331 * found and MOSTLY_NONASCII is returned. Else, the check continues |
|
1332 * till <code>max</code> bytes or till the end of stream. |
|
1333 * |
|
1334 * @param is the input stream |
|
1335 * @param max maximum bytes to check for. The special value |
|
1336 * ALL indicates that all the bytes in this input |
|
1337 * stream must be checked. |
|
1338 * @param breakOnNonAscii if <code>true</code>, then terminate the |
|
1339 * the check when the first non-US-ASCII character |
|
1340 * is found. |
|
1341 * @return ALL_ASCII if all characters in the string |
|
1342 * belong to the US-ASCII charset. MOSTLY_ASCII |
|
1343 * if more than half of the available characters |
|
1344 * are US-ASCII characters. Else MOSTLY_NONASCII. |
|
1345 */ |
|
1346 static int checkAscii(InputStream is, int max, boolean breakOnNonAscii) { |
|
1347 int ascii = 0, non_ascii = 0; |
|
1348 int len; |
|
1349 int block = 4096; |
|
1350 int linelen = 0; |
|
1351 boolean longLine = false, badEOL = false; |
|
1352 boolean checkEOL = encodeEolStrict && breakOnNonAscii; |
|
1353 byte buf[] = null; |
|
1354 if (max != 0) { |
|
1355 block = (max == ALL) ? 4096 : Math.min(max, 4096); |
|
1356 buf = new byte[block]; |
|
1357 } |
|
1358 while (max != 0) { |
|
1359 try { |
|
1360 if ((len = is.read(buf, 0, block)) == -1) |
|
1361 break; |
|
1362 int lastb = 0; |
|
1363 for (int i = 0; i < len; i++) { |
|
1364 // The '&' operator automatically causes b[i] to |
|
1365 // be promoted to an int, and we mask out the higher |
|
1366 // bytes in the int so that the resulting value is |
|
1367 // not a negative integer. |
|
1368 int b = buf[i] & 0xff; |
|
1369 if (checkEOL && |
|
1370 ((lastb == '\r' && b != '\n') || |
|
1371 (lastb != '\r' && b == '\n'))) |
|
1372 badEOL = true; |
|
1373 if (b == '\r' || b == '\n') |
|
1374 linelen = 0; |
|
1375 else { |
|
1376 linelen++; |
|
1377 if (linelen > 998) // 1000 - CRLF |
|
1378 longLine = true; |
|
1379 } |
|
1380 if (nonascii(b)) { // non-ascii |
|
1381 if (breakOnNonAscii) // we are done |
|
1382 return MOSTLY_NONASCII; |
|
1383 else |
|
1384 non_ascii++; |
|
1385 } else |
|
1386 ascii++; |
|
1387 lastb = b; |
|
1388 } |
|
1389 } catch (IOException ioex) { |
|
1390 break; |
|
1391 } |
|
1392 if (max != ALL) |
|
1393 max -= len; |
|
1394 } |
|
1395 |
|
1396 if (max == 0 && breakOnNonAscii) |
|
1397 // We have been told to break on the first non-ascii character. |
|
1398 // We haven't got any non-ascii character yet, but then we |
|
1399 // have not checked all of the available bytes either. So we |
|
1400 // cannot say for sure that this input stream is ALL_ASCII, |
|
1401 // and hence we must play safe and return MOSTLY_NONASCII |
|
1402 |
|
1403 return MOSTLY_NONASCII; |
|
1404 |
|
1405 if (non_ascii == 0) { // no non-us-ascii characters so far |
|
1406 // If we're looking at non-text data, and we saw CR without LF |
|
1407 // or vice versa, consider this mostly non-ASCII so that it |
|
1408 // will be base64 encoded (since the quoted-printable encoder |
|
1409 // doesn't encode this case properly). |
|
1410 if (badEOL) |
|
1411 return MOSTLY_NONASCII; |
|
1412 // if we've seen a long line, we degrade to mostly ascii |
|
1413 else if (longLine) |
|
1414 return MOSTLY_ASCII; |
|
1415 else |
|
1416 return ALL_ASCII; |
|
1417 } |
|
1418 if (ascii > non_ascii) // mostly ascii |
|
1419 return MOSTLY_ASCII; |
|
1420 return MOSTLY_NONASCII; |
|
1421 } |
|
1422 |
|
1423 static final boolean nonascii(int b) { |
|
1424 return b >= 0177 || (b < 040 && b != '\r' && b != '\n' && b != '\t'); |
|
1425 } |
|
1426 } |
|
1427 |
|
1428 /** |
|
1429 * An OutputStream that determines whether the data written to |
|
1430 * it is all ASCII, mostly ASCII, or mostly non-ASCII. |
|
1431 */ |
|
1432 class AsciiOutputStream extends OutputStream { |
|
1433 private boolean breakOnNonAscii; |
|
1434 private int ascii = 0, non_ascii = 0; |
|
1435 private int linelen = 0; |
|
1436 private boolean longLine = false; |
|
1437 private boolean badEOL = false; |
|
1438 private boolean checkEOL = false; |
|
1439 private int lastb = 0; |
|
1440 private int ret = 0; |
|
1441 |
|
1442 public AsciiOutputStream(boolean breakOnNonAscii, boolean encodeEolStrict) { |
|
1443 this.breakOnNonAscii = breakOnNonAscii; |
|
1444 checkEOL = encodeEolStrict && breakOnNonAscii; |
|
1445 } |
|
1446 |
|
1447 public void write(int b) throws IOException { |
|
1448 check(b); |
|
1449 } |
|
1450 |
|
1451 public void write(byte b[]) throws IOException { |
|
1452 write(b, 0, b.length); |
|
1453 } |
|
1454 |
|
1455 public void write(byte b[], int off, int len) throws IOException { |
|
1456 len += off; |
|
1457 for (int i = off; i < len ; i++) |
|
1458 check(b[i]); |
|
1459 } |
|
1460 |
|
1461 private final void check(int b) throws IOException { |
|
1462 b &= 0xff; |
|
1463 if (checkEOL && |
|
1464 ((lastb == '\r' && b != '\n') || (lastb != '\r' && b == '\n'))) |
|
1465 badEOL = true; |
|
1466 if (b == '\r' || b == '\n') |
|
1467 linelen = 0; |
|
1468 else { |
|
1469 linelen++; |
|
1470 if (linelen > 998) // 1000 - CRLF |
|
1471 longLine = true; |
|
1472 } |
|
1473 if (MimeUtility.nonascii(b)) { // non-ascii |
|
1474 non_ascii++; |
|
1475 if (breakOnNonAscii) { // we are done |
|
1476 ret = MimeUtility.MOSTLY_NONASCII; |
|
1477 throw new EOFException(); |
|
1478 } |
|
1479 } else |
|
1480 ascii++; |
|
1481 lastb = b; |
|
1482 } |
|
1483 |
|
1484 /** |
|
1485 * Return ASCII-ness of data stream. |
|
1486 */ |
|
1487 public int getAscii() { |
|
1488 if (ret != 0) |
|
1489 return ret; |
|
1490 // If we're looking at non-text data, and we saw CR without LF |
|
1491 // or vice versa, consider this mostly non-ASCII so that it |
|
1492 // will be base64 encoded (since the quoted-printable encoder |
|
1493 // doesn't encode this case properly). |
|
1494 if (badEOL) |
|
1495 return MimeUtility.MOSTLY_NONASCII; |
|
1496 else if (non_ascii == 0) { // no non-us-ascii characters so far |
|
1497 // if we've seen a long line, we degrade to mostly ascii |
|
1498 if (longLine) |
|
1499 return MimeUtility.MOSTLY_ASCII; |
|
1500 else |
|
1501 return MimeUtility.ALL_ASCII; |
|
1502 } |
|
1503 if (ascii > non_ascii) // mostly ascii |
|
1504 return MimeUtility.MOSTLY_ASCII; |
|
1505 return MimeUtility.MOSTLY_NONASCII; |
|
1506 } |
|
1507 } |
|