|
1 /* |
|
2 * reserved comment block |
|
3 * DO NOT REMOVE OR ALTER! |
|
4 */ |
|
5 /* |
|
6 * Copyright 1999-2004 The Apache Software Foundation. |
|
7 * |
|
8 * Licensed under the Apache License, Version 2.0 (the "License"); |
|
9 * you may not use this file except in compliance with the License. |
|
10 * You may obtain a copy of the License at |
|
11 * |
|
12 * http://www.apache.org/licenses/LICENSE-2.0 |
|
13 * |
|
14 * Unless required by applicable law or agreed to in writing, software |
|
15 * distributed under the License is distributed on an "AS IS" BASIS, |
|
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
17 * See the License for the specific language governing permissions and |
|
18 * limitations under the License. |
|
19 */ |
|
20 /* |
|
21 * $Id: Encodings.java,v 1.3 2005/09/28 13:49:04 pvedula Exp $ |
|
22 */ |
|
23 package com.sun.org.apache.xml.internal.serializer; |
|
24 |
|
25 import java.io.InputStream; |
|
26 import java.io.OutputStream; |
|
27 import java.io.OutputStreamWriter; |
|
28 import java.io.UnsupportedEncodingException; |
|
29 import java.io.Writer; |
|
30 import java.io.BufferedWriter; |
|
31 import java.net.URL; |
|
32 import java.util.Enumeration; |
|
33 import java.util.HashMap; |
|
34 import java.util.Properties; |
|
35 import java.util.StringTokenizer; |
|
36 |
|
37 |
|
38 /** |
|
39 * Provides information about encodings. Depends on the Java runtime |
|
40 * to provides writers for the different encodings, but can be used |
|
41 * to override encoding names and provide the last printable character |
|
42 * for each encoding. |
|
43 * |
|
44 * @version $Revision: 1.9 $ $Date: 2009/12/01 22:17:31 $ |
|
45 * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a> |
|
46 */ |
|
47 |
|
48 public final class Encodings extends Object |
|
49 { |
|
50 |
|
51 /** |
|
52 * The last printable character for unknown encodings. |
|
53 */ |
|
54 private static final int m_defaultLastPrintable = 0x7F; |
|
55 |
|
56 /** |
|
57 * Standard filename for properties file with encodings data. |
|
58 */ |
|
59 private static final String ENCODINGS_FILE = "com/sun/org/apache/xml/internal/serializer/Encodings.properties"; |
|
60 |
|
61 /** |
|
62 * Standard filename for properties file with encodings data. |
|
63 */ |
|
64 private static final String ENCODINGS_PROP = "com.sun.org.apache.xalan.internal.serialize.encodings"; |
|
65 |
|
66 |
|
67 /** |
|
68 * Returns a writer for the specified encoding based on |
|
69 * an output stream. |
|
70 * |
|
71 * @param output The output stream |
|
72 * @param encoding The encoding |
|
73 * @return A suitable writer |
|
74 * @throws UnsupportedEncodingException There is no convertor |
|
75 * to support this encoding |
|
76 */ |
|
77 static Writer getWriter(OutputStream output, String encoding) |
|
78 throws UnsupportedEncodingException |
|
79 { |
|
80 |
|
81 for (int i = 0; i < _encodings.length; ++i) |
|
82 { |
|
83 if (_encodings[i].name.equalsIgnoreCase(encoding)) |
|
84 { |
|
85 try |
|
86 { |
|
87 return new BufferedWriter(new OutputStreamWriter( |
|
88 output, |
|
89 _encodings[i].javaName)); |
|
90 } |
|
91 catch (java.lang.IllegalArgumentException iae) // java 1.1.8 |
|
92 { |
|
93 // keep trying |
|
94 } |
|
95 catch (UnsupportedEncodingException usee) |
|
96 { |
|
97 |
|
98 // keep trying |
|
99 } |
|
100 } |
|
101 } |
|
102 |
|
103 try |
|
104 { |
|
105 return new BufferedWriter(new OutputStreamWriter(output, encoding)); |
|
106 } |
|
107 catch (java.lang.IllegalArgumentException iae) // java 1.1.8 |
|
108 { |
|
109 throw new UnsupportedEncodingException(encoding); |
|
110 } |
|
111 } |
|
112 |
|
113 |
|
114 /** |
|
115 * Returns the last printable character for an unspecified |
|
116 * encoding. |
|
117 * |
|
118 * @return the default size |
|
119 */ |
|
120 public static int getLastPrintable() |
|
121 { |
|
122 return m_defaultLastPrintable; |
|
123 } |
|
124 |
|
125 |
|
126 |
|
127 /** |
|
128 * Returns the EncodingInfo object for the specified |
|
129 * encoding. |
|
130 * <p> |
|
131 * This is not a public API. |
|
132 * |
|
133 * @param encoding The encoding |
|
134 * @return The object that is used to determine if |
|
135 * characters are in the given encoding. |
|
136 * @xsl.usage internal |
|
137 */ |
|
138 static EncodingInfo getEncodingInfo(String encoding) |
|
139 { |
|
140 EncodingInfo ei; |
|
141 |
|
142 String normalizedEncoding = toUpperCaseFast(encoding); |
|
143 ei = (EncodingInfo) _encodingTableKeyJava.get(normalizedEncoding); |
|
144 if (ei == null) |
|
145 ei = (EncodingInfo) _encodingTableKeyMime.get(normalizedEncoding); |
|
146 if (ei == null) { |
|
147 // We shouldn't have to do this, but just in case. |
|
148 ei = new EncodingInfo(null,null); |
|
149 } |
|
150 |
|
151 return ei; |
|
152 } |
|
153 |
|
154 /** |
|
155 * A fast and cheap way to uppercase a String that is |
|
156 * only made of printable ASCII characters. |
|
157 * <p> |
|
158 * This is not a public API. |
|
159 * @param s a String of ASCII characters |
|
160 * @return an uppercased version of the input String, |
|
161 * possibly the same String. |
|
162 * @xsl.usage internal |
|
163 */ |
|
164 static private String toUpperCaseFast(final String s) { |
|
165 |
|
166 boolean different = false; |
|
167 final int mx = s.length(); |
|
168 char[] chars = new char[mx]; |
|
169 for (int i=0; i < mx; i++) { |
|
170 char ch = s.charAt(i); |
|
171 // is the character a lower case ASCII one? |
|
172 if ('a' <= ch && ch <= 'z') { |
|
173 // a cheap and fast way to uppercase that is good enough |
|
174 ch = (char) (ch + ('A' - 'a')); |
|
175 different = true; // the uppercased String is different |
|
176 } |
|
177 chars[i] = ch; |
|
178 } |
|
179 |
|
180 // A little optimization, don't call String.valueOf() if |
|
181 // the uppercased string is the same as the input string. |
|
182 final String upper; |
|
183 if (different) |
|
184 upper = String.valueOf(chars); |
|
185 else |
|
186 upper = s; |
|
187 |
|
188 return upper; |
|
189 } |
|
190 |
|
191 /** The default encoding, ISO style, ISO style. */ |
|
192 static final String DEFAULT_MIME_ENCODING = "UTF-8"; |
|
193 |
|
194 /** |
|
195 * Get the proper mime encoding. From the XSLT recommendation: "The encoding |
|
196 * attribute specifies the preferred encoding to use for outputting the result |
|
197 * tree. XSLT processors are required to respect values of UTF-8 and UTF-16. |
|
198 * For other values, if the XSLT processor does not support the specified |
|
199 * encoding it may signal an error; if it does not signal an error it should |
|
200 * use UTF-8 or UTF-16 instead. The XSLT processor must not use an encoding |
|
201 * whose name does not match the EncName production of the XML Recommendation |
|
202 * [XML]. If no encoding attribute is specified, then the XSLT processor should |
|
203 * use either UTF-8 or UTF-16." |
|
204 * |
|
205 * @param encoding Reference to java-style encoding string, which may be null, |
|
206 * in which case a default will be found. |
|
207 * |
|
208 * @return The ISO-style encoding string, or null if failure. |
|
209 */ |
|
210 static String getMimeEncoding(String encoding) |
|
211 { |
|
212 |
|
213 if (null == encoding) |
|
214 { |
|
215 try |
|
216 { |
|
217 |
|
218 // Get the default system character encoding. This may be |
|
219 // incorrect if they passed in a writer, but right now there |
|
220 // seems to be no way to get the encoding from a writer. |
|
221 encoding = System.getProperty("file.encoding", "UTF8"); |
|
222 |
|
223 if (null != encoding) |
|
224 { |
|
225 |
|
226 /* |
|
227 * See if the mime type is equal to UTF8. If you don't |
|
228 * do that, then convertJava2MimeEncoding will convert |
|
229 * 8859_1 to "ISO-8859-1", which is not what we want, |
|
230 * I think, and I don't think I want to alter the tables |
|
231 * to convert everything to UTF-8. |
|
232 */ |
|
233 String jencoding = |
|
234 (encoding.equalsIgnoreCase("Cp1252") |
|
235 || encoding.equalsIgnoreCase("ISO8859_1") |
|
236 || encoding.equalsIgnoreCase("8859_1") |
|
237 || encoding.equalsIgnoreCase("UTF8")) |
|
238 ? DEFAULT_MIME_ENCODING |
|
239 : convertJava2MimeEncoding(encoding); |
|
240 |
|
241 encoding = |
|
242 (null != jencoding) ? jencoding : DEFAULT_MIME_ENCODING; |
|
243 } |
|
244 else |
|
245 { |
|
246 encoding = DEFAULT_MIME_ENCODING; |
|
247 } |
|
248 } |
|
249 catch (SecurityException se) |
|
250 { |
|
251 encoding = DEFAULT_MIME_ENCODING; |
|
252 } |
|
253 } |
|
254 else |
|
255 { |
|
256 encoding = convertJava2MimeEncoding(encoding); |
|
257 } |
|
258 |
|
259 return encoding; |
|
260 } |
|
261 |
|
262 /** |
|
263 * Try the best we can to convert a Java encoding to a XML-style encoding. |
|
264 * |
|
265 * @param encoding non-null reference to encoding string, java style. |
|
266 * |
|
267 * @return ISO-style encoding string. |
|
268 */ |
|
269 private static String convertJava2MimeEncoding(String encoding) |
|
270 { |
|
271 EncodingInfo enc = |
|
272 (EncodingInfo) _encodingTableKeyJava.get(encoding.toUpperCase()); |
|
273 if (null != enc) |
|
274 return enc.name; |
|
275 return encoding; |
|
276 } |
|
277 |
|
278 /** |
|
279 * Try the best we can to convert a Java encoding to a XML-style encoding. |
|
280 * |
|
281 * @param encoding non-null reference to encoding string, java style. |
|
282 * |
|
283 * @return ISO-style encoding string. |
|
284 */ |
|
285 public static String convertMime2JavaEncoding(String encoding) |
|
286 { |
|
287 |
|
288 for (int i = 0; i < _encodings.length; ++i) |
|
289 { |
|
290 if (_encodings[i].name.equalsIgnoreCase(encoding)) |
|
291 { |
|
292 return _encodings[i].javaName; |
|
293 } |
|
294 } |
|
295 |
|
296 return encoding; |
|
297 } |
|
298 |
|
299 /** |
|
300 * Load a list of all the supported encodings. |
|
301 * |
|
302 * System property "encodings" formatted using URL syntax may define an |
|
303 * external encodings list. Thanks to Sergey Ushakov for the code |
|
304 * contribution! |
|
305 */ |
|
306 private static EncodingInfo[] loadEncodingInfo() |
|
307 { |
|
308 try |
|
309 { |
|
310 String urlString = null; |
|
311 InputStream is = null; |
|
312 |
|
313 try |
|
314 { |
|
315 urlString = System.getProperty(ENCODINGS_PROP, ""); |
|
316 } |
|
317 catch (SecurityException e) |
|
318 { |
|
319 } |
|
320 |
|
321 if (urlString != null && urlString.length() > 0) { |
|
322 URL url = new URL(urlString); |
|
323 is = url.openStream(); |
|
324 } |
|
325 |
|
326 if (is == null) { |
|
327 SecuritySupport ss = SecuritySupport.getInstance(); |
|
328 is = ss.getResourceAsStream(ObjectFactory.findClassLoader(), |
|
329 ENCODINGS_FILE); |
|
330 } |
|
331 |
|
332 Properties props = new Properties(); |
|
333 if (is != null) { |
|
334 props.load(is); |
|
335 is.close(); |
|
336 } else { |
|
337 // Seems to be no real need to force failure here, let the |
|
338 // system do its best... The issue is not really very critical, |
|
339 // and the output will be in any case _correct_ though maybe not |
|
340 // always human-friendly... :) |
|
341 // But maybe report/log the resource problem? |
|
342 // Any standard ways to report/log errors (in static context)? |
|
343 } |
|
344 |
|
345 int totalEntries = props.size(); |
|
346 int totalMimeNames = 0; |
|
347 Enumeration keys = props.keys(); |
|
348 for (int i = 0; i < totalEntries; ++i) |
|
349 { |
|
350 String javaName = (String) keys.nextElement(); |
|
351 String val = props.getProperty(javaName); |
|
352 totalMimeNames++; |
|
353 int pos = val.indexOf(' '); |
|
354 for (int j = 0; j < pos; ++j) |
|
355 if (val.charAt(j) == ',') |
|
356 totalMimeNames++; |
|
357 } |
|
358 EncodingInfo[] ret = new EncodingInfo[totalMimeNames]; |
|
359 int j = 0; |
|
360 keys = props.keys(); |
|
361 for (int i = 0; i < totalEntries; ++i) |
|
362 { |
|
363 String javaName = (String) keys.nextElement(); |
|
364 String val = props.getProperty(javaName); |
|
365 int pos = val.indexOf(' '); |
|
366 String mimeName; |
|
367 //int lastPrintable; |
|
368 if (pos < 0) |
|
369 { |
|
370 // Maybe report/log this problem? |
|
371 // "Last printable character not defined for encoding " + |
|
372 // mimeName + " (" + val + ")" ... |
|
373 mimeName = val; |
|
374 //lastPrintable = 0x00FF; |
|
375 } |
|
376 else |
|
377 { |
|
378 //lastPrintable = |
|
379 // Integer.decode(val.substring(pos).trim()).intValue(); |
|
380 StringTokenizer st = |
|
381 new StringTokenizer(val.substring(0, pos), ","); |
|
382 for (boolean first = true; |
|
383 st.hasMoreTokens(); |
|
384 first = false) |
|
385 { |
|
386 mimeName = st.nextToken(); |
|
387 ret[j] = |
|
388 new EncodingInfo(mimeName, javaName); |
|
389 _encodingTableKeyMime.put( |
|
390 mimeName.toUpperCase(), |
|
391 ret[j]); |
|
392 if (first) |
|
393 _encodingTableKeyJava.put( |
|
394 javaName.toUpperCase(), |
|
395 ret[j]); |
|
396 j++; |
|
397 } |
|
398 } |
|
399 } |
|
400 return ret; |
|
401 } |
|
402 catch (java.net.MalformedURLException mue) |
|
403 { |
|
404 throw new com.sun.org.apache.xml.internal.serializer.utils.WrappedRuntimeException(mue); |
|
405 } |
|
406 catch (java.io.IOException ioe) |
|
407 { |
|
408 throw new com.sun.org.apache.xml.internal.serializer.utils.WrappedRuntimeException(ioe); |
|
409 } |
|
410 } |
|
411 |
|
412 /** |
|
413 * Return true if the character is the high member of a surrogate pair. |
|
414 * <p> |
|
415 * This is not a public API. |
|
416 * @param ch the character to test |
|
417 * @xsl.usage internal |
|
418 */ |
|
419 static boolean isHighUTF16Surrogate(char ch) { |
|
420 return ('\uD800' <= ch && ch <= '\uDBFF'); |
|
421 } |
|
422 /** |
|
423 * Return true if the character is the low member of a surrogate pair. |
|
424 * <p> |
|
425 * This is not a public API. |
|
426 * @param ch the character to test |
|
427 * @xsl.usage internal |
|
428 */ |
|
429 static boolean isLowUTF16Surrogate(char ch) { |
|
430 return ('\uDC00' <= ch && ch <= '\uDFFF'); |
|
431 } |
|
432 /** |
|
433 * Return the unicode code point represented by the high/low surrogate pair. |
|
434 * <p> |
|
435 * This is not a public API. |
|
436 * @param highSurrogate the high char of the high/low pair |
|
437 * @param lowSurrogate the low char of the high/low pair |
|
438 * @xsl.usage internal |
|
439 */ |
|
440 static int toCodePoint(char highSurrogate, char lowSurrogate) { |
|
441 int codePoint = |
|
442 ((highSurrogate - 0xd800) << 10) |
|
443 + (lowSurrogate - 0xdc00) |
|
444 + 0x10000; |
|
445 return codePoint; |
|
446 } |
|
447 /** |
|
448 * Return the unicode code point represented by the char. |
|
449 * A bit of a dummy method, since all it does is return the char, |
|
450 * but as an int value. |
|
451 * <p> |
|
452 * This is not a public API. |
|
453 * @param ch the char. |
|
454 * @xsl.usage internal |
|
455 */ |
|
456 static int toCodePoint(char ch) { |
|
457 int codePoint = ch; |
|
458 return codePoint; |
|
459 } |
|
460 |
|
461 private static final HashMap _encodingTableKeyJava = new HashMap(); |
|
462 private static final HashMap _encodingTableKeyMime = new HashMap(); |
|
463 private static final EncodingInfo[] _encodings = loadEncodingInfo(); |
|
464 } |