|
1 /* |
|
2 * Copyright (c) 2006, 2018, Oracle and/or its affiliates. All rights reserved. |
|
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
|
4 * |
|
5 * This code is free software; you can redistribute it and/or modify it |
|
6 * under the terms of the GNU General Public License version 2 only, as |
|
7 * published by the Free Software Foundation. |
|
8 * |
|
9 * This code is distributed in the hope that it will be useful, but WITHOUT |
|
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
12 * version 2 for more details (a copy is included in the LICENSE file that |
|
13 * accompanied this code). |
|
14 * |
|
15 * You should have received a copy of the GNU General Public License version |
|
16 * 2 along with this work; if not, write to the Free Software Foundation, |
|
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
|
18 * |
|
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
|
20 * or visit www.oracle.com if you need additional information or have any |
|
21 * questions. |
|
22 */ |
|
23 |
|
24 package jnlp.converter.parser.xml; |
|
25 |
|
26 import java.io.ByteArrayInputStream; |
|
27 import java.io.EOFException; |
|
28 import java.io.InputStreamReader; |
|
29 import java.io.IOException; |
|
30 import java.io.Reader; |
|
31 import java.io.UnsupportedEncodingException; |
|
32 |
|
33 public class XMLEncoding { |
|
34 /** |
|
35 * Decodes a byte stream into a String by testing for a Byte Order Mark |
|
36 * (BOM) or an XML declaration. |
|
37 * <br /> |
|
38 * Detection begins by examining the first four octets of the stream for a |
|
39 * BOM. If a BOM is not found, then an encoding declaration is looked for |
|
40 * at the beginning of the stream. If the encoding still can not be |
|
41 * determined at this point, then UTF-8 is assumed. |
|
42 * |
|
43 * @param data an array of bytes containing an encoded XML document. |
|
44 * |
|
45 * @return A string containing the decoded XML document. |
|
46 */ |
|
47 public static String decodeXML(byte [] data) throws IOException { |
|
48 int start = 0; |
|
49 String encoding; |
|
50 |
|
51 if (data.length < BOM_LENGTH) { |
|
52 throw (new EOFException("encoding.error.not.xml")); |
|
53 } |
|
54 // no else required; successfully read stream |
|
55 int firstFour = ((0xff000000 & ((int) data[0] << 24)) | |
|
56 (0x00ff0000 & ((int) data[1] << 16)) | |
|
57 (0x0000ff00 & ((int) data[2] << 8)) | |
|
58 (0x000000ff & (int) data[3])); |
|
59 |
|
60 // start by examining the first four bytes for a BOM |
|
61 switch (firstFour) { |
|
62 case EBCDIC: |
|
63 // examine the encoding declaration |
|
64 encoding = examineEncodingDeclaration(data, IBM037_ENC); |
|
65 break; |
|
66 |
|
67 case XML_DECLARATION: |
|
68 // assume UTF-8, but examine the encoding declaration |
|
69 encoding = examineEncodingDeclaration(data, UTF_8_ENC); |
|
70 break; |
|
71 |
|
72 case UTF_16BE: |
|
73 encoding = UTF_16BE_ENC; |
|
74 break; |
|
75 |
|
76 case UTF_16LE: |
|
77 encoding = UTF_16LE_ENC; |
|
78 break; |
|
79 |
|
80 case UNUSUAL_OCTET_1: |
|
81 case UNUSUAL_OCTET_2: |
|
82 throw (new UnsupportedEncodingException("encoding.error.unusual.octet")); |
|
83 |
|
84 case UTF_32_BE_BOM: |
|
85 case UTF_32_LE_BOM: |
|
86 encoding = UTF_32_ENC; |
|
87 break; |
|
88 |
|
89 default: |
|
90 int firstThree = firstFour & 0xffffff00; |
|
91 |
|
92 switch (firstThree) { |
|
93 case UTF_8_BOM: |
|
94 // the InputStreamReader class doen't properly handle |
|
95 // the Byte Order Mark (BOM) in UTF-8 streams, so don't |
|
96 // putback those 3 bytes. |
|
97 start = 3; |
|
98 encoding = UTF_8_ENC; |
|
99 break; |
|
100 |
|
101 default: |
|
102 int firstTwo = firstFour & 0xffff0000; |
|
103 |
|
104 switch (firstTwo) { |
|
105 case UTF_16_BE_BOM: |
|
106 case UTF_16_LE_BOM: |
|
107 encoding = UTF_16_ENC; |
|
108 break; |
|
109 |
|
110 default: |
|
111 // this is probably UTF-8 without the encoding |
|
112 // declaration |
|
113 encoding = UTF_8_ENC; |
|
114 break; |
|
115 } |
|
116 break; |
|
117 } |
|
118 break; |
|
119 } |
|
120 |
|
121 return (new String(data, start, data.length - start, encoding)); |
|
122 } |
|
123 |
|
124 /** |
|
125 * [3] S ::= ( #x20 | #x09 | #x0d | #x0a ) |
|
126 * [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' |
|
127 * [24] VersionInfo ::= S 'version' Eq ( '"' VersionNum '"' | |
|
128 * "'" VersionNum "'" ) |
|
129 * [25] Eq ::= S? '=' S? |
|
130 * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')+ |
|
131 * [80] EncodingDecl ::= S 'encoding' Eq ( '"' EncName '"' | |
|
132 * "'" EncName "'" ) |
|
133 * [81] EncName ::= [a-zA-Z] ([a-zA-Z0-9_.] | '-')* |
|
134 */ |
|
135 private static String examineEncodingDeclaration(byte [] data, |
|
136 String encoding) throws IOException { |
|
137 boolean loop = false; |
|
138 boolean recognized = false; |
|
139 boolean almost = false; |
|
140 boolean question = false; |
|
141 boolean done = false; |
|
142 boolean found = false; |
|
143 int pos = 0; |
|
144 int ch = -1; |
|
145 Reader reader = null; |
|
146 String result = ((encoding != null) ? encoding : UTF_8_ENC); |
|
147 |
|
148 reader = new InputStreamReader(new ByteArrayInputStream(data), result); |
|
149 ch = reader.read(); |
|
150 |
|
151 // if this is an XML declaration, it will start with the text '<?xml' |
|
152 for (int i = 0; ((i < XML_DECL_START.length()) && (done == false)); i++) { |
|
153 if (ch != XML_DECL_START.charAt(i)) { |
|
154 // This doesn't look like an XML declaration. This method |
|
155 // should only be called if the stream contains an XML |
|
156 // declaration in the encoding that is passed into the method. |
|
157 done = true; |
|
158 break; |
|
159 } |
|
160 // no else required; still matches |
|
161 ch = reader.read(); |
|
162 } |
|
163 |
|
164 // there must be at least one whitespace character next. |
|
165 loop = true; |
|
166 while ((loop == true) && (done == false)) { |
|
167 switch (ch) { |
|
168 case SPACE: |
|
169 case TAB: // intentional |
|
170 case LINEFEED: // fall |
|
171 case RETURN: // through |
|
172 ch = reader.read(); |
|
173 break; |
|
174 |
|
175 case -1: |
|
176 // unexpected EOF |
|
177 done = true; |
|
178 break; |
|
179 |
|
180 default: |
|
181 // non-whitespace |
|
182 loop = false; |
|
183 break; |
|
184 } |
|
185 } |
|
186 |
|
187 // now look for the text 'encoding', but if the end of the XML |
|
188 // declaration (signified by the text '?>') comes first, then |
|
189 // assume the encoding is UTF-8 |
|
190 loop = true; |
|
191 while ((loop == true) && (done == false)) { |
|
192 if (ch == -1) { |
|
193 // unexpected EOF |
|
194 done = true; |
|
195 break; |
|
196 } else if (recognized == true) { |
|
197 // this is the encoding declaration as long as the next few |
|
198 // characters are whitespace and/or the equals ('=') sign |
|
199 switch (ch) { |
|
200 case SPACE: // intentional |
|
201 case TAB: // fall |
|
202 case LINEFEED: // through |
|
203 case RETURN: |
|
204 // don't need to do anything |
|
205 break; |
|
206 |
|
207 case EQUAL: |
|
208 if (almost == false) { |
|
209 // got the equal, now find a quote |
|
210 almost = true; |
|
211 } else { |
|
212 // this is not valid XML, so punt |
|
213 recognized = false; |
|
214 done = true; |
|
215 } |
|
216 break; |
|
217 |
|
218 case DOUBLE_QUOTE: // intentional |
|
219 case SINGLE_QUOTE: // fall through |
|
220 if (almost == true) { |
|
221 // got the quote, so move on to get the value |
|
222 loop = false; |
|
223 } else { |
|
224 // got a quote before the equal; this is not valid |
|
225 // XML, so punt |
|
226 recognized = false; |
|
227 done = true; |
|
228 } |
|
229 break; |
|
230 |
|
231 default: |
|
232 // non-whitespace |
|
233 recognized = false; |
|
234 if (almost == true) { |
|
235 // this is not valid XML, so punt |
|
236 done = true; |
|
237 } |
|
238 // no else required; this wasn't the encoding |
|
239 // declaration |
|
240 break; |
|
241 } |
|
242 |
|
243 if (recognized == false) { |
|
244 // this isn't the encoding declaration, so go back to the |
|
245 // top without reading the next character |
|
246 pos = 0; |
|
247 continue; |
|
248 } |
|
249 // no else required; still looking good |
|
250 } else if (ch == ENCODING_DECL.charAt(pos++)) { |
|
251 if (ENCODING_DECL.length() == pos) { |
|
252 // this looks like the encoding declaration |
|
253 recognized = true; |
|
254 } |
|
255 // no else required; this might be the encoding declaration |
|
256 } else if (ch == '?') { |
|
257 question = true; |
|
258 pos = 0; |
|
259 } else if ((ch == '>') && (question == true)) { |
|
260 // there is no encoding declaration, so assume that the initial |
|
261 // encoding guess was correct |
|
262 done = true; |
|
263 continue; |
|
264 } else { |
|
265 // still searching for the encoding declaration |
|
266 pos = 0; |
|
267 } |
|
268 |
|
269 ch = reader.read(); |
|
270 } |
|
271 |
|
272 if (done == false) { |
|
273 StringBuilder buffer = new StringBuilder(MAX_ENC_NAME); |
|
274 |
|
275 if (((ch >= 'a') && (ch <= 'z')) | |
|
276 ((ch >= 'A') && (ch <= 'Z'))) { |
|
277 // add the character to the result |
|
278 buffer.append((char) ch); |
|
279 |
|
280 loop = true; |
|
281 while ((loop == true) && (done == false)) { |
|
282 ch = reader.read(); |
|
283 |
|
284 if (((ch >= 'a') && (ch <= 'z')) || |
|
285 ((ch >= 'A') && (ch <= 'Z')) || |
|
286 ((ch >= '0') && (ch <= '9')) || |
|
287 (ch == '_') || (ch == '.') || (ch == '-')) { |
|
288 // add the character to the result |
|
289 buffer.append((char) ch); |
|
290 } else if ((ch == DOUBLE_QUOTE) || (ch == SINGLE_QUOTE)) { |
|
291 // finished! |
|
292 found = true; |
|
293 done = true; |
|
294 result = buffer.toString(); |
|
295 } else { |
|
296 // this is not a valid encoding name, so punt |
|
297 done = true; |
|
298 } |
|
299 } |
|
300 } else { |
|
301 // this is not a valid encoding name, so punt |
|
302 done = true; |
|
303 } |
|
304 } |
|
305 // no else required; already failed to find the encoding somewhere else |
|
306 |
|
307 return (result); |
|
308 } |
|
309 |
|
310 private static final int BOM_LENGTH = 4; |
|
311 private static final int MAX_ENC_NAME = 512; |
|
312 |
|
313 private static final int SPACE = 0x00000020; |
|
314 private static final int TAB = 0x00000009; |
|
315 private static final int LINEFEED = 0x0000000a; |
|
316 private static final int RETURN = 0x0000000d; |
|
317 private static final int EQUAL = '='; |
|
318 private static final int DOUBLE_QUOTE = '\"'; |
|
319 private static final int SINGLE_QUOTE = '\''; |
|
320 |
|
321 private static final int UTF_32_BE_BOM = 0x0000feff; |
|
322 private static final int UTF_32_LE_BOM = 0xfffe0000; |
|
323 private static final int UTF_16_BE_BOM = 0xfeff0000; |
|
324 private static final int UTF_16_LE_BOM = 0xfffe0000; |
|
325 private static final int UTF_8_BOM = 0xefbbbf00; |
|
326 private static final int UNUSUAL_OCTET_1 = 0x00003c00; |
|
327 private static final int UNUSUAL_OCTET_2 = 0x003c0000; |
|
328 private static final int UTF_16BE = 0x003c003f; |
|
329 private static final int UTF_16LE = 0x3c003f00; |
|
330 private static final int EBCDIC = 0x4c6fa794; |
|
331 private static final int XML_DECLARATION = 0x3c3f786d; |
|
332 |
|
333 private static final String UTF_32_ENC = "UTF-32"; |
|
334 private static final String UTF_16_ENC = "UTF-16"; |
|
335 private static final String UTF_16BE_ENC = "UTF-16BE"; |
|
336 private static final String UTF_16LE_ENC = "UTF-16LE"; |
|
337 private static final String UTF_8_ENC = "UTF-8"; |
|
338 private static final String IBM037_ENC = "IBM037"; |
|
339 |
|
340 private static final String XML_DECL_START = "<?xml"; |
|
341 private static final String ENCODING_DECL = "encoding"; |
|
342 } |