2
|
1 |
/*
|
5506
|
2 |
* Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
|
2
|
3 |
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
4 |
*
|
|
5 |
* This code is free software; you can redistribute it and/or modify it
|
|
6 |
* under the terms of the GNU General Public License version 2 only, as
|
5506
|
7 |
* published by the Free Software Foundation. Oracle designates this
|
2
|
8 |
* particular file as subject to the "Classpath" exception as provided
|
5506
|
9 |
* by Oracle in the LICENSE file that accompanied this code.
|
2
|
10 |
*
|
|
11 |
* This code is distributed in the hope that it will be useful, but WITHOUT
|
|
12 |
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
13 |
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
14 |
* version 2 for more details (a copy is included in the LICENSE file that
|
|
15 |
* accompanied this code).
|
|
16 |
*
|
|
17 |
* You should have received a copy of the GNU General Public License version
|
|
18 |
* 2 along with this work; if not, write to the Free Software Foundation,
|
|
19 |
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
20 |
*
|
5506
|
21 |
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
|
22 |
* or visit www.oracle.com if you need additional information or have any
|
|
23 |
* questions.
|
2
|
24 |
*/
|
|
25 |
package java.net;
|
|
26 |
|
|
27 |
import java.io.InputStream;
|
|
28 |
import java.io.IOException;
|
|
29 |
import java.security.AccessController;
|
|
30 |
import java.security.PrivilegedAction;
|
|
31 |
|
|
32 |
import sun.net.idn.StringPrep;
|
|
33 |
import sun.net.idn.Punycode;
|
|
34 |
import sun.text.normalizer.UCharacterIterator;
|
|
35 |
|
|
36 |
/**
|
|
37 |
* Provides methods to convert internationalized domain names (IDNs) between
|
|
38 |
* a normal Unicode representation and an ASCII Compatible Encoding (ACE) representation.
|
|
39 |
* Internationalized domain names can use characters from the entire range of
|
|
40 |
* Unicode, while traditional domain names are restricted to ASCII characters.
|
|
41 |
* ACE is an encoding of Unicode strings that uses only ASCII characters and
|
|
42 |
* can be used with software (such as the Domain Name System) that only
|
|
43 |
* understands traditional domain names.
|
|
44 |
*
|
|
45 |
* <p>Internationalized domain names are defined in <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
|
|
46 |
* RFC 3490 defines two operations: ToASCII and ToUnicode. These 2 operations employ
|
|
47 |
* <a href="http://www.ietf.org/rfc/rfc3491.txt">Nameprep</a> algorithm, which is a
|
|
48 |
* profile of <a href="http://www.ietf.org/rfc/rfc3454.txt">Stringprep</a>, and
|
|
49 |
* <a href="http://www.ietf.org/rfc/rfc3492.txt">Punycode</a> algorithm to convert
|
|
50 |
* domain name string back and forth.
|
|
51 |
*
|
|
52 |
* <p>The behavior of aforementioned conversion process can be adjusted by various flags:
|
|
53 |
* <ul>
|
|
54 |
* <li>If the ALLOW_UNASSIGNED flag is used, the domain name string to be converted
|
|
55 |
* can contain code points that are unassigned in Unicode 3.2, which is the
|
|
56 |
* Unicode version on which IDN conversion is based. If the flag is not used,
|
|
57 |
* the presence of such unassigned code points is treated as an error.
|
|
58 |
* <li>If the USE_STD3_ASCII_RULES flag is used, ASCII strings are checked against <a href="http://www.ietf.org/rfc/rfc1122.txt">RFC 1122</a> and <a href="http://www.ietf.org/rfc/rfc1123.txt">RFC 1123</a>.
|
|
59 |
* It is an error if they don't meet the requirements.
|
|
60 |
* </ul>
|
|
61 |
* These flags can be logically OR'ed together.
|
|
62 |
*
|
|
63 |
* <p>The security consideration is important with respect to internationalization
|
|
64 |
* domain name support. For example, English domain names may be <i>homographed</i>
|
|
65 |
* - maliciously misspelled by substitution of non-Latin letters.
|
|
66 |
* <a href="http://www.unicode.org/reports/tr36/">Unicode Technical Report #36</a>
|
|
67 |
* discusses security issues of IDN support as well as possible solutions.
|
|
68 |
* Applications are responsible for taking adequate security measures when using
|
|
69 |
* international domain names.
|
|
70 |
*
|
|
71 |
* @author Edward Wang
|
|
72 |
* @since 1.6
|
|
73 |
*
|
|
74 |
*/
|
|
75 |
public final class IDN {
|
|
76 |
/**
|
|
77 |
* Flag to allow processing of unassigned code points
|
|
78 |
*/
|
|
79 |
public static final int ALLOW_UNASSIGNED = 0x01;
|
|
80 |
|
|
81 |
/**
|
|
82 |
* Flag to turn on the check against STD-3 ASCII rules
|
|
83 |
*/
|
|
84 |
public static final int USE_STD3_ASCII_RULES = 0x02;
|
|
85 |
|
|
86 |
|
|
87 |
/**
|
|
88 |
* Translates a string from Unicode to ASCII Compatible Encoding (ACE),
|
|
89 |
* as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
|
|
90 |
*
|
|
91 |
* <p>ToASCII operation can fail. ToASCII fails if any step of it fails.
|
|
92 |
* If ToASCII operation fails, an IllegalArgumentException will be thrown.
|
|
93 |
* In this case, the input string should not be used in an internationalized domain name.
|
|
94 |
*
|
|
95 |
* <p> A label is an individual part of a domain name. The original ToASCII operation,
|
|
96 |
* as defined in RFC 3490, only operates on a single label. This method can handle
|
|
97 |
* both label and entire domain name, by assuming that labels in a domain name are
|
|
98 |
* always separated by dots. The following characters are recognized as dots:
|
|
99 |
* \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop),
|
|
100 |
* and \uFF61 (halfwidth ideographic full stop). if dots are
|
|
101 |
* used as label separators, this method also changes all of them to \u002E (full stop)
|
|
102 |
* in output translated string.
|
|
103 |
*
|
|
104 |
* @param input the string to be processed
|
|
105 |
* @param flag process flag; can be 0 or any logical OR of possible flags
|
|
106 |
*
|
|
107 |
* @return the translated <tt>String</tt>
|
|
108 |
*
|
|
109 |
* @throws IllegalArgumentException if the input string doesn't conform to RFC 3490 specification
|
|
110 |
*/
|
|
111 |
public static String toASCII(String input, int flag)
|
|
112 |
{
|
|
113 |
int p = 0, q = 0;
|
|
114 |
StringBuffer out = new StringBuffer();
|
|
115 |
|
|
116 |
while (p < input.length()) {
|
|
117 |
q = searchDots(input, p);
|
|
118 |
out.append(toASCIIInternal(input.substring(p, q), flag));
|
|
119 |
p = q + 1;
|
|
120 |
if (p < input.length()) out.append('.');
|
|
121 |
}
|
|
122 |
|
|
123 |
return out.toString();
|
|
124 |
}
|
|
125 |
|
|
126 |
|
|
127 |
/**
|
|
128 |
* Translates a string from Unicode to ASCII Compatible Encoding (ACE),
|
|
129 |
* as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
|
|
130 |
*
|
|
131 |
* <p> This convenience method works as if by invoking the
|
|
132 |
* two-argument counterpart as follows:
|
|
133 |
* <blockquote><tt>
|
|
134 |
* {@link #toASCII(String, int) toASCII}(input, 0);
|
|
135 |
* </tt></blockquote>
|
|
136 |
*
|
|
137 |
* @param input the string to be processed
|
|
138 |
*
|
|
139 |
* @return the translated <tt>String</tt>
|
|
140 |
*
|
|
141 |
* @throws IllegalArgumentException if the input string doesn't conform to RFC 3490 specification
|
|
142 |
*/
|
|
143 |
public static String toASCII(String input) {
|
|
144 |
return toASCII(input, 0);
|
|
145 |
}
|
|
146 |
|
|
147 |
|
|
148 |
/**
|
|
149 |
* Translates a string from ASCII Compatible Encoding (ACE) to Unicode,
|
|
150 |
* as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
|
|
151 |
*
|
|
152 |
* <p>ToUnicode never fails. In case of any error, the input string is returned unmodified.
|
|
153 |
*
|
|
154 |
* <p> A label is an individual part of a domain name. The original ToUnicode operation,
|
|
155 |
* as defined in RFC 3490, only operates on a single label. This method can handle
|
|
156 |
* both label and entire domain name, by assuming that labels in a domain name are
|
|
157 |
* always separated by dots. The following characters are recognized as dots:
|
|
158 |
* \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop),
|
|
159 |
* and \uFF61 (halfwidth ideographic full stop).
|
|
160 |
*
|
|
161 |
* @param input the string to be processed
|
|
162 |
* @param flag process flag; can be 0 or any logical OR of possible flags
|
|
163 |
*
|
|
164 |
* @return the translated <tt>String</tt>
|
|
165 |
*/
|
|
166 |
public static String toUnicode(String input, int flag) {
|
|
167 |
int p = 0, q = 0;
|
|
168 |
StringBuffer out = new StringBuffer();
|
|
169 |
|
|
170 |
while (p < input.length()) {
|
|
171 |
q = searchDots(input, p);
|
|
172 |
out.append(toUnicodeInternal(input.substring(p, q), flag));
|
|
173 |
p = q + 1;
|
|
174 |
if (p < input.length()) out.append('.');
|
|
175 |
}
|
|
176 |
|
|
177 |
return out.toString();
|
|
178 |
}
|
|
179 |
|
|
180 |
|
|
181 |
/**
|
|
182 |
* Translates a string from ASCII Compatible Encoding (ACE) to Unicode,
|
|
183 |
* as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
|
|
184 |
*
|
|
185 |
* <p> This convenience method works as if by invoking the
|
|
186 |
* two-argument counterpart as follows:
|
|
187 |
* <blockquote><tt>
|
|
188 |
* {@link #toUnicode(String, int) toUnicode}(input, 0);
|
|
189 |
* </tt></blockquote>
|
|
190 |
*
|
|
191 |
* @param input the string to be processed
|
|
192 |
*
|
|
193 |
* @return the translated <tt>String</tt>
|
|
194 |
*/
|
|
195 |
public static String toUnicode(String input) {
|
|
196 |
return toUnicode(input, 0);
|
|
197 |
}
|
|
198 |
|
|
199 |
|
|
200 |
/* ---------------- Private members -------------- */
|
|
201 |
|
|
202 |
// ACE Prefix is "xn--"
|
|
203 |
private static final String ACE_PREFIX = "xn--";
|
|
204 |
private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length();
|
|
205 |
|
|
206 |
private static final int MAX_LABEL_LENGTH = 63;
|
|
207 |
|
|
208 |
// single instance of nameprep
|
|
209 |
private static StringPrep namePrep = null;
|
|
210 |
|
|
211 |
static {
|
|
212 |
InputStream stream = null;
|
|
213 |
|
|
214 |
try {
|
|
215 |
final String IDN_PROFILE = "uidna.spp";
|
|
216 |
if (System.getSecurityManager() != null) {
|
|
217 |
stream = AccessController.doPrivileged(new PrivilegedAction<InputStream>() {
|
|
218 |
public InputStream run() {
|
|
219 |
return StringPrep.class.getResourceAsStream(IDN_PROFILE);
|
|
220 |
}
|
|
221 |
});
|
|
222 |
} else {
|
|
223 |
stream = StringPrep.class.getResourceAsStream(IDN_PROFILE);
|
|
224 |
}
|
|
225 |
|
|
226 |
namePrep = new StringPrep(stream);
|
|
227 |
stream.close();
|
|
228 |
} catch (IOException e) {
|
|
229 |
// should never reach here
|
|
230 |
assert false;
|
|
231 |
}
|
|
232 |
}
|
|
233 |
|
|
234 |
|
|
235 |
/* ---------------- Private operations -------------- */
|
|
236 |
|
|
237 |
|
|
238 |
//
|
|
239 |
// to suppress the default zero-argument constructor
|
|
240 |
//
|
|
241 |
private IDN() {}
|
|
242 |
|
|
243 |
//
|
|
244 |
// toASCII operation; should only apply to a single label
|
|
245 |
//
|
|
246 |
private static String toASCIIInternal(String label, int flag)
|
|
247 |
{
|
|
248 |
// step 1
|
|
249 |
// Check if the string contains code points outside the ASCII range 0..0x7c.
|
|
250 |
boolean isASCII = isAllASCII(label);
|
|
251 |
StringBuffer dest;
|
|
252 |
|
|
253 |
// step 2
|
|
254 |
// perform the nameprep operation; flag ALLOW_UNASSIGNED is used here
|
|
255 |
if (!isASCII) {
|
|
256 |
UCharacterIterator iter = UCharacterIterator.getInstance(label);
|
|
257 |
try {
|
|
258 |
dest = namePrep.prepare(iter, flag);
|
|
259 |
} catch (java.text.ParseException e) {
|
|
260 |
throw new IllegalArgumentException(e);
|
|
261 |
}
|
|
262 |
} else {
|
|
263 |
dest = new StringBuffer(label);
|
|
264 |
}
|
|
265 |
|
|
266 |
// step 3
|
|
267 |
// Verify the absence of non-LDH ASCII code points
|
|
268 |
// 0..0x2c, 0x2e..0x2f, 0x3a..0x40, 0x5b..0x60, 0x7b..0x7f
|
|
269 |
// Verify the absence of leading and trailing hyphen
|
|
270 |
boolean useSTD3ASCIIRules = ((flag & USE_STD3_ASCII_RULES) != 0);
|
|
271 |
if (useSTD3ASCIIRules) {
|
|
272 |
for (int i = 0; i < dest.length(); i++) {
|
|
273 |
int c = dest.charAt(i);
|
|
274 |
if (!isLDHChar(c)) {
|
|
275 |
throw new IllegalArgumentException("Contains non-LDH characters");
|
|
276 |
}
|
|
277 |
}
|
|
278 |
|
|
279 |
if (dest.charAt(0) == '-' || dest.charAt(dest.length() - 1) == '-') {
|
|
280 |
throw new IllegalArgumentException("Has leading or trailing hyphen");
|
|
281 |
}
|
|
282 |
}
|
|
283 |
|
|
284 |
if (!isASCII) {
|
|
285 |
// step 4
|
|
286 |
// If all code points are inside 0..0x7f, skip to step 8
|
|
287 |
if (!isAllASCII(dest.toString())) {
|
|
288 |
// step 5
|
|
289 |
// verify the sequence does not begin with ACE prefix
|
|
290 |
if(!startsWithACEPrefix(dest)){
|
|
291 |
|
|
292 |
// step 6
|
|
293 |
// encode the sequence with punycode
|
|
294 |
try {
|
|
295 |
dest = Punycode.encode(dest, null);
|
|
296 |
} catch (java.text.ParseException e) {
|
|
297 |
throw new IllegalArgumentException(e);
|
|
298 |
}
|
|
299 |
|
|
300 |
dest = toASCIILower(dest);
|
|
301 |
|
|
302 |
// step 7
|
|
303 |
// prepend the ACE prefix
|
|
304 |
dest.insert(0, ACE_PREFIX);
|
|
305 |
} else {
|
|
306 |
throw new IllegalArgumentException("The input starts with the ACE Prefix");
|
|
307 |
}
|
|
308 |
|
|
309 |
}
|
|
310 |
}
|
|
311 |
|
|
312 |
// step 8
|
|
313 |
// the length must be inside 1..63
|
|
314 |
if(dest.length() > MAX_LABEL_LENGTH){
|
|
315 |
throw new IllegalArgumentException("The label in the input is too long");
|
|
316 |
}
|
|
317 |
|
|
318 |
return dest.toString();
|
|
319 |
}
|
|
320 |
|
|
321 |
//
|
|
322 |
// toUnicode operation; should only apply to a single label
|
|
323 |
//
|
|
324 |
private static String toUnicodeInternal(String label, int flag) {
|
|
325 |
boolean[] caseFlags = null;
|
|
326 |
StringBuffer dest;
|
|
327 |
|
|
328 |
// step 1
|
|
329 |
// find out if all the codepoints in input are ASCII
|
|
330 |
boolean isASCII = isAllASCII(label);
|
|
331 |
|
|
332 |
if(!isASCII){
|
|
333 |
// step 2
|
|
334 |
// perform the nameprep operation; flag ALLOW_UNASSIGNED is used here
|
|
335 |
try {
|
|
336 |
UCharacterIterator iter = UCharacterIterator.getInstance(label);
|
|
337 |
dest = namePrep.prepare(iter, flag);
|
|
338 |
} catch (Exception e) {
|
|
339 |
// toUnicode never fails; if any step fails, return the input string
|
|
340 |
return label;
|
|
341 |
}
|
|
342 |
} else {
|
|
343 |
dest = new StringBuffer(label);
|
|
344 |
}
|
|
345 |
|
|
346 |
// step 3
|
|
347 |
// verify ACE Prefix
|
|
348 |
if(startsWithACEPrefix(dest)) {
|
|
349 |
|
|
350 |
// step 4
|
|
351 |
// Remove the ACE Prefix
|
|
352 |
String temp = dest.substring(ACE_PREFIX_LENGTH, dest.length());
|
|
353 |
|
|
354 |
try {
|
|
355 |
// step 5
|
|
356 |
// Decode using punycode
|
|
357 |
StringBuffer decodeOut = Punycode.decode(new StringBuffer(temp), null);
|
|
358 |
|
|
359 |
// step 6
|
|
360 |
// Apply toASCII
|
|
361 |
String toASCIIOut = toASCII(decodeOut.toString(), flag);
|
|
362 |
|
|
363 |
// step 7
|
|
364 |
// verify
|
|
365 |
if (toASCIIOut.equalsIgnoreCase(dest.toString())) {
|
|
366 |
// step 8
|
|
367 |
// return output of step 5
|
|
368 |
return decodeOut.toString();
|
|
369 |
}
|
|
370 |
} catch (Exception ignored) {
|
|
371 |
// no-op
|
|
372 |
}
|
|
373 |
}
|
|
374 |
|
|
375 |
// just return the input
|
|
376 |
return label;
|
|
377 |
}
|
|
378 |
|
|
379 |
|
|
380 |
//
|
|
381 |
// LDH stands for "letter/digit/hyphen", with characters restricted to the
|
|
382 |
// 26-letter Latin alphabet <A-Z a-z>, the digits <0-9>, and the hyphen
|
|
383 |
// <->
|
|
384 |
// non-LDH = 0..0x2C, 0x2E..0x2F, 0x3A..0x40, 0x56..0x60, 0x7B..0x7F
|
|
385 |
//
|
|
386 |
private static boolean isLDHChar(int ch){
|
|
387 |
// high runner case
|
|
388 |
if(ch > 0x007A){
|
|
389 |
return false;
|
|
390 |
}
|
|
391 |
//['-' '0'..'9' 'A'..'Z' 'a'..'z']
|
|
392 |
if((ch == 0x002D) ||
|
|
393 |
(0x0030 <= ch && ch <= 0x0039) ||
|
|
394 |
(0x0041 <= ch && ch <= 0x005A) ||
|
|
395 |
(0x0061 <= ch && ch <= 0x007A)
|
|
396 |
){
|
|
397 |
return true;
|
|
398 |
}
|
|
399 |
return false;
|
|
400 |
}
|
|
401 |
|
|
402 |
|
|
403 |
//
|
|
404 |
// search dots in a string and return the index of that character;
|
|
405 |
// or if there is no dots, return the length of input string
|
|
406 |
// dots might be: \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop),
|
|
407 |
// and \uFF61 (halfwidth ideographic full stop).
|
|
408 |
//
|
|
409 |
private static int searchDots(String s, int start) {
|
|
410 |
int i;
|
|
411 |
for (i = start; i < s.length(); i++) {
|
|
412 |
char c = s.charAt(i);
|
|
413 |
if (c == '.' || c == '\u3002' || c == '\uFF0E' || c == '\uFF61') {
|
|
414 |
break;
|
|
415 |
}
|
|
416 |
}
|
|
417 |
|
|
418 |
return i;
|
|
419 |
}
|
|
420 |
|
|
421 |
|
|
422 |
//
|
|
423 |
// to check if a string only contains US-ASCII code point
|
|
424 |
//
|
|
425 |
private static boolean isAllASCII(String input) {
|
|
426 |
boolean isASCII = true;
|
|
427 |
for (int i = 0; i < input.length(); i++) {
|
|
428 |
int c = input.charAt(i);
|
|
429 |
if (c > 0x7F) {
|
|
430 |
isASCII = false;
|
|
431 |
break;
|
|
432 |
}
|
|
433 |
}
|
|
434 |
return isASCII;
|
|
435 |
}
|
|
436 |
|
|
437 |
//
|
|
438 |
// to check if a string starts with ACE-prefix
|
|
439 |
//
|
|
440 |
private static boolean startsWithACEPrefix(StringBuffer input){
|
|
441 |
boolean startsWithPrefix = true;
|
|
442 |
|
|
443 |
if(input.length() < ACE_PREFIX_LENGTH){
|
|
444 |
return false;
|
|
445 |
}
|
|
446 |
for(int i = 0; i < ACE_PREFIX_LENGTH; i++){
|
|
447 |
if(toASCIILower(input.charAt(i)) != ACE_PREFIX.charAt(i)){
|
|
448 |
startsWithPrefix = false;
|
|
449 |
}
|
|
450 |
}
|
|
451 |
return startsWithPrefix;
|
|
452 |
}
|
|
453 |
|
|
454 |
private static char toASCIILower(char ch){
|
|
455 |
if('A' <= ch && ch <= 'Z'){
|
|
456 |
return (char)(ch + 'a' - 'A');
|
|
457 |
}
|
|
458 |
return ch;
|
|
459 |
}
|
|
460 |
|
|
461 |
private static StringBuffer toASCIILower(StringBuffer input){
|
|
462 |
StringBuffer dest = new StringBuffer();
|
|
463 |
for(int i = 0; i < input.length();i++){
|
|
464 |
dest.append(toASCIILower(input.charAt(i)));
|
|
465 |
}
|
|
466 |
return dest;
|
|
467 |
}
|
|
468 |
}
|