author | redestad |
Thu, 21 Apr 2016 13:39:53 +0200 | |
changeset 37593 | 824750ada3d6 |
parent 25859 | 3317bb8137f4 |
child 37781 | 71ed5645f17c |
permissions | -rw-r--r-- |
2 | 1 |
/* |
19069 | 2 |
* Copyright (c) 1995, 2013, Oracle and/or its affiliates. All rights reserved. |
2 | 3 |
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
4 |
* |
|
5 |
* This code is free software; you can redistribute it and/or modify it |
|
6 |
* under the terms of the GNU General Public License version 2 only, as |
|
5506 | 7 |
* published by the Free Software Foundation. Oracle designates this |
2 | 8 |
* particular file as subject to the "Classpath" exception as provided |
5506 | 9 |
* by Oracle in the LICENSE file that accompanied this code. |
2 | 10 |
* |
11 |
* This code is distributed in the hope that it will be useful, but WITHOUT |
|
12 |
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
13 |
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
14 |
* version 2 for more details (a copy is included in the LICENSE file that |
|
15 |
* accompanied this code). |
|
16 |
* |
|
17 |
* You should have received a copy of the GNU General Public License version |
|
18 |
* 2 along with this work; if not, write to the Free Software Foundation, |
|
19 |
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
|
20 |
* |
|
5506 | 21 |
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
22 |
* or visit www.oracle.com if you need additional information or have any |
|
23 |
* questions. |
|
2 | 24 |
*/ |
25 |
||
26 |
package java.net; |
|
27 |
||
28 |
import java.io.UnsupportedEncodingException; |
|
29 |
import java.io.CharArrayWriter; |
|
30 |
import java.nio.charset.Charset; |
|
31 |
import java.nio.charset.IllegalCharsetNameException; |
|
32 |
import java.nio.charset.UnsupportedCharsetException ; |
|
33 |
import java.util.BitSet; |
|
34 |
import sun.security.action.GetPropertyAction; |
|
35 |
||
36 |
/** |
|
37 |
* Utility class for HTML form encoding. This class contains static methods |
|
38 |
* for converting a String to the <CODE>application/x-www-form-urlencoded</CODE> MIME |
|
39 |
* format. For more information about HTML form encoding, consult the HTML |
|
40 |
* <A HREF="http://www.w3.org/TR/html4/">specification</A>. |
|
41 |
* |
|
42 |
* <p> |
|
43 |
* When encoding a String, the following rules apply: |
|
44 |
* |
|
45 |
* <ul> |
|
19069 | 46 |
* <li>The alphanumeric characters "{@code a}" through |
47 |
* "{@code z}", "{@code A}" through |
|
48 |
* "{@code Z}" and "{@code 0}" |
|
49 |
* through "{@code 9}" remain the same. |
|
50 |
* <li>The special characters "{@code .}", |
|
51 |
* "{@code -}", "{@code *}", and |
|
52 |
* "{@code _}" remain the same. |
|
53 |
* <li>The space character " " is |
|
54 |
* converted into a plus sign "{@code +}". |
|
2 | 55 |
* <li>All other characters are unsafe and are first converted into |
56 |
* one or more bytes using some encoding scheme. Then each byte is |
|
57 |
* represented by the 3-character string |
|
19069 | 58 |
* "<i>{@code %xy}</i>", where <i>xy</i> is the |
2 | 59 |
* two-digit hexadecimal representation of the byte. |
60 |
* The recommended encoding scheme to use is UTF-8. However, |
|
61 |
* for compatibility reasons, if an encoding is not specified, |
|
62 |
* then the default encoding of the platform is used. |
|
63 |
* </ul> |
|
64 |
* |
|
65 |
* <p> |
|
66 |
* For example using UTF-8 as the encoding scheme the string "The |
|
67 |
* string ü@foo-bar" would get converted to |
|
68 |
* "The+string+%C3%BC%40foo-bar" because in UTF-8 the character |
|
69 |
* ü is encoded as two bytes C3 (hex) and BC (hex), and the |
|
70 |
* character @ is encoded as one byte 40 (hex). |
|
71 |
* |
|
72 |
* @author Herb Jellinek |
|
24865
09b1d992ca72
8044740: Convert all JDK versions used in @since tag to 1.n[.n] in jdk repo
henryjen
parents:
21428
diff
changeset
|
73 |
* @since 1.0 |
2 | 74 |
*/ |
75 |
public class URLEncoder { |
|
76 |
static BitSet dontNeedEncoding; |
|
77 |
static final int caseDiff = ('a' - 'A'); |
|
78 |
static String dfltEncName = null; |
|
79 |
||
80 |
static { |
|
81 |
||
82 |
/* The list of characters that are not encoded has been |
|
83 |
* determined as follows: |
|
84 |
* |
|
85 |
* RFC 2396 states: |
|
86 |
* ----- |
|
87 |
* Data characters that are allowed in a URI but do not have a |
|
88 |
* reserved purpose are called unreserved. These include upper |
|
89 |
* and lower case letters, decimal digits, and a limited set of |
|
90 |
* punctuation marks and symbols. |
|
91 |
* |
|
92 |
* unreserved = alphanum | mark |
|
93 |
* |
|
94 |
* mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")" |
|
95 |
* |
|
96 |
* Unreserved characters can be escaped without changing the |
|
97 |
* semantics of the URI, but this should not be done unless the |
|
98 |
* URI is being used in a context that does not allow the |
|
99 |
* unescaped character to appear. |
|
100 |
* ----- |
|
101 |
* |
|
102 |
* It appears that both Netscape and Internet Explorer escape |
|
103 |
* all special characters from this list with the exception |
|
104 |
* of "-", "_", ".", "*". While it is not clear why they are |
|
105 |
* escaping the other characters, perhaps it is safest to |
|
106 |
* assume that there might be contexts in which the others |
|
107 |
* are unsafe if not escaped. Therefore, we will use the same |
|
108 |
* list. It is also noteworthy that this is consistent with |
|
109 |
* O'Reilly's "HTML: The Definitive Guide" (page 164). |
|
110 |
* |
|
111 |
* As a last note, Intenet Explorer does not encode the "@" |
|
112 |
* character which is clearly not unreserved according to the |
|
113 |
* RFC. We are being consistent with the RFC in this matter, |
|
114 |
* as is Netscape. |
|
115 |
* |
|
116 |
*/ |
|
117 |
||
118 |
dontNeedEncoding = new BitSet(256); |
|
119 |
int i; |
|
120 |
for (i = 'a'; i <= 'z'; i++) { |
|
121 |
dontNeedEncoding.set(i); |
|
122 |
} |
|
123 |
for (i = 'A'; i <= 'Z'; i++) { |
|
124 |
dontNeedEncoding.set(i); |
|
125 |
} |
|
126 |
for (i = '0'; i <= '9'; i++) { |
|
127 |
dontNeedEncoding.set(i); |
|
128 |
} |
|
129 |
dontNeedEncoding.set(' '); /* encoding a space to a + is done |
|
130 |
* in the encode() method */ |
|
131 |
dontNeedEncoding.set('-'); |
|
132 |
dontNeedEncoding.set('_'); |
|
133 |
dontNeedEncoding.set('.'); |
|
134 |
dontNeedEncoding.set('*'); |
|
135 |
||
37593
824750ada3d6
8154231: Simplify access to System properties from JDK code
redestad
parents:
25859
diff
changeset
|
136 |
dfltEncName = GetPropertyAction.getProperty("file.encoding"); |
2 | 137 |
} |
138 |
||
139 |
/** |
|
140 |
* You can't call the constructor. |
|
141 |
*/ |
|
142 |
private URLEncoder() { } |
|
143 |
||
144 |
/** |
|
19069 | 145 |
* Translates a string into {@code x-www-form-urlencoded} |
2 | 146 |
* format. This method uses the platform's default encoding |
147 |
* as the encoding scheme to obtain the bytes for unsafe characters. |
|
148 |
* |
|
19069 | 149 |
* @param s {@code String} to be translated. |
2 | 150 |
* @deprecated The resulting string may vary depending on the platform's |
151 |
* default encoding. Instead, use the encode(String,String) |
|
152 |
* method to specify the encoding. |
|
19069 | 153 |
* @return the translated {@code String}. |
2 | 154 |
*/ |
155 |
@Deprecated |
|
156 |
public static String encode(String s) { |
|
157 |
||
158 |
String str = null; |
|
159 |
||
160 |
try { |
|
161 |
str = encode(s, dfltEncName); |
|
162 |
} catch (UnsupportedEncodingException e) { |
|
163 |
// The system should always have the platform default |
|
164 |
} |
|
165 |
||
166 |
return str; |
|
167 |
} |
|
168 |
||
169 |
/** |
|
19069 | 170 |
* Translates a string into {@code application/x-www-form-urlencoded} |
2 | 171 |
* format using a specific encoding scheme. This method uses the |
172 |
* supplied encoding scheme to obtain the bytes for unsafe |
|
173 |
* characters. |
|
174 |
* <p> |
|
175 |
* <em><strong>Note:</strong> The <a href= |
|
176 |
* "http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars"> |
|
177 |
* World Wide Web Consortium Recommendation</a> states that |
|
178 |
* UTF-8 should be used. Not doing so may introduce |
|
21278 | 179 |
* incompatibilities.</em> |
2 | 180 |
* |
19069 | 181 |
* @param s {@code String} to be translated. |
2 | 182 |
* @param enc The name of a supported |
183 |
* <a href="../lang/package-summary.html#charenc">character |
|
184 |
* encoding</a>. |
|
19069 | 185 |
* @return the translated {@code String}. |
2 | 186 |
* @exception UnsupportedEncodingException |
187 |
* If the named encoding is not supported |
|
188 |
* @see URLDecoder#decode(java.lang.String, java.lang.String) |
|
189 |
* @since 1.4 |
|
190 |
*/ |
|
191 |
public static String encode(String s, String enc) |
|
192 |
throws UnsupportedEncodingException { |
|
193 |
||
194 |
boolean needToChange = false; |
|
24969
afa6934dd8e8
8041679: Replace uses of StringBuffer with StringBuilder within core library classes
psandoz
parents:
24865
diff
changeset
|
195 |
StringBuilder out = new StringBuilder(s.length()); |
2 | 196 |
Charset charset; |
197 |
CharArrayWriter charArrayWriter = new CharArrayWriter(); |
|
198 |
||
199 |
if (enc == null) |
|
200 |
throw new NullPointerException("charsetName"); |
|
201 |
||
202 |
try { |
|
203 |
charset = Charset.forName(enc); |
|
204 |
} catch (IllegalCharsetNameException e) { |
|
205 |
throw new UnsupportedEncodingException(enc); |
|
206 |
} catch (UnsupportedCharsetException e) { |
|
207 |
throw new UnsupportedEncodingException(enc); |
|
208 |
} |
|
209 |
||
210 |
for (int i = 0; i < s.length();) { |
|
211 |
int c = (int) s.charAt(i); |
|
212 |
//System.out.println("Examining character: " + c); |
|
213 |
if (dontNeedEncoding.get(c)) { |
|
214 |
if (c == ' ') { |
|
215 |
c = '+'; |
|
216 |
needToChange = true; |
|
217 |
} |
|
218 |
//System.out.println("Storing: " + c); |
|
219 |
out.append((char)c); |
|
220 |
i++; |
|
221 |
} else { |
|
222 |
// convert to external encoding before hex conversion |
|
223 |
do { |
|
224 |
charArrayWriter.write(c); |
|
225 |
/* |
|
226 |
* If this character represents the start of a Unicode |
|
227 |
* surrogate pair, then pass in two characters. It's not |
|
228 |
* clear what should be done if a bytes reserved in the |
|
229 |
* surrogate pairs range occurs outside of a legal |
|
230 |
* surrogate pair. For now, just treat it as if it were |
|
231 |
* any other character. |
|
232 |
*/ |
|
233 |
if (c >= 0xD800 && c <= 0xDBFF) { |
|
234 |
/* |
|
235 |
System.out.println(Integer.toHexString(c) |
|
236 |
+ " is high surrogate"); |
|
237 |
*/ |
|
238 |
if ( (i+1) < s.length()) { |
|
239 |
int d = (int) s.charAt(i+1); |
|
240 |
/* |
|
241 |
System.out.println("\tExamining " |
|
242 |
+ Integer.toHexString(d)); |
|
243 |
*/ |
|
244 |
if (d >= 0xDC00 && d <= 0xDFFF) { |
|
245 |
/* |
|
246 |
System.out.println("\t" |
|
247 |
+ Integer.toHexString(d) |
|
248 |
+ " is low surrogate"); |
|
249 |
*/ |
|
250 |
charArrayWriter.write(d); |
|
251 |
i++; |
|
252 |
} |
|
253 |
} |
|
254 |
} |
|
255 |
i++; |
|
256 |
} while (i < s.length() && !dontNeedEncoding.get((c = (int) s.charAt(i)))); |
|
257 |
||
258 |
charArrayWriter.flush(); |
|
259 |
String str = new String(charArrayWriter.toCharArray()); |
|
260 |
byte[] ba = str.getBytes(charset); |
|
261 |
for (int j = 0; j < ba.length; j++) { |
|
262 |
out.append('%'); |
|
263 |
char ch = Character.forDigit((ba[j] >> 4) & 0xF, 16); |
|
264 |
// converting to use uppercase letter as part of |
|
265 |
// the hex value if ch is a letter. |
|
266 |
if (Character.isLetter(ch)) { |
|
267 |
ch -= caseDiff; |
|
268 |
} |
|
269 |
out.append(ch); |
|
270 |
ch = Character.forDigit(ba[j] & 0xF, 16); |
|
271 |
if (Character.isLetter(ch)) { |
|
272 |
ch -= caseDiff; |
|
273 |
} |
|
274 |
out.append(ch); |
|
275 |
} |
|
276 |
charArrayWriter.reset(); |
|
277 |
needToChange = true; |
|
278 |
} |
|
279 |
} |
|
280 |
||
281 |
return (needToChange? out.toString() : s); |
|
282 |
} |
|
283 |
} |