|
1 /* |
|
2 * Copyright (c) 2007, 2011, Oracle and/or its affiliates. All rights reserved. |
|
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
|
4 * |
|
5 * This code is free software; you can redistribute it and/or modify it |
|
6 * under the terms of the GNU General Public License version 2 only, as |
|
7 * published by the Free Software Foundation. Oracle designates this |
|
8 * particular file as subject to the "Classpath" exception as provided |
|
9 * by Oracle in the LICENSE file that accompanied this code. |
|
10 * |
|
11 * This code is distributed in the hope that it will be useful, but WITHOUT |
|
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
14 * version 2 for more details (a copy is included in the LICENSE file that |
|
15 * accompanied this code). |
|
16 * |
|
17 * You should have received a copy of the GNU General Public License version |
|
18 * 2 along with this work; if not, write to the Free Software Foundation, |
|
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
|
20 * |
|
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
|
22 * or visit www.oracle.com if you need additional information or have any |
|
23 * questions. |
|
24 */ |
|
25 |
|
26 package sun.invoke.util; |
|
27 |
|
28 /** |
|
29 * Utility routines for dealing with bytecode-level names. |
|
30 * Includes universal mangling rules for the JVM. |
|
31 * |
|
32 * <h3>Avoiding Dangerous Characters </h3> |
|
33 * |
|
34 * <p> |
|
35 * The JVM defines a very small set of characters which are illegal |
|
36 * in name spellings. We will slightly extend and regularize this set |
|
37 * into a group of <cite>dangerous characters</cite>. |
|
38 * These characters will then be replaced, in mangled names, by escape sequences. |
|
39 * In addition, accidental escape sequences must be further escaped. |
|
40 * Finally, a special prefix will be applied if and only if |
|
41 * the mangling would otherwise fail to begin with the escape character. |
|
42 * This happens to cover the corner case of the null string, |
|
43 * and also clearly marks symbols which need demangling. |
|
44 * </p> |
|
45 * <p> |
|
46 * Dangerous characters are the union of all characters forbidden |
|
47 * or otherwise restricted by the JVM specification, |
|
48 * plus their mates, if they are brackets |
|
49 * (<code><big><b>[</b></big></code> and <code><big><b>]</b></big></code>, |
|
50 * <code><big><b><</b></big></code> and <code><big><b>></b></big></code>), |
|
51 * plus, arbitrarily, the colon character <code><big><b>:</b></big></code>. |
|
52 * There is no distinction between type, method, and field names. |
|
53 * This makes it easier to convert between mangled names of different |
|
54 * types, since they do not need to be decoded (demangled). |
|
55 * </p> |
|
56 * <p> |
|
57 * The escape character is backslash <code><big><b>\</b></big></code> |
|
58 * (also known as reverse solidus). |
|
59 * This character is, until now, unheard of in bytecode names, |
|
60 * but traditional in the proposed role. |
|
61 * |
|
62 * </p> |
|
63 * <h3> Replacement Characters </h3> |
|
64 * |
|
65 * |
|
66 * <p> |
|
67 * Every escape sequence is two characters |
|
68 * (in fact, two UTF8 bytes) beginning with |
|
69 * the escape character and followed by a |
|
70 * <cite>replacement character</cite>. |
|
71 * (Since the replacement character is never a backslash, |
|
72 * iterated manglings do not double in size.) |
|
73 * </p> |
|
74 * <p> |
|
75 * Each dangerous character has some rough visual similarity |
|
76 * to its corresponding replacement character. |
|
77 * This makes mangled symbols easier to recognize by sight. |
|
78 * </p> |
|
79 * <p> |
|
80 * The dangerous characters are |
|
81 * <code><big><b>/</b></big></code> (forward slash, used to delimit package components), |
|
82 * <code><big><b>.</b></big></code> (dot, also a package delimiter), |
|
83 * <code><big><b>;</b></big></code> (semicolon, used in signatures), |
|
84 * <code><big><b>$</b></big></code> (dollar, used in inner classes and synthetic members), |
|
85 * <code><big><b><</b></big></code> (left angle), |
|
86 * <code><big><b>></b></big></code> (right angle), |
|
87 * <code><big><b>[</b></big></code> (left square bracket, used in array types), |
|
88 * <code><big><b>]</b></big></code> (right square bracket, reserved in this scheme for language use), |
|
89 * and <code><big><b>:</b></big></code> (colon, reserved in this scheme for language use). |
|
90 * Their replacements are, respectively, |
|
91 * <code><big><b>|</b></big></code> (vertical bar), |
|
92 * <code><big><b>,</b></big></code> (comma), |
|
93 * <code><big><b>?</b></big></code> (question mark), |
|
94 * <code><big><b>%</b></big></code> (percent), |
|
95 * <code><big><b>^</b></big></code> (caret), |
|
96 * <code><big><b>_</b></big></code> (underscore), and |
|
97 * <code><big><b>{</b></big></code> (left curly bracket), |
|
98 * <code><big><b>}</b></big></code> (right curly bracket), |
|
99 * <code><big><b>!</b></big></code> (exclamation mark). |
|
100 * In addition, the replacement character for the escape character itself is |
|
101 * <code><big><b>-</b></big></code> (hyphen), |
|
102 * and the replacement character for the null prefix is |
|
103 * <code><big><b>=</b></big></code> (equal sign). |
|
104 * </p> |
|
105 * <p> |
|
106 * An escape character <code><big><b>\</b></big></code> |
|
107 * followed by any of these replacement characters |
|
108 * is an escape sequence, and there are no other escape sequences. |
|
109 * An equal sign is only part of an escape sequence |
|
110 * if it is the second character in the whole string, following a backslash. |
|
111 * Two consecutive backslashes do <em>not</em> form an escape sequence. |
|
112 * </p> |
|
113 * <p> |
|
114 * Each escape sequence replaces a so-called <cite>original character</cite> |
|
115 * which is either one of the dangerous characters or the escape character. |
|
116 * A null prefix replaces an initial null string, not a character. |
|
117 * </p> |
|
118 * <p> |
|
119 * All this implies that escape sequences cannot overlap and may be |
|
120 * determined all at once for a whole string. Note that a spelling |
|
121 * string can contain <cite>accidental escapes</cite>, apparent escape |
|
122 * sequences which must not be interpreted as manglings. |
|
123 * These are disabled by replacing their leading backslash with an |
|
124 * escape sequence (<code><big><b>\-</b></big></code>). To mangle a string, three logical steps |
|
125 * are required, though they may be carried out in one pass: |
|
126 * </p> |
|
127 * <ol> |
|
128 * <li>In each accidental escape, replace the backslash with an escape sequence |
|
129 * (<code><big><b>\-</b></big></code>).</li> |
|
130 * <li>Replace each dangerous character with an escape sequence |
|
131 * (<code><big><b>\|</b></big></code> for <code><big><b>/</b></big></code>, etc.).</li> |
|
132 * <li>If the first two steps introduced any change, <em>and</em> |
|
133 * if the string does not already begin with a backslash, prepend a null prefix (<code><big><b>\=</b></big></code>).</li> |
|
134 * </ol> |
|
135 * |
|
136 * To demangle a mangled string that begins with an escape, |
|
137 * remove any null prefix, and then replace (in parallel) |
|
138 * each escape sequence by its original character. |
|
139 * <p>Spelling strings which contain accidental |
|
140 * escapes <em>must</em> have them replaced, even if those |
|
141 * strings do not contain dangerous characters. |
|
142 * This restriction means that mangling a string always |
|
143 * requires a scan of the string for escapes. |
|
144 * But then, a scan would be required anyway, |
|
145 * to check for dangerous characters. |
|
146 * |
|
147 * </p> |
|
148 * <h3> Nice Properties </h3> |
|
149 * |
|
150 * <p> |
|
151 * If a bytecode name does not contain any escape sequence, |
|
152 * demangling is a no-op: The string demangles to itself. |
|
153 * Such a string is called <cite>self-mangling</cite>. |
|
154 * Almost all strings are self-mangling. |
|
155 * In practice, to demangle almost any name “found in nature”, |
|
156 * simply verify that it does not begin with a backslash. |
|
157 * </p> |
|
158 * <p> |
|
159 * Mangling is a one-to-one function, while demangling |
|
160 * is a many-to-one function. |
|
161 * A mangled string is defined as <cite>validly mangled</cite> if |
|
162 * it is in fact the unique mangling of its spelling string. |
|
163 * Three examples of invalidly mangled strings are <code><big><b>\=foo</b></big></code>, |
|
164 * <code><big><b>\-bar</b></big></code>, and <code><big><b>baz\!</b></big></code>, which demangle to <code><big><b>foo</b></big></code>, <code><big><b>\bar</b></big></code>, and |
|
165 * <code><big><b>baz\!</b></big></code>, but then remangle to <code><big><b>foo</b></big></code>, <code><big><b>\bar</b></big></code>, and <code><big><b>\=baz\-!</b></big></code>. |
|
166 * If a language back-end or runtime is using mangled names, |
|
167 * it should never present an invalidly mangled bytecode |
|
168 * name to the JVM. If the runtime encounters one, |
|
169 * it should also report an error, since such an occurrence |
|
170 * probably indicates a bug in name encoding which |
|
171 * will lead to errors in linkage. |
|
172 * However, this note does not propose that the JVM verifier |
|
173 * detect invalidly mangled names. |
|
174 * </p> |
|
175 * <p> |
|
176 * As a result of these rules, it is a simple matter to |
|
177 * compute validly mangled substrings and concatenations |
|
178 * of validly mangled strings, and (with a little care) |
|
179 * these correspond to corresponding operations on their |
|
180 * spelling strings. |
|
181 * </p> |
|
182 * <ul> |
|
183 * <li>Any prefix of a validly mangled string is also validly mangled, |
|
184 * although a null prefix may need to be removed.</li> |
|
185 * <li>Any suffix of a validly mangled string is also validly mangled, |
|
186 * although a null prefix may need to be added.</li> |
|
187 * <li>Two validly mangled strings, when concatenated, |
|
188 * are also validly mangled, although any null prefix |
|
189 * must be removed from the second string, |
|
190 * and a trailing backslash on the first string may need escaping, |
|
191 * if it would participate in an accidental escape when followed |
|
192 * by the first character of the second string.</li> |
|
193 * </ul> |
|
194 * <p>If languages that include non-Java symbol spellings use this |
|
195 * mangling convention, they will enjoy the following advantages: |
|
196 * </p> |
|
197 * <ul> |
|
198 * <li>They can interoperate via symbols they share in common.</li> |
|
199 * <li>Low-level tools, such as backtrace printers, will have readable displays.</li> |
|
200 * <li>Future JVM and language extensions can safely use the dangerous characters |
|
201 * for structuring symbols, but will never interfere with valid spellings.</li> |
|
202 * <li>Runtimes and compilers can use standard libraries for mangling and demangling.</li> |
|
203 * <li>Occasional transliterations and name composition will be simple and regular, |
|
204 * for classes, methods, and fields.</li> |
|
205 * <li>Bytecode names will continue to be compact. |
|
206 * When mangled, spellings will at most double in length, either in |
|
207 * UTF8 or UTF16 format, and most will not change at all.</li> |
|
208 * </ul> |
|
209 * |
|
210 * |
|
211 * <h3> Suggestions for Human Readable Presentations </h3> |
|
212 * |
|
213 * |
|
214 * <p> |
|
215 * For human readable displays of symbols, |
|
216 * it will be better to present a string-like quoted |
|
217 * representation of the spelling, because JVM users |
|
218 * are generally familiar with such tokens. |
|
219 * We suggest using single or double quotes before and after |
|
220 * mangled symbols which are not valid Java identifiers, |
|
221 * with quotes, backslashes, and non-printing characters |
|
222 * escaped as if for literals in the Java language. |
|
223 * </p> |
|
224 * <p> |
|
225 * For example, an HTML-like spelling |
|
226 * <code><big><b><pre></b></big></code> mangles to |
|
227 * <code><big><b>\^pre\_</b></big></code> and could |
|
228 * display more cleanly as |
|
229 * <code><big><b>'<pre>'</b></big></code>, |
|
230 * with the quotes included. |
|
231 * Such string-like conventions are <em>not</em> suitable |
|
232 * for mangled bytecode names, in part because |
|
233 * dangerous characters must be eliminated, rather |
|
234 * than just quoted. Otherwise internally structured |
|
235 * strings like package prefixes and method signatures |
|
236 * could not be reliably parsed. |
|
237 * </p> |
|
238 * <p> |
|
239 * In such human-readable displays, invalidly mangled |
|
240 * names should <em>not</em> be demangled and quoted, |
|
241 * for this would be misleading. Likewise, JVM symbols |
|
242 * which contain dangerous characters (like dots in field |
|
243 * names or brackets in method names) should not be |
|
244 * simply quoted. The bytecode names |
|
245 * <code><big><b>\=phase\,1</b></big></code> and |
|
246 * <code><big><b>phase.1</b></big></code> are distinct, |
|
247 * and in demangled displays they should be presented as |
|
248 * <code><big><b>'phase.1'</b></big></code> and something like |
|
249 * <code><big><b>'phase'.1</b></big></code>, respectively. |
|
250 * </p> |
|
251 * |
|
252 * @author John Rose |
|
253 * @version 1.2, 02/06/2008 |
|
254 * @see http://blogs.sun.com/jrose/entry/symbolic_freedom_in_the_vm |
|
255 */ |
|
256 public class BytecodeName { |
|
257 private BytecodeName() { } // static only class |
|
258 |
|
259 /** Given a source name, produce the corresponding bytecode name. |
|
260 * The source name should not be qualified, because any syntactic |
|
261 * markers (dots, slashes, dollar signs, colons, etc.) will be mangled. |
|
262 * @param s the source name |
|
263 * @return a valid bytecode name which represents the source name |
|
264 */ |
|
265 public static String toBytecodeName(String s) { |
|
266 String bn = mangle(s); |
|
267 assert((Object)bn == s || looksMangled(bn)) : bn; |
|
268 assert(s.equals(toSourceName(bn))) : s; |
|
269 return bn; |
|
270 } |
|
271 |
|
272 /** Given an unqualified bytecode name, produce the corresponding source name. |
|
273 * The bytecode name must not contain dangerous characters. |
|
274 * In particular, it must not be qualified or segmented by colon {@code ':'}. |
|
275 * @param s the bytecode name |
|
276 * @return the source name, which may possibly have unsafe characters |
|
277 * @throws IllegalArgumentException if the bytecode name is not {@link #isSafeBytecodeName safe} |
|
278 * @see #isSafeBytecodeName(java.lang.String) |
|
279 */ |
|
280 public static String toSourceName(String s) { |
|
281 checkSafeBytecodeName(s); |
|
282 String sn = s; |
|
283 if (looksMangled(s)) { |
|
284 sn = demangle(s); |
|
285 assert(s.equals(mangle(sn))) : s+" => "+sn+" => "+mangle(sn); |
|
286 } |
|
287 return sn; |
|
288 } |
|
289 |
|
290 /** |
|
291 * Given a bytecode name from a classfile, separate it into |
|
292 * components delimited by dangerous characters. |
|
293 * Each resulting array element will be either a dangerous character, |
|
294 * or else a safe bytecode name. |
|
295 * (The safe name might possibly be mangled to hide further dangerous characters.) |
|
296 * For example, the qualified class name {@code java/lang/String} |
|
297 * will be parsed into the array {@code {"java", '/', "lang", '/', "String"}}. |
|
298 * The name {@code <init>} will be parsed into { '<', "init", '>'}} |
|
299 * The name {@code foo/bar$:baz} will be parsed into |
|
300 * {@code {"foo", '/', "bar", '$', ':', "baz"}}. |
|
301 * The name {@code ::\=:foo:\=bar\!baz} will be parsed into |
|
302 * {@code {':', ':', "", ':', "foo", ':', "bar:baz"}}. |
|
303 */ |
|
304 public static Object[] parseBytecodeName(String s) { |
|
305 int slen = s.length(); |
|
306 Object[] res = null; |
|
307 for (int pass = 0; pass <= 1; pass++) { |
|
308 int fillp = 0; |
|
309 int lasti = 0; |
|
310 for (int i = 0; i <= slen; i++) { |
|
311 int whichDC = -1; |
|
312 if (i < slen) { |
|
313 whichDC = DANGEROUS_CHARS.indexOf(s.charAt(i)); |
|
314 if (whichDC < DANGEROUS_CHAR_FIRST_INDEX) continue; |
|
315 } |
|
316 // got to end of string or next dangerous char |
|
317 if (lasti < i) { |
|
318 // normal component |
|
319 if (pass != 0) |
|
320 res[fillp] = toSourceName(s.substring(lasti, i)); |
|
321 fillp++; |
|
322 lasti = i+1; |
|
323 } |
|
324 if (whichDC >= DANGEROUS_CHAR_FIRST_INDEX) { |
|
325 if (pass != 0) |
|
326 res[fillp] = DANGEROUS_CHARS_CA[whichDC]; |
|
327 fillp++; |
|
328 lasti = i+1; |
|
329 } |
|
330 } |
|
331 if (pass != 0) break; |
|
332 // between passes, build the result array |
|
333 res = new Object[fillp]; |
|
334 if (fillp <= 1 && lasti == 0) { |
|
335 if (fillp != 0) res[0] = toSourceName(s); |
|
336 break; |
|
337 } |
|
338 } |
|
339 return res; |
|
340 } |
|
341 |
|
342 /** |
|
343 * Given a series of components, create a bytecode name for a classfile. |
|
344 * This is the inverse of {@link #parseBytecodeName(java.lang.String)}. |
|
345 * Each component must either be an interned one-character string of |
|
346 * a dangerous character, or else a safe bytecode name. |
|
347 * @param components a series of name components |
|
348 * @return the concatenation of all components |
|
349 * @throws IllegalArgumentException if any component contains an unsafe |
|
350 * character, and is not an interned one-character string |
|
351 * @throws NullPointerException if any component is null |
|
352 */ |
|
353 public static String unparseBytecodeName(Object[] components) { |
|
354 Object[] components0 = components; |
|
355 for (int i = 0; i < components.length; i++) { |
|
356 Object c = components[i]; |
|
357 if (c instanceof String) { |
|
358 String mc = toBytecodeName((String) c); |
|
359 if (i == 0 && components.length == 1) |
|
360 return mc; // usual case |
|
361 if ((Object)mc != c) { |
|
362 if (components == components0) |
|
363 components = components.clone(); |
|
364 components[i] = c = mc; |
|
365 } |
|
366 } |
|
367 } |
|
368 return appendAll(components); |
|
369 } |
|
370 private static String appendAll(Object[] components) { |
|
371 if (components.length <= 1) { |
|
372 if (components.length == 1) { |
|
373 return String.valueOf(components[0]); |
|
374 } |
|
375 return ""; |
|
376 } |
|
377 int slen = 0; |
|
378 for (Object c : components) { |
|
379 if (c instanceof String) |
|
380 slen += String.valueOf(c).length(); |
|
381 else |
|
382 slen += 1; |
|
383 } |
|
384 StringBuilder sb = new StringBuilder(slen); |
|
385 for (Object c : components) { |
|
386 sb.append(c); |
|
387 } |
|
388 return sb.toString(); |
|
389 } |
|
390 |
|
391 /** |
|
392 * Given a bytecode name, produce the corresponding display name. |
|
393 * This is the source name, plus quotes if needed. |
|
394 * If the bytecode name contains dangerous characters, |
|
395 * assume that they are being used as punctuation, |
|
396 * and pass them through unchanged. |
|
397 * Non-empty runs of non-dangerous characters are demangled |
|
398 * if necessary, and the resulting names are quoted if |
|
399 * they are not already valid Java identifiers, or if |
|
400 * they contain a dangerous character (i.e., dollar sign "$"). |
|
401 * Single quotes are used when quoting. |
|
402 * Within quoted names, embedded single quotes and backslashes |
|
403 * are further escaped by prepended backslashes. |
|
404 * |
|
405 * @param s the original bytecode name (which may be qualified) |
|
406 * @return a human-readable presentation |
|
407 */ |
|
408 public static String toDisplayName(String s) { |
|
409 Object[] components = parseBytecodeName(s); |
|
410 for (int i = 0; i < components.length; i++) { |
|
411 if (!(components[i] instanceof String)) |
|
412 continue; |
|
413 String sn = (String) components[i]; |
|
414 // note that the name is already demangled! |
|
415 //sn = toSourceName(sn); |
|
416 if (!isJavaIdent(sn) || sn.indexOf('$') >=0 ) { |
|
417 components[i] = quoteDisplay(sn); |
|
418 } |
|
419 } |
|
420 return appendAll(components); |
|
421 } |
|
422 private static boolean isJavaIdent(String s) { |
|
423 int slen = s.length(); |
|
424 if (slen == 0) return false; |
|
425 if (!Character.isJavaIdentifierStart(s.charAt(0))) |
|
426 return false; |
|
427 for (int i = 1; i < slen; i++) { |
|
428 if (!Character.isJavaIdentifierPart(s.charAt(i))) |
|
429 return false; |
|
430 } |
|
431 return true; |
|
432 } |
|
433 private static String quoteDisplay(String s) { |
|
434 // TO DO: Replace wierd characters in s by C-style escapes. |
|
435 return "'"+s.replaceAll("['\\\\]", "\\\\$0")+"'"; |
|
436 } |
|
437 |
|
438 private static void checkSafeBytecodeName(String s) |
|
439 throws IllegalArgumentException { |
|
440 if (!isSafeBytecodeName(s)) { |
|
441 throw new IllegalArgumentException(s); |
|
442 } |
|
443 } |
|
444 |
|
445 /** |
|
446 * Report whether a simple name is safe as a bytecode name. |
|
447 * Such names are acceptable in class files as class, method, and field names. |
|
448 * Additionally, they are free of "dangerous" characters, even if those |
|
449 * characters are legal in some (or all) names in class files. |
|
450 * @param s the proposed bytecode name |
|
451 * @return true if the name is non-empty and all of its characters are safe |
|
452 */ |
|
453 public static boolean isSafeBytecodeName(String s) { |
|
454 if (s.length() == 0) return false; |
|
455 // check occurrences of each DANGEROUS char |
|
456 for (char xc : DANGEROUS_CHARS_A) { |
|
457 if (xc == ESCAPE_C) continue; // not really that dangerous |
|
458 if (s.indexOf(xc) >= 0) return false; |
|
459 } |
|
460 return true; |
|
461 } |
|
462 |
|
463 /** |
|
464 * Report whether a character is safe in a bytecode name. |
|
465 * This is true of any unicode character except the following |
|
466 * <em>dangerous characters</em>: {@code ".;:$[]<>/"}. |
|
467 * @param s the proposed character |
|
468 * @return true if the character is safe to use in classfiles |
|
469 */ |
|
470 public static boolean isSafeBytecodeChar(char c) { |
|
471 return DANGEROUS_CHARS.indexOf(c) < DANGEROUS_CHAR_FIRST_INDEX; |
|
472 } |
|
473 |
|
474 private static boolean looksMangled(String s) { |
|
475 return s.charAt(0) == ESCAPE_C; |
|
476 } |
|
477 |
|
478 private static String mangle(String s) { |
|
479 if (s.length() == 0) |
|
480 return NULL_ESCAPE; |
|
481 |
|
482 // build this lazily, when we first need an escape: |
|
483 StringBuilder sb = null; |
|
484 |
|
485 for (int i = 0, slen = s.length(); i < slen; i++) { |
|
486 char c = s.charAt(i); |
|
487 |
|
488 boolean needEscape = false; |
|
489 if (c == ESCAPE_C) { |
|
490 if (i+1 < slen) { |
|
491 char c1 = s.charAt(i+1); |
|
492 if ((i == 0 && c1 == NULL_ESCAPE_C) |
|
493 || c1 != originalOfReplacement(c1)) { |
|
494 // an accidental escape |
|
495 needEscape = true; |
|
496 } |
|
497 } |
|
498 } else { |
|
499 needEscape = isDangerous(c); |
|
500 } |
|
501 |
|
502 if (!needEscape) { |
|
503 if (sb != null) sb.append(c); |
|
504 continue; |
|
505 } |
|
506 |
|
507 // build sb if this is the first escape |
|
508 if (sb == null) { |
|
509 sb = new StringBuilder(s.length()+10); |
|
510 // mangled names must begin with a backslash: |
|
511 if (s.charAt(0) != ESCAPE_C && i > 0) |
|
512 sb.append(NULL_ESCAPE); |
|
513 // append the string so far, which is unremarkable: |
|
514 sb.append(s.substring(0, i)); |
|
515 } |
|
516 |
|
517 // rewrite \ to \-, / to \|, etc. |
|
518 sb.append(ESCAPE_C); |
|
519 sb.append(replacementOf(c)); |
|
520 } |
|
521 |
|
522 if (sb != null) return sb.toString(); |
|
523 |
|
524 return s; |
|
525 } |
|
526 |
|
527 private static String demangle(String s) { |
|
528 // build this lazily, when we first meet an escape: |
|
529 StringBuilder sb = null; |
|
530 |
|
531 int stringStart = 0; |
|
532 if (s.startsWith(NULL_ESCAPE)) |
|
533 stringStart = 2; |
|
534 |
|
535 for (int i = stringStart, slen = s.length(); i < slen; i++) { |
|
536 char c = s.charAt(i); |
|
537 |
|
538 if (c == ESCAPE_C && i+1 < slen) { |
|
539 // might be an escape sequence |
|
540 char rc = s.charAt(i+1); |
|
541 char oc = originalOfReplacement(rc); |
|
542 if (oc != rc) { |
|
543 // build sb if this is the first escape |
|
544 if (sb == null) { |
|
545 sb = new StringBuilder(s.length()); |
|
546 // append the string so far, which is unremarkable: |
|
547 sb.append(s.substring(stringStart, i)); |
|
548 } |
|
549 ++i; // skip both characters |
|
550 c = oc; |
|
551 } |
|
552 } |
|
553 |
|
554 if (sb != null) |
|
555 sb.append(c); |
|
556 } |
|
557 |
|
558 if (sb != null) return sb.toString(); |
|
559 |
|
560 return s.substring(stringStart); |
|
561 } |
|
562 |
|
563 static char ESCAPE_C = '\\'; |
|
564 // empty escape sequence to avoid a null name or illegal prefix |
|
565 static char NULL_ESCAPE_C = '='; |
|
566 static String NULL_ESCAPE = ESCAPE_C+""+NULL_ESCAPE_C; |
|
567 |
|
568 static final String DANGEROUS_CHARS = "\\/.;:$[]<>"; // \\ must be first |
|
569 static final String REPLACEMENT_CHARS = "-|,?!%{}^_"; |
|
570 static final int DANGEROUS_CHAR_FIRST_INDEX = 1; // index after \\ |
|
571 static char[] DANGEROUS_CHARS_A = DANGEROUS_CHARS.toCharArray(); |
|
572 static char[] REPLACEMENT_CHARS_A = REPLACEMENT_CHARS.toCharArray(); |
|
573 static final Character[] DANGEROUS_CHARS_CA; |
|
574 static { |
|
575 Character[] dcca = new Character[DANGEROUS_CHARS.length()]; |
|
576 for (int i = 0; i < dcca.length; i++) |
|
577 dcca[i] = Character.valueOf(DANGEROUS_CHARS.charAt(i)); |
|
578 DANGEROUS_CHARS_CA = dcca; |
|
579 } |
|
580 |
|
581 static final long[] SPECIAL_BITMAP = new long[2]; // 128 bits |
|
582 static { |
|
583 String SPECIAL = DANGEROUS_CHARS + REPLACEMENT_CHARS; |
|
584 //System.out.println("SPECIAL = "+SPECIAL); |
|
585 for (char c : SPECIAL.toCharArray()) { |
|
586 SPECIAL_BITMAP[c >>> 6] |= 1L << c; |
|
587 } |
|
588 } |
|
589 static boolean isSpecial(char c) { |
|
590 if ((c >>> 6) < SPECIAL_BITMAP.length) |
|
591 return ((SPECIAL_BITMAP[c >>> 6] >> c) & 1) != 0; |
|
592 else |
|
593 return false; |
|
594 } |
|
595 static char replacementOf(char c) { |
|
596 if (!isSpecial(c)) return c; |
|
597 int i = DANGEROUS_CHARS.indexOf(c); |
|
598 if (i < 0) return c; |
|
599 return REPLACEMENT_CHARS.charAt(i); |
|
600 } |
|
601 static char originalOfReplacement(char c) { |
|
602 if (!isSpecial(c)) return c; |
|
603 int i = REPLACEMENT_CHARS.indexOf(c); |
|
604 if (i < 0) return c; |
|
605 return DANGEROUS_CHARS.charAt(i); |
|
606 } |
|
607 static boolean isDangerous(char c) { |
|
608 if (!isSpecial(c)) return false; |
|
609 return (DANGEROUS_CHARS.indexOf(c) >= DANGEROUS_CHAR_FIRST_INDEX); |
|
610 } |
|
611 static int indexOfDangerousChar(String s, int from) { |
|
612 for (int i = from, slen = s.length(); i < slen; i++) { |
|
613 if (isDangerous(s.charAt(i))) |
|
614 return i; |
|
615 } |
|
616 return -1; |
|
617 } |
|
618 static int lastIndexOfDangerousChar(String s, int from) { |
|
619 for (int i = Math.min(from, s.length()-1); i >= 0; i--) { |
|
620 if (isDangerous(s.charAt(i))) |
|
621 return i; |
|
622 } |
|
623 return -1; |
|
624 } |
|
625 |
|
626 |
|
627 } |