|
1 /* |
|
2 * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. |
|
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
|
4 * |
|
5 * This code is free software; you can redistribute it and/or modify it |
|
6 * under the terms of the GNU General Public License version 2 only, as |
|
7 * published by the Free Software Foundation. |
|
8 * |
|
9 * This code is distributed in the hope that it will be useful, but WITHOUT |
|
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
12 * version 2 for more details (a copy is included in the LICENSE file that |
|
13 * accompanied this code). |
|
14 * |
|
15 * You should have received a copy of the GNU General Public License version |
|
16 * 2 along with this work; if not, write to the Free Software Foundation, |
|
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
|
18 * |
|
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
|
20 * or visit www.oracle.com if you need additional information or have any |
|
21 * questions. |
|
22 */ |
|
23 |
|
24 import java.io.BufferedReader; |
|
25 import java.io.BufferedWriter; |
|
26 import java.io.FilterReader; |
|
27 import java.io.FilterWriter; |
|
28 import java.io.IOException; |
|
29 import java.io.Reader; |
|
30 import java.io.Writer; |
|
31 import java.nio.charset.Charset; |
|
32 import java.nio.charset.CharsetEncoder; |
|
33 import java.nio.file.Files; |
|
34 import java.nio.file.Path; |
|
35 import static java.nio.charset.StandardCharsets.*; |
|
36 |
|
37 /** |
|
38 * Simple utility to convert from native encoding file to ascii or reverse |
|
39 * including \udddd Unicode notation. |
|
40 */ |
|
41 public class Native2Ascii { |
|
42 final Charset cs; |
|
43 final CharsetEncoder encoder; |
|
44 public Native2Ascii(Charset cs) { |
|
45 this.cs = cs; |
|
46 this.encoder = cs.newEncoder(); |
|
47 } |
|
48 |
|
49 /** |
|
50 * ASCII to Native conversion |
|
51 */ |
|
52 public void asciiToNative(Path infile, Path outfile) throws IOException { |
|
53 try (BufferedReader in = Files.newBufferedReader(infile, US_ASCII); |
|
54 BufferedReader reader = new BufferedReader(new A2NFilter(in)); |
|
55 BufferedWriter writer = Files.newBufferedWriter(outfile, cs)) { |
|
56 String line; |
|
57 while ((line = reader.readLine()) != null) { |
|
58 writer.write(line.toCharArray()); |
|
59 writer.newLine(); |
|
60 } |
|
61 } |
|
62 } |
|
63 |
|
64 /** |
|
65 * Native to ASCII conversion |
|
66 */ |
|
67 public void nativeToAscii(Path infile, Path outfile) throws IOException { |
|
68 try (BufferedReader reader = Files.newBufferedReader(infile, cs); |
|
69 BufferedWriter out = Files.newBufferedWriter(outfile, US_ASCII); |
|
70 BufferedWriter writer = new BufferedWriter(new N2AFilter(out))) { |
|
71 String line; |
|
72 while ((line = reader.readLine()) != null) { |
|
73 writer.write(line.toCharArray()); |
|
74 writer.newLine(); |
|
75 } |
|
76 } |
|
77 } |
|
78 |
|
79 // A copy of native2ascii N2AFilter |
|
80 class N2AFilter extends FilterWriter { |
|
81 public N2AFilter(Writer out) { super(out); } |
|
82 public void write(char b) throws IOException { |
|
83 char[] buf = new char[1]; |
|
84 buf[0] = b; |
|
85 write(buf, 0, 1); |
|
86 } |
|
87 |
|
88 public void write(char[] buf, int off, int len) throws IOException { |
|
89 for (int i = 0; i < len; i++) { |
|
90 if ((buf[i] > '\u007f')) { |
|
91 // write \udddd |
|
92 out.write('\\'); |
|
93 out.write('u'); |
|
94 String hex = Integer.toHexString(buf[i]); |
|
95 StringBuilder hex4 = new StringBuilder(hex); |
|
96 hex4.reverse(); |
|
97 int length = 4 - hex4.length(); |
|
98 for (int j = 0; j < length; j++) { |
|
99 hex4.append('0'); |
|
100 } |
|
101 for (int j = 0; j < 4; j++) { |
|
102 out.write(hex4.charAt(3 - j)); |
|
103 } |
|
104 } else |
|
105 out.write(buf[i]); |
|
106 } |
|
107 } |
|
108 } |
|
109 |
|
110 // A copy of native2ascii A2NFilter |
|
111 class A2NFilter extends FilterReader { |
|
112 // maintain a trailing buffer to hold any incompleted |
|
113 // unicode escaped sequences |
|
114 private char[] trailChars = null; |
|
115 |
|
116 public A2NFilter(Reader in) { |
|
117 super(in); |
|
118 } |
|
119 |
|
120 public int read(char[] buf, int off, int len) throws IOException { |
|
121 int numChars = 0; // how many characters have been read |
|
122 int retChars = 0; // how many characters we'll return |
|
123 |
|
124 char[] cBuf = new char[len]; |
|
125 int cOffset = 0; // offset at which we'll start reading |
|
126 boolean eof = false; |
|
127 |
|
128 // copy trailing chars from previous invocation to input buffer |
|
129 if (trailChars != null) { |
|
130 for (int i = 0; i < trailChars.length; i++) |
|
131 cBuf[i] = trailChars[i]; |
|
132 numChars = trailChars.length; |
|
133 trailChars = null; |
|
134 } |
|
135 |
|
136 int n = in.read(cBuf, numChars, len - numChars); |
|
137 if (n < 0) { |
|
138 eof = true; |
|
139 if (numChars == 0) |
|
140 return -1; // EOF; |
|
141 } else { |
|
142 numChars += n; |
|
143 } |
|
144 |
|
145 for (int i = 0; i < numChars; ) { |
|
146 char c = cBuf[i++]; |
|
147 |
|
148 if (c != '\\' || (eof && numChars <= 5)) { |
|
149 // Not a backslash, so copy and continue |
|
150 // Always pass non backslash chars straight thru |
|
151 // for regular encoding. If backslash occurs in |
|
152 // input stream at the final 5 chars then don't |
|
153 // attempt to read-ahead and de-escape since these |
|
154 // are literal occurrences of U+005C which need to |
|
155 // be encoded verbatim in the target encoding. |
|
156 buf[retChars++] = c; |
|
157 continue; |
|
158 } |
|
159 |
|
160 int remaining = numChars - i; |
|
161 if (remaining < 5) { |
|
162 // Might be the first character of a unicode escape, but we |
|
163 // don't have enough characters to tell, so save it and finish |
|
164 trailChars = new char[1 + remaining]; |
|
165 trailChars[0] = c; |
|
166 for (int j = 0; j < remaining; j++) |
|
167 trailChars[1 + j] = cBuf[i + j]; |
|
168 break; |
|
169 } |
|
170 // At this point we have at least five characters remaining |
|
171 |
|
172 c = cBuf[i++]; |
|
173 if (c != 'u') { |
|
174 // Not a unicode escape, so copy and continue |
|
175 buf[retChars++] = '\\'; |
|
176 buf[retChars++] = c; |
|
177 continue; |
|
178 } |
|
179 |
|
180 // The next four characters are the hex part of a unicode escape |
|
181 char rc = 0; |
|
182 boolean isUE = true; |
|
183 try { |
|
184 rc = (char) Integer.parseInt(new String(cBuf, i, 4), 16); |
|
185 } catch (NumberFormatException x) { |
|
186 isUE = false; |
|
187 } |
|
188 if (isUE && encoder.canEncode(rc)) { |
|
189 // We'll be able to convert this |
|
190 buf[retChars++] = rc; |
|
191 i += 4; // Align beyond the current uXXXX sequence |
|
192 } else { |
|
193 // We won't, so just retain the original sequence |
|
194 buf[retChars++] = '\\'; |
|
195 buf[retChars++] = 'u'; |
|
196 continue; |
|
197 } |
|
198 |
|
199 } |
|
200 |
|
201 return retChars; |
|
202 } |
|
203 |
|
204 public int read() throws IOException { |
|
205 char[] buf = new char[1]; |
|
206 |
|
207 if (read(buf, 0, 1) == -1) |
|
208 return -1; |
|
209 else |
|
210 return (int) buf[0]; |
|
211 } |
|
212 } |
|
213 } |