33663
|
1 |
/*
|
|
2 |
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
|
|
3 |
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
4 |
*
|
|
5 |
* This code is free software; you can redistribute it and/or modify it
|
|
6 |
* under the terms of the GNU General Public License version 2 only, as
|
|
7 |
* published by the Free Software Foundation. Oracle designates this
|
|
8 |
* particular file as subject to the "Classpath" exception as provided
|
|
9 |
* by Oracle in the LICENSE file that accompanied this code.
|
|
10 |
*
|
|
11 |
* This code is distributed in the hope that it will be useful, but WITHOUT
|
|
12 |
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
13 |
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
14 |
* version 2 for more details (a copy is included in the LICENSE file that
|
|
15 |
* accompanied this code).
|
|
16 |
*
|
|
17 |
* You should have received a copy of the GNU General Public License version
|
|
18 |
* 2 along with this work; if not, write to the Free Software Foundation,
|
|
19 |
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
20 |
*
|
|
21 |
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
|
22 |
* or visit www.oracle.com if you need additional information or have any
|
|
23 |
* questions.
|
|
24 |
*/
|
|
25 |
|
|
26 |
package java.lang;
|
|
27 |
|
|
28 |
import java.nio.charset.Charset;
|
|
29 |
import java.util.Arrays;
|
|
30 |
|
|
31 |
import static java.lang.String.LATIN1;
|
|
32 |
import static java.lang.String.UTF16;
|
|
33 |
import static java.lang.String.COMPACT_STRINGS;
|
|
34 |
import static java.lang.Character.isSurrogate;
|
|
35 |
import static java.lang.Character.highSurrogate;
|
|
36 |
import static java.lang.Character.lowSurrogate;
|
|
37 |
import static java.lang.Character.isSupplementaryCodePoint;
|
|
38 |
import static java.lang.StringUTF16.putChar;
|
|
39 |
|
|
40 |
class StringDecoderUTF8 extends StringCoding.StringDecoder {
|
|
41 |
|
|
42 |
StringDecoderUTF8(Charset cs, String rcn) {
|
|
43 |
super(cs, rcn);
|
|
44 |
}
|
|
45 |
|
|
46 |
private static boolean isNotContinuation(int b) {
|
|
47 |
return (b & 0xc0) != 0x80;
|
|
48 |
}
|
|
49 |
|
|
50 |
private static boolean isMalformed3(int b1, int b2, int b3) {
|
|
51 |
return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
|
|
52 |
(b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80;
|
|
53 |
}
|
|
54 |
|
|
55 |
private static boolean isMalformed3_2(int b1, int b2) {
|
|
56 |
return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
|
|
57 |
(b2 & 0xc0) != 0x80;
|
|
58 |
}
|
|
59 |
|
|
60 |
private static boolean isMalformed4(int b2, int b3, int b4) {
|
|
61 |
return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 ||
|
|
62 |
(b4 & 0xc0) != 0x80;
|
|
63 |
}
|
|
64 |
|
|
65 |
private static boolean isMalformed4_2(int b1, int b2) {
|
|
66 |
return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) ||
|
|
67 |
(b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
|
|
68 |
(b2 & 0xc0) != 0x80;
|
|
69 |
}
|
|
70 |
|
|
71 |
private static boolean isMalformed4_3(int b3) {
|
|
72 |
return (b3 & 0xc0) != 0x80;
|
|
73 |
}
|
|
74 |
|
|
75 |
// for nb == 3/4
|
|
76 |
private static int malformedN(byte[] src, int sp, int nb) {
|
|
77 |
if (nb == 3) {
|
|
78 |
int b1 = src[sp++];
|
|
79 |
int b2 = src[sp++]; // no need to lookup b3
|
|
80 |
return ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
|
|
81 |
isNotContinuation(b2)) ? 1 : 2;
|
|
82 |
} else if (nb == 4) { // we don't care the speed here
|
|
83 |
int b1 = src[sp++] & 0xff;
|
|
84 |
int b2 = src[sp++] & 0xff;
|
|
85 |
if (b1 > 0xf4 ||
|
|
86 |
(b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) ||
|
|
87 |
(b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
|
|
88 |
isNotContinuation(b2))
|
|
89 |
return 1;
|
|
90 |
if (isNotContinuation(src[sp++]))
|
|
91 |
return 2;
|
|
92 |
return 3;
|
|
93 |
}
|
|
94 |
assert false;
|
|
95 |
return -1;
|
|
96 |
}
|
|
97 |
|
|
98 |
private static char repl = '\ufffd';
|
|
99 |
|
|
100 |
StringCoding.Result decode(byte[] src, int sp, int len) {
|
|
101 |
return decode(src, sp, len, result);
|
|
102 |
}
|
|
103 |
|
|
104 |
static StringCoding.Result decode(byte[] src, int sp, int len,
|
|
105 |
StringCoding.Result ret) {
|
|
106 |
int sl = sp + len;
|
|
107 |
byte[] dst = new byte[len];
|
|
108 |
int dp = 0;
|
|
109 |
if (COMPACT_STRINGS) { // Latin1 only loop
|
|
110 |
while (sp < sl) {
|
|
111 |
int b1 = src[sp];
|
|
112 |
if (b1 >= 0) {
|
|
113 |
dst[dp++] = (byte)b1;
|
|
114 |
sp++;
|
|
115 |
continue;
|
|
116 |
}
|
|
117 |
if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) &&
|
|
118 |
sp + 1 < sl) {
|
|
119 |
int b2 = src[sp + 1];
|
|
120 |
if (!isNotContinuation(b2)) {
|
|
121 |
dst[dp++] = (byte)(((b1 << 6) ^ b2)^
|
|
122 |
(((byte) 0xC0 << 6) ^
|
|
123 |
((byte) 0x80 << 0)));
|
|
124 |
sp += 2;
|
|
125 |
continue;
|
|
126 |
}
|
|
127 |
}
|
|
128 |
// anything not a latin1, including the repl
|
|
129 |
// we have to go with the utf16
|
|
130 |
break;
|
|
131 |
}
|
|
132 |
if (sp == sl) {
|
|
133 |
if (dp != dst.length) {
|
|
134 |
dst = Arrays.copyOf(dst, dp);
|
|
135 |
}
|
|
136 |
return ret.with(dst, LATIN1);
|
|
137 |
}
|
|
138 |
}
|
|
139 |
if (dp == 0) {
|
|
140 |
dst = new byte[len << 1];
|
|
141 |
} else {
|
|
142 |
byte[] buf = new byte[len << 1];
|
|
143 |
StringLatin1.inflate(dst, 0, buf, 0, dp);
|
|
144 |
dst = buf;
|
|
145 |
}
|
|
146 |
while (sp < sl) {
|
|
147 |
int b1 = src[sp++];
|
|
148 |
if (b1 >= 0) {
|
|
149 |
putChar(dst, dp++, (char) b1);
|
|
150 |
} else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {
|
|
151 |
if (sp < sl) {
|
|
152 |
int b2 = src[sp++];
|
|
153 |
if (isNotContinuation(b2)) {
|
|
154 |
putChar(dst, dp++, repl);
|
|
155 |
sp--;
|
|
156 |
} else {
|
|
157 |
putChar(dst, dp++, (char)(((b1 << 6) ^ b2)^
|
|
158 |
(((byte) 0xC0 << 6) ^
|
|
159 |
((byte) 0x80 << 0))));
|
|
160 |
}
|
|
161 |
continue;
|
|
162 |
}
|
|
163 |
putChar(dst, dp++, repl);
|
|
164 |
break;
|
|
165 |
} else if ((b1 >> 4) == -2) {
|
|
166 |
if (sp + 1 < sl) {
|
|
167 |
int b2 = src[sp++];
|
|
168 |
int b3 = src[sp++];
|
|
169 |
if (isMalformed3(b1, b2, b3)) {
|
|
170 |
putChar(dst, dp++, repl);
|
|
171 |
sp -= 3;
|
|
172 |
sp += malformedN(src, sp, 3);
|
|
173 |
} else {
|
|
174 |
char c = (char)((b1 << 12) ^
|
|
175 |
(b2 << 6) ^
|
|
176 |
(b3 ^
|
|
177 |
(((byte) 0xE0 << 12) ^
|
|
178 |
((byte) 0x80 << 6) ^
|
|
179 |
((byte) 0x80 << 0))));
|
|
180 |
putChar(dst, dp++, isSurrogate(c) ? repl : c);
|
|
181 |
}
|
|
182 |
continue;
|
|
183 |
}
|
|
184 |
if (sp < sl && isMalformed3_2(b1, src[sp])) {
|
|
185 |
putChar(dst, dp++, repl);
|
|
186 |
continue;
|
|
187 |
}
|
|
188 |
putChar(dst, dp++, repl);
|
|
189 |
break;
|
|
190 |
} else if ((b1 >> 3) == -2) {
|
|
191 |
if (sp + 2 < sl) {
|
|
192 |
int b2 = src[sp++];
|
|
193 |
int b3 = src[sp++];
|
|
194 |
int b4 = src[sp++];
|
|
195 |
int uc = ((b1 << 18) ^
|
|
196 |
(b2 << 12) ^
|
|
197 |
(b3 << 6) ^
|
|
198 |
(b4 ^
|
|
199 |
(((byte) 0xF0 << 18) ^
|
|
200 |
((byte) 0x80 << 12) ^
|
|
201 |
((byte) 0x80 << 6) ^
|
|
202 |
((byte) 0x80 << 0))));
|
|
203 |
if (isMalformed4(b2, b3, b4) ||
|
|
204 |
!isSupplementaryCodePoint(uc)) { // shortest form check
|
|
205 |
putChar(dst, dp++, repl);
|
|
206 |
sp -= 4;
|
|
207 |
sp += malformedN(src, sp, 4);
|
|
208 |
} else {
|
|
209 |
putChar(dst, dp++, highSurrogate(uc));
|
|
210 |
putChar(dst, dp++, lowSurrogate(uc));
|
|
211 |
}
|
|
212 |
continue;
|
|
213 |
}
|
|
214 |
b1 &= 0xff;
|
|
215 |
if (b1 > 0xf4 ||
|
|
216 |
sp < sl && isMalformed4_2(b1, src[sp] & 0xff)) {
|
|
217 |
putChar(dst, dp++, repl);
|
|
218 |
continue;
|
|
219 |
}
|
|
220 |
sp++;
|
|
221 |
putChar(dst, dp++, repl);
|
|
222 |
if (sp < sl && isMalformed4_3(src[sp])) {
|
|
223 |
continue;
|
|
224 |
}
|
|
225 |
break;
|
|
226 |
} else {
|
|
227 |
putChar(dst, dp++, repl);
|
|
228 |
}
|
|
229 |
}
|
|
230 |
if (dp != len) {
|
|
231 |
dst = Arrays.copyOf(dst, dp << 1);
|
|
232 |
}
|
|
233 |
return ret.with(dst, UTF16);
|
|
234 |
}
|
|
235 |
}
|